diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 19:33:14 +0000 |
commit | 36d22d82aa202bb199967e9512281e9a53db42c9 (patch) | |
tree | 105e8c98ddea1c1e4784a60a5a6410fa416be2de /intl/icu/source/tools | |
parent | Initial commit. (diff) | |
download | firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip |
Adding upstream version 115.7.0esr.upstream/115.7.0esrupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'intl/icu/source/tools')
239 files changed, 65951 insertions, 0 deletions
diff --git a/intl/icu/source/tools/Makefile.in b/intl/icu/source/tools/Makefile.in new file mode 100644 index 0000000000..e0896f1206 --- /dev/null +++ b/intl/icu/source/tools/Makefile.in @@ -0,0 +1,87 @@ +## Makefile.in for ICU tools +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 1999-2012, International Business Machines Corporation and +## others. All Rights Reserved. + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = .. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools + +SUBDIRS = toolutil ctestfw makeconv genrb genbrk \ +gencnval gensprep icuinfo genccode gencmn icupkg pkgdata \ +gentest gennorm2 gencfu gendict icuexportdata + +ifneq (@platform_make_fragment_name@,mh-cygwin-msvc) +SUBDIRS += escapesrc +endif + +## List of phony targets +.PHONY : all all-local all-recursive install install-local \ +install-recursive clean clean-local clean-recursive distclean \ +distclean-local distclean-recursive dist dist-local dist-recursive \ +check check-local check-recursive build-local check-exhaustive + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-recursive +install: install-recursive +clean: clean-local clean-recursive +distclean : distclean-recursive +dist: dist-recursive +check: all check-recursive + +check-exhaustive: check + +## Recursive targets +all-recursive install-recursive clean-recursive distclean-recursive dist-recursive check-recursive: + @dot_seen=no; \ + target=`echo $@ | sed s/-recursive//`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + echo "$(MAKE)[$(MAKELEVEL)]: Making \`$$target' in \`$$subdir'"; \ + if test "$$subdir" = "."; then \ + dot_seen=yes; \ + local_target="$$target-local"; \ + else \ + local_target="$$target"; \ + fi; \ + (cd $$subdir && $(MAKE) $$local_target) || exit; \ + done; \ + if test "$$dot_seen" = "no"; then \ + $(MAKE) "$$target-local" || exit; \ + fi + +all-local: build-local + + +## Files to remove for 'make clean' +CLEANFILES = *~ + +install-local: + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + +# Clean up any old variations.. +distclean-local: clean-local + $(RMV) Makefile + +build-local: + +check-local: + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + diff --git a/intl/icu/source/tools/ctestfw/Makefile.in b/intl/icu/source/tools/ctestfw/Makefile.in new file mode 100644 index 0000000000..2ad1fbe579 --- /dev/null +++ b/intl/icu/source/tools/ctestfw/Makefile.in @@ -0,0 +1,149 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +#****************************************************************************** +# +# Copyright (C) 1999-2011, International Business Machines +# Corporation and others. All Rights Reserved. +# +#****************************************************************************** +## Makefile.in for ICU - tools/ctestfw +## Stephen F. Booth + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +## All the flags and other definitions are included here. +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/ctestfw + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(IMPORT_LIB) $(MIDDLE_IMPORT_LIB) $(FINAL_IMPORT_LIB) + +## Target information + +TARGET_STUBNAME=$(CTESTFW_STUBNAME) + +ifneq ($(ENABLE_STATIC),) +TARGET = $(LIBSICU)$(TARGET_STUBNAME)$(ICULIBSUFFIX).$(A) +endif + +ifneq ($(ENABLE_SHARED),) +SO_TARGET = $(LIBICU)$(TARGET_STUBNAME)$(ICULIBSUFFIX).$(SO) +ALL_SO_TARGETS = $(SO_TARGET) $(MIDDLE_SO_TARGET) $(FINAL_SO_TARGET) $(SHARED_OBJECT) +endif + +ALL_TARGETS = $(TARGET) $(ALL_SO_TARGETS) + +DYNAMICCPPFLAGS = $(SHAREDLIBCPPFLAGS) +DYNAMICCFLAGS = $(SHAREDLIBCFLAGS) +DYNAMICCXXFLAGS = $(SHAREDLIBCXXFLAGS) +CFLAGS += $(LIBCFLAGS) +CXXFLAGS += $(LIBCXXFLAGS) + +CPPFLAGS += -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(srcdir)/../toolutil -I$(srcdir) $(LIBCPPFLAGS) $(CPPFLAGSCTESTFW) +DEFS += -DT_CTEST_IMPLEMENTATION +LDFLAGS += $(LDFLAGSCTESTFW) +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(patsubst %.cpp,%.o,$(patsubst %.c,%.o, $(SOURCES))) + +STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O)) + +DEPS = $(OBJECTS:.o=.d) + +-include Makefile.local + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(ALL_TARGETS) + +install-local: install-library + +install-library: all-local + $(MKINSTALLDIRS) $(DESTDIR)$(libdir) +ifneq ($(ENABLE_STATIC),) + $(INSTALL-L) $(TARGET) $(DESTDIR)$(libdir) +endif +ifneq ($(ENABLE_SHARED),) +# For MinGW, do we want the DLL to go in the bin location? +ifeq ($(MINGW_MOVEDLLSTOBINDIR),YES) + $(MKINSTALLDIRS) $(DESTDIR)$(bindir) + $(INSTALL-L) $(FINAL_SO_TARGET) $(DESTDIR)$(bindir) +else + $(INSTALL-L) $(FINAL_SO_TARGET) $(DESTDIR)$(libdir) +ifneq ($(FINAL_SO_TARGET),$(SO_TARGET)) + cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(SO_TARGET)) && ln -s $(notdir $(FINAL_SO_TARGET)) $(notdir $(SO_TARGET)) +ifneq ($(FINAL_SO_TARGET),$(MIDDLE_SO_TARGET)) + cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(MIDDLE_SO_TARGET)) && ln -s $(notdir $(FINAL_SO_TARGET)) $(notdir $(MIDDLE_SO_TARGET)) +endif +endif +endif +ifneq ($(IMPORT_LIB_EXT),) + $(INSTALL-L) $(FINAL_IMPORT_LIB) $(DESTDIR)$(libdir) +ifneq ($(IMPORT_LIB),$(FINAL_IMPORT_LIB)) + cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(IMPORT_LIB)) && ln -s $(notdir $(FINAL_IMPORT_LIB)) $(notdir $(IMPORT_LIB)) +endif +ifneq ($(MIDDLE_IMPORT_LIB),$(FINAL_IMPORT_LIB)) + cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(MIDDLE_IMPORT_LIB)) && ln -s $(notdir $(FINAL_IMPORT_LIB)) $(notdir $(MIDDLE_IMPORT_LIB)) +endif +endif +endif + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(OBJECTS) $(STATIC_OBJECTS) $(ALL_TARGETS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +ifneq ($(ENABLE_STATIC),) +$(TARGET): $(STATIC_OBJECTS) + $(AR) $(ARFLAGS) $(AR_OUTOPT)$@ $^ + $(RANLIB) $@ +endif + +ifneq ($(ENABLE_SHARED),) +$(SHARED_OBJECT): $(OBJECTS) + $(SHLIB.cc) $(LD_SONAME) $(OUTOPT)$@ $^ $(LIBS) +ifeq ($(ENABLE_RPATH),YES) +ifneq ($(wildcard $(libdir)/$(MIDDLE_SO_TARGET)),) + $(warning RPATH warning: --enable-rpath means test programs may use existing $(libdir)/$(MIDDLE_SO_TARGET)) +endif +endif +endif + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/ctestfw/ctest.c b/intl/icu/source/tools/ctestfw/ctest.c new file mode 100644 index 0000000000..99f9789d3f --- /dev/null +++ b/intl/icu/source/tools/ctestfw/ctest.c @@ -0,0 +1,1333 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************** +* +* Copyright (C) 1996-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************** +*/ +#include <assert.h> +#include <ctype.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "unicode/utrace.h" +#include "unicode/uclean.h" +#include "putilimp.h" +#include "udbgutil.h" + +/* NOTES: + 3/20/1999 srl - strncpy called w/o setting nulls at the end + */ + +#define MAXTESTNAME 128 +#define MAXTESTS 512 +#define MAX_TEST_LOG 4096 + +/** + * How may columns to indent the 'OK' markers. + */ +#define FLAG_INDENT 45 +/** + * How many lines of scrollage can go by before we need to remind the user what the test is. + */ +#define PAGE_SIZE_LIMIT 25 + +#ifndef SHOW_TIMES +#define SHOW_TIMES 1 +#endif + +struct TestNode +{ + void (*test)(void); + struct TestNode* sibling; + struct TestNode* child; + char name[1]; /* This is dynamically allocated off the end with malloc. */ +}; + + +static const struct TestNode* currentTest; + +typedef enum { RUNTESTS, SHOWTESTS } TestMode; +#define TEST_SEPARATOR '/' + +#ifndef C_TEST_IMPL +#define C_TEST_IMPL +#endif + +#include "unicode/ctest.h" + +static char ERROR_LOG[MAX_TEST_LOG][MAXTESTNAME]; + +/* Local prototypes */ +static TestNode* addTestNode( TestNode *root, const char *name ); + +static TestNode *createTestNode(const char* name, int32_t nameLen); + +static int strncmp_nullcheck( const char* s1, + const char* s2, + int n ); + +static void getNextLevel( const char* name, + int* nameLen, + const char** nextName ); + +static void iterateTestsWithLevel( const TestNode *root, int depth, + const TestNode** nodeList, + TestMode mode); + +static void help ( const char *argv0 ); + +/** + * Do the work of logging an error. Doesn't increase the error count. + * + * @prefix optional prefix prepended to message, or NULL. + * @param pattern printf style pattern + * @param ap vprintf style arg list + */ +static void vlog_err(const char *prefix, const char *pattern, va_list ap); +static void vlog_verbose(const char *prefix, const char *pattern, va_list ap); +static UBool vlog_knownIssue(const char *ticket, const char *pattern, va_list ap); + +/** + * Log test structure, with indent + * @param pattern printf pattern + */ +static void log_testinfo_i(const char *pattern, ...); + +/** + * Log test structure, NO indent + * @param pattern printf pattern + */ +static void log_testinfo(const char *pattern, ...); + +/* If we need to make the framework multi-thread safe + we need to pass around the following vars +*/ +static int ERRONEOUS_FUNCTION_COUNT = 0; +static int ERROR_COUNT = 0; /* Count of errors from all tests. */ +static int ONE_ERROR = 0; /* were there any other errors? */ +static int DATA_ERROR_COUNT = 0; /* count of data related errors or warnings */ +static int INDENT_LEVEL = 0; +static UBool NO_KNOWN = false; +static void *knownList = NULL; +static char gTestName[1024] = ""; +static UBool ON_LINE = false; /* are we on the top line with our test name? */ +static UBool HANGING_OUTPUT = false; /* did the user leave us without a trailing \n ? */ +static int GLOBAL_PRINT_COUNT = 0; /* global count of printouts */ +int REPEAT_TESTS_INIT = 0; /* Was REPEAT_TESTS initialized? */ +int REPEAT_TESTS = 1; /* Number of times to run the test */ +int VERBOSITY = 0; /* be No-verbose by default */ +int ERR_MSG =1; /* error messages will be displayed by default*/ +int QUICK = 1; /* Skip some of the slower tests? */ +int WARN_ON_MISSING_DATA = 0; /* Reduce data errs to warnings? */ +UTraceLevel ICU_TRACE = UTRACE_OFF; /* ICU tracing level */ +int WRITE_GOLDEN_DATA = 0; /* Overwrite golden data files? */ +size_t MINIMUM_MEMORY_SIZE_FAILURE = (size_t)-1; /* Minimum library memory allocation window that will fail. */ +size_t MAXIMUM_MEMORY_SIZE_FAILURE = (size_t)-1; /* Maximum library memory allocation window that will fail. */ +static const char *ARGV_0 = "[ALL]"; +static const char *XML_FILE_NAME=NULL; +static char XML_PREFIX[256]; +static const char *SUMMARY_FILE = NULL; +FILE *XML_FILE = NULL; +/*-------------------------------------------*/ + +/* strncmp that also makes sure there's a \0 at s2[0] */ +static int strncmp_nullcheck( const char* s1, + const char* s2, + int n ) +{ + if (((int)strlen(s2) >= n) && s2[n] != 0) { + return 3; /* null check fails */ + } + else { + return strncmp ( s1, s2, n ); + } +} + +static void getNextLevel( const char* name, + int* nameLen, + const char** nextName ) +{ + /* Get the next component of the name */ + *nextName = strchr(name, TEST_SEPARATOR); + + if( *nextName != 0 ) + { + char n[255]; + *nameLen = (int)((*nextName) - name); + (*nextName)++; /* skip '/' */ + strncpy(n, name, *nameLen); + n[*nameLen] = 0; + /*printf("->%s-< [%d] -> [%s]\n", name, *nameLen, *nextName);*/ + } + else { + *nameLen = (int)strlen(name); + } +} + +static TestNode *createTestNode(const char* name, int32_t nameLen) +{ + TestNode *newNode; + + newNode = (TestNode*)malloc(sizeof(TestNode) + (nameLen + 1)); + + newNode->test = NULL; + newNode->sibling = NULL; + newNode->child = NULL; + + strncpy( newNode->name, name, nameLen ); + newNode->name[nameLen] = 0; + + return newNode; +} + +void T_CTEST_EXPORT2 +cleanUpTestTree(TestNode *tn) +{ + if(tn->child != NULL) { + cleanUpTestTree(tn->child); + } + if(tn->sibling != NULL) { + cleanUpTestTree(tn->sibling); + } + + free(tn); + +} + + +void T_CTEST_EXPORT2 +addTest(TestNode** root, + TestFunctionPtr test, + const char* name ) +{ + TestNode *newNode; + + /*if this is the first Test created*/ + if (*root == NULL) + *root = createTestNode("", 0); + + newNode = addTestNode( *root, name ); + assert(newNode != 0 ); + /* printf("addTest: nreName = %s\n", newNode->name );*/ + + newNode->test = test; +} + +/* non recursive insert function */ +static TestNode *addTestNode ( TestNode *root, const char *name ) +{ + const char* nextName; + TestNode *nextNode, *curNode; + int nameLen; /* length of current 'name' */ + + /* remove leading slash */ + if ( *name == TEST_SEPARATOR ) + name++; + + curNode = root; + + for(;;) + { + /* Start with the next child */ + nextNode = curNode->child; + + getNextLevel ( name, &nameLen, &nextName ); + + /* printf("* %s\n", name );*/ + + /* if nextNode is already null, then curNode has no children + -- add them */ + if( nextNode == NULL ) + { + /* Add all children of the node */ + do + { + /* Get the next component of the name */ + getNextLevel(name, &nameLen, &nextName); + + /* update curName to have the next name segment */ + curNode->child = createTestNode(name, nameLen); + /* printf("*** added %s\n", curNode->child->name );*/ + curNode = curNode->child; + name = nextName; + } + while( name != NULL ); + + return curNode; + } + + /* Search across for the name */ + while (strncmp_nullcheck ( name, nextNode->name, nameLen) != 0 ) + { + curNode = nextNode; + nextNode = nextNode -> sibling; + + if ( nextNode == NULL ) + { + /* Did not find 'name' on this level. */ + nextNode = createTestNode(name, nameLen); + curNode->sibling = nextNode; + break; + } + } + + /* nextNode matches 'name' */ + + if (nextName == NULL) /* end of the line */ + { + return nextNode; + } + + /* Loop again with the next item */ + name = nextName; + curNode = nextNode; + } +} + +/** + * Log the time taken. May not output anything. + * @param deltaTime change in time + */ +void T_CTEST_EXPORT2 str_timeDelta(char *str, UDate deltaTime) { + if (deltaTime > 110000.0 ) { + double mins = uprv_floor(deltaTime/60000.0); + sprintf(str, "[(%.0fm %.1fs)]", mins, (deltaTime-(mins*60000.0))/1000.0); + } else if (deltaTime > 1500.0) { + sprintf(str, "((%.1fs))", deltaTime/1000.0); + } else if(deltaTime>900.0) { + sprintf(str, "( %.2fs )", deltaTime/1000.0); + } else if(deltaTime > 5.0) { + sprintf(str, " (%.0fms) ", deltaTime); + } else { + str[0]=0; /* at least terminate it. */ + } +} + +static void print_timeDelta(UDate deltaTime) { + char str[256]; + str_timeDelta(str, deltaTime); + if(str[0]) { + printf("%s", str); + } +} + +/** + * Run or list tests (according to mode) in a subtree. + * + * @param root root of the subtree to operate on + * @param depth The depth of this tree (0=root) + * @param nodeList an array of MAXTESTS depth that's used for keeping track of where we are. nodeList[depth] points to the 'parent' at depth depth. + * @param mode what mode we are operating in. + */ +static void iterateTestsWithLevel ( const TestNode* root, + int depth, + const TestNode** nodeList, + TestMode mode) +{ + int i; + + char pathToFunction[MAXTESTNAME] = ""; + char separatorString[2] = { TEST_SEPARATOR, '\0'}; +#if SHOW_TIMES + UDate allStartTime = -1, allStopTime = -1; +#endif + + if(depth<2) { + allStartTime = uprv_getRawUTCtime(); + } + + if ( root == NULL ) + return; + + /* record the current root node, and increment depth. */ + nodeList[depth++] = root; + /* depth is now the depth of root's children. */ + + /* Collect the 'path' to the current subtree. */ + for ( i=0;i<(depth-1);i++ ) + { + strcat(pathToFunction, nodeList[i]->name); + strcat(pathToFunction, separatorString); + } + strcat(pathToFunction, nodeList[i]->name); /* including 'root' */ + + /* print test name and space. */ + INDENT_LEVEL = depth-1; + if(root->name[0]) { + log_testinfo_i("%s ", root->name); + } else { + log_testinfo_i("(%s) ", ARGV_0); + } + ON_LINE = true; /* we are still on the line with the test name */ + + + if ( (mode == RUNTESTS) && + (root->test != NULL)) /* if root is a leaf node, run it */ + { + int myERROR_COUNT = ERROR_COUNT; + int myGLOBAL_PRINT_COUNT = GLOBAL_PRINT_COUNT; +#if SHOW_TIMES + UDate startTime, stopTime; + char timeDelta[256]; + char timeSeconds[256]; +#else + const char timeDelta[] = "(unknown)"; + const char timeSeconds[] = "0.000"; +#endif + currentTest = root; + INDENT_LEVEL = depth; /* depth of subitems */ + ONE_ERROR=0; + HANGING_OUTPUT=false; +#if SHOW_TIMES + startTime = uprv_getRawUTCtime(); +#endif + strcpy(gTestName, pathToFunction); + root->test(); /* PERFORM THE TEST ************************/ +#if SHOW_TIMES + stopTime = uprv_getRawUTCtime(); +#endif + if(HANGING_OUTPUT) { + log_testinfo("\n"); + HANGING_OUTPUT=false; + } + INDENT_LEVEL = depth-1; /* depth of root */ + currentTest = NULL; + if((ONE_ERROR>0)&&(ERROR_COUNT==0)) { + ERROR_COUNT++; /* There was an error without a newline */ + } + ONE_ERROR=0; + +#if SHOW_TIMES + str_timeDelta(timeDelta, stopTime-startTime); + sprintf(timeSeconds, "%f", (stopTime-startTime)/1000.0); +#endif + ctest_xml_testcase(pathToFunction, pathToFunction, timeSeconds, (myERROR_COUNT!=ERROR_COUNT)?"error":NULL); + + if (myERROR_COUNT != ERROR_COUNT) { + log_testinfo_i("} ---[%d ERRORS in %s] ", ERROR_COUNT - myERROR_COUNT, pathToFunction); + strcpy(ERROR_LOG[ERRONEOUS_FUNCTION_COUNT++], pathToFunction); + } else { + if(!ON_LINE) { /* had some output */ + int spaces = FLAG_INDENT-(depth-1); + log_testinfo_i("} %*s[OK] ", spaces, "---"); + if((GLOBAL_PRINT_COUNT-myGLOBAL_PRINT_COUNT)>PAGE_SIZE_LIMIT) { + log_testinfo(" %s ", pathToFunction); /* in case they forgot. */ + } + } else { + /* put -- out at 30 sp. */ + int spaces = FLAG_INDENT - ((int)strlen(root->name) + depth); + if(spaces<0) spaces=0; + log_testinfo(" %*s[OK] ", spaces,"---"); + } + } + +#if SHOW_TIMES + if(timeDelta[0]) printf("%s", timeDelta); +#endif + + ON_LINE = true; /* we are back on-line */ + } + + INDENT_LEVEL = depth-1; /* root */ + + /* we want these messages to be at 0 indent. so just push the indent level briefly. */ + if(mode==SHOWTESTS) { + log_testinfo("---%s%c\n",pathToFunction, nodeList[i]->test?' ':TEST_SEPARATOR ); + } + + INDENT_LEVEL = depth; + + if(root->child) { + int myERROR_COUNT = ERROR_COUNT; + int myGLOBAL_PRINT_COUNT = GLOBAL_PRINT_COUNT; + if(mode!=SHOWTESTS) { + INDENT_LEVEL=depth-1; + log_testinfo("{\n"); + INDENT_LEVEL=depth; + } + + iterateTestsWithLevel ( root->child, depth, nodeList, mode ); + + if(mode!=SHOWTESTS) { + INDENT_LEVEL=depth-1; + log_testinfo_i("} "); /* TODO: summarize subtests */ + if((depth>1) && (ERROR_COUNT > myERROR_COUNT)) { + log_testinfo("[%d %s in %s] ", ERROR_COUNT-myERROR_COUNT, (ERROR_COUNT-myERROR_COUNT)==1?"error":"errors", pathToFunction); + } else if((GLOBAL_PRINT_COUNT-myGLOBAL_PRINT_COUNT)>PAGE_SIZE_LIMIT || (depth<1)) { + if(pathToFunction[0]) { + log_testinfo(" %s ", pathToFunction); /* in case they forgot. */ + } else { + log_testinfo(" / (%s) ", ARGV_0); + } + } + + ON_LINE=true; + } + } + depth--; + +#if SHOW_TIMES + if(depth<2) { + allStopTime = uprv_getRawUTCtime(); + print_timeDelta(allStopTime-allStartTime); + } +#endif + + if(mode!=SHOWTESTS && ON_LINE) { + log_testinfo("\n"); + } + + if ( depth != 0 ) { /* DO NOT iterate over siblings of the root. TODO: why not? */ + iterateTestsWithLevel ( root->sibling, depth, nodeList, mode ); + } +} + + + +void T_CTEST_EXPORT2 +showTests ( const TestNode *root ) +{ + /* make up one for them */ + const TestNode *nodeList[MAXTESTS]; + + if (root == NULL) + log_err("TEST CAN'T BE FOUND!"); + + iterateTestsWithLevel ( root, 0, nodeList, SHOWTESTS ); + +} + +void T_CTEST_EXPORT2 +runTests ( const TestNode *root ) +{ + int i; + const TestNode *nodeList[MAXTESTS]; + /* make up one for them */ + + + if (root == NULL) + log_err("TEST CAN'T BE FOUND!\n"); + + ERRONEOUS_FUNCTION_COUNT = ERROR_COUNT = 0; + iterateTestsWithLevel ( root, 0, nodeList, RUNTESTS ); + + /*print out result summary*/ + + ON_LINE=false; /* just in case */ + + if(knownList != NULL) { + if( udbg_knownIssue_print(knownList) ) { + fprintf(stdout, "(To run suppressed tests, use the -K option.) \n\n"); + } + udbg_knownIssue_close(knownList); + knownList = NULL; + } + + if (ERROR_COUNT) + { + fprintf(stdout,"\nSUMMARY:\n"); + fflush(stdout); + fprintf(stdout,"******* [Total error count:\t%d]\n", ERROR_COUNT); + fflush(stdout); + fprintf(stdout, " Errors in\n"); + for (i=0;i < ERRONEOUS_FUNCTION_COUNT; i++) + fprintf(stdout, "[%s]\n",ERROR_LOG[i]); + if(SUMMARY_FILE != NULL) { + FILE *summf = fopen(SUMMARY_FILE, "w"); + if(summf!=NULL) { + for (i=0;i < ERRONEOUS_FUNCTION_COUNT; i++) + fprintf(summf, "%s\n",ERROR_LOG[i]); + fclose(summf); + } + } + } + else + { + log_testinfo("\n[All tests passed successfully...]\n"); + } + + if(DATA_ERROR_COUNT) { + if(WARN_ON_MISSING_DATA==0) { + log_testinfo("\t*Note* some errors are data-loading related. If the data used is not the \n" + "\tstock ICU data (i.e some have been added or removed), consider using\n" + "\tthe '-w' option to turn these errors into warnings.\n"); + } else { + log_testinfo("\t*WARNING* some data-loading errors were ignored by the -w option.\n"); + } + } +} + +const char* T_CTEST_EXPORT2 +getTestName(void) +{ + if(currentTest != NULL) { + return currentTest->name; + } else { + return NULL; + } +} + +const TestNode* T_CTEST_EXPORT2 +getTest(const TestNode* root, const char* name) +{ + const char* nextName; + TestNode *nextNode; + const TestNode* curNode; + int nameLen; /* length of current 'name' */ + + if (root == NULL) { + log_err("TEST CAN'T BE FOUND!\n"); + return NULL; + } + /* remove leading slash */ + if ( *name == TEST_SEPARATOR ) + name++; + + curNode = root; + + for(;;) + { + /* Start with the next child */ + nextNode = curNode->child; + + getNextLevel ( name, &nameLen, &nextName ); + + /* printf("* %s\n", name );*/ + + /* if nextNode is already null, then curNode has no children + -- add them */ + if( nextNode == NULL ) + { + return NULL; + } + + /* Search across for the name */ + while (strncmp_nullcheck ( name, nextNode->name, nameLen) != 0 ) + { + curNode = nextNode; + nextNode = nextNode -> sibling; + + if ( nextNode == NULL ) + { + /* Did not find 'name' on this level. */ + return NULL; + } + } + + /* nextNode matches 'name' */ + + if (nextName == NULL) /* end of the line */ + { + return nextNode; + } + + /* Loop again with the next item */ + name = nextName; + curNode = nextNode; + } +} + +/* =========== io functions ======== */ + +static void go_offline_with_marker(const char *mrk) { + UBool wasON_LINE = ON_LINE; + + if(ON_LINE) { + log_testinfo(" {\n"); + ON_LINE=false; + } + + if(!HANGING_OUTPUT || wasON_LINE) { + if(mrk != NULL) { + fputs(mrk, stdout); + } + } +} + +static void go_offline() { + go_offline_with_marker(NULL); +} + +static void go_offline_err() { + go_offline(); +} + +static void first_line_verbose() { + go_offline_with_marker("v"); +} + +static void first_line_err() { + go_offline_with_marker("!"); +} + +static void first_line_info() { + go_offline_with_marker("\""); +} + +static void first_line_test() { + fputs(" ", stdout); +} + + +static void vlog_err(const char *prefix, const char *pattern, va_list ap) +{ + if( ERR_MSG == false){ + return; + } + fputs("!", stdout); /* col 1 - bang */ + fprintf(stdout, "%-*s", INDENT_LEVEL,"" ); + if(prefix) { + fputs(prefix, stdout); + } + vfprintf(stdout, pattern, ap); + fflush(stdout); + va_end(ap); + if((*pattern==0) || (pattern[strlen(pattern)-1]!='\n')) { + HANGING_OUTPUT=1; + } else { + HANGING_OUTPUT=0; + } + GLOBAL_PRINT_COUNT++; +} + +static UBool vlog_knownIssue(const char *ticket, const char *pattern, va_list ap) +{ + char buf[2048]; + UBool firstForTicket; + UBool firstForWhere; + + if(NO_KNOWN) return false; + if(pattern==NULL) pattern=""; + + vsprintf(buf, pattern, ap); + knownList = udbg_knownIssue_open(knownList, ticket, gTestName, buf, + &firstForTicket, &firstForWhere); + + if(firstForTicket || firstForWhere) { + log_info("(Known issue %s) %s\n", ticket, buf); + } else { + log_verbose("(Known issue %s) %s\n", ticket, buf); + } + + return true; +} + + +void T_CTEST_EXPORT2 +vlog_info(const char *prefix, const char *pattern, va_list ap) +{ + first_line_info(); + fprintf(stdout, "%-*s", INDENT_LEVEL,"" ); + if(prefix) { + fputs(prefix, stdout); + } + vfprintf(stdout, pattern, ap); + fflush(stdout); + va_end(ap); + if((*pattern==0) || (pattern[strlen(pattern)-1]!='\n')) { + HANGING_OUTPUT=1; + } else { + HANGING_OUTPUT=0; + } + GLOBAL_PRINT_COUNT++; +} +/** + * Log test structure, with indent + */ +static void log_testinfo_i(const char *pattern, ...) +{ + va_list ap; + first_line_test(); + fprintf(stdout, "%-*s", INDENT_LEVEL,"" ); + va_start(ap, pattern); + vfprintf(stdout, pattern, ap); + fflush(stdout); + va_end(ap); + GLOBAL_PRINT_COUNT++; +} +/** + * Log test structure (no ident) + */ +static void log_testinfo(const char *pattern, ...) +{ + va_list ap; + va_start(ap, pattern); + first_line_test(); + vfprintf(stdout, pattern, ap); + fflush(stdout); + va_end(ap); + GLOBAL_PRINT_COUNT++; +} + + +static void vlog_verbose(const char *prefix, const char *pattern, va_list ap) +{ + if ( VERBOSITY == false ) + return; + + first_line_verbose(); + fprintf(stdout, "%-*s", INDENT_LEVEL,"" ); + if(prefix) { + fputs(prefix, stdout); + } + vfprintf(stdout, pattern, ap); + fflush(stdout); + va_end(ap); + GLOBAL_PRINT_COUNT++; + if((*pattern==0) || (pattern[strlen(pattern)-1]!='\n')) { + HANGING_OUTPUT=1; + } else { + HANGING_OUTPUT=0; + } +} + +void T_CTEST_EXPORT2 +log_err(const char* pattern, ...) +{ + va_list ap; + first_line_err(); + if(strchr(pattern, '\n') != NULL) { + /* + * Count errors only if there is a line feed in the pattern + * so that we do not exaggerate our error count. + */ + ++ERROR_COUNT; + } else { + /* Count at least one error. */ + ONE_ERROR=1; + } + va_start(ap, pattern); + vlog_err(NULL, pattern, ap); +} + +UBool T_CTEST_EXPORT2 +log_knownIssue(const char *ticket, const char *pattern, ...) { + va_list ap; + va_start(ap, pattern); + return vlog_knownIssue(ticket, pattern, ap); +} + +void T_CTEST_EXPORT2 +log_err_status(UErrorCode status, const char* pattern, ...) +{ + va_list ap; + va_start(ap, pattern); + + if ((status == U_FILE_ACCESS_ERROR || status == U_MISSING_RESOURCE_ERROR)) { + ++DATA_ERROR_COUNT; /* for informational message at the end */ + + if (WARN_ON_MISSING_DATA == 0) { + first_line_err(); + /* Fatal error. */ + if (strchr(pattern, '\n') != NULL) { + ++ERROR_COUNT; + } else { + ++ONE_ERROR; + } + vlog_err(NULL, pattern, ap); /* no need for prefix in default case */ + } else { + vlog_info("[DATA] ", pattern, ap); + } + } else { + first_line_err(); + /* Fatal error. */ + if(strchr(pattern, '\n') != NULL) { + ++ERROR_COUNT; + } else { + ++ONE_ERROR; + } + vlog_err(NULL, pattern, ap); /* no need for prefix in default case */ + } +} + +void T_CTEST_EXPORT2 +log_info(const char* pattern, ...) +{ + va_list ap; + + va_start(ap, pattern); + vlog_info(NULL, pattern, ap); +} + +void T_CTEST_EXPORT2 +log_verbose(const char* pattern, ...) +{ + va_list ap; + + va_start(ap, pattern); + vlog_verbose(NULL, pattern, ap); +} + + +void T_CTEST_EXPORT2 +log_data_err(const char* pattern, ...) +{ + va_list ap; + va_start(ap, pattern); + + go_offline_err(); + ++DATA_ERROR_COUNT; /* for informational message at the end */ + + if(WARN_ON_MISSING_DATA == 0) { + /* Fatal error. */ + if(strchr(pattern, '\n') != NULL) { + ++ERROR_COUNT; + } + vlog_err(NULL, pattern, ap); /* no need for prefix in default case */ + } else { + vlog_info("[DATA] ", pattern, ap); + } +} + + +/* + * Tracing functions. + */ +static int traceFnNestingDepth = 0; +U_CDECL_BEGIN +static void U_CALLCONV TraceEntry(const void *context, int32_t fnNumber) { + (void)context; // suppress compiler warnings about unused variable + char buf[500]; + utrace_format(buf, sizeof(buf), traceFnNestingDepth*3, "%s() enter.\n", utrace_functionName(fnNumber)); + buf[sizeof(buf)-1]=0; + fputs(buf, stdout); + traceFnNestingDepth++; +} + +static void U_CALLCONV TraceExit(const void *context, int32_t fnNumber, const char *fmt, va_list args) { + (void)context; // suppress compiler warnings about unused variable + char buf[500]; + if (traceFnNestingDepth>0) { + traceFnNestingDepth--; + } + utrace_format(buf, sizeof(buf), traceFnNestingDepth*3, "%s() ", utrace_functionName(fnNumber)); + buf[sizeof(buf)-1]=0; + fputs(buf, stdout); + utrace_vformat(buf, sizeof(buf), traceFnNestingDepth*3, fmt, args); + buf[sizeof(buf)-1]=0; + fputs(buf, stdout); + putc('\n', stdout); +} + +static void U_CALLCONV TraceData(const void *context, int32_t fnNumber, + int32_t level, const char *fmt, va_list args) { + // suppress compiler warnings about unused variables + (void)context; + (void)fnNumber; + (void)level; + char buf[500]; + utrace_vformat(buf, sizeof(buf), traceFnNestingDepth*3, fmt, args); + buf[sizeof(buf)-1]=0; + fputs(buf, stdout); + putc('\n', stdout); +} + +static void *U_CALLCONV ctest_libMalloc(const void *context, size_t size) { + (void)context; // suppress compiler warnings about unused variable + /*if (VERBOSITY) { + printf("Allocated %ld\n", (long)size); + }*/ + if (MINIMUM_MEMORY_SIZE_FAILURE <= size && size <= MAXIMUM_MEMORY_SIZE_FAILURE) { + return NULL; + } + return malloc(size); +} +static void *U_CALLCONV ctest_libRealloc(const void *context, void *mem, size_t size) { + (void)context; // suppress compiler warnings about unused variable + /*if (VERBOSITY) { + printf("Reallocated %ld\n", (long)size); + }*/ + if (MINIMUM_MEMORY_SIZE_FAILURE <= size && size <= MAXIMUM_MEMORY_SIZE_FAILURE) { + /*free(mem);*/ /* Realloc doesn't free on failure. */ + return NULL; + } + return realloc(mem, size); +} +static void U_CALLCONV ctest_libFree(const void *context, void *mem) { + (void)context; // suppress compiler warnings about unused variable + free(mem); +} + +int T_CTEST_EXPORT2 +initArgs( int argc, const char* const argv[], ArgHandlerPtr argHandler, void *context) +{ + int i; + int argSkip = 0; + + VERBOSITY = false; + ERR_MSG = true; + + ARGV_0=argv[0]; + + for( i=1; i<argc; i++) + { + if ( argv[i][0] == '/' ) + { + /* We don't run the tests here. */ + continue; + } + else if ((strcmp( argv[i], "-a") == 0) || (strcmp(argv[i],"-all") == 0)) + { + /* We don't run the tests here. */ + continue; + } + else if (strcmp( argv[i], "-v" )==0 || strcmp( argv[i], "-verbose")==0) + { + VERBOSITY = true; + } + else if (strcmp( argv[i], "-l" )==0 ) + { + /* doList = true; */ + } + else if (strcmp( argv[i], "-e1") == 0) + { + QUICK = -1; + } + else if (strcmp( argv[i], "-e") ==0) + { + QUICK = 0; + } + else if (strcmp( argv[i], "-K") ==0) + { + NO_KNOWN = 1; + } + else if (strncmp( argv[i], "-E",2) ==0) + { + SUMMARY_FILE=argv[i]+2; + } + else if (strcmp( argv[i], "-w") ==0) + { + WARN_ON_MISSING_DATA = true; + } + else if (strcmp( argv[i], "-m") ==0) + { + UErrorCode errorCode = U_ZERO_ERROR; + if (i+1 < argc) { + char *endPtr = NULL; + i++; + MINIMUM_MEMORY_SIZE_FAILURE = (size_t)strtol(argv[i], &endPtr, 10); + if (endPtr == argv[i]) { + printf("Can't parse %s\n", argv[i]); + help(argv[0]); + return 0; + } + if (*endPtr == '-') { + char *maxPtr = endPtr+1; + endPtr = NULL; + MAXIMUM_MEMORY_SIZE_FAILURE = (size_t)strtol(maxPtr, &endPtr, 10); + if (endPtr == argv[i]) { + printf("Can't parse %s\n", argv[i]); + help(argv[0]); + return 0; + } + } + } + /* Use the default value */ + u_setMemoryFunctions(NULL, ctest_libMalloc, ctest_libRealloc, ctest_libFree, &errorCode); + if (U_FAILURE(errorCode)) { + printf("u_setMemoryFunctions returned %s\n", u_errorName(errorCode)); + return 0; + } + } + else if(strcmp( argv[i], "-n") == 0 || strcmp( argv[i], "-no_err_msg") == 0) + { + ERR_MSG = false; + } + else if (strcmp( argv[i], "-r") == 0) + { + if (!REPEAT_TESTS_INIT) { + REPEAT_TESTS++; + } + } + else if (strcmp( argv[i], "-x") == 0) + { + if(++i>=argc) { + printf("* Error: '-x' option requires an argument. usage: '-x outfile.xml'.\n"); + return 0; + } + if(ctest_xml_setFileName(argv[i])) { /* set the name */ + return 0; + } + } + else if (strcmp( argv[i], "-t_info") == 0) { + ICU_TRACE = UTRACE_INFO; + } + else if (strcmp( argv[i], "-t_error") == 0) { + ICU_TRACE = UTRACE_ERROR; + } + else if (strcmp( argv[i], "-t_warn") == 0) { + ICU_TRACE = UTRACE_WARNING; + } + else if (strcmp( argv[i], "-t_verbose") == 0) { + ICU_TRACE = UTRACE_VERBOSE; + } + else if (strcmp( argv[i], "-t_oc") == 0) { + ICU_TRACE = UTRACE_OPEN_CLOSE; + } + else if (strcmp( argv[i], "-G") == 0) { + WRITE_GOLDEN_DATA = 1; + } + else if (strcmp( argv[i], "-h" )==0 || strcmp( argv[i], "--help" )==0) + { + help( argv[0] ); + return 0; + } + else if (argHandler != NULL && (argSkip = argHandler(i, argc, argv, context)) > 0) + { + i += argSkip - 1; + } + else + { + printf("* unknown option: %s\n", argv[i]); + help( argv[0] ); + return 0; + } + } + if (ICU_TRACE != UTRACE_OFF) { + utrace_setFunctions(NULL, TraceEntry, TraceExit, TraceData); + utrace_setLevel(ICU_TRACE); + } + + return 1; /* total error count */ +} + +int T_CTEST_EXPORT2 +runTestRequest(const TestNode* root, + int argc, + const char* const argv[]) +{ + /** + * This main will parse the l, v, h, n, and path arguments + */ + const TestNode* toRun; + int i; + int doList = false; + int subtreeOptionSeen = false; + + int errorCount = 0; + + toRun = root; + + if(ctest_xml_init(ARGV_0)) { + return 1; /* couldn't fire up XML thing */ + } + + for( i=1; i<argc; i++) + { + if ( argv[i][0] == '/' ) + { + printf("Selecting subtree '%s'\n", argv[i]); + + if ( argv[i][1] == 0 ) + toRun = root; + else + toRun = getTest(root, argv[i]); + + if ( toRun == NULL ) + { + printf("* Could not find any matching subtree\n"); + return -1; + } + + ON_LINE=false; /* just in case */ + + if( doList == true) + showTests(toRun); + else + runTests(toRun); + + ON_LINE=false; /* just in case */ + + errorCount += ERROR_COUNT; + + subtreeOptionSeen = true; + } else if ((strcmp( argv[i], "-a") == 0) || (strcmp(argv[i],"-all") == 0)) { + subtreeOptionSeen=false; + } else if (strcmp( argv[i], "-l") == 0) { + doList = true; + } + /* else option already handled by initArgs */ + } + + if( subtreeOptionSeen == false) /* no other subtree given, run the default */ + { + ON_LINE=false; /* just in case */ + if( doList == true) + showTests(toRun); + else + runTests(toRun); + ON_LINE=false; /* just in case */ + + errorCount += ERROR_COUNT; + } + else + { + if( ( doList == false ) && ( errorCount > 0 ) ) + printf(" Total errors: %d\n", errorCount ); + } + + REPEAT_TESTS_INIT = 1; + + if(ctest_xml_fini()) { + errorCount++; + } + + return errorCount; /* total error count */ +} + +/** + * Display program invocation arguments + */ + +static void help ( const char *argv0 ) +{ + printf("Usage: %s [ -l ] [ -v ] [ -verbose] [-a] [ -all] [-n] [ -no_err_msg]\n" + " [ -h ] [-t_info | -t_error | -t_warn | -t_oc | -t_verbose] [-m n[-q] ]\n" + " [ /path/to/test ]\n", + argv0); + printf(" -l To get a list of test names\n"); + printf(" -e to do exhaustive testing\n"); + printf(" -verbose To turn ON verbosity\n"); + printf(" -v To turn ON verbosity(same as -verbose)\n"); + printf(" -x file.xml Write junit format output to file.xml\n"); + printf(" -h To print this message\n"); + printf(" -K to turn OFF suppressing known issues\n"); + printf(" -n To turn OFF printing error messages\n"); + printf(" -w Don't fail on data-loading errs, just warn. Useful if\n" + " user has reduced/changed the common set of ICU data \n"); + printf(" -t_info | -t_error | -t_warn | -t_oc | -t_verbose Enable ICU tracing\n"); + printf(" -no_err_msg (same as -n) \n"); + printf(" -m n[-q] Min-Max memory size that will cause an allocation failure.\n"); + printf(" The default is the maximum value of size_t. Max is optional.\n"); + printf(" -r Repeat tests after calling u_cleanup \n"); + printf(" -G Write golden data files \n"); + printf(" [/subtest] To run a subtest \n"); + printf(" eg: to run just the utility tests type: cintltest /tsutil) \n"); +} + +int32_t T_CTEST_EXPORT2 +getTestOption ( int32_t testOption ) { + switch (testOption) { + case VERBOSITY_OPTION: + return VERBOSITY; + case WARN_ON_MISSING_DATA_OPTION: + return WARN_ON_MISSING_DATA; + case QUICK_OPTION: + return QUICK; + case REPEAT_TESTS_OPTION: + return REPEAT_TESTS; + case ERR_MSG_OPTION: + return ERR_MSG; + case ICU_TRACE_OPTION: + return ICU_TRACE; + case WRITE_GOLDEN_DATA_OPTION: + return WRITE_GOLDEN_DATA; + default : + return 0; + } +} + +void T_CTEST_EXPORT2 +setTestOption ( int32_t testOption, int32_t value) { + if (value == DECREMENT_OPTION_VALUE) { + value = getTestOption(testOption); + --value; + } + switch (testOption) { + case VERBOSITY_OPTION: + VERBOSITY = value; + break; + case WARN_ON_MISSING_DATA_OPTION: + WARN_ON_MISSING_DATA = value; + break; + case QUICK_OPTION: + QUICK = value; + break; + case REPEAT_TESTS_OPTION: + REPEAT_TESTS = value; + break; + case ICU_TRACE_OPTION: + ICU_TRACE = (UTraceLevel)value; + break; + case WRITE_GOLDEN_DATA_OPTION: + WRITE_GOLDEN_DATA = value; + default : + break; + } +} + + +/* + * ================== JUnit support ================================ + */ + +int32_t +T_CTEST_EXPORT2 +ctest_xml_setFileName(const char *name) { + XML_FILE_NAME=name; + return 0; +} + + +int32_t +T_CTEST_EXPORT2 +ctest_xml_init(const char *rootName) { + if(!XML_FILE_NAME) return 0; + XML_FILE = fopen(XML_FILE_NAME,"w"); + if(!XML_FILE) { + perror("fopen"); + fprintf(stderr," Error: couldn't open XML output file %s\n", XML_FILE_NAME); + return 1; + } + while(*rootName&&!isalnum((int)*rootName)) { + rootName++; + } + strcpy(XML_PREFIX,rootName); + { + char *p = XML_PREFIX+strlen(XML_PREFIX); + for(p--;*p&&p>XML_PREFIX&&!isalnum((int)*p);p--) { + *p=0; + } + } + /* write prefix */ + fprintf(XML_FILE, "<testsuite name=\"%s\">\n", XML_PREFIX); + + return 0; +} + +int32_t +T_CTEST_EXPORT2 +ctest_xml_fini(void) { + if(!XML_FILE) return 0; + + fprintf(XML_FILE, "</testsuite>\n"); + fclose(XML_FILE); + printf(" ( test results written to %s )\n", XML_FILE_NAME); + XML_FILE=0; + return 0; +} + + +int32_t +T_CTEST_EXPORT2 +ctest_xml_testcase(const char *classname, const char *name, const char *timeSeconds, const char *failMsg) { + if(!XML_FILE) return 0; + + fprintf(XML_FILE, "\t<testcase classname=\"%s:%s\" name=\"%s:%s\" time=\"%s\"", XML_PREFIX, classname, XML_PREFIX, name, timeSeconds); + if(failMsg) { + fprintf(XML_FILE, ">\n\t\t<failure type=\"err\" message=\"%s\"/>\n\t</testcase>\n", failMsg); + } else { + fprintf(XML_FILE, "/>\n"); + } + + return 0; +} + + diff --git a/intl/icu/source/tools/ctestfw/ctestfw.vcxproj b/intl/icu/source/tools/ctestfw/ctestfw.vcxproj new file mode 100644 index 0000000000..f55c3f0b00 --- /dev/null +++ b/intl/icu/source/tools/ctestfw/ctestfw.vcxproj @@ -0,0 +1,104 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{ECA6B435-B4FA-4F9F-BF95-F451D078FC47}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>DynamicLibrary</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* project configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)/icutest.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <AdditionalIncludeDirectories>..\..\..\include;..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PreprocessorDefinitions>T_CTEST_IMPLEMENTATION;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <PrecompiledHeaderOutputFile>$(OutDir)/icutest.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)/icutest.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <OutputFile>..\..\..\$(IcuBinOutputDir)\icutest$(IcuMajorVersion)d.exe</OutputFile> + <ProgramDatabaseFile>.\..\..\..\$(IcuLibOutputDir)\icutestd.pdb</ProgramDatabaseFile> + <ImportLibrary>.\..\..\..\$(IcuLibOutputDir)\icutestd.lib</ImportLibrary> + <AdditionalDependencies>icuucd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <OutputFile>..\..\..\$(IcuBinOutputDir)\icutest$(IcuMajorVersion).exe</OutputFile> + <ProgramDatabaseFile>.\..\..\..\$(IcuLibOutputDir)\icutest.pdb</ProgramDatabaseFile> + <ImportLibrary>.\..\..\..\$(IcuLibOutputDir)\icutest.lib</ImportLibrary> + <AdditionalDependencies>icuuc.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="ctest.c" /> + <ClCompile Include="datamap.cpp" /> + <ClCompile Include="testdata.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="tstdtmod.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="ucln_ct.c"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="uperf.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="unicode\ctest.h" /> + <ClInclude Include="unicode\datamap.h" /> + <ClInclude Include="unicode\testdata.h" /> + <ClInclude Include="unicode\testlog.h" /> + <ClInclude Include="unicode\testtype.h" /> + <ClInclude Include="unicode\tstdtmod.h" /> + <ClInclude Include="unicode\uperf.h" /> + <ClInclude Include="unicode\utimer.h" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project> diff --git a/intl/icu/source/tools/ctestfw/ctestfw.vcxproj.filters b/intl/icu/source/tools/ctestfw/ctestfw.vcxproj.filters new file mode 100644 index 0000000000..31da517dd1 --- /dev/null +++ b/intl/icu/source/tools/ctestfw/ctestfw.vcxproj.filters @@ -0,0 +1,63 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{852ed8c9-5bc0-4d29-8eb6-be22c01226a8}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{b2dfb7a8-10dc-4668-bc01-42b2b3403944}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{76918e76-1025-421a-9363-11071191fbbc}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="ctest.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="datamap.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="testdata.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="tstdtmod.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="ucln_ct.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="uperf.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="unicode\ctest.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="unicode\datamap.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="unicode\testdata.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="unicode\testlog.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="unicode\testtype.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="unicode\tstdtmod.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="unicode\uperf.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="unicode\utimer.h"> + <Filter>Header Files</Filter> + </ClInclude> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/ctestfw/datamap.cpp b/intl/icu/source/tools/ctestfw/datamap.cpp new file mode 100644 index 0000000000..0dd86f4f5b --- /dev/null +++ b/intl/icu/source/tools/ctestfw/datamap.cpp @@ -0,0 +1,224 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2002-2006, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + +/* Created by weiv 05/09/2002 */ + +#include "unicode/datamap.h" +#include "unicode/resbund.h" +#include "unicode/unistr.h" +#include "hash.h" +#include <stdlib.h> + +DataMap::~DataMap() {} +DataMap::DataMap() {} + +int32_t +DataMap::utoi(const UnicodeString &s) const +{ + char ch[256]; + const char16_t *u = toUCharPtr(s.getBuffer()); + int32_t len = s.length(); + u_UCharsToChars(u, ch, len); + ch[len] = 0; /* include terminating \0 */ + return atoi(ch); +} + +U_CDECL_BEGIN +void U_CALLCONV +deleteResBund(void *obj) { + delete (ResourceBundle *)obj; +} +U_CDECL_END + + +RBDataMap::~RBDataMap() +{ + delete fData; +} + +RBDataMap::RBDataMap() +{ + UErrorCode status = U_ZERO_ERROR; + fData = new Hashtable(true, status); + fData->setValueDeleter(deleteResBund); +} + +// init from table resource +// will put stuff in hashtable according to +// keys. +RBDataMap::RBDataMap(UResourceBundle *data, UErrorCode &status) +{ + fData = new Hashtable(true, status); + fData->setValueDeleter(deleteResBund); + init(data, status); +} + +// init from headers and resource +// with checking the whether the size of resource matches +// header size +RBDataMap::RBDataMap(UResourceBundle *headers, UResourceBundle *data, UErrorCode &status) +{ + fData = new Hashtable(true, status); + fData->setValueDeleter(deleteResBund); + init(headers, data, status); +} + + +void RBDataMap::init(UResourceBundle *data, UErrorCode &status) { + int32_t i = 0; + fData->removeAll(); + UResourceBundle *t = nullptr; + for(i = 0; i < ures_getSize(data); i++) { + t = ures_getByIndex(data, i, t, &status); + fData->put(UnicodeString(ures_getKey(t), -1, US_INV), new ResourceBundle(t, status), status); + } + ures_close(t); +} + +void RBDataMap::init(UResourceBundle *headers, UResourceBundle *data, UErrorCode &status) +{ + int32_t i = 0; + fData->removeAll(); + UResourceBundle *t = nullptr; + const char16_t *key = nullptr; + int32_t keyLen = 0; + if(ures_getSize(headers) == ures_getSize(data)) { + for(i = 0; i < ures_getSize(data); i++) { + t = ures_getByIndex(data, i, t, &status); + key = ures_getStringByIndex(headers, i, &keyLen, &status); + fData->put(UnicodeString(key, keyLen), new ResourceBundle(t, status), status); + } + } else { + // error + status = U_INVALID_FORMAT_ERROR; + } + ures_close(t); +} + +const ResourceBundle *RBDataMap::getItem(const char* key, UErrorCode &status) const +{ + if(U_FAILURE(status)) { + return nullptr; + } + + UnicodeString hashKey(key, -1, US_INV); + const ResourceBundle *r = (ResourceBundle *)fData->get(hashKey); + if(r != nullptr) { + return r; + } else { + status = U_MISSING_RESOURCE_ERROR; + return nullptr; + } +} + +const UnicodeString RBDataMap::getString(const char* key, UErrorCode &status) const +{ + const ResourceBundle *r = getItem(key, status); + if(U_SUCCESS(status)) { + return r->getString(status); + } else { + return UnicodeString(); + } +} + +int32_t +RBDataMap::getInt28(const char* key, UErrorCode &status) const +{ + const ResourceBundle *r = getItem(key, status); + if(U_SUCCESS(status)) { + return r->getInt(status); + } else { + return 0; + } +} + +uint32_t +RBDataMap::getUInt28(const char* key, UErrorCode &status) const +{ + const ResourceBundle *r = getItem(key, status); + if(U_SUCCESS(status)) { + return r->getUInt(status); + } else { + return 0; + } +} + +const int32_t * +RBDataMap::getIntVector(int32_t &length, const char *key, UErrorCode &status) const { + const ResourceBundle *r = getItem(key, status); + if(U_SUCCESS(status)) { + return r->getIntVector(length, status); + } else { + return nullptr; + } +} + +const uint8_t * +RBDataMap::getBinary(int32_t &length, const char *key, UErrorCode &status) const { + const ResourceBundle *r = getItem(key, status); + if(U_SUCCESS(status)) { + return r->getBinary(length, status); + } else { + return nullptr; + } +} + +int32_t RBDataMap::getInt(const char* key, UErrorCode &status) const +{ + UnicodeString r = this->getString(key, status); + if(U_SUCCESS(status)) { + return utoi(r); + } else { + return 0; + } +} + +const UnicodeString* RBDataMap::getStringArray(int32_t& count, const char* key, UErrorCode &status) const +{ + const ResourceBundle *r = getItem(key, status); + if(U_SUCCESS(status)) { + int32_t i = 0; + + count = r->getSize(); + if(count <= 0) { + return nullptr; + } + + UnicodeString *result = new UnicodeString[count]; + for(i = 0; i<count; i++) { + result[i] = r->getStringEx(i, status); + } + return result; + } else { + return nullptr; + } +} + +const int32_t* RBDataMap::getIntArray(int32_t& count, const char* key, UErrorCode &status) const +{ + const ResourceBundle *r = getItem(key, status); + if(U_SUCCESS(status)) { + int32_t i = 0; + + count = r->getSize(); + if(count <= 0) { + return nullptr; + } + + int32_t *result = new int32_t[count]; + UnicodeString stringRes; + for(i = 0; i<count; i++) { + stringRes = r->getStringEx(i, status); + result[i] = utoi(stringRes); + } + return result; + } else { + return nullptr; + } +} + diff --git a/intl/icu/source/tools/ctestfw/sources.txt b/intl/icu/source/tools/ctestfw/sources.txt new file mode 100644 index 0000000000..30103db4a1 --- /dev/null +++ b/intl/icu/source/tools/ctestfw/sources.txt @@ -0,0 +1,6 @@ +ctest.c +datamap.cpp +testdata.cpp +tstdtmod.cpp +ucln_ct.c +uperf.cpp diff --git a/intl/icu/source/tools/ctestfw/testdata.cpp b/intl/icu/source/tools/ctestfw/testdata.cpp new file mode 100644 index 0000000000..2fb93381dc --- /dev/null +++ b/intl/icu/source/tools/ctestfw/testdata.cpp @@ -0,0 +1,144 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2002-2005, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + +/* Created by weiv 05/09/2002 */ + +#include "unicode/testdata.h" + + +TestData::TestData(const char* testName) +: name(testName), +fInfo(nullptr), +fCurrSettings(nullptr), +fCurrCase(nullptr), +fSettingsSize(0), +fCasesSize(0), +fCurrentSettings(0), +fCurrentCase(0) + +{ +} + +TestData::~TestData() { + if(fInfo != nullptr) { + delete fInfo; + } + if(fCurrSettings != nullptr) { + delete fCurrSettings; + } + if(fCurrCase != nullptr) { + delete fCurrCase; + } +} + +const char * TestData::getName() const +{ + return name; +} + + + +RBTestData::RBTestData(const char* testName) +: TestData(testName), +fData(nullptr), +fHeaders(nullptr), +fSettings(nullptr), +fCases(nullptr) +{ +} + +RBTestData::RBTestData(UResourceBundle *data, UResourceBundle *headers, UErrorCode& status) +: TestData(ures_getKey(data)), +fData(data), +fHeaders(headers), +fSettings(nullptr), +fCases(nullptr) +{ + UErrorCode intStatus = U_ZERO_ERROR; + UResourceBundle *currHeaders = ures_getByKey(data, "Headers", nullptr, &intStatus); + if(intStatus == U_ZERO_ERROR) { + ures_close(fHeaders); + fHeaders = currHeaders; + } else { + intStatus = U_ZERO_ERROR; + } + fSettings = ures_getByKey(data, "Settings", nullptr, &intStatus); + fSettingsSize = ures_getSize(fSettings); + UResourceBundle *info = ures_getByKey(data, "Info", nullptr, &intStatus); + if(U_SUCCESS(intStatus)) { + fInfo = new RBDataMap(info, status); + } else { + intStatus = U_ZERO_ERROR; + } + fCases = ures_getByKey(data, "Cases", nullptr, &status); + fCasesSize = ures_getSize(fCases); + + ures_close(info); +} + + +RBTestData::~RBTestData() +{ + ures_close(fData); + ures_close(fHeaders); + ures_close(fSettings); + ures_close(fCases); +} + +UBool RBTestData::getInfo(const DataMap *& info, UErrorCode &/*status*/) const +{ + if(fInfo) { + info = fInfo; + return true; + } else { + info = nullptr; + return false; + } +} + +UBool RBTestData::nextSettings(const DataMap *& settings, UErrorCode &status) +{ + UErrorCode intStatus = U_ZERO_ERROR; + UResourceBundle *data = ures_getByIndex(fSettings, fCurrentSettings++, nullptr, &intStatus); + if(U_SUCCESS(intStatus)) { + // reset the cases iterator + fCurrentCase = 0; + if(fCurrSettings == nullptr) { + fCurrSettings = new RBDataMap(data, status); + } else { + ((RBDataMap *)fCurrSettings)->init(data, status); + } + ures_close(data); + settings = fCurrSettings; + return true; + } else { + settings = nullptr; + return false; + } +} + +UBool RBTestData::nextCase(const DataMap *& nextCase, UErrorCode &status) +{ + UErrorCode intStatus = U_ZERO_ERROR; + UResourceBundle *currCase = ures_getByIndex(fCases, fCurrentCase++, nullptr, &intStatus); + if(U_SUCCESS(intStatus)) { + if(fCurrCase == nullptr) { + fCurrCase = new RBDataMap(fHeaders, currCase, status); + } else { + ((RBDataMap *)fCurrCase)->init(fHeaders, currCase, status); + } + ures_close(currCase); + nextCase = fCurrCase; + return true; + } else { + nextCase = nullptr; + return false; + } +} + + diff --git a/intl/icu/source/tools/ctestfw/tstdtmod.cpp b/intl/icu/source/tools/ctestfw/tstdtmod.cpp new file mode 100644 index 0000000000..3ebe22466e --- /dev/null +++ b/intl/icu/source/tools/ctestfw/tstdtmod.cpp @@ -0,0 +1,293 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2002-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + +/* Created by weiv 05/09/2002 */ + +#include <stdarg.h> + +#include "unicode/tstdtmod.h" +#include "cmemory.h" +#include <stdio.h> +#include "cstr.h" +#include "cstring.h" + +TestLog::~TestLog() {} + +IcuTestErrorCode::~IcuTestErrorCode() { + // Safe because our errlog() does not throw exceptions. + if(isFailure()) { + errlog(false, u"destructor: expected success", nullptr); + } +} + +UBool IcuTestErrorCode::errIfFailureAndReset() { + if(isFailure()) { + errlog(false, u"expected success", nullptr); + reset(); + return true; + } else { + reset(); + return false; + } +} + +UBool IcuTestErrorCode::errIfFailureAndReset(const char *fmt, ...) { + if(isFailure()) { + char buffer[4000]; + va_list ap; + va_start(ap, fmt); + vsnprintf(buffer, sizeof(buffer), fmt, ap); + va_end(ap); + errlog(false, u"expected success", buffer); + reset(); + return true; + } else { + reset(); + return false; + } +} + +UBool IcuTestErrorCode::errDataIfFailureAndReset() { + if(isFailure()) { + errlog(true, u"data: expected success", nullptr); + reset(); + return true; + } else { + reset(); + return false; + } +} + +UBool IcuTestErrorCode::errDataIfFailureAndReset(const char *fmt, ...) { + if(isFailure()) { + char buffer[4000]; + va_list ap; + va_start(ap, fmt); + vsnprintf(buffer, sizeof(buffer), fmt, ap); + va_end(ap); + errlog(true, u"data: expected success", buffer); + reset(); + return true; + } else { + reset(); + return false; + } +} + +UBool IcuTestErrorCode::expectErrorAndReset(UErrorCode expectedError) { + if(get() != expectedError) { + errlog(false, UnicodeString(u"expected: ") + u_errorName(expectedError), nullptr); + } + UBool retval = isFailure(); + reset(); + return retval; +} + +UBool IcuTestErrorCode::expectErrorAndReset(UErrorCode expectedError, const char *fmt, ...) { + if(get() != expectedError) { + char buffer[4000]; + va_list ap; + va_start(ap, fmt); + vsnprintf(buffer, sizeof(buffer), fmt, ap); + va_end(ap); + errlog(false, UnicodeString(u"expected: ") + u_errorName(expectedError), buffer); + } + UBool retval = isFailure(); + reset(); + return retval; +} + +void IcuTestErrorCode::setScope(const char* message) { + scopeMessage.remove().append({ message, -1, US_INV }); +} + +void IcuTestErrorCode::setScope(const UnicodeString& message) { + scopeMessage = message; +} + +void IcuTestErrorCode::handleFailure() const { + errlog(false, u"(handleFailure)", nullptr); +} + +void IcuTestErrorCode::errlog(UBool dataErr, const UnicodeString& mainMessage, const char* extraMessage) const { + UnicodeString msg(testName, -1, US_INV); + msg.append(u' ').append(mainMessage); + msg.append(u" but got error: ").append(UnicodeString(errorName(), -1, US_INV)); + + if (!scopeMessage.isEmpty()) { + msg.append(u" scope: ").append(scopeMessage); + } + + if (extraMessage != nullptr) { + msg.append(u" - ").append(UnicodeString(extraMessage, -1, US_INV)); + } + + if (dataErr || get() == U_MISSING_RESOURCE_ERROR || get() == U_FILE_ACCESS_ERROR) { + testClass.dataerrln(msg); + } else { + testClass.errln(msg); + } +} + +TestDataModule *TestDataModule::getTestDataModule(const char* name, TestLog& log, UErrorCode &status) +{ + if(U_FAILURE(status)) { + return nullptr; + } + TestDataModule *result = nullptr; + + // TODO: probe for resource bundle and then for XML. + // According to that, construct an appropriate driver object + + result = new RBTestDataModule(name, log, status); + if(U_SUCCESS(status)) { + return result; + } else { + delete result; + return nullptr; + } +} + +TestDataModule::TestDataModule(const char* name, TestLog& log, UErrorCode& /*status*/) +: testName(name), +fInfo(nullptr), +fLog(log) +{ +} + +TestDataModule::~TestDataModule() { + if(fInfo != nullptr) { + delete fInfo; + } +} + +const char * TestDataModule::getName() const +{ + return testName; +} + + + +RBTestDataModule::~RBTestDataModule() +{ + ures_close(fTestData); + ures_close(fModuleBundle); + ures_close(fInfoRB); + uprv_free(tdpath); +} + +RBTestDataModule::RBTestDataModule(const char* name, TestLog& log, UErrorCode& status) +: TestDataModule(name, log, status), + fModuleBundle(nullptr), + fTestData(nullptr), + fInfoRB(nullptr), + tdpath(nullptr) +{ + fNumberOfTests = 0; + fDataTestValid = true; + fModuleBundle = getTestBundle(name, status); + if(fDataTestValid) { + fTestData = ures_getByKey(fModuleBundle, "TestData", nullptr, &status); + fNumberOfTests = ures_getSize(fTestData); + fInfoRB = ures_getByKey(fModuleBundle, "Info", nullptr, &status); + if(status != U_ZERO_ERROR) { + log.errln(UNICODE_STRING_SIMPLE("Unable to initialize test data - missing mandatory description resources!")); + fDataTestValid = false; + } else { + fInfo = new RBDataMap(fInfoRB, status); + } + } +} + +UBool RBTestDataModule::getInfo(const DataMap *& info, UErrorCode &/*status*/) const +{ + info = fInfo; + if(fInfo) { + return true; + } else { + return false; + } +} + +TestData* RBTestDataModule::createTestData(int32_t index, UErrorCode &status) const +{ + TestData *result = nullptr; + UErrorCode intStatus = U_ZERO_ERROR; + + if(fDataTestValid == true) { + // Both of these resources get adopted by a TestData object. + UResourceBundle *DataFillIn = ures_getByIndex(fTestData, index, nullptr, &status); + UResourceBundle *headers = ures_getByKey(fInfoRB, "Headers", nullptr, &intStatus); + + if(U_SUCCESS(status)) { + result = new RBTestData(DataFillIn, headers, status); + + if(U_SUCCESS(status)) { + return result; + } else { + delete result; + } + } else { + ures_close(DataFillIn); + ures_close(headers); + } + } else { + status = U_MISSING_RESOURCE_ERROR; + } + return nullptr; +} + +TestData* RBTestDataModule::createTestData(const char* name, UErrorCode &status) const +{ + TestData *result = nullptr; + UErrorCode intStatus = U_ZERO_ERROR; + + if(fDataTestValid == true) { + // Both of these resources get adopted by a TestData object. + UResourceBundle *DataFillIn = ures_getByKey(fTestData, name, nullptr, &status); + UResourceBundle *headers = ures_getByKey(fInfoRB, "Headers", nullptr, &intStatus); + + if(U_SUCCESS(status)) { + result = new RBTestData(DataFillIn, headers, status); + if(U_SUCCESS(status)) { + return result; + } else { + delete result; + } + } else { + ures_close(DataFillIn); + ures_close(headers); + } + } else { + status = U_MISSING_RESOURCE_ERROR; + } + return nullptr; +} + + + +//Get test data from ResourceBundles +UResourceBundle* +RBTestDataModule::getTestBundle(const char* bundleName, UErrorCode &status) +{ + if(U_SUCCESS(status)) { + UResourceBundle *testBundle = nullptr; + const char* icu_data = fLog.getTestDataPath(status); + if (testBundle == nullptr) { + testBundle = ures_openDirect(icu_data, bundleName, &status); + if (status != U_ZERO_ERROR) { + fLog.dataerrln(UNICODE_STRING_SIMPLE("Could not load test data from resourcebundle: ") + UnicodeString(bundleName, -1, US_INV)); + fDataTestValid = false; + } + } + return testBundle; + } else { + return nullptr; + } +} + diff --git a/intl/icu/source/tools/ctestfw/ucln_ct.c b/intl/icu/source/tools/ctestfw/ucln_ct.c new file mode 100644 index 0000000000..a4d1ce86e8 --- /dev/null +++ b/intl/icu/source/tools/ctestfw/ucln_ct.c @@ -0,0 +1,19 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2007-2013, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + + +/** Auto-client **/ +#define UCLN_TYPE UCLN_CTESTFW +#include "ucln_imp.h" + +int uprv_dummyFunction_CT(void); +int uprv_dummyFunction_CT(void) +{ + /* this is here to prevent the compiler from complaining about an empty file */ + return 0; +} diff --git a/intl/icu/source/tools/ctestfw/unicode/ctest.h b/intl/icu/source/tools/ctestfw/unicode/ctest.h new file mode 100644 index 0000000000..da75be55b2 --- /dev/null +++ b/intl/icu/source/tools/ctestfw/unicode/ctest.h @@ -0,0 +1,321 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ******************************************************************************** + * + * Copyright (C) 1996-2013, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************** + */ + +#ifndef CTEST_H +#define CTEST_H + +#include "unicode/testtype.h" +#include "unicode/utrace.h" + + +/* prototypes *********************************/ + +U_CDECL_BEGIN +typedef void (U_CALLCONV *TestFunctionPtr)(void); +typedef int (U_CALLCONV *ArgHandlerPtr)(int arg, int argc, const char* const argv[], void *context); +typedef struct TestNode TestNode; +U_CDECL_END + +/** + * This is use to set or get the option value for REPEAT_TESTS. + * Use with set/getTestOption(). + * + * @internal + */ +#define REPEAT_TESTS_OPTION 1 + +/** + * This is use to set or get the option value for VERBOSITY. + * When option is set to zero to disable log_verbose() messages. + * Otherwise nonzero to see log_verbose() messages. + * Use with set/getTestOption(). + * + * @internal + */ +#define VERBOSITY_OPTION 2 + +/** + * This is use to set or get the option value for ERR_MSG. + * Use with set/getTestOption(). + * + * @internal + */ +#define ERR_MSG_OPTION 3 + +/** + * This is use to set or get the option value for QUICK. + * When option is zero, disable some of the slower tests. + * Otherwise nonzero to run the slower tests. + * Use with set/getTestOption(). + * + * @internal + */ +#define QUICK_OPTION 4 + +/** + * This is use to set or get the option value for WARN_ON_MISSING_DATA. + * When option is nonzero, warn on missing data. + * Otherwise, errors are propagated when data is not available. + * Affects the behavior of log_dataerr. + * Use with set/getTestOption(). + * + * @see log_data_err + * @internal + */ +#define WARN_ON_MISSING_DATA_OPTION 5 + +/** + * This is use to set or get the option value for ICU_TRACE. + * ICU tracing level, is set by command line option. + * Use with set/getTestOption(). + * + * @internal + */ +#define ICU_TRACE_OPTION 6 + +/** + * This is used to set or get the option value for WRITE_GOLDEN_DATA. + * Set to 1 to overwrite golden data files, such as those in testdata/ucptrie. + * Use with set/getTestOption(). + */ +#define WRITE_GOLDEN_DATA_OPTION 7 + +/** + * Maximum amount of memory uprv_malloc should allocate before returning NULL. + * + * @internal + */ +extern T_CTEST_EXPORT_API size_t MAX_MEMORY_ALLOCATION; + +/** + * If memory tracing was enabled, contains the number of unfreed allocations. + * + * @internal + */ +extern T_CTEST_EXPORT_API int32_t ALLOCATION_COUNT; + +/** + * Pass to setTestOption to decrement the test option value. + * + * @internal + */ +#define DECREMENT_OPTION_VALUE -99 + +/** + * Gets the test option set on commandline. + * + * @param testOption macro definition for the individual test option + * @return value of test option, zero if option is not set or off + * @internal Internal APIs for testing purpose only + */ +T_CTEST_API int32_t T_CTEST_EXPORT2 +getTestOption ( int32_t testOption ); + +/** + * Sets the test option with value given on commandline. + * + * @param testOption macro definition for the individual test option + * @param value to set the test option to + * @internal Internal APIs for testing purpose only + */ +T_CTEST_API void T_CTEST_EXPORT2 +setTestOption ( int32_t testOption, int32_t value); + +/** + * Show the names of all nodes. + * + * @param root Subtree of tests. + * @internal Internal APIs for testing purpose only + */ +T_CTEST_API void T_CTEST_EXPORT2 +showTests ( const TestNode *root); + +/** + * Run a subtree of tests. + * + * @param root Subtree of tests. + * @internal Internal APIs for testing purpose only + */ +T_CTEST_API void T_CTEST_EXPORT2 +runTests ( const TestNode* root); + +/** + * Add a test to the subtree. + * Example usage: + * <PRE> + * TestNode* root=NULL; + * addTest(&root, &mytest, "/a/b/mytest" ); + * </PRE> + * @param root Pointer to the root pointer. + * @param test Pointer to 'void function(void)' for actual test. + * @param path Path from root under which test will be placed. Ex. '/a/b/mytest' + * @internal Internal APIs for testing purpose only + */ +T_CTEST_API void T_CTEST_EXPORT2 +addTest(TestNode** root, + TestFunctionPtr test, + const char *path); + +/** + * Clean up any allocated memory. + * Conditions for calling this function are the same as u_cleanup(). + * @see u_cleanup + * @internal Internal APIs for testing purpose only + */ +T_CTEST_API void T_CTEST_EXPORT2 +cleanUpTestTree(TestNode *tn); + +/** + * Retrieve a specific subtest. (subtree). + * + * @param root Pointer to the root. + * @param path Path relative to the root, Ex. '/a/b' + * @return The subtest, or NULL on failure. + * @internal Internal APIs for testing purpose only + */ +T_CTEST_API const TestNode* T_CTEST_EXPORT2 +getTest(const TestNode* root, + const char *path); + + +/** + * Log an error message. (printf style) + * @param pattern printf-style format string + * @internal Internal APIs for testing purpose only + */ +T_CTEST_API void T_CTEST_EXPORT2 +log_err(const char* pattern, ...); + +T_CTEST_API void T_CTEST_EXPORT2 +log_err_status(UErrorCode status, const char* pattern, ...); +/** + * Log an informational message. (printf style) + * @param pattern printf-style format string + * @internal Internal APIs for testing purpose only + */ +T_CTEST_API void T_CTEST_EXPORT2 +log_info(const char* pattern, ...); + +/** + * Log an informational message. (vprintf style) + * @param prefix a string that is output before the pattern and without formatting + * @param pattern printf-style format string + * @param ap variable-arguments list + * @internal Internal APIs for testing purpose only + */ +T_CTEST_API void T_CTEST_EXPORT2 +vlog_info(const char *prefix, const char *pattern, va_list ap); + +/** + * Log a verbose informational message. (printf style) + * This message will only appear if the global VERBOSITY is nonzero + * @param pattern printf-style format string + * @internal Internal APIs for testing purpose only + */ +T_CTEST_API void T_CTEST_EXPORT2 +log_verbose(const char* pattern, ...); + +/** + * Log an error message concerning missing data. (printf style) + * If WARN_ON_MISSING_DATA is nonzero, this will case a log_info (warning) to be + * printed, but if it is zero this will produce an error (log_err). + * @param pattern printf-style format string + * @internal Internal APIs for testing purpose only + */ +T_CTEST_API void T_CTEST_EXPORT2 +log_data_err(const char *pattern, ...); + +/** + * Log a known issue. + * @param ticket ticket number such as "ICU-12345" for ICU tickets or "CLDR-6636" for CLDR tickets. + * @param fmt ... sprintf-style format, optional message. can be NULL. + * @return true if known issue test should be skipped, false if it should be run + */ +T_CTEST_API UBool +T_CTEST_EXPORT2 +log_knownIssue(const char *ticket, const char *fmt, ...); + +/** + * Initialize the variables above. This allows the test to set up accordingly + * before running the tests. + * This must be called before runTests. + */ +T_CTEST_API int T_CTEST_EXPORT2 +initArgs( int argc, const char* const argv[], ArgHandlerPtr argHandler, void *context); + +/** + * Processes the command line arguments. + * This is a sample implementation + * <PRE>Usage: %s [ -l ] [ -v ] [ -? ] [ /path/to/test ] + * -l List only, do not run\ + * -v turn OFF verbosity + * -? print this message</PRE> + * @param root Testnode root with tests already attached to it + * @param argv argument list from main (stdio.h) + * @param argc argument list count from main (stdio.h) + * @return positive for error count, 0 for success, negative for illegal argument + * @internal Internal APIs for testing purpose only + */ +T_CTEST_API int T_CTEST_EXPORT2 +runTestRequest(const TestNode* root, + int argc, + const char* const argv[]); + + +T_CTEST_API const char* T_CTEST_EXPORT2 +getTestName(void); + +/** + * Append a time delta to str if it is significant (>5 ms) otherwise no change + * @param delta a delta in millis + * @param str a string to append to. + */ +T_CTEST_API void T_CTEST_EXPORT2 +str_timeDelta(char *str, UDate delta); + + +/* ======== XML (JUnit output) ========= */ + +/** + * Set the filename for the XML output. + * @param fileName file name. Caller must retain storage. + * @return 0 on success, 1 on failure. + */ +T_CTEST_API int32_t T_CTEST_EXPORT2 +ctest_xml_setFileName(const char *fileName); + + +/** + * Init XML subsystem. Call ctest_xml_setFileName first + * @param rootName the test root name to be written + * @return 0 on success, 1 on failure. + */ +T_CTEST_API int32_t T_CTEST_EXPORT2 +ctest_xml_init(const char *rootName); + + +/** + * Set the filename for the XML output. Caller must retain storage. + * @return 0 on success, 1 on failure. + */ +T_CTEST_API int32_t T_CTEST_EXPORT2 +ctest_xml_fini(void); + + +/** + * report a test case + * @return 0 on success, 1 on failure. + */ +T_CTEST_API int32_t +T_CTEST_EXPORT2 +ctest_xml_testcase(const char *classname, const char *name, const char *time, const char *failMsg); + +#endif diff --git a/intl/icu/source/tools/ctestfw/unicode/datamap.h b/intl/icu/source/tools/ctestfw/unicode/datamap.h new file mode 100644 index 0000000000..b4f7f82fd6 --- /dev/null +++ b/intl/icu/source/tools/ctestfw/unicode/datamap.h @@ -0,0 +1,140 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2002-2006, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + +/* Created by weiv 05/09/2002 */ + +#ifndef U_TESTFW_DATAMAP +#define U_TESTFW_DATAMAP + +#include "unicode/resbund.h" +#include "unicode/testtype.h" + + + +U_NAMESPACE_BEGIN +class Hashtable; +U_NAMESPACE_END + +/** Holder of test data and settings. Allows addressing of items by name. + * For test cases, names are defined in the "Headers" section. For settings + * and info data, names are keys in data. Currently, we return scalar strings + * and integers and arrays of strings and integers. Arrays should be deposited + * of by the user. + */ +class T_CTEST_EXPORT_API DataMap { +public: + virtual ~DataMap(); + +protected: + DataMap(); + int32_t utoi(const UnicodeString &s) const; + + +public: + /** get the string from the DataMap. Addressed by name + * @param key name of the data field. + * @return a string containing the data + */ + virtual const UnicodeString getString(const char* key, UErrorCode &status) const = 0; + + /** get the string from the DataMap. Addressed by name + * parses a bundle string into an integer + * @param key name of the data field. + * @return an integer containing the data + */ + virtual int32_t getInt(const char* key, UErrorCode &status) const = 0; + + /** + * Get a signed integer without runtime parsing. + * @param key name of the data field. + * @param status UErrorCode in/out parameter + * @return the integer + */ + virtual int32_t getInt28(const char* key, UErrorCode &status) const = 0; + + /** + * Get an unsigned integer without runtime parsing. + * @param key name of the data field. + * @param status UErrorCode in/out parameter + * @return the integer + */ + virtual uint32_t getUInt28(const char* key, UErrorCode &status) const = 0; + + /** + * Get a vector of integers without runtime parsing. + * @param length output parameter for the length of the vector + * @param key name of the data field. + * @param status UErrorCode in/out parameter + * @return the integer vector, do not delete + */ + virtual const int32_t *getIntVector(int32_t &length, const char *key, UErrorCode &status) const = 0; + + /** + * Get binary data without runtime parsing. + * @param length output parameter for the length of the data + * @param key name of the data field. + * @param status UErrorCode in/out parameter + * @return the binary data, do not delete + */ + virtual const uint8_t *getBinary(int32_t &length, const char *key, UErrorCode &status) const = 0; + + /** get an array of strings from the DataMap. Addressed by name. + * The user must dispose of it after usage, using delete. + * @param key name of the data field. + * @return a string array containing the data + */ + virtual const UnicodeString* getStringArray(int32_t& count, const char* key, UErrorCode &status) const = 0; + + /** get an array of integers from the DataMap. Addressed by name. + * The user must dispose of it after usage, using delete. + * @param key name of the data field. + * @return an integer array containing the data + */ + virtual const int32_t* getIntArray(int32_t& count, const char* key, UErrorCode &status) const = 0; + + // ... etc ... +}; + +// This one is already concrete - it is going to be instantiated from +// concrete data by TestData children... +class T_CTEST_EXPORT_API RBDataMap : public DataMap{ +private: + Hashtable *fData; + +public: + virtual ~RBDataMap(); + +public: + RBDataMap(); + + RBDataMap(UResourceBundle *data, UErrorCode &status); + RBDataMap(UResourceBundle *headers, UResourceBundle *data, UErrorCode &status); + +public: + void init(UResourceBundle *data, UErrorCode &status); + void init(UResourceBundle *headers, UResourceBundle *data, UErrorCode &status); + + virtual const ResourceBundle *getItem(const char* key, UErrorCode &status) const; + + virtual const UnicodeString getString(const char* key, UErrorCode &status) const override; + virtual int32_t getInt28(const char* key, UErrorCode &status) const override; + virtual uint32_t getUInt28(const char* key, UErrorCode &status) const override; + virtual const int32_t *getIntVector(int32_t &length, const char *key, UErrorCode &status) const override; + virtual const uint8_t *getBinary(int32_t &length, const char *key, UErrorCode &status) const override; + + virtual int32_t getInt(const char* key, UErrorCode &status) const override; + + virtual const UnicodeString* getStringArray(int32_t& count, const char* key, UErrorCode &status) const override; + virtual const int32_t* getIntArray(int32_t& count, const char* key, UErrorCode &status) const override; + + // ... etc ... +}; + + +#endif + diff --git a/intl/icu/source/tools/ctestfw/unicode/testdata.h b/intl/icu/source/tools/ctestfw/unicode/testdata.h new file mode 100644 index 0000000000..77db9ceaf1 --- /dev/null +++ b/intl/icu/source/tools/ctestfw/unicode/testdata.h @@ -0,0 +1,113 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2002-2006, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + +/* Created by weiv 05/09/2002 */ + +/* Base class for data driven tests */ + +#ifndef U_TESTFW_TESTDATA +#define U_TESTFW_TESTDATA + +#include "unicode/tstdtmod.h" +#include "unicode/datamap.h" + + + /** This is the class that abstracts one of the tests in a data file + * It is usually instantiated using TestDataModule::CreateTestData method + * This class provides two important methods: nextSettings and nextCase + * Usually, one walks through all settings and executes all cases for + * each setting. Each call to nextSettings resets the cases iterator. + * Individual test cases have to have the same number of fields as the + * number of entries in headers. Default headers can be specified in + * the TestDataModule info section. The default headers will be overridden + * by per-test headers. + * Example: + * DataMap *settings = nullptr; + * DataMap *cases = nullptr; + * while(nextSettings(settings, status)) { + * // set settings for the subtest + * while(nextCase(cases, status) { + * // process testcase + * } + * } + */ + +class T_CTEST_EXPORT_API TestData { + const char* name; + +protected: + DataMap *fInfo; + DataMap *fCurrSettings; + DataMap *fCurrCase; + int32_t fSettingsSize; + int32_t fCasesSize; + int32_t fCurrentSettings; + int32_t fCurrentCase; + /** constructor - don't use */ + TestData(const char* name); + +public: + virtual ~TestData(); + + const char* getName() const; + + /** Get a pointer to an object owned DataMap that contains more information on this + * TestData object. + * Usual fields is "Description". + * @param info pass in a const DataMap pointer. If no info, it will be set to nullptr + */ + virtual UBool getInfo(const DataMap *& info, UErrorCode &status) const = 0; + + /** Gets the next set of settings for the test. Resets the cases iterator. + * DataMap is owned by the object and should not be deleted. + * @param settings a DataMap pointer provided by the user. Will be nullptr if + * no more settings are available. + * @param status for reporting unexpected errors. + * @return A boolean, true if there are settings, false if there is no more + * settings. + */ + virtual UBool nextSettings(const DataMap *& settings, UErrorCode &status) = 0; + + /** Gets the next test case. + * DataMap is owned by the object and should not be deleted. + * @param data a DataMap pointer provided by the user. Will be nullptr if + * no more cases are available. + * @param status for reporting unexpected errors. + * @return A boolean, true if there are cases, false if there is no more + * cases. + */ + virtual UBool nextCase(const DataMap *& data, UErrorCode &status) = 0; +}; + +// implementation of TestData that uses resource bundles + +class T_CTEST_EXPORT_API RBTestData : public TestData { + UResourceBundle *fData; + UResourceBundle *fHeaders; + UResourceBundle *fSettings; + UResourceBundle *fCases; + +public: + RBTestData(const char* name); + RBTestData(UResourceBundle *data, UResourceBundle *headers, UErrorCode& status); +private: +// RBTestData() {}; +// RBTestData(const RBTestData& original) {}; + RBTestData& operator=(const RBTestData& /*original*/); + +public: + virtual ~RBTestData(); + + virtual UBool getInfo(const DataMap *& info, UErrorCode &status) const override; + + virtual UBool nextSettings(const DataMap *& settings, UErrorCode &status) override; + virtual UBool nextCase(const DataMap *& nextCase, UErrorCode &status) override; +}; + +#endif + diff --git a/intl/icu/source/tools/ctestfw/unicode/testlog.h b/intl/icu/source/tools/ctestfw/unicode/testlog.h new file mode 100644 index 0000000000..a7ffbc6084 --- /dev/null +++ b/intl/icu/source/tools/ctestfw/unicode/testlog.h @@ -0,0 +1,62 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2004-2010, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + +/* Created by grhoten 03/17/2004 */ + +/* Base class for data driven tests */ + +#ifndef U_TESTFW_TESTLOG +#define U_TESTFW_TESTLOG + +#include "unicode/errorcode.h" +#include "unicode/unistr.h" +#include "unicode/testtype.h" + +/** Facilitates internal logging of data driven test service + * It would be interesting to develop this into a full + * fledged control system as in Java. + */ +class T_CTEST_EXPORT_API TestLog { +public: + virtual ~TestLog(); + virtual void errln( const UnicodeString &message ) = 0; + virtual void logln( const UnicodeString &message ) = 0; + virtual void dataerrln( const UnicodeString &message ) = 0; + virtual const char* getTestDataPath(UErrorCode& err) = 0; +}; + +class T_CTEST_EXPORT_API IcuTestErrorCode : public ErrorCode { +public: + IcuTestErrorCode(TestLog &callingTestClass, const char *callingTestName) + : testClass(callingTestClass), testName(callingTestName), scopeMessage() {} + virtual ~IcuTestErrorCode(); + + // Returns true if isFailure(). + UBool errIfFailureAndReset(); + UBool errIfFailureAndReset(const char *fmt, ...); + UBool errDataIfFailureAndReset(); + UBool errDataIfFailureAndReset(const char *fmt, ...); + UBool expectErrorAndReset(UErrorCode expectedError); + UBool expectErrorAndReset(UErrorCode expectedError, const char *fmt, ...); + + /** Sets an additional message string to be appended to failure output. */ + void setScope(const char* message); + void setScope(const UnicodeString& message); + +protected: + virtual void handleFailure() const override; + +private: + TestLog &testClass; + const char *const testName; + UnicodeString scopeMessage; + + void errlog(UBool dataErr, const UnicodeString& mainMessage, const char* extraMessage) const; +}; + +#endif diff --git a/intl/icu/source/tools/ctestfw/unicode/testtype.h b/intl/icu/source/tools/ctestfw/unicode/testtype.h new file mode 100644 index 0000000000..a5c70d577a --- /dev/null +++ b/intl/icu/source/tools/ctestfw/unicode/testtype.h @@ -0,0 +1,40 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ***************************************************************************************** + * Copyright (C) 2004-2011, International Business Machines + * Corporation and others. All Rights Reserved. + ***************************************************************************************** + */ + +#include "unicode/utypes.h" + +/*Deals with imports and exports of the dynamic library*/ +#if !defined(U_STATIC_IMPLEMENTATION) + #define T_CTEST_EXPORT U_EXPORT + #define T_CTEST_IMPORT U_IMPORT +#else + #define T_CTEST_EXPORT + #define T_CTEST_IMPORT +#endif + +#if defined(_MSC_VER) +#define T_CTEST_EXPORT2 __cdecl +#else +#define T_CTEST_EXPORT2 +#endif + +#ifdef __cplusplus + #define C_CTEST_API extern "C" + U_NAMESPACE_USE +#else + #define C_CTEST_API +#endif + +#ifdef T_CTEST_IMPLEMENTATION + #define T_CTEST_API C_CTEST_API T_CTEST_EXPORT + #define T_CTEST_EXPORT_API T_CTEST_EXPORT +#else + #define T_CTEST_API C_CTEST_API T_CTEST_IMPORT + #define T_CTEST_EXPORT_API T_CTEST_IMPORT +#endif diff --git a/intl/icu/source/tools/ctestfw/unicode/tstdtmod.h b/intl/icu/source/tools/ctestfw/unicode/tstdtmod.h new file mode 100644 index 0000000000..fb2f19631d --- /dev/null +++ b/intl/icu/source/tools/ctestfw/unicode/tstdtmod.h @@ -0,0 +1,117 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2002-2005, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + +/* Created by weiv 05/09/2002 */ + +/* Base class for data driven tests */ + +#ifndef U_TESTFW_TESTMODULE +#define U_TESTFW_TESTMODULE + +#include "unicode/unistr.h" +#include "unicode/ures.h" +#include "unicode/testtype.h" +#include "unicode/testdata.h" +#include "unicode/datamap.h" +#include "unicode/testlog.h" + + +/* This class abstracts the actual organization of the + * data for data driven tests + */ + + +class DataMap; +class TestData; + + +/** Main data driven test class. Corresponds to one named data + * unit (such as a resource bundle. It is instantiated using + * a factory method getTestDataModule + */ +class T_CTEST_EXPORT_API TestDataModule { + const char* testName; + +protected: + DataMap *fInfo; + TestLog& fLog; + +public: + /** Factory method. + * @param name name of the test module. Usually name of a resource bundle or a XML file + * @param log a logging class, used for internal error reporting. + * @param status if something goes wrong, status will be set + * @return a TestDataModule object. Use it to get test data from it + */ + static TestDataModule *getTestDataModule(const char* name, TestLog& log, UErrorCode &status); + virtual ~TestDataModule(); + +protected: + TestDataModule(const char* name, TestLog& log, UErrorCode& status); + +public: + /** Name of this TestData module. + * @return a name + */ + const char * getName() const; + + /** Get a pointer to an object owned DataMap that contains more information on this module + * Usual fields are "Description", "LongDescription", "Settings". Also, if containing a + * field "Headers" these will be used as the default headers, so that you don't have to + * to specify per test headers. + * @param info pass in a const DataMap pointer. If no info, it will be set to nullptr + */ + virtual UBool getInfo(const DataMap *& info, UErrorCode &status) const = 0; + + /** Create a test data object from an index. Helpful for integrating tests with current + * intltest framework which addresses the tests by index. + * @param index index of the test to be instantiated + * @return an instantiated TestData object, ready to provide settings and cases for + * the tests. + */ + virtual TestData* createTestData(int32_t index, UErrorCode &status) const = 0; + + /** Create a test data object from a name. + * @param name name of the test to be instantiated + * @return an instantiated TestData object, ready to provide settings and cases for + * the tests. + */ + virtual TestData* createTestData(const char* name, UErrorCode &status) const = 0; +}; + +class T_CTEST_EXPORT_API RBTestDataModule : public TestDataModule { +public: + virtual ~RBTestDataModule(); + +public: + RBTestDataModule(const char* name, TestLog& log, UErrorCode& status); + +public: + virtual UBool getInfo(const DataMap *& info, UErrorCode &status) const override; + + virtual TestData* createTestData(int32_t index, UErrorCode &status) const override; + virtual TestData* createTestData(const char* name, UErrorCode &status) const override; + +private: + UResourceBundle *getTestBundle(const char* bundleName, UErrorCode &status); + +private: + UResourceBundle *fModuleBundle; + UResourceBundle *fTestData; + UResourceBundle *fInfoRB; + UBool fDataTestValid; + char *tdpath; + +/* const char* fTestName;*/ /* See name */ + int32_t fNumberOfTests; + +}; + + +#endif + diff --git a/intl/icu/source/tools/ctestfw/unicode/uperf.h b/intl/icu/source/tools/ctestfw/unicode/uperf.h new file mode 100644 index 0000000000..e578c46694 --- /dev/null +++ b/intl/icu/source/tools/ctestfw/unicode/uperf.h @@ -0,0 +1,200 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (c) 2002-2014, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +*/ +#ifndef _UPERF_H +#define _UPERF_H + +#include "unicode/utypes.h" +#include "unicode/unistr.h" +#include "unicode/ustring.h" + +#include "unicode/testtype.h" +#include "unicode/utimer.h" +#include "ucbuf.h" + +// Forward declarations from uoptions.h. +struct UOption; +typedef struct UOption UOption; + +#if !UCONFIG_NO_CONVERSION + +U_NAMESPACE_USE +// Use the TESTCASE macro in subclasses of UPerfTest. Define the +// runIndexedTest method in this fashion: +// +//| void MyTest::runIndexedTest(int32_t index, UBool exec, +//| const char* &name, char* /*par*/) { +//| switch (index) { +//| TESTCASE(0,TestSomething); +//| TESTCASE(1,TestSomethingElse); +//| TESTCASE(2,TestAnotherThing); +//| default: +//| name = ""; +//| break; +//| } +//| return nullptr; +//| } +#define TESTCASE(id,test) \ + case id: \ + name = #test; \ + if (exec) { \ + return test(); \ + } \ + break + +// More convenient macros. These allow easy reordering of the test cases. +// Copied from intltest.h, and adjusted to not logln() but return a UPerfFunction. +// +//| void MyTest::runIndexedTest(int32_t index, UBool exec, +//| const char* &name, char* /*par*/) { +//| TESTCASE_AUTO_BEGIN; +//| TESTCASE_AUTO(TestSomething); +//| TESTCASE_AUTO(TestSomethingElse); +//| TESTCASE_AUTO(TestAnotherThing); +//| TESTCASE_AUTO_END; +//| return nullptr; +//| } +#define TESTCASE_AUTO_BEGIN \ + for(;;) { \ + int32_t testCaseAutoNumber = 0 + +#define TESTCASE_AUTO(test) \ + if (index == testCaseAutoNumber++) { \ + name = #test; \ + if (exec) { \ + return test(); \ + } \ + break; \ + } + +#define TESTCASE_AUTO_END \ + name = ""; \ + break; \ + } + +/** + * Subclasses of PerfTest will need to create subclasses of + * Function that define a call() method which contains the code to + * be timed. They then call setTestFunction() in their "Test..." + * method to establish this as the current test functor. + */ +class T_CTEST_EXPORT_API UPerfFunction { +public: + /** + * destructor + */ + virtual ~UPerfFunction(); + + /** + * Subclasses must implement this method to do the action to be + * measured. + */ + virtual void call(UErrorCode* status)=0; + + /** + * Subclasses must implement this method to return positive + * integer indicating the number of operations in a single + * call to this object's call() method. + */ + virtual long getOperationsPerIteration()=0; + /** + * Subclasses should override this method to return either positive + * or negative integer indicating the number of events in a single + * call to this object's call() method, if applicable + * e.g: Number of breaks / iterations for break iterator + */ + virtual long getEventsPerIteration(){ + return -1; + } + /** + * Call call() n times in a tight loop and return the elapsed + * milliseconds. If n is small and call() is fast the return + * result may be zero. Small return values have limited + * meaningfulness, depending on the underlying CPU and OS. + */ + virtual double time(int32_t n, UErrorCode* status) { + UTimer start, stop; + utimer_getTime(&start); + while (n-- > 0) { + call(status); + } + utimer_getTime(&stop); + return utimer_getDeltaSeconds(&start,&stop); // ms + } + +}; + + +class T_CTEST_EXPORT_API UPerfTest { +public: + UBool run(); + UBool runTest( char* name = nullptr, char* par = nullptr ); // not to be overridden + + virtual void usage() ; + + virtual ~UPerfTest(); + + void setCaller( UPerfTest* callingTest ); // for internal use only + + void setPath( char* path ); // for internal use only + + ULine* getLines(UErrorCode& status); + + const char16_t* getBuffer(int32_t& len,UErrorCode& status); + +protected: + UPerfTest(int32_t argc, const char* argv[], UErrorCode& status); + + UPerfTest(int32_t argc, const char* argv[], + UOption addOptions[], int32_t addOptionsCount, + const char *addUsage, + UErrorCode& status); + + void init(UOption addOptions[], int32_t addOptionsCount, + UErrorCode& status); + + virtual UPerfFunction* runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = nullptr ); // override ! + + virtual UBool runTestLoop( char* testname, char* par ); + + virtual UBool callTest( UPerfTest& testToBeCalled, char* par ); + + int32_t _argc; + const char** _argv; + const char * _addUsage; + char* resolvedFileName; + UCHARBUF* ucharBuf; + const char* encoding; + UBool uselen; + const char* fileName; + const char* sourceDir; + int32_t _remainingArgc; + ULine* lines; + int32_t numLines; + UBool line_mode; + char16_t* buffer; + int32_t bufferLen; + UBool verbose; + UBool bulk_mode; + int32_t passes; + int32_t iterations; + int32_t time; + const char* locale; +private: + UPerfTest* caller; + char* path; // specifies subtests + +// static members +public: + static UPerfTest* gTest; + static const char gUsageString[]; +}; + +#endif +#endif + diff --git a/intl/icu/source/tools/ctestfw/unicode/utimer.h b/intl/icu/source/tools/ctestfw/unicode/utimer.h new file mode 100644 index 0000000000..10f80833cb --- /dev/null +++ b/intl/icu/source/tools/ctestfw/unicode/utimer.h @@ -0,0 +1,282 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +************************************************************************ +* Copyright (c) 1997-2012, International Business Machines +* Corporation and others. All Rights Reserved. +************************************************************************ +*/ + +#ifndef _UTIMER_H +#define _UTIMER_H + +#include "unicode/utypes.h" + +#if U_PLATFORM_USES_ONLY_WIN32_API +# define VC_EXTRALEAN +# define WIN32_LEAN_AND_MEAN +# include <windows.h> +#else +# if U_PLATFORM == U_PF_OS390 && !defined(__UU) +# define __UU /* Universal Unix - for struct timeval */ +# endif +# include <time.h> +# include <sys/time.h> +# include <unistd.h> +#endif + +/** + * This API provides functions for performing performance measurement + * There are 3 main usage scenarios. + * i) Loop until a threshold time is reached: + * Example: + * <code> + * typedef Params Params; + * struct Params{ + * char16_t* target; + * int32_t targetLen; + * const char16_t* source; + * int32_t sourceLen; + * UNormalizationMode mode; + * } + * void NormFn( void* param){ + * Params* parameters = ( Params*) param; + * UErrorCode error = U_ZERO_ERROR; + * unorm_normalize(parameters->source, parameters->sourceLen, parameters->mode, 0, parameters->target, parameters->targetLen, &error); + * if(U_FAILURE(error)){ + * printf("Normalization failed\n"); + * } + * } + * + * int main(){ + * // time the normalization function + * double timeTaken = 0; + * Params param; + * param.source // set up the source buffer + * param.target // set up the target buffer + * .... so on ... + * UTimer timer; + * // time the loop for 10 seconds at least and find out the loop count and time taken + * timeTaken = utimer_loopUntilDone((double)10,(void*) param, NormFn, &loopCount); + * } + * </code> + * + * ii) Measure the time taken + * Example: + * <code> + * double perfNormalization(NormFn fn,const char* mode,Line* fileLines,int32_t loopCount){ + * int line; + * int loops; + * UErrorCode error = U_ZERO_ERROR; + * char16_t* dest=nullptr; + * int32_t destCapacity=0; + * int len =-1; + * double elapsedTime = 0; + * int retVal=0; + * + * char16_t arr[5000]; + * dest=arr; + * destCapacity = 5000; + * UTimer start; + * + * // Initialize cache and ensure the data is loaded. + * // This loop checks for errors in Normalization. Once we pass the initialization + * // without errors we can safelly assume that there are no errors while timing the + * // function + * for (loops=0; loops<10; loops++) { + * for (line=0; line < gNumFileLines; line++) { + * if (opt_uselen) { + * len = fileLines[line].len; + * } + * + * retVal= fn(fileLines[line].name,len,dest,destCapacity,&error); + * #if U_PLATFORM_HAS_WIN32_API + * if(retVal==0 ){ + * fprintf(stderr,"Normalization of string in Windows API failed for mode %s. ErrorNo: %i at line number %i\n",mode,GetLastError(),line); + * return 0; + * } + * #endif + * if(U_FAILURE(error)){ + * fprintf(stderr,"Normalization of string in ICU API failed for mode %s. Error: %s at line number %i\n",mode,u_errorName(error),line); + * return 0; + * } + * + * } + * } + * + * //compute the time + * + * utimer_getTime(&start); + * for (loops=0; loops<loopCount; loops++) { + * for (line=0; line < gNumFileLines; line++) { + * if (opt_uselen) { + * len = fileLines[line].len; + * } + * + * retVal= fn(fileLines[line].name,len,dest,destCapacity,&error); + * + * } + * } + * + * return utimer_getElapsedSeconds(&start); + * } + * </code> + * + * iii) Let a higher level function do the calculation of confidence levels etc. + * Example: + * <code> + * void perf(UTimer* timer, char16_t* source, int32_t sourceLen, char16_t* target, int32_t targetLen, int32_t loopCount,UNormalizationMode mode, UErrorCode* error){ + * int32_t loops; + * for (loops=0; loops<loopCount; loops++) { + * unorm_normalize(source,sourceLen,target, targetLen,mode,error); + * } + * utimer_getTime(timer); + * } + * void main(const char* argsc, int argv){ + * // read the file and setup the data + * // set up options + * UTimer start,timer1, timer2, timer3, timer4; + * double NFDTimeTaken, NFCTimeTaken, FCDTimeTaken; + * switch(opt){ + * case 0: + * utimer_getTime(start); + * perf(timer1, source,sourceLen, target, targetLen,loopCount,UNORM_NFD,&error); + * NFDTimeTaken = utimer_getDeltaSeconds(start,timer1); + * case 1: + * timer_getTime(start); + * perf(timer2,source,sourceLen,target,targetLen,loopCount,UNORM_NFC,&error); + * NFCTimeTaken = utimer_getDeltaSeconds(start,timer2); + * perf(timer3, source, sourceLen, target,targetLen, loopCount, UNORM_FCD,&error); + * // ........so on ............. + * } + * // calculate confidence levels etc and print + * + * } + * + * </code> + * + */ + +typedef struct UTimer UTimer; + +typedef void FunctionToBeTimed(void* param); + + +#if U_PLATFORM_USES_ONLY_WIN32_API + + struct UTimer{ + LARGE_INTEGER start; + LARGE_INTEGER placeHolder; + }; + +static int uprv_initFrequency(UTimer* timer) + { + return QueryPerformanceFrequency(&timer->placeHolder); + } +static void uprv_start(UTimer* timer) + { + QueryPerformanceCounter(&timer->start); + } +static double uprv_delta(UTimer* timer1, UTimer* timer2){ + return ((double)(timer2->start.QuadPart - timer1->start.QuadPart))/((double)timer1->placeHolder.QuadPart); + } +static UBool uprv_compareFrequency(UTimer* timer1, UTimer* timer2){ + return (timer1->placeHolder.QuadPart == timer2->placeHolder.QuadPart); + } + +#else + + struct UTimer{ + struct timeval start; + struct timeval placeHolder; + }; + +static int32_t uprv_initFrequency(UTimer* /*timer*/) + { + return 0; + } +static void uprv_start(UTimer* timer) + { + gettimeofday(&timer->start, 0); + } +static double uprv_delta(UTimer* timer1, UTimer* timer2){ + double t1, t2; + + t1 = (double)timer1->start.tv_sec + (double)timer1->start.tv_usec/(1000*1000); + t2 = (double)timer2->start.tv_sec + (double)timer2->start.tv_usec/(1000*1000); + return (t2-t1); + } +static UBool uprv_compareFrequency(UTimer* /*timer1*/, UTimer* /*timer2*/){ + return true; + } + +#endif +/** + * Initializes the timer with the current time + * + * @param timer A pointer to UTimer struct to receive the current time + */ +static inline void U_EXPORT2 +utimer_getTime(UTimer* timer){ + uprv_initFrequency(timer); + uprv_start(timer); +} + +/** + * Returns the difference in times between timer1 and timer2 by subtracting + * timer1's time from timer2's time + * + * @param timer1 A pointer to UTimer struct to be used as starting time + * @param timer2 A pointer to UTimer struct to be used as end time + * @return Time in seconds + */ +static inline double U_EXPORT2 +utimer_getDeltaSeconds(UTimer* timer1, UTimer* timer2){ + if(uprv_compareFrequency(timer1,timer2)){ + return uprv_delta(timer1,timer2); + } + /* got error return -1 */ + return -1; +} + +/** + * Returns the time elapsed from the starting time represented by the + * UTimer struct pointer passed + * @param timer A pointer to UTimer struct to be used as starting time + * @return Time elapsed in seconds + */ +static inline double U_EXPORT2 +utimer_getElapsedSeconds(UTimer* timer){ + UTimer temp; + utimer_getTime(&temp); + return uprv_delta(timer,&temp); +} + +/** + * Executes the function pointed to for a given time and returns exact time + * taken and number of iterations of the loop + * @param thresholTimeVal + * @param loopCount output param to receive the number of iterations + * @param fn The function to be executed + * @param param Parameters to be passed to the fn + * @return the time elapsed in seconds + */ +static inline double U_EXPORT2 +utimer_loopUntilDone(double thresholdTimeVal, + int32_t* loopCount, + FunctionToBeTimed fn, + void* param){ + UTimer timer; + double currentVal=0; + *loopCount = 0; + utimer_getTime(&timer); + for(;currentVal<thresholdTimeVal;){ + fn(param); + currentVal = utimer_getElapsedSeconds(&timer); + (*loopCount)++; + } + return currentVal; +} + +#endif + diff --git a/intl/icu/source/tools/ctestfw/uperf.cpp b/intl/icu/source/tools/ctestfw/uperf.cpp new file mode 100644 index 0000000000..9e92b7714b --- /dev/null +++ b/intl/icu/source/tools/ctestfw/uperf.cpp @@ -0,0 +1,533 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2002-2012, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + +// Defines _XOPEN_SOURCE for access to POSIX functions. +// Must be before any other #includes. +#include "uposixdefs.h" + +#include "unicode/uperf.h" +#include "uoptions.h" +#include "cmemory.h" +#include <stdio.h> +#include <stdlib.h> + +#if !UCONFIG_NO_CONVERSION + +UPerfFunction::~UPerfFunction() {} + +static const char delim = '/'; +static int32_t execCount = 0; +UPerfTest* UPerfTest::gTest = nullptr; +static const int MAXLINES = 40000; +const char UPerfTest::gUsageString[] = + "Usage: %s [OPTIONS] [FILES]\n" + "\tReads the input file and prints out time taken in seconds\n" + "Options:\n" + "\t-h or -? or --help this usage text\n" + "\t-v or --verbose print extra information when processing files\n" + "\t-s or --sourcedir source directory for files followed by path\n" + "\t followed by path\n" + "\t-e or --encoding encoding of source files\n" + "\t-u or --uselen perform timing analysis on non-null terminated buffer using length\n" + "\t-f or --file-name file to be used as input data\n" + "\t-p or --passes Number of passes to be performed. Requires Numeric argument.\n" + "\t Cannot be used with --time\n" + "\t-i or --iterations Number of iterations to be performed. Requires Numeric argument\n" + "\t-t or --time Threshold time for looping until in seconds. Requires Numeric argument.\n" + "\t Cannot be used with --iterations\n" + "\t-l or --line-mode The data file should be processed in line mode\n" + "\t-b or --bulk-mode The data file should be processed in file based.\n" + "\t Cannot be used with --line-mode\n" + "\t-L or --locale Locale for the test\n"; + +enum +{ + HELP1, + HELP2, + VERBOSE, + SOURCEDIR, + ENCODING, + USELEN, + FILE_NAME, + PASSES, + ITERATIONS, + TIME, + LINE_MODE, + BULK_MODE, + LOCALE, + OPTIONS_COUNT +}; + + +static UOption options[OPTIONS_COUNT+20]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_VERBOSE, + UOPTION_SOURCEDIR, + UOPTION_ENCODING, + UOPTION_DEF( "uselen", 'u', UOPT_NO_ARG), + UOPTION_DEF( "file-name", 'f', UOPT_REQUIRES_ARG), + UOPTION_DEF( "passes", 'p', UOPT_REQUIRES_ARG), + UOPTION_DEF( "iterations", 'i', UOPT_REQUIRES_ARG), + UOPTION_DEF( "time", 't', UOPT_REQUIRES_ARG), + UOPTION_DEF( "line-mode", 'l', UOPT_NO_ARG), + UOPTION_DEF( "bulk-mode", 'b', UOPT_NO_ARG), + UOPTION_DEF( "locale", 'L', UOPT_REQUIRES_ARG) +}; + +UPerfTest::UPerfTest(int32_t argc, const char* argv[], UErrorCode& status) + : _argc(argc), _argv(argv), _addUsage(nullptr), + ucharBuf(nullptr), encoding(""), + uselen(false), + fileName(nullptr), sourceDir("."), + lines(nullptr), numLines(0), line_mode(true), + buffer(nullptr), bufferLen(0), + verbose(false), bulk_mode(false), + passes(1), iterations(0), time(0), + locale(nullptr) { + init(nullptr, 0, status); +} + +UPerfTest::UPerfTest(int32_t argc, const char* argv[], + UOption addOptions[], int32_t addOptionsCount, + const char *addUsage, + UErrorCode& status) + : _argc(argc), _argv(argv), _addUsage(addUsage), + ucharBuf(nullptr), encoding(""), + uselen(false), + fileName(nullptr), sourceDir("."), + lines(nullptr), numLines(0), line_mode(true), + buffer(nullptr), bufferLen(0), + verbose(false), bulk_mode(false), + passes(1), iterations(0), time(0), + locale(nullptr) { + init(addOptions, addOptionsCount, status); +} + +void UPerfTest::init(UOption addOptions[], int32_t addOptionsCount, + UErrorCode& status) { + //initialize the argument list + U_MAIN_INIT_ARGS(_argc, _argv); + + resolvedFileName = nullptr; + + // add specific options + int32_t optionsCount = OPTIONS_COUNT; + if (addOptionsCount > 0) { + memcpy(options+optionsCount, addOptions, addOptionsCount*sizeof(UOption)); + optionsCount += addOptionsCount; + } + + //parse the arguments + _remainingArgc = u_parseArgs(_argc, (char**)_argv, optionsCount, options); + + // copy back values for additional options + if (addOptionsCount > 0) { + memcpy(addOptions, options+OPTIONS_COUNT, addOptionsCount*sizeof(UOption)); + } + + // Now setup the arguments + if(_argc==1 || options[HELP1].doesOccur || options[HELP2].doesOccur) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if(options[VERBOSE].doesOccur) { + verbose = true; + } + + if(options[SOURCEDIR].doesOccur) { + sourceDir = options[SOURCEDIR].value; + } + + if(options[ENCODING].doesOccur) { + encoding = options[ENCODING].value; + } + + if(options[USELEN].doesOccur) { + uselen = true; + } + + if(options[FILE_NAME].doesOccur){ + fileName = options[FILE_NAME].value; + } + + if(options[PASSES].doesOccur) { + passes = atoi(options[PASSES].value); + } + if(options[ITERATIONS].doesOccur) { + iterations = atoi(options[ITERATIONS].value); + if(options[TIME].doesOccur) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + } else if(options[TIME].doesOccur) { + time = atoi(options[TIME].value); + } else { + iterations = 1000; // some default + } + + if(options[LINE_MODE].doesOccur) { + line_mode = true; + bulk_mode = false; + } + + if(options[BULK_MODE].doesOccur) { + bulk_mode = true; + line_mode = false; + } + + if(options[LOCALE].doesOccur) { + locale = options[LOCALE].value; + } + + int32_t len = 0; + if(fileName!=nullptr){ + //pre-flight + ucbuf_resolveFileName(sourceDir, fileName, nullptr, &len, &status); + resolvedFileName = (char*) uprv_malloc(len); + if(resolvedFileName==nullptr){ + status= U_MEMORY_ALLOCATION_ERROR; + return; + } + if(status == U_BUFFER_OVERFLOW_ERROR){ + status = U_ZERO_ERROR; + } + ucbuf_resolveFileName(sourceDir, fileName, resolvedFileName, &len, &status); + ucharBuf = ucbuf_open(resolvedFileName,&encoding,true,false,&status); + + if(U_FAILURE(status)){ + printf("Could not open the input file %s. Error: %s\n", fileName, u_errorName(status)); + return; + } + } +} + +ULine* UPerfTest::getLines(UErrorCode& status){ + if (U_FAILURE(status)) { + return nullptr; + } + if (lines != nullptr) { + return lines; // don't do it again + } + lines = new ULine[MAXLINES]; + int maxLines = MAXLINES; + numLines=0; + const char16_t* line=nullptr; + int32_t len =0; + for (;;) { + line = ucbuf_readline(ucharBuf,&len,&status); + if(line == nullptr || U_FAILURE(status)){ + break; + } + lines[numLines].name = new char16_t[len]; + lines[numLines].len = len; + memcpy(lines[numLines].name, line, len * U_SIZEOF_UCHAR); + + numLines++; + len = 0; + if (numLines >= maxLines) { + maxLines += MAXLINES; + ULine *newLines = new ULine[maxLines]; + if(newLines == nullptr) { + fprintf(stderr, "Out of memory reading line %d.\n", (int)numLines); + status= U_MEMORY_ALLOCATION_ERROR; + delete []lines; + return nullptr; + } + + memcpy(newLines, lines, numLines*sizeof(ULine)); + delete []lines; + lines = newLines; + } + } + return lines; +} +const char16_t* UPerfTest::getBuffer(int32_t& len, UErrorCode& status){ + if (U_FAILURE(status)) { + return nullptr; + } + len = ucbuf_size(ucharBuf); + buffer = (char16_t*) uprv_malloc(U_SIZEOF_UCHAR * (len+1)); + u_strncpy(buffer,ucbuf_getBuffer(ucharBuf,&bufferLen,&status),len); + buffer[len]=0; + len = bufferLen; + return buffer; +} +UBool UPerfTest::run(){ + if(_remainingArgc==1){ + // Testing all methods + return runTest(); + } + UBool res=false; + // Test only the specified function + for (int i = 1; i < _remainingArgc; ++i) { + if (_argv[i][0] != '-') { + char* name = (char*) _argv[i]; + if(verbose==true){ + //fprintf(stdout, "\n=== Handling test: %s: ===\n", name); + //fprintf(stdout, "\n%s:\n", name); + } + char* parameter = strchr( name, '@' ); + if (parameter) { + *parameter = 0; + parameter += 1; + } + execCount = 0; + res = runTest( name, parameter ); + if (!res || (execCount <= 0)) { + fprintf(stdout, "\n---ERROR: Test doesn't exist: %s!\n", name); + return false; + } + } + } + return res; +} +UBool UPerfTest::runTest(char* name, char* par ){ + UBool rval; + char* pos = nullptr; + + if (name) + pos = strchr( name, delim ); // check if name contains path (by looking for '/') + if (pos) { + path = pos+1; // store subpath for calling subtest + *pos = 0; // split into two strings + }else{ + path = nullptr; + } + + if (!name || (name[0] == 0) || (strcmp(name, "*") == 0)) { + rval = runTestLoop( nullptr, nullptr ); + + }else if (strcmp( name, "LIST" ) == 0) { + this->usage(); + rval = true; + + }else{ + rval = runTestLoop( name, par ); + } + + if (pos) + *pos = delim; // restore original value at pos + return rval; +} + + +void UPerfTest::setPath( char* pathVal ) +{ + this->path = pathVal; +} + +// call individual tests, to be overridden to call implementations +UPerfFunction* UPerfTest::runIndexedTest( int32_t /*index*/, UBool /*exec*/, const char* & /*name*/, char* /*par*/ ) +{ + // to be overridden by a method like: + /* + switch (index) { + case 0: name = "First Test"; if (exec) FirstTest( par ); break; + case 1: name = "Second Test"; if (exec) SecondTest( par ); break; + default: name = ""; break; + } + */ + fprintf(stderr,"*** runIndexedTest needs to be overridden! ***"); + return nullptr; +} + + +UBool UPerfTest::runTestLoop( char* testname, char* par ) +{ + int32_t index = 0; + const char* name; + UBool run_this_test; + UBool rval = false; + UErrorCode status = U_ZERO_ERROR; + UPerfTest* saveTest = gTest; + gTest = this; + int32_t loops = 0; + double t=0; + int32_t n = 1; + long ops; + do { + this->runIndexedTest( index, false, name ); + if (!name || (name[0] == 0)) + break; + if (!testname) { + run_this_test = true; + }else{ + run_this_test = (UBool) (strcmp( name, testname ) == 0); + } + if (run_this_test) { + UPerfFunction* testFunction = this->runIndexedTest( index, true, name, par ); + execCount++; + rval=true; + if(testFunction==nullptr){ + fprintf(stderr,"%s function returned nullptr", name); + return false; + } + ops = testFunction->getOperationsPerIteration(); + if (ops < 1) { + fprintf(stderr, "%s returned an illegal operations/iteration()\n", name); + return false; + } + if(iterations == 0) { + n = time; + // Run for specified duration in seconds + if(verbose==true){ + fprintf(stdout,"= %s calibrating %i seconds \n", name, (int)n); + } + + //n *= 1000; // s => ms + //System.out.println("# " + meth.getName() + " " + n + " sec"); + int32_t failsafe = 1; // last resort for very fast methods + t = 0; + while (t < (int)(n * 0.9)) { // 90% is close enough + if (loops == 0 || t == 0) { + loops = failsafe; + failsafe *= 10; + } else { + //System.out.println("# " + meth.getName() + " x " + loops + " = " + t); + loops = (int)((double)n / t * loops + 0.5); + if (loops == 0) { + fprintf(stderr,"Unable to converge on desired duration"); + return false; + } + } + //System.out.println("# " + meth.getName() + " x " + loops); + t = testFunction->time(loops,&status); + if(U_FAILURE(status)){ + printf("Performance test failed with error: %s \n", u_errorName(status)); + break; + } + } + } else { + loops = iterations; + } + + double min_t=1000000.0, sum_t=0.0; + long events = -1; + + for(int32_t ps =0; ps < passes; ps++){ + if(verbose==true){ + fprintf(stdout,"= %s begin " ,name); + if(iterations > 0) { + fprintf(stdout, "%i\n", (int)loops); + } else { + fprintf(stdout, "%i\n", (int)n); + } + } + t = testFunction->time(loops, &status); + if(U_FAILURE(status)){ + printf("Performance test failed with error: %s \n", u_errorName(status)); + break; + } + sum_t+=t; + if(t<min_t) { + min_t=t; + } + events = testFunction->getEventsPerIteration(); + //print info only in verbose mode + if(verbose==true){ + if(events == -1){ + fprintf(stdout, "= %s end: %f loops: %i operations: %li \n", name, t, (int)loops, ops); + }else{ + fprintf(stdout, "= %s end: %f loops: %i operations: %li events: %li\n", name, t, (int)loops, ops, events); + } + } + } + if(verbose && U_SUCCESS(status)) { + double avg_t = sum_t/passes; + if (loops == 0 || ops == 0) { + fprintf(stderr, "%s did not run\n", name); + } + else if(events == -1) { + fprintf(stdout, "%%= %s avg: %.4g loops: %i avg/op: %.4g ns\n", + name, avg_t, (int)loops, (avg_t*1E9)/(loops*ops)); + fprintf(stdout, "_= %s min: %.4g loops: %i min/op: %.4g ns\n", + name, min_t, (int)loops, (min_t*1E9)/(loops*ops)); + } + else { + fprintf(stdout, "%%= %s avg: %.4g loops: %i avg/op: %.4g ns avg/event: %.4g ns\n", + name, avg_t, (int)loops, (avg_t*1E9)/(loops*ops), (avg_t*1E9)/(loops*events)); + fprintf(stdout, "_= %s min: %.4g loops: %i min/op: %.4g ns min/event: %.4g ns\n", + name, min_t, (int)loops, (min_t*1E9)/(loops*ops), (min_t*1E9)/(loops*events)); + } + } + else if(U_SUCCESS(status)) { + // Print results in ndjson format for GHA Benchmark to process. + fprintf(stdout, + "{\"biggerIsBetter\":false,\"name\":\"%s\",\"unit\":\"ns/iter\",\"value\":%.4f}\n", + name, (min_t*1E9)/(loops*ops)); + } + delete testFunction; + } + index++; + }while(name); + + gTest = saveTest; + return rval; +} + +/** +* Print a usage message for this test class. +*/ +void UPerfTest::usage() +{ + puts(gUsageString); + if (_addUsage != nullptr) { + puts(_addUsage); + } + + UBool save_verbose = verbose; + verbose = true; + fprintf(stdout,"Test names:\n"); + fprintf(stdout,"-----------\n"); + + int32_t index = 0; + const char* name = nullptr; + do{ + this->runIndexedTest( index, false, name ); + if (!name) + break; + fprintf(stdout, "%s\n", name); + index++; + }while (name && (name[0] != 0)); + verbose = save_verbose; +} + + + + +void UPerfTest::setCaller( UPerfTest* callingTest ) +{ + caller = callingTest; + if (caller) { + verbose = caller->verbose; + } +} + +UBool UPerfTest::callTest( UPerfTest& testToBeCalled, char* par ) +{ + execCount--; // correct a previously assumed test-exec, as this only calls a subtest + testToBeCalled.setCaller( this ); + return testToBeCalled.runTest( path, par ); +} + +UPerfTest::~UPerfTest(){ + if(lines!=nullptr){ + delete[] lines; + } + if(buffer!=nullptr){ + uprv_free(buffer); + } + if(resolvedFileName!=nullptr){ + uprv_free(resolvedFileName); + } + ucbuf_close(ucharBuf); +} + +#endif diff --git a/intl/icu/source/tools/escapesrc/Makefile.in b/intl/icu/source/tools/escapesrc/Makefile.in new file mode 100644 index 0000000000..7580ccdc31 --- /dev/null +++ b/intl/icu/source/tools/escapesrc/Makefile.in @@ -0,0 +1,112 @@ +## Makefile.in for ICU - tools/escapesrc +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 1999-2011, International Business Machines Corporation and +## others. All Rights Reserved. +## Steven R. Loomis + +# To avoid recursion +SKIP_ESCAPING=YES + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/escapesrc + +TARGET_STUB_NAME = escapesrc + +SECTION = 8 + +#MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) ./output-*.cpp + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil +#LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) +LIBS += $(DEFAULT_LIBS) $(LIB_M) + +OBJECTS = escapesrc.o + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(sbindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir) + +install-man: $(MAN_FILES) +# $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) +# $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + @echo Testing test-nochange.cpp + @$(INVOKE) $(TARGET) $(srcdir)/test-nochange.cpp ./output-nochange.cpp + @-diff -I '#line.*' $(srcdir)/test-nochange.cpp ./output-nochange.cpp || (echo >&2 'warning: diff failed or not found' ; true) + @echo Testing test-simple.cpp + @$(INVOKE) $(TARGET) $(srcdir)/test-simple.cpp ./output-simple.cpp + @-diff -I '#line.*' $(srcdir)/expect-simple.cpp ./output-simple.cpp || (echo >&2 'warning: diff failed or not found' ; true) + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +# depends on ICU being built +gen-table: tblgen$(EXEEXT) + $(INVOKE) ./tblgen$(EXEEXT) > $(srcdir)/cptbl.h + +tblgen$(EXEEXT): tblgen.o + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) $(LIBICUUC) + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/escapesrc/cptbl.h b/intl/icu/source/tools/escapesrc/cptbl.h new file mode 100644 index 0000000000..898e16c925 --- /dev/null +++ b/intl/icu/source/tools/escapesrc/cptbl.h @@ -0,0 +1,521 @@ +// Copyright (C) 2016 and later: Unicode, Inc. and others. License & terms of use: http://www.unicode.org/copyright.html +// generated by tblgen. You weren't going to edit it by hand, were you? + +static const char cp1047_8859_1[256] = { + (char)0x00, /* 00 */ + (char)0x01, /* 01 */ + (char)0x02, /* 02 */ + (char)0x03, /* 03 */ + (char)0x9C, /* 04 */ + (char)0x09, /* 05 */ + (char)0x86, /* 06 */ + (char)0x7F, /* 07 */ + (char)0x97, /* 08 */ + (char)0x8D, /* 09 */ + (char)0x8E, /* 0A */ + (char)0x0B, /* 0B */ + (char)0x0C, /* 0C */ + (char)0x0D, /* 0D */ + (char)0x0E, /* 0E */ + (char)0x0F, /* 0F */ + (char)0x10, /* 10 */ + (char)0x11, /* 11 */ + (char)0x12, /* 12 */ + (char)0x13, /* 13 */ + (char)0x9D, /* 14 */ + (char)0x85, /* 15 */ + (char)0x08, /* 16 */ + (char)0x87, /* 17 */ + (char)0x18, /* 18 */ + (char)0x19, /* 19 */ + (char)0x92, /* 1A */ + (char)0x8F, /* 1B */ + (char)0x1C, /* 1C */ + (char)0x1D, /* 1D */ + (char)0x1E, /* 1E */ + (char)0x1F, /* 1F */ + (char)0x80, /* 20 */ + (char)0x81, /* 21 */ + (char)0x82, /* 22 */ + (char)0x83, /* 23 */ + (char)0x84, /* 24 */ + (char)0x0A, /* 25 */ + (char)0x17, /* 26 */ + (char)0x1B, /* 27 */ + (char)0x88, /* 28 */ + (char)0x89, /* 29 */ + (char)0x8A, /* 2A */ + (char)0x8B, /* 2B */ + (char)0x8C, /* 2C */ + (char)0x05, /* 2D */ + (char)0x06, /* 2E */ + (char)0x07, /* 2F */ + (char)0x90, /* 30 */ + (char)0x91, /* 31 */ + (char)0x16, /* 32 */ + (char)0x93, /* 33 */ + (char)0x94, /* 34 */ + (char)0x95, /* 35 */ + (char)0x96, /* 36 */ + (char)0x04, /* 37 */ + (char)0x98, /* 38 */ + (char)0x99, /* 39 */ + (char)0x9A, /* 3A */ + (char)0x9B, /* 3B */ + (char)0x14, /* 3C */ + (char)0x15, /* 3D */ + (char)0x9E, /* 3E */ + (char)0x1A, /* 3F */ + (char)0x20, /* 40 */ + (char)0xA0, /* 41 */ + (char)0xE2, /* 42 */ + (char)0xE4, /* 43 */ + (char)0xE0, /* 44 */ + (char)0xE1, /* 45 */ + (char)0xE3, /* 46 */ + (char)0xE5, /* 47 */ + (char)0xE7, /* 48 */ + (char)0xF1, /* 49 */ + (char)0xA2, /* 4A */ + (char)0x2E, /* 4B */ + (char)0x3C, /* 4C */ + (char)0x28, /* 4D */ + (char)0x2B, /* 4E */ + (char)0x7C, /* 4F */ + (char)0x26, /* 50 */ + (char)0xE9, /* 51 */ + (char)0xEA, /* 52 */ + (char)0xEB, /* 53 */ + (char)0xE8, /* 54 */ + (char)0xED, /* 55 */ + (char)0xEE, /* 56 */ + (char)0xEF, /* 57 */ + (char)0xEC, /* 58 */ + (char)0xDF, /* 59 */ + (char)0x21, /* 5A */ + (char)0x24, /* 5B */ + (char)0x2A, /* 5C */ + (char)0x29, /* 5D */ + (char)0x3B, /* 5E */ + (char)0x5E, /* 5F */ + (char)0x2D, /* 60 */ + (char)0x2F, /* 61 */ + (char)0xC2, /* 62 */ + (char)0xC4, /* 63 */ + (char)0xC0, /* 64 */ + (char)0xC1, /* 65 */ + (char)0xC3, /* 66 */ + (char)0xC5, /* 67 */ + (char)0xC7, /* 68 */ + (char)0xD1, /* 69 */ + (char)0xA6, /* 6A */ + (char)0x2C, /* 6B */ + (char)0x25, /* 6C */ + (char)0x5F, /* 6D */ + (char)0x3E, /* 6E */ + (char)0x3F, /* 6F */ + (char)0xF8, /* 70 */ + (char)0xC9, /* 71 */ + (char)0xCA, /* 72 */ + (char)0xCB, /* 73 */ + (char)0xC8, /* 74 */ + (char)0xCD, /* 75 */ + (char)0xCE, /* 76 */ + (char)0xCF, /* 77 */ + (char)0xCC, /* 78 */ + (char)0x60, /* 79 */ + (char)0x3A, /* 7A */ + (char)0x23, /* 7B */ + (char)0x40, /* 7C */ + (char)0x27, /* 7D */ + (char)0x3D, /* 7E */ + (char)0x22, /* 7F */ + (char)0xD8, /* 80 */ + (char)0x61, /* 81 */ + (char)0x62, /* 82 */ + (char)0x63, /* 83 */ + (char)0x64, /* 84 */ + (char)0x65, /* 85 */ + (char)0x66, /* 86 */ + (char)0x67, /* 87 */ + (char)0x68, /* 88 */ + (char)0x69, /* 89 */ + (char)0xAB, /* 8A */ + (char)0xBB, /* 8B */ + (char)0xF0, /* 8C */ + (char)0xFD, /* 8D */ + (char)0xFE, /* 8E */ + (char)0xB1, /* 8F */ + (char)0xB0, /* 90 */ + (char)0x6A, /* 91 */ + (char)0x6B, /* 92 */ + (char)0x6C, /* 93 */ + (char)0x6D, /* 94 */ + (char)0x6E, /* 95 */ + (char)0x6F, /* 96 */ + (char)0x70, /* 97 */ + (char)0x71, /* 98 */ + (char)0x72, /* 99 */ + (char)0xAA, /* 9A */ + (char)0xBA, /* 9B */ + (char)0xE6, /* 9C */ + (char)0xB8, /* 9D */ + (char)0xC6, /* 9E */ + (char)0xA4, /* 9F */ + (char)0xB5, /* A0 */ + (char)0x7E, /* A1 */ + (char)0x73, /* A2 */ + (char)0x74, /* A3 */ + (char)0x75, /* A4 */ + (char)0x76, /* A5 */ + (char)0x77, /* A6 */ + (char)0x78, /* A7 */ + (char)0x79, /* A8 */ + (char)0x7A, /* A9 */ + (char)0xA1, /* AA */ + (char)0xBF, /* AB */ + (char)0xD0, /* AC */ + (char)0x5B, /* AD */ + (char)0xDE, /* AE */ + (char)0xAE, /* AF */ + (char)0xAC, /* B0 */ + (char)0xA3, /* B1 */ + (char)0xA5, /* B2 */ + (char)0xB7, /* B3 */ + (char)0xA9, /* B4 */ + (char)0xA7, /* B5 */ + (char)0xB6, /* B6 */ + (char)0xBC, /* B7 */ + (char)0xBD, /* B8 */ + (char)0xBE, /* B9 */ + (char)0xDD, /* BA */ + (char)0xA8, /* BB */ + (char)0xAF, /* BC */ + (char)0x5D, /* BD */ + (char)0xB4, /* BE */ + (char)0xD7, /* BF */ + (char)0x7B, /* C0 */ + (char)0x41, /* C1 */ + (char)0x42, /* C2 */ + (char)0x43, /* C3 */ + (char)0x44, /* C4 */ + (char)0x45, /* C5 */ + (char)0x46, /* C6 */ + (char)0x47, /* C7 */ + (char)0x48, /* C8 */ + (char)0x49, /* C9 */ + (char)0xAD, /* CA */ + (char)0xF4, /* CB */ + (char)0xF6, /* CC */ + (char)0xF2, /* CD */ + (char)0xF3, /* CE */ + (char)0xF5, /* CF */ + (char)0x7D, /* D0 */ + (char)0x4A, /* D1 */ + (char)0x4B, /* D2 */ + (char)0x4C, /* D3 */ + (char)0x4D, /* D4 */ + (char)0x4E, /* D5 */ + (char)0x4F, /* D6 */ + (char)0x50, /* D7 */ + (char)0x51, /* D8 */ + (char)0x52, /* D9 */ + (char)0xB9, /* DA */ + (char)0xFB, /* DB */ + (char)0xFC, /* DC */ + (char)0xF9, /* DD */ + (char)0xFA, /* DE */ + (char)0xFF, /* DF */ + (char)0x5C, /* E0 */ + (char)0xF7, /* E1 */ + (char)0x53, /* E2 */ + (char)0x54, /* E3 */ + (char)0x55, /* E4 */ + (char)0x56, /* E5 */ + (char)0x57, /* E6 */ + (char)0x58, /* E7 */ + (char)0x59, /* E8 */ + (char)0x5A, /* E9 */ + (char)0xB2, /* EA */ + (char)0xD4, /* EB */ + (char)0xD6, /* EC */ + (char)0xD2, /* ED */ + (char)0xD3, /* EE */ + (char)0xD5, /* EF */ + (char)0x30, /* F0 */ + (char)0x31, /* F1 */ + (char)0x32, /* F2 */ + (char)0x33, /* F3 */ + (char)0x34, /* F4 */ + (char)0x35, /* F5 */ + (char)0x36, /* F6 */ + (char)0x37, /* F7 */ + (char)0x38, /* F8 */ + (char)0x39, /* F9 */ + (char)0xB3, /* FA */ + (char)0xDB, /* FB */ + (char)0xDC, /* FC */ + (char)0xD9, /* FD */ + (char)0xDA, /* FE */ + (char)0x9F, /* FF */ +}; + +static const bool oldIllegal[256] = { + false, /* U+0000 */ + false, /* U+0001 */ + false, /* U+0002 */ + false, /* U+0003 */ + false, /* U+0004 */ + false, /* U+0005 */ + false, /* U+0006 */ + false, /* U+0007 */ + false, /* U+0008 */ + false, /* U+0009 */ + false, /* U+000A */ + false, /* U+000B */ + false, /* U+000C */ + false, /* U+000D */ + false, /* U+000E */ + false, /* U+000F */ + false, /* U+0010 */ + false, /* U+0011 */ + false, /* U+0012 */ + false, /* U+0013 */ + false, /* U+0014 */ + false, /* U+0015 */ + false, /* U+0016 */ + false, /* U+0017 */ + false, /* U+0018 */ + false, /* U+0019 */ + false, /* U+001A */ + false, /* U+001B */ + false, /* U+001C */ + false, /* U+001D */ + false, /* U+001E */ + false, /* U+001F */ + true, /* U+0020 */ + true, /* U+0021 */ + true, /* U+0022 */ + true, /* U+0023 */ + false, /* U+0024 */ + true, /* U+0025 */ + true, /* U+0026 */ + true, /* U+0027 */ + true, /* U+0028 */ + true, /* U+0029 */ + true, /* U+002A */ + true, /* U+002B */ + true, /* U+002C */ + true, /* U+002D */ + true, /* U+002E */ + true, /* U+002F */ + true, /* U+0030 */ + true, /* U+0031 */ + true, /* U+0032 */ + true, /* U+0033 */ + true, /* U+0034 */ + true, /* U+0035 */ + true, /* U+0036 */ + true, /* U+0037 */ + true, /* U+0038 */ + true, /* U+0039 */ + true, /* U+003A */ + true, /* U+003B */ + true, /* U+003C */ + true, /* U+003D */ + true, /* U+003E */ + true, /* U+003F */ + false, /* U+0040 */ + true, /* U+0041 */ + true, /* U+0042 */ + true, /* U+0043 */ + true, /* U+0044 */ + true, /* U+0045 */ + true, /* U+0046 */ + true, /* U+0047 */ + true, /* U+0048 */ + true, /* U+0049 */ + true, /* U+004A */ + true, /* U+004B */ + true, /* U+004C */ + true, /* U+004D */ + true, /* U+004E */ + true, /* U+004F */ + true, /* U+0050 */ + true, /* U+0051 */ + true, /* U+0052 */ + true, /* U+0053 */ + true, /* U+0054 */ + true, /* U+0055 */ + true, /* U+0056 */ + true, /* U+0057 */ + true, /* U+0058 */ + true, /* U+0059 */ + true, /* U+005A */ + true, /* U+005B */ + false, /* U+005C */ + true, /* U+005D */ + true, /* U+005E */ + true, /* U+005F */ + false, /* U+0060 */ + true, /* U+0061 */ + true, /* U+0062 */ + true, /* U+0063 */ + true, /* U+0064 */ + true, /* U+0065 */ + true, /* U+0066 */ + true, /* U+0067 */ + true, /* U+0068 */ + true, /* U+0069 */ + true, /* U+006A */ + true, /* U+006B */ + true, /* U+006C */ + true, /* U+006D */ + true, /* U+006E */ + true, /* U+006F */ + true, /* U+0070 */ + true, /* U+0071 */ + true, /* U+0072 */ + true, /* U+0073 */ + true, /* U+0074 */ + true, /* U+0075 */ + true, /* U+0076 */ + true, /* U+0077 */ + true, /* U+0078 */ + true, /* U+0079 */ + true, /* U+007A */ + true, /* U+007B */ + true, /* U+007C */ + true, /* U+007D */ + true, /* U+007E */ + false, /* U+007F */ + false, /* U+0080 */ + false, /* U+0081 */ + false, /* U+0082 */ + false, /* U+0083 */ + false, /* U+0084 */ + false, /* U+0085 */ + false, /* U+0086 */ + false, /* U+0087 */ + false, /* U+0088 */ + false, /* U+0089 */ + false, /* U+008A */ + false, /* U+008B */ + false, /* U+008C */ + false, /* U+008D */ + false, /* U+008E */ + false, /* U+008F */ + false, /* U+0090 */ + false, /* U+0091 */ + false, /* U+0092 */ + false, /* U+0093 */ + false, /* U+0094 */ + false, /* U+0095 */ + false, /* U+0096 */ + false, /* U+0097 */ + false, /* U+0098 */ + false, /* U+0099 */ + false, /* U+009A */ + false, /* U+009B */ + false, /* U+009C */ + false, /* U+009D */ + false, /* U+009E */ + false, /* U+009F */ + false, /* U+00A0 */ + false, /* U+00A1 */ + false, /* U+00A2 */ + false, /* U+00A3 */ + false, /* U+00A4 */ + false, /* U+00A5 */ + false, /* U+00A6 */ + false, /* U+00A7 */ + false, /* U+00A8 */ + false, /* U+00A9 */ + false, /* U+00AA */ + false, /* U+00AB */ + false, /* U+00AC */ + false, /* U+00AD */ + false, /* U+00AE */ + false, /* U+00AF */ + false, /* U+00B0 */ + false, /* U+00B1 */ + false, /* U+00B2 */ + false, /* U+00B3 */ + false, /* U+00B4 */ + false, /* U+00B5 */ + false, /* U+00B6 */ + false, /* U+00B7 */ + false, /* U+00B8 */ + false, /* U+00B9 */ + false, /* U+00BA */ + false, /* U+00BB */ + false, /* U+00BC */ + false, /* U+00BD */ + false, /* U+00BE */ + false, /* U+00BF */ + false, /* U+00C0 */ + false, /* U+00C1 */ + false, /* U+00C2 */ + false, /* U+00C3 */ + false, /* U+00C4 */ + false, /* U+00C5 */ + false, /* U+00C6 */ + false, /* U+00C7 */ + false, /* U+00C8 */ + false, /* U+00C9 */ + false, /* U+00CA */ + false, /* U+00CB */ + false, /* U+00CC */ + false, /* U+00CD */ + false, /* U+00CE */ + false, /* U+00CF */ + false, /* U+00D0 */ + false, /* U+00D1 */ + false, /* U+00D2 */ + false, /* U+00D3 */ + false, /* U+00D4 */ + false, /* U+00D5 */ + false, /* U+00D6 */ + false, /* U+00D7 */ + false, /* U+00D8 */ + false, /* U+00D9 */ + false, /* U+00DA */ + false, /* U+00DB */ + false, /* U+00DC */ + false, /* U+00DD */ + false, /* U+00DE */ + false, /* U+00DF */ + false, /* U+00E0 */ + false, /* U+00E1 */ + false, /* U+00E2 */ + false, /* U+00E3 */ + false, /* U+00E4 */ + false, /* U+00E5 */ + false, /* U+00E6 */ + false, /* U+00E7 */ + false, /* U+00E8 */ + false, /* U+00E9 */ + false, /* U+00EA */ + false, /* U+00EB */ + false, /* U+00EC */ + false, /* U+00ED */ + false, /* U+00EE */ + false, /* U+00EF */ + false, /* U+00F0 */ + false, /* U+00F1 */ + false, /* U+00F2 */ + false, /* U+00F3 */ + false, /* U+00F4 */ + false, /* U+00F5 */ + false, /* U+00F6 */ + false, /* U+00F7 */ + false, /* U+00F8 */ + false, /* U+00F9 */ + false, /* U+00FA */ + false, /* U+00FB */ + false, /* U+00FC */ + false, /* U+00FD */ + false, /* U+00FE */ + false, /* U+00FF */ +}; + diff --git a/intl/icu/source/tools/escapesrc/escapesrc.cpp b/intl/icu/source/tools/escapesrc/escapesrc.cpp new file mode 100644 index 0000000000..10ac3d1aef --- /dev/null +++ b/intl/icu/source/tools/escapesrc/escapesrc.cpp @@ -0,0 +1,427 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include <stdio.h> +#include <string> +#include <stdlib.h> +#include <errno.h> +#include <string.h> +#include <iostream> +#include <fstream> + +// We only use U8_* macros, which are entirely inline. +#include "unicode/utf8.h" + +// This contains a codepage and ISO 14882:1998 illegality table. +// Use "make gen-table" to rebuild it. +#include "cptbl.h" + +/** + * What is this? + * + * "This" is a preprocessor that makes an attempt to convert fully valid C++11 source code + * in utf-8 into something consumable by certain compilers (Solaris, xlC) + * which aren't quite standards compliant. + * + * - u"<unicode>" or u'<unicode>' gets converted to u"\uNNNN" or u'\uNNNN' + * - u8"<unicode>" gets converted to "\xAA\xBB\xCC\xDD" etc. + * (some compilers do not support the u8 prefix correctly.) + * - if the system is EBCDIC-based, that is used to correct the input characters. + * + * Usage: + * escapesrc infile.cpp outfile.cpp + * Normally this is invoked by the build stage, with a rule such as: + * + * _%.cpp: $(srcdir)/%.cpp + * @$(BINDIR)/escapesrc$(EXEEXT) $< $@ + * %.o: _%.cpp + * $(COMPILE.cc) ... $@ $< + * + * In the Makefiles, SKIP_ESCAPING=YES is used to prevent escapesrc.cpp + * from being itself escaped. + */ + + +static const char + kSPACE = 0x20, + kTAB = 0x09, + kLF = 0x0A, + kCR = 0x0D; + +// For convenience +# define cp1047_to_8859(c) cp1047_8859_1[c] + +// Our app's name +std::string prog; + +/** + * Give the usual 1-line documentation and exit + */ +void usage() { + fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str()); +} + +/** + * Delete the output file (if any) + * We want to delete even if we didn't generate, because it might be stale. + */ +int cleanup(const std::string &outfile) { + const char *outstr = outfile.c_str(); + if(outstr && *outstr) { + int rc = std::remove(outstr); + if(rc == 0) { + fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr); + return 0; + } else { + if( errno == ENOENT ) { + return 0; // File did not exist - no error. + } else { + perror("std::remove"); + return 1; + } + } + } + return 0; +} + +/** + * Skip across any known whitespace. + * @param p startpoint + * @param e limit + * @return first non-whitespace char + */ +inline const char *skipws(const char *p, const char *e) { + for(;p<e;p++) { + switch(*p) { + case kSPACE: + case kTAB: + case kLF: + case kCR: + break; + default: + return p; // non ws + } + } + return p; +} + +/** + * Append a byte, hex encoded + * @param outstr sstring to append to + * @param byte the byte to append + */ +void appendByte(std::string &outstr, + uint8_t byte) { + char tmp2[5]; + snprintf(tmp2, sizeof(tmp2), "\\x%02X", 0xFF & (int)(byte)); + outstr += tmp2; +} + +/** + * Append the bytes from 'linestr' into outstr, with escaping + * @param outstr the output buffer + * @param linestr the input buffer + * @param pos in/out: the current char under consideration + * @param chars the number of chars to consider + * @return true on failure + */ +bool appendUtf8(std::string &outstr, + const std::string &linestr, + size_t &pos, + size_t chars) { + char tmp[9]; + for(size_t i=0;i<chars;i++) { + tmp[i] = linestr[++pos]; + } + tmp[chars] = 0; + unsigned int c; + sscanf(tmp, "%X", &c); + UChar32 ch = c & 0x1FFFFF; + + // now to append \\x%% etc + uint8_t bytesNeeded = U8_LENGTH(ch); + if(bytesNeeded == 0) { + fprintf(stderr, "Illegal code point U+%X\n", ch); + return true; + } + uint8_t bytes[4]; + uint8_t *s = bytes; + size_t i = 0; + U8_APPEND_UNSAFE(s, i, ch); + for(size_t t = 0; t<i; t++) { + appendByte(outstr, s[t]); + } + return false; +} + +/** + * Fixup u8"x" + * @param linestr string to mutate. Already escaped into \u format. + * @param origpos beginning, points to 'u8"' + * @param pos end, points to " + * @return false for no-problem, true for failure! + */ +bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) { + size_t pos = origpos + 3; + std::string outstr; + outstr += '\"'; // local encoding + for(;pos<endpos;pos++) { + char c = linestr[pos]; + if(c == '\\') { + char c2 = linestr[++pos]; + switch(c2) { + case '\'': + case '"': +#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) + c2 = cp1047_to_8859(c2); +#endif + appendByte(outstr, c2); + break; + case 'u': + appendUtf8(outstr, linestr, pos, 4); + break; + case 'U': + appendUtf8(outstr, linestr, pos, 8); + break; + } + } else { +#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) + c = cp1047_to_8859(c); +#endif + appendByte(outstr, c); + } + } + outstr += ('\"'); + + linestr.replace(origpos, (endpos-origpos+1), outstr); + + return false; // OK +} + +/** + * fix the u"x"/u'x'/u8"x" string at the position + * u8'x' is not supported, sorry. + * @param linestr the input string + * @param pos the position + * @return false = no err, true = had err + */ +bool fixAt(std::string &linestr, size_t pos) { + size_t origpos = pos; + + if(linestr[pos] != 'u') { + fprintf(stderr, "Not a 'u'?"); + return true; + } + + pos++; // past 'u' + + bool utf8 = false; + + if(linestr[pos] == '8') { // u8" + utf8 = true; + pos++; + } + + char quote = linestr[pos]; + + if(quote != '\'' && quote != '\"') { + fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote); + return true; + } + + if(quote == '\'' && utf8) { + fprintf(stderr, "Cannot do u8'...'\n"); + return true; + } + + pos ++; + + //printf("u%c…%c\n", quote, quote); + + for(; pos < linestr.size(); pos++) { + if(linestr[pos] == quote) { + if(utf8) { + return fixu8(linestr, origpos, pos); // fix u8"..." + } else { + return false; // end of quote + } + } + if(linestr[pos] == '\\') { + pos++; + if(linestr[pos] == quote) continue; // quoted quote + if(linestr[pos] == 'u') continue; // for now ... unicode escape + if(linestr[pos] == '\\') continue; + // some other escape… ignore + } else { + size_t old_pos = pos; + int32_t i = pos; +#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) + // mogrify 1-4 bytes from 1047 'back' to utf-8 + char old_byte = linestr[pos]; + linestr[pos] = cp1047_to_8859(linestr[pos]); + // how many more? + int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]); + for(size_t pos2 = pos+1; trail>0; pos2++,trail--) { + linestr[pos2] = cp1047_to_8859(linestr[pos2]); + if(linestr[pos2] == 0x0A) { + linestr[pos2] = 0x85; // NL is ambiguous here + } + } +#endif + + // Proceed to decode utf-8 + const uint8_t *s = (const uint8_t*) (linestr.c_str()); + int32_t length = linestr.size(); + UChar32 c; + if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) { +#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) + linestr[pos] = old_byte; // put it back +#endif + continue; // single code point not previously legal for \u escaping + } + + // otherwise, convert it to \u / \U + { + U8_NEXT(s, i, length, c); + } + if(c<0) { + fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", (int)old_pos); + fprintf(stderr, "Line: >>%s<<\n", linestr.c_str()); + return true; + } + + size_t seqLen = (i-pos); + + //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout); + + char newSeq[20]; + if( c <= 0xFFFF) { + snprintf(newSeq, sizeof(newSeq), "\\u%04X", c); + } else { + snprintf(newSeq, sizeof(newSeq), "\\U%08X", c); + } + linestr.replace(pos, seqLen, newSeq); + pos += strlen(newSeq) - 1; + } + } + + return false; +} + +/** + * Fixup an entire line + * false = no err + * true = had err + * @param no the line number (not used) + * @param linestr the string to fix + * @return true if any err, else false + */ +bool fixLine(int /*no*/, std::string &linestr) { + const char *line = linestr.c_str(); + size_t len = linestr.size(); + + // no u' in the line? + if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) { + return false; // Nothing to do. No u' or u" detected + } + + // start from the end and find all u" cases + size_t pos = len = linestr.size(); + if(len>INT32_MAX/2) { + return true; + } + while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) { + //printf("found doublequote at %d\n", pos); + if(fixAt(linestr, pos)) return true; + if(pos == 0) break; + pos--; + } + + // reset and find all u' cases + pos = len = linestr.size(); + while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) { + //printf("found singlequote at %d\n", pos); + if(fixAt(linestr, pos)) return true; + if(pos == 0) break; + pos--; + } + + // reset and find all u8" cases + pos = len = linestr.size(); + while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) { + if(fixAt(linestr, pos)) return true; + if(pos == 0) break; + pos--; + } + + //fprintf(stderr, "%d - fixed\n", no); + return false; +} + +/** + * Convert a whole file + * @param infile + * @param outfile + * @return 1 on err, 0 otherwise + */ +int convert(const std::string &infile, const std::string &outfile) { + fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str()); + + std::ifstream inf; + + inf.open(infile.c_str(), std::ios::in); + + if(!inf.is_open()) { + fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str()); + cleanup(outfile); + return 1; + } + + std::ofstream outf; + + outf.open(outfile.c_str(), std::ios::out); + + if(!outf.is_open()) { + fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str()); + return 1; + } + + // TODO: any platform variations of #line? + outf << "#line 1 \"" << infile << "\"" << '\n'; + + int no = 0; + std::string linestr; + while( getline( inf, linestr)) { + no++; + if(fixLine(no, linestr)) { + goto fail; + } + outf << linestr << '\n'; + } + + if(inf.eof()) { + return 0; + } +fail: + outf.close(); + fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str()); + cleanup(outfile); + return 1; +} + +/** + * Main function + */ +int main(int argc, const char *argv[]) { + prog = argv[0]; + + if(argc != 3) { + usage(); + return 1; + } + + std::string infile = argv[1]; + std::string outfile = argv[2]; + + return convert(infile, outfile); +} diff --git a/intl/icu/source/tools/escapesrc/expect-simple.cpp b/intl/icu/source/tools/escapesrc/expect-simple.cpp new file mode 100644 index 0000000000..a6019a8d40 --- /dev/null +++ b/intl/icu/source/tools/escapesrc/expect-simple.cpp @@ -0,0 +1,17 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +u"sa\u0127\u0127a"; +u'\u6587'; +u"\U000219F2"; +u"\u039C\u03C5\u03C3\u03C4\u03AE\u03C1\u03B9\u03BF"; + + u"sa\u0127\u0127a"; + u'\u6587'; u"\U000219F2"; + +"\x20\xCC\x81"; +"\xCC\x88\x20"; +"\x73\x61\xC4\xA7\xC4\xA7\x61"; +"\xE6\x96\x87"; +"\xF0\xA1\xA7\xB2"; +"\x73\x61\xC4\xA7\xC4\xA7\x61"; diff --git a/intl/icu/source/tools/escapesrc/tblgen.cpp b/intl/icu/source/tools/escapesrc/tblgen.cpp new file mode 100644 index 0000000000..dce4af6867 --- /dev/null +++ b/intl/icu/source/tools/escapesrc/tblgen.cpp @@ -0,0 +1,80 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" +#include "unicode/ucnv.h" +#include "unicode/uniset.h" +#include <stdio.h> + +static const char *kConverter = "ibm-1047"; + +int main(int argc, const char *argv[]) { + printf("// %s\n", U_COPYRIGHT_STRING); + printf("// generated by tblgen. You weren't going to edit it by hand, were you?\n"); + printf("\n"); + + UErrorCode status = U_ZERO_ERROR; + LocalUConverterPointer cnv(ucnv_open(kConverter, &status)); + + if(U_FAILURE(status)) { + fprintf(stderr, "Failed to open %s: %s\n", kConverter, u_errorName(status)); + return 1; + } + + printf("static const char cp1047_8859_1[256] = { \n"); + for(int i=0x00; i<0x100; i++) { + char cp1047[1]; + cp1047[0] = i; + char16_t u[1]; + char16_t *target = u; + const char *source = cp1047; + ucnv_toUnicode(cnv.getAlias(), &target, u+1, &source, cp1047+1, nullptr, true, &status); + if(U_FAILURE(status)) { + fprintf(stderr, "Conversion failure at #%X: %s\n", i, u_errorName(status)); + return 2; + } + printf(" (char)0x%02X, /* %02X */\n", u[0], i); + } + printf("};\n\n"); + + // + // UnicodeSet oldIllegal("[:print:]", status); // [a-zA-Z0-9_}{#)(><%:;.?*+-/^&|~!=,\\u005b\\u005d\\u005c]", status); + UnicodeSet oldIllegal("[0-9 a-z A-Z " + "_ \\{ \\} \\[ \\] # \\( \\) < > % \\: ; . " + "? * + \\- / \\^ \\& | ~ ! = , \\ \" ' ]", status); + + /* + +http://www.lirmm.fr/~ducour/Doc-objets/ISO+IEC+14882-1998.pdf ( note: 1998 ) page 10, section 2.2 says: + +1 The basic source character set consists of 96 characters: the space character, the control characters repre- 15) +senting horizontal tab, vertical tab, form feed, and new-line, plus the following 91 graphical characters: +a b c d e f g h i j k l m n opqrstuvwxyz +A B C D E F G H I J K L M N OPQRSTUVWXYZ +0 12 3 4 5 6 7 8 9 + _ { } [ ] # ( ) < > % : ; . ?*+-/^&|~!=,\" +2 The universal-character-name construct provides a way to name other characters. hex-quad: +hexadecimal-digit hexadecimal-digit hexadecimal-digit hexadecimal-digit +universal-character-name: \u hex-quad +\U hex-quad hex-quad +The character designated by the universal-character-name \UNNNNNNNN is that character whose character short name in ISO/IEC 10646 is NNNNNNNN; the character designated by the universal-character-name \uNNNN is that character whose character short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value for a universal character name is less than 0x20 or in the range 0x7F-0x9F (inclusive), or if the uni- versal character name designates a character in the basic source character set, then the program is ill- formed. + + +So basically: printable ASCII plus 0x00-0x1F, 0x7F-0x9F, was all illegal. + +Some discussion at http://unicode.org/mail-arch/unicode-ml/y2003-m10/0471.html + + */ + + + + printf("static const bool oldIllegal[256] = { \n"); + for(char16_t i=0x00; i<0x100;i++) { + printf(" %s, /* U+%04X */\n", + (oldIllegal.contains(i))?" true":"false", + i); + } + printf("};\n\n"); + + return 0; +} diff --git a/intl/icu/source/tools/escapesrc/test-nochange.cpp b/intl/icu/source/tools/escapesrc/test-nochange.cpp new file mode 100644 index 0000000000..8c0d04b809 --- /dev/null +++ b/intl/icu/source/tools/escapesrc/test-nochange.cpp @@ -0,0 +1,5 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// This is a source file with no changes needed in it. +// In fact, the only non-ASCII character is the comment line at top. diff --git a/intl/icu/source/tools/escapesrc/test-simple.cpp b/intl/icu/source/tools/escapesrc/test-simple.cpp new file mode 100644 index 0000000000..b03f28f706 --- /dev/null +++ b/intl/icu/source/tools/escapesrc/test-simple.cpp @@ -0,0 +1,17 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +u"saħħa"; +u'文'; +u"𡧲"; +u"Μυστήριο"; + + u"saħħa"; + u'文'; u"𡧲"; + +u8" \u0301"; +u8"\u0308 "; +u8"saħħa"; +u8"文"; +u8"𡧲"; +u8"saħ\u0127a"; diff --git a/intl/icu/source/tools/genbrk/Makefile.in b/intl/icu/source/tools/genbrk/Makefile.in new file mode 100644 index 0000000000..bcb684eed7 --- /dev/null +++ b/intl/icu/source/tools/genbrk/Makefile.in @@ -0,0 +1,96 @@ +## Makefile.in for ICU - tools/genbrk +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 2002-2011 International Business Machines Corporation and +## others. All Rights Reserved. + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/genbrk + +TARGET_STUB_NAME = genbrk + +SECTION = 1 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(SOURCES:.cpp=.o) + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(bindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(bindir) + +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/genbrk/genbrk.1.in b/intl/icu/source/tools/genbrk/genbrk.1.in new file mode 100644 index 0000000000..9b21093960 --- /dev/null +++ b/intl/icu/source/tools/genbrk/genbrk.1.in @@ -0,0 +1,114 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" genbrk.1: manual page for the genbrk utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2005-2006 International Business Machines Corporation and others +.\" +.TH GENBRK 1 "2 December 2005" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B genbrk +\- Compiles ICU break iteration rules source files into binary data files +.SH SYNOPSIS +.B genbrk +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-V\fP, \fB\-\-version" +] +[ +.BR "\-c\fP, \fB\-\-copyright" +] +[ +.BR "\-v\fP, \fB\-\-verbose" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +[ +.BI "\-i\fP, \fB\-\-icudatadir" " directory" +] +.BI "\-r\fP, \fB\-\-rules" " rule\-file" +.BI "\-o\fP, \fB\-\-out" " output\-file" +.SH DESCRIPTION +.B genbrk +reads the break (boundary) rule source code from +.I rule-file +and creates a break iteration data file. Normally this data file has the +.B .brk +extension. +.PP +The details of the rule syntax can be found in ICU's User Guide. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BR "\-V\fP, \fB\-\-version" +Print the version of +.B genbrk +and exit. +.TP +.BR "\-c\fP, \fB\-\-copyright" +Embeds the standard ICU copyright into the +.IR output-file . +.TP +.BR "\-v\fP, \fB\-\-verbose" +Display extra informative messages during execution. +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory of the +.IR output-file +to +.IR destination . +.TP +.BI "\-i\fP, \fB\-\-icudatadir" " directory" +Look for any necessary ICU data files in +.IR directory . +For example, the file +.B pnames.icu +must be located when ICU's data is not built as a shared library. +The default ICU data directory is specified by the environment variable +.BR ICU_DATA . +Most configurations of ICU do not require this argument. +.TP +.BI "\-r\fP, \fB\-\-rules" " rule\-file" +The source file to read. +.TP +.BI "\-o\fP, \fB\-\-out" " output\-file" +The output data file to write. +.SH CAVEATS +When the +.IR rule-file +contains a byte order mark (BOM) at the beginning of the file, which is the Unicode character +.B U+FEFF, +then the +.IR rule-file +is interpreted as Unicode. Without the BOM, +the file is interpreted in the current operating system default codepage. +In order to eliminate any ambiguity of the encoding for how the +.IR rule-file +was written, it is recommended that you write this file in UTF-8 +with the BOM. +.SH ENVIRONMENT +.TP 10 +.B ICU_DATA +Specifies the directory containing ICU data. Defaults to +.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ . +Some tools in ICU depend on the presence of the trailing slash. It is thus +important to make sure that it is present if +.B ICU_DATA +is set. +.SH AUTHORS +George Rhoten +.br +Andy Heninger +.SH VERSION +1.0 +.SH COPYRIGHT +Copyright (C) 2005 International Business Machines Corporation and others +.SH SEE ALSO +.BR http://www.icu-project.org/userguide/boundaryAnalysis.html + diff --git a/intl/icu/source/tools/genbrk/genbrk.cpp b/intl/icu/source/tools/genbrk/genbrk.cpp new file mode 100644 index 0000000000..2b175d459b --- /dev/null +++ b/intl/icu/source/tools/genbrk/genbrk.cpp @@ -0,0 +1,352 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (C) 2002-2016, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* +* File genbrk.c +*/ + +//-------------------------------------------------------------------- +// +// Tool for generating RuleBasedBreakIterator data files (.brk files). +// .brk files contain the precompiled rules for standard types +// of iterators - word, line, sentence, etc. +// +// Usage: genbrk [options] -r rule-file.txt -o output-file.brk +// +// options: -v verbose +// -? or -h help +// +// The input rule file is a plain text file containing break rules +// in the input format accepted by RuleBasedBreakIterators. The +// file can be encoded as utf-8, or utf-16 (either endian), or +// in the default code page (platform dependent.). utf encoded +// files must include a BOM. +// +//-------------------------------------------------------------------- + +#include "unicode/utypes.h" +#include "unicode/ucnv.h" +#include "unicode/unistr.h" +#include "unicode/rbbi.h" +#include "unicode/uclean.h" +#include "unicode/udata.h" +#include "unicode/putil.h" + +#include "uoptions.h" +#include "unewdata.h" +#include "ucmndata.h" +#include "rbbidata.h" +#include "cmemory.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +U_NAMESPACE_USE + +static char *progName; +static UOption options[]={ + UOPTION_HELP_H, /* 0 */ + UOPTION_HELP_QUESTION_MARK, /* 1 */ + UOPTION_VERBOSE, /* 2 */ + { "rules", nullptr, nullptr, nullptr, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */ + { "out", nullptr, nullptr, nullptr, 'o', UOPT_REQUIRES_ARG, 0 }, /* 4 */ + UOPTION_ICUDATADIR, /* 5 */ + UOPTION_DESTDIR, /* 6 */ + UOPTION_COPYRIGHT, /* 7 */ + UOPTION_QUIET, /* 8 */ +}; + +void usageAndDie(int retCode) { + printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName); + printf("\tRead in break iteration rules text and write out the binary data\n" + "options:\n" + "\t-h or -? or --help this usage text\n" + "\t-V or --version show a version message\n" + "\t-c or --copyright include a copyright notice\n" + "\t-v or --verbose turn on verbose output\n" + "\t-q or --quiet do not display warnings and progress\n" + "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" + "\t followed by path, defaults to %s\n" + "\t-d or --destdir destination directory, followed by the path\n", + u_getDataDirectory()); + exit (retCode); +} + + +#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO + +/* dummy UDataInfo cf. udata.h */ +static UDataInfo dummyDataInfo = { + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, + + { 0, 0, 0, 0 }, /* dummy dataFormat */ + { 0, 0, 0, 0 }, /* dummy formatVersion */ + { 0, 0, 0, 0 } /* dummy dataVersion */ +}; + +#else + +// +// Set up the ICU data header, defined in ucmndata.h +// +DataHeader dh ={ + {sizeof(DataHeader), // Struct MappedData + 0xda, + 0x27}, + + { // struct UDataInfo + sizeof(UDataInfo), // size + 0, // reserved + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, // reserved + + { 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk " + { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values + // from the RBBI rule builder. The values declared + // here should never appear in any real RBBI data. + { 4, 1, 0, 0 } // dataVersion (Unicode version) + }}; + +#endif + +//---------------------------------------------------------------------------- +// +// main for genbrk +// +//---------------------------------------------------------------------------- +int main(int argc, char **argv) { + UErrorCode status = U_ZERO_ERROR; + const char *ruleFileName; + const char *outFileName; + const char *outDir = nullptr; + const char *copyright = nullptr; + + // + // Pick up and check the command line arguments, + // using the standard ICU tool utils option handling. + // + U_MAIN_INIT_ARGS(argc, argv); + progName = argv[0]; + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + if(argc<0) { + // Unrecognized option + fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); + usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); + } + + if(options[0].doesOccur || options[1].doesOccur) { + // -? or -h for help. + usageAndDie(0); + } + + if (!(options[3].doesOccur && options[4].doesOccur)) { + fprintf(stderr, "rule file and output file must both be specified.\n"); + usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); + } + ruleFileName = options[3].value; + outFileName = options[4].value; + + if (options[5].doesOccur) { + u_setDataDirectory(options[5].value); + } + + status = U_ZERO_ERROR; + + /* Combine the directory with the file name */ + if(options[6].doesOccur) { + outDir = options[6].value; + } + if (options[7].doesOccur) { + copyright = U_COPYRIGHT_STRING; + } + +#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO + + UNewDataMemory *pData; + char msg[1024]; + + /* write message with just the name */ + snprintf(msg, sizeof(msg), "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); + fprintf(stderr, "%s\n", msg); + + /* write the dummy data file */ + pData = udata_create(outDir, nullptr, outFileName, &dummyDataInfo, nullptr, &status); + udata_writeBlock(pData, msg, strlen(msg)); + udata_finish(pData, &status); + return (int)status; + +#else + /* Initialize ICU */ + u_init(&status); + if (U_FAILURE(status)) { + fprintf(stderr, "%s: can not initialize ICU. status = %s\n", + argv[0], u_errorName(status)); + exit(1); + } + status = U_ZERO_ERROR; + + // + // Read in the rule source file + // + long result; + long ruleFileSize; + FILE *file; + char *ruleBufferC; + + file = fopen(ruleFileName, "rb"); + if( file == 0 ) { + fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName); + exit(-1); + } + fseek(file, 0, SEEK_END); + ruleFileSize = ftell(file); + fseek(file, 0, SEEK_SET); + ruleBufferC = new char[ruleFileSize+10]; + + result = (long)fread(ruleBufferC, 1, ruleFileSize, file); + if (result != ruleFileSize) { + fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName); + exit (-1); + } + ruleBufferC[ruleFileSize]=0; + fclose(file); + + // + // Look for a Unicode Signature (BOM) on the rule file + // + int32_t signatureLength; + const char * ruleSourceC = ruleBufferC; + const char* encoding = ucnv_detectUnicodeSignature( + ruleSourceC, ruleFileSize, &signatureLength, &status); + if (U_FAILURE(status)) { + exit(status); + } + if(encoding!=nullptr ){ + ruleSourceC += signatureLength; + ruleFileSize -= signatureLength; + } + + // + // Open a converter to take the rule file to UTF-16 + // + UConverter* conv; + conv = ucnv_open(encoding, &status); + if (U_FAILURE(status)) { + fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); + exit(status); + } + + // + // Convert the rules to char16_t. + // Preflight first to determine required buffer size. + // + uint32_t destCap = ucnv_toUChars(conv, + nullptr, // dest, + 0, // destCapacity, + ruleSourceC, + ruleFileSize, + &status); + if (status != U_BUFFER_OVERFLOW_ERROR) { + fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); + exit(status); + } + + status = U_ZERO_ERROR; + char16_t *ruleSourceU = new char16_t[destCap+1]; + ucnv_toUChars(conv, + ruleSourceU, // dest, + destCap+1, + ruleSourceC, + ruleFileSize, + &status); + if (U_FAILURE(status)) { + fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); + exit(status); + } + ucnv_close(conv); + + + // + // Put the source rules into a UnicodeString + // + UnicodeString ruleSourceS(false, ruleSourceU, destCap); + + // + // Create the break iterator from the rules + // This will compile the rules. + // + UParseError parseError; + parseError.line = 0; + parseError.offset = 0; + RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status); + if (U_FAILURE(status)) { + fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n", + u_errorName(status), (int)parseError.line, (int)parseError.offset); + exit(status); + } + + + // + // Get the compiled rule data from the break iterator. + // + uint32_t outDataSize; + const uint8_t *outData; + outData = bi->getBinaryRules(outDataSize); + + // Copy the data format version numbers from the RBBI data header into the UDataMemory header. + uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion)); + + // + // Create the output file + // + size_t bytesWritten; + UNewDataMemory *pData; + pData = udata_create(outDir, nullptr, outFileName, &(dh.info), copyright, &status); + if(U_FAILURE(status)) { + fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n", + outFileName, u_errorName(status)); + exit(status); + } + + + // Write the data itself. + udata_writeBlock(pData, outData, outDataSize); + // finish up + bytesWritten = udata_finish(pData, &status); + if(U_FAILURE(status)) { + fprintf(stderr, "genbrk: error %d writing the output file\n", status); + exit(status); + } + + if (bytesWritten != outDataSize) { + fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); + exit(-1); + } + + delete bi; + delete[] ruleSourceU; + delete[] ruleBufferC; + u_cleanup(); + + + if(!options[8].doesOccur) { + printf("genbrk: tool completed successfully.\n"); + } + return 0; + +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ +} + diff --git a/intl/icu/source/tools/genbrk/genbrk.vcxproj b/intl/icu/source/tools/genbrk/genbrk.vcxproj new file mode 100644 index 0000000000..44cb00ed8f --- /dev/null +++ b/intl/icu/source/tools/genbrk/genbrk.vcxproj @@ -0,0 +1,80 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{C2BE5000-7501-4E87-9724-B8D82494FAE6}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)/genbrk.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)/genbrk.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)/genbrk.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)/genbrk.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="genbrk.cpp" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/genbrk/genbrk.vcxproj.filters b/intl/icu/source/tools/genbrk/genbrk.vcxproj.filters new file mode 100644 index 0000000000..cfa644e39c --- /dev/null +++ b/intl/icu/source/tools/genbrk/genbrk.vcxproj.filters @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{82d916a5-20c7-4274-ae1e-2af434b33866}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{1e6ae8a2-19da-42f9-a533-f032edd29aa9}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{1034b176-e390-4db1-bb83-307d3f9924b5}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="genbrk.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/genbrk/sources.txt b/intl/icu/source/tools/genbrk/sources.txt new file mode 100644 index 0000000000..a750aebb35 --- /dev/null +++ b/intl/icu/source/tools/genbrk/sources.txt @@ -0,0 +1 @@ +genbrk.cpp diff --git a/intl/icu/source/tools/genccode/Makefile.in b/intl/icu/source/tools/genccode/Makefile.in new file mode 100644 index 0000000000..6a1bab571b --- /dev/null +++ b/intl/icu/source/tools/genccode/Makefile.in @@ -0,0 +1,97 @@ +## Makefile.in for ICU - tools/genccode +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 1999-2011, International Business Machines Corporation and +## others. All Rights Reserved. +## Steven R. Loomis + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/genccode + +TARGET_STUB_NAME = genccode + +SECTION = 8 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(SOURCES:.c=.o) + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(sbindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir) + +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/genccode/genccode.8.in b/intl/icu/source/tools/genccode/genccode.8.in new file mode 100644 index 0000000000..f39980db70 --- /dev/null +++ b/intl/icu/source/tools/genccode/genccode.8.in @@ -0,0 +1,108 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" genccode.8: manual page for the gennames utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2003-2004 IBM, Inc. and others. +.\" +.TH GENCCODE 8 "11 March 2004" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B genccode +\- generate C or platform specific assembly code from an ICU data file. +.SH SYNOPSIS +.B genccode +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BI "\-a\fP, \fB\-\-assembly" " name" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +[ +.BI "\-n\fP, \fB\-\-name" " name" +] +[ +.BI "\-e\fP, \fB\-\-entrypoint" " name" +] +[ +.BI "\-f\fP, \fB\-\-filename" " name" +] +[ +.IR filename " .\|.\|." +] +.SH DESCRIPTION +.B genccode +reads each of the supplied +.I filename +and writes out a C file containing a compilable definition of the data in +the data file. +The C file name is made by taking the base name of the data +.IR filename , +replacing dots by underscores, and adding a +.I .c +file extension. +.PP +If the \fB-a\fP option is used, platform specific assembly +code is generated instead of C code. +Most C compilers will accept both C and assembly files. +Instead of writing a filename with a +.I .c +file extension, a filename with a +.I .s +will be written instead. +.PP +If +.B genccode +is called with no +.I filename +it terminates gracefully. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BI "\-a\fP, \fB\-\-assembly" " name" +Output assembly code instead of C code. +Use \fB-h\fP to see the list of available types of assembly to generate and +to specify for this option. +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory to +.IR destination . +The default destination directory is the current directory. +.TP +.BI "\-n\fP, \fB\-\-name" " name" +Set the data name to +.I name +instead of the default. This name is also used as the base name of the +output. The default name is made of the +.I icudt +prefix, followed by a two-digit version number corresponding to +the current version of the ICU release, and a single letter indicating +the endianness of the data (the letter +.I b +indicated big endian data, and the letter +.I l +indicates little endian ones). +.TP +.BI "\-f\fP, \fB\-\-filename" " name" +Normally, an ICU data file such as mydata.icu will be turned into mydata_icu.c and mydata_icu.o. +However, if this parameter was set to "somedata", the output files will be somedata.o and +somedata.c, respectively. +.TP +.BI "\-e\fP, \fB\-\-entrypoint" " name" +Set the data entry point (used for linking against the data in a +shared library form) to +.IR name . +The default entry point name is made of the data (set by the +.BI "\-n\fP, \fB\-\-name" +option) followed by an underscore and the type of the data (set by the +.BI "\-t\fP, \fB\-\-type" +option). +.SH VERSION +@VERSION@ +.SH COPYRIGHT +Copyright (C) 2000-2004 IBM, Inc. and others. diff --git a/intl/icu/source/tools/genccode/genccode.c b/intl/icu/source/tools/genccode/genccode.c new file mode 100644 index 0000000000..9fb7dbcdf2 --- /dev/null +++ b/intl/icu/source/tools/genccode/genccode.c @@ -0,0 +1,214 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ******************************************************************************* + * Copyright (C) 1999-2016, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + * file name: gennames.c + * encoding: UTF-8 + * tab size: 8 (not used) + * indentation:4 + * + * created on: 1999nov01 + * created by: Markus W. Scherer + * + * This program reads a binary file and creates a C source code file + * with a byte array that contains the data of the binary file. + * + * 12/09/1999 weiv Added multiple file handling + */ + +#include "unicode/utypes.h" + +#if U_PLATFORM_HAS_WIN32_API +# define VC_EXTRALEAN +# define WIN32_LEAN_AND_MEAN +# define NOUSER +# define NOSERVICE +# define NOIME +# define NOMCX +#include <windows.h> +#include <time.h> +#endif + +#if U_PLATFORM_IS_LINUX_BASED && U_HAVE_ELF_H +# define U_ELF +#endif + +#ifdef U_ELF +# include <elf.h> +# if defined(ELFCLASS64) +# define U_ELF64 +# endif + /* Old elf.h headers may not have EM_X86_64, or have EM_X8664 instead. */ +# ifndef EM_X86_64 +# define EM_X86_64 62 +# endif +# define ICU_ENTRY_OFFSET 0 +#endif + +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include "unicode/putil.h" +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" +#include "toolutil.h" +#include "unicode/uclean.h" +#include "uoptions.h" +#include "pkg_genc.h" + +enum { + kOptHelpH = 0, + kOptHelpQuestionMark, + kOptDestDir, + kOptQuiet, + kOptName, + kOptEntryPoint, +#ifdef CAN_GENERATE_OBJECTS + kOptObject, + kOptMatchArch, + kOptSkipDllExport, +#endif + kOptFilename, + kOptAssembly +}; + +static UOption options[]={ +/*0*/UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_DESTDIR, + UOPTION_QUIET, + UOPTION_DEF("name", 'n', UOPT_REQUIRES_ARG), + UOPTION_DEF("entrypoint", 'e', UOPT_REQUIRES_ARG), +#ifdef CAN_GENERATE_OBJECTS +/*6*/UOPTION_DEF("object", 'o', UOPT_NO_ARG), + UOPTION_DEF("match-arch", 'm', UOPT_REQUIRES_ARG), + UOPTION_DEF("skip-dll-export", '\0', UOPT_NO_ARG), +#endif + UOPTION_DEF("filename", 'f', UOPT_REQUIRES_ARG), + UOPTION_DEF("assembly", 'a', UOPT_REQUIRES_ARG) +}; + +#define CALL_WRITECCODE 'c' +#define CALL_WRITEASSEMBLY 'a' +#define CALL_WRITEOBJECT 'o' +extern int +main(int argc, char* argv[]) { + UBool verbose = true; + char writeCode; + + U_MAIN_INIT_ARGS(argc, argv); + + options[kOptDestDir].value = "."; + + /* read command line options */ + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } + if(argc<0 || options[kOptHelpH].doesOccur || options[kOptHelpQuestionMark].doesOccur) { + fprintf(stderr, + "usage: %s [-options] filename1 filename2 ...\n" + "\tread each binary input file and \n" + "\tcreate a .c file with a byte array that contains the input file's data\n" + "options:\n" + "\t-h or -? or --help this usage text\n" + "\t-d or --destdir destination directory, followed by the path\n" + "\t-q or --quiet do not display warnings and progress\n" + "\t-n or --name symbol prefix, followed by the prefix\n" + "\t-e or --entrypoint entry point name, followed by the name (_dat will be appended)\n" + "\t-r or --revision Specify a version\n" + , argv[0]); +#ifdef CAN_GENERATE_OBJECTS + fprintf(stderr, + "\t-o or --object write a .obj file instead of .c\n" + "\t-m or --match-arch file.o match the architecture (CPU, 32/64 bits) of the specified .o\n" + "\t ELF format defaults to i386. Windows defaults to the native platform.\n" + "\t--skip-dll-export Don't export the ICU data entry point symbol (for use when statically linking)\n"); +#endif + fprintf(stderr, + "\t-f or --filename Specify an alternate base filename. (default: symbolname_typ)\n" + "\t-a or --assembly Create assembly file. (possible values are: "); + + printAssemblyHeadersToStdErr(); + } else { + const char *message, *filename; + /* TODO: remove void (*writeCode)(const char *, const char *); */ + + if(options[kOptAssembly].doesOccur) { + message="generating assembly code for %s\n"; + writeCode = CALL_WRITEASSEMBLY; + /* TODO: remove writeCode=&writeAssemblyCode; */ + + if (!checkAssemblyHeaderName(options[kOptAssembly].value)) { + fprintf(stderr, + "Assembly type \"%s\" is unknown.\n", options[kOptAssembly].value); + return -1; + } + } +#ifdef CAN_GENERATE_OBJECTS + else if(options[kOptObject].doesOccur) { + message="generating object code for %s\n"; + writeCode = CALL_WRITEOBJECT; + /* TODO: remove writeCode=&writeObjectCode; */ + } +#endif + else + { + message="generating C code for %s\n"; + writeCode = CALL_WRITECCODE; + /* TODO: remove writeCode=&writeCCode; */ + } + if (options[kOptQuiet].doesOccur) { + verbose = false; + } + while(--argc) { + filename=getLongPathname(argv[argc]); + if (verbose) { + fprintf(stdout, message, filename); + } + + switch (writeCode) { + case CALL_WRITECCODE: + writeCCode(filename, options[kOptDestDir].value, + options[kOptEntryPoint].doesOccur ? options[kOptEntryPoint].value : NULL, + options[kOptName].doesOccur ? options[kOptName].value : NULL, + options[kOptFilename].doesOccur ? options[kOptFilename].value : NULL, + NULL, + 0); + break; + case CALL_WRITEASSEMBLY: + writeAssemblyCode(filename, options[kOptDestDir].value, + options[kOptEntryPoint].doesOccur ? options[kOptEntryPoint].value : NULL, + options[kOptFilename].doesOccur ? options[kOptFilename].value : NULL, + NULL, + 0); + break; +#ifdef CAN_GENERATE_OBJECTS + case CALL_WRITEOBJECT: + writeObjectCode(filename, options[kOptDestDir].value, + options[kOptEntryPoint].doesOccur ? options[kOptEntryPoint].value : NULL, + options[kOptMatchArch].doesOccur ? options[kOptMatchArch].value : NULL, + options[kOptFilename].doesOccur ? options[kOptFilename].value : NULL, + NULL, + 0, + !options[kOptSkipDllExport].doesOccur); + break; +#endif + default: + /* Should never occur. */ + break; + } + /* TODO: remove writeCode(filename, options[kOptDestDir].value); */ + } + } + + return 0; +} diff --git a/intl/icu/source/tools/genccode/genccode.vcxproj b/intl/icu/source/tools/genccode/genccode.vcxproj new file mode 100644 index 0000000000..aad028f8c7 --- /dev/null +++ b/intl/icu/source/tools/genccode/genccode.vcxproj @@ -0,0 +1,80 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{FDD3C4F2-9805-44EB-9A77-BC1C1C95B547}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)/genccode.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)/genccode.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)/genccode.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)/genccode.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="genccode.c" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/genccode/genccode.vcxproj.filters b/intl/icu/source/tools/genccode/genccode.vcxproj.filters new file mode 100644 index 0000000000..87d4e05025 --- /dev/null +++ b/intl/icu/source/tools/genccode/genccode.vcxproj.filters @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{e71f343e-4f13-4d9f-a050-57e1a45e1d0e}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{e7ef9b3f-40af-4fb6-a566-a733e6b718ee}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{a2128bc1-f7fe-41cb-bbbb-c298cb2b91db}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="genccode.c"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/genccode/sources.txt b/intl/icu/source/tools/genccode/sources.txt new file mode 100644 index 0000000000..d5eb02587e --- /dev/null +++ b/intl/icu/source/tools/genccode/sources.txt @@ -0,0 +1 @@ +genccode.c diff --git a/intl/icu/source/tools/gencfu/Makefile.in b/intl/icu/source/tools/gencfu/Makefile.in new file mode 100644 index 0000000000..6cd8e418db --- /dev/null +++ b/intl/icu/source/tools/gencfu/Makefile.in @@ -0,0 +1,96 @@ +## Makefile.in for ICU - tools/gencfu +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 2009-2012 International Business Machines Corporation and +## others. All Rights Reserved. + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/gencfu + +TARGET_STUB_NAME = gencfu + +SECTION = 1 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(srcdir)/../toolutil +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(SOURCES:.cpp=.o) + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(bindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(bindir) + +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/gencfu/gencfu.1.in b/intl/icu/source/tools/gencfu/gencfu.1.in new file mode 100644 index 0000000000..2b3240b2ab --- /dev/null +++ b/intl/icu/source/tools/gencfu/gencfu.1.in @@ -0,0 +1,93 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" gencfu.1: manual page for the gencfu utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2012 International Business Machines Corporation and others +.\" +.TH GENCFU 1 "24 May 2009" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B gencfu +\- Generates Unicode Confusable data files +.SH SYNOPSIS +.B gencfu +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-V\fP, \fB\-\-version" +] +[ +.BR "\-c\fP, \fB\-\-copyright" +] +[ +.BR "\-v\fP, \fB\-\-verbose" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +[ +.BI "\-i\fP, \fB\-\-icudatadir" " directory" +] +.BI "\-r\fP, \fB\-\-rules" " rule\-file" +.BI "\-w\fP, \fB\-\-wsrules" " whole\-script\-rule\-file" +.BI "\-o\fP, \fB\-\-out" " output\-file" +.SH DESCRIPTION +.B gencfu +reads confusable character definitions in the input file, which are +plain text files containing confusable character +definitions in the input format defined by Unicode UAX39 for the files +.I confusables.txt +and +.I confusablesWholeScript.txt. +This source (.txt) format is also accepted by ICU spoof detectors. +The files must be encoded in utf-8 format, with or without a BOM. +Normally the output data file has the +.B .cfu +extension. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BR "\-V\fP, \fB\-\-version" +Print the version of +.B gencfu +and exit. +.TP +.BR "\-c\fP, \fB\-\-copyright" +Embeds the standard ICU copyright into the +.IR output-file . +.TP +.BR "\-v\fP, \fB\-\-verbose" +Display extra informative messages during execution. +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory of the +.IR output-file +to +.IR destination . +.TP +.BI "\-i\fP, \fB\-\-icudatadir" " directory" +Look for any necessary ICU data files in +.IR directory . +For example, the file +.B pnames.icu +must be located when ICU's data is not built as a shared library. +The default ICU data directory is specified by the environment variable +.BR ICU_DATA . +Most configurations of ICU do not require this argument. +.TP +.BI "\-r\fP, \fB\-\-rules" " rule\-file" +The source file to read. +.TP +.BI "\-w\fP, \fB\-\-wsrules" " whole\-script\-rule\-file" +The whole script source file to read. +.TP +.BI "\-o\fP, \fB\-\-out" " output\-file" +The output data file to write. +.SH VERSION +1.0 +.SH COPYRIGHT +Copyright (C) 2009 International Business Machines Corporation and others diff --git a/intl/icu/source/tools/gencfu/gencfu.cpp b/intl/icu/source/tools/gencfu/gencfu.cpp new file mode 100644 index 0000000000..543cd76afa --- /dev/null +++ b/intl/icu/source/tools/gencfu/gencfu.cpp @@ -0,0 +1,332 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (C) 2009-2016, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* +* File gencfu.c +*/ + +//-------------------------------------------------------------------- +// +// Tool for generating Unicode Confusable data files (.cfu files). +// .cfu files contain the compiled of the confusable data +// derived from the Unicode Consortium data described in +// Unicode UAX 39. +// +// Usage: gencfu [options] -r confusables-file.txt -o output-file.cfu +// +// options: -v verbose +// -? or -h help +// +// The input rule file are plain text files containing confusable character +// definitions in the input format defined by Unicode UAX39 for the files +// confusables.txt. This source (.txt) format +// is also accepted by ICU spoof detectors. The +// files must be encoded in utf-8 format, with or without a BOM. +// +// The script used to compile confusablesWholeScript.txt into the CFU file +// until the Unicode consortium deprecated it. +// +//-------------------------------------------------------------------- + +#include "unicode/utypes.h" +#include "unicode/unistr.h" +#include "unicode/uclean.h" +#include "unicode/udata.h" +#include "unicode/putil.h" + +#include "uoptions.h" +#include "unewdata.h" +#include "ucmndata.h" +#include "uspoof_impl.h" +#include "cmemory.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +U_NAMESPACE_USE + +static char *progName; +static UOption options[]={ + UOPTION_HELP_H, /* 0 */ + UOPTION_HELP_QUESTION_MARK, /* 1 */ + UOPTION_VERBOSE, /* 2 */ + { "rules", nullptr, nullptr, nullptr, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */ + { "wsrules", nullptr, nullptr, nullptr, 'w', UOPT_REQUIRES_ARG, 0}, /* 4 */ // deprecated + { "out", nullptr, nullptr, nullptr, 'o', UOPT_REQUIRES_ARG, 0 }, /* 5 */ + UOPTION_ICUDATADIR, /* 6 */ + UOPTION_DESTDIR, /* 7 */ + UOPTION_COPYRIGHT, /* 8 */ + UOPTION_QUIET, /* 9 */ +}; + +void usageAndDie(int retCode) { + printf("Usage: %s [-v] [-options] -r confusablesRules.txt -o output-file\n", progName); + printf("\tRead in Unicode confusable character definitions and write out the binary data\n" + "options:\n" + "\t-h or -? or --help this usage text\n" + "\t-V or --version show a version message\n" + "\t-c or --copyright include a copyright notice\n" + "\t-v or --verbose turn on verbose output\n" + "\t-q or --quiet do not display warnings and progress\n" + "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" + "\t followed by path, defaults to %s\n" + "\t-d or --destdir destination directory, followed by the path\n", + u_getDataDirectory()); + exit (retCode); +} + + +#if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO + +/* dummy UDataInfo cf. udata.h */ +static UDataInfo dummyDataInfo = { + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, + + { 0, 0, 0, 0 }, /* dummy dataFormat */ + { 0, 0, 0, 0 }, /* dummy formatVersion */ + { 0, 0, 0, 0 } /* dummy dataVersion */ +}; + +#else + +// +// Set up the ICU data header, defined in ucmndata.h +// +DataHeader dh ={ + {sizeof(DataHeader), // Struct MappedData + 0xda, + 0x27}, + + { // struct UDataInfo + sizeof(UDataInfo), // size + 0, // reserved + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, // reserved + + { 0x43, 0x66, 0x75, 0x20 }, // dataFormat="Cfu " + { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values + // from the builder. The values declared + // here should never appear in any real data. + { 5, 1, 0, 0 } // dataVersion (Unicode version) + }}; + +#endif + +// Forward declaration for function for reading source files. +static const char *readFile(const char *fileName, int32_t *len); + +//---------------------------------------------------------------------------- +// +// main for gencfu +// +//---------------------------------------------------------------------------- +int main(int argc, char **argv) { + UErrorCode status = U_ZERO_ERROR; + const char *confFileName; + const char *outFileName; + const char *outDir = nullptr; + const char *copyright = nullptr; + + // + // Pick up and check the command line arguments, + // using the standard ICU tool utils option handling. + // + U_MAIN_INIT_ARGS(argc, argv); + progName = argv[0]; + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + if(argc<0) { + // Unrecognized option + fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); + usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); + } + + if(options[0].doesOccur || options[1].doesOccur) { + // -? or -h for help. + usageAndDie(0); + } + + if (!(options[3].doesOccur && options[5].doesOccur)) { + fprintf(stderr, "confusables file and output file must all be specified.\n"); + usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); + } + confFileName = options[3].value; + outFileName = options[5].value; + + if (options[6].doesOccur) { + u_setDataDirectory(options[6].value); + } + + status = U_ZERO_ERROR; + + /* Combine the directory with the file name */ + if(options[7].doesOccur) { + outDir = options[7].value; + } + if (options[8].doesOccur) { + copyright = U_COPYRIGHT_STRING; + } + + UBool quiet = false; + if (options[9].doesOccur) { + quiet = true; + } + +#if UCONFIG_NO_REGULAR_EXPRESSIONS || UCONFIG_NO_NORMALIZATION || UCONFIG_NO_FILE_IO + // spoof detection data file parsing is dependent on regular expressions. + // TODO: have the tool return an error status. Requires fixing the ICU data build + // so that it doesn't abort entirely on that error. + + UNewDataMemory *pData; + char msg[1024]; + + /* write message with just the name */ + snprintf(msg, sizeof(msg), "gencfu writes dummy %s because of UCONFIG_NO_REGULAR_EXPRESSIONS and/or UCONFIG_NO_NORMALIZATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); + fprintf(stderr, "%s\n", msg); + + /* write the dummy data file */ + pData = udata_create(outDir, nullptr, outFileName, &dummyDataInfo, nullptr, &status); + udata_writeBlock(pData, msg, strlen(msg)); + udata_finish(pData, &status); + return (int)status; + +#else + /* Initialize ICU */ + u_init(&status); + if (U_FAILURE(status)) { + fprintf(stderr, "%s: can not initialize ICU. status = %s\n", + argv[0], u_errorName(status)); + exit(1); + } + status = U_ZERO_ERROR; + + // Read in the confusables source file + + int32_t confusablesLen = 0; + const char *confusables = readFile(confFileName, &confusablesLen); + if (confusables == nullptr) { + printf("gencfu: error reading file \"%s\"\n", confFileName); + exit(-1); + } + + // + // Create the Spoof Detector from the source confusables files. + // This will compile the data. + // + UParseError parseError; + parseError.line = 0; + parseError.offset = 0; + int32_t errType; + USpoofChecker *sc = uspoof_openFromSource(confusables, confusablesLen, + nullptr, 0, + &errType, &parseError, &status); + if (U_FAILURE(status)) { + fprintf(stderr, "gencfu: uspoof_openFromSource error \"%s\" at file %s, line %d, column %d\n", + u_errorName(status), confFileName, (int)parseError.line, (int)parseError.offset); + exit(status); + } + + + // + // Get the compiled rule data from the USpoofChecker. + // + uint32_t outDataSize; + uint8_t *outData; + outDataSize = uspoof_serialize(sc, nullptr, 0, &status); + if (status != U_BUFFER_OVERFLOW_ERROR) { + fprintf(stderr, "gencfu: uspoof_serialize() returned %s\n", u_errorName(status)); + exit(status); + } + status = U_ZERO_ERROR; + outData = new uint8_t[outDataSize]; + uspoof_serialize(sc, outData, outDataSize, &status); + + // Copy the data format version numbers from the spoof data header into the UDataMemory header. + + uprv_memcpy(dh.info.formatVersion, + reinterpret_cast<SpoofDataHeader *>(outData)->fFormatVersion, + sizeof(dh.info.formatVersion)); + + // + // Create the output file + // + size_t bytesWritten; + UNewDataMemory *pData; + pData = udata_create(outDir, nullptr, outFileName, &(dh.info), copyright, &status); + if(U_FAILURE(status)) { + fprintf(stderr, "gencfu: Could not open output file \"%s\", \"%s\"\n", + outFileName, u_errorName(status)); + exit(status); + } + + + // Write the data itself. + udata_writeBlock(pData, outData, outDataSize); + // finish up + bytesWritten = udata_finish(pData, &status); + if(U_FAILURE(status)) { + fprintf(stderr, "gencfu: Error %d writing the output file\n", status); + exit(status); + } + + if (bytesWritten != outDataSize) { + fprintf(stderr, "gencfu: Error writing to output file \"%s\"\n", outFileName); + exit(-1); + } + + uspoof_close(sc); + delete [] outData; + delete [] confusables; + u_cleanup(); + if (!quiet) { + printf("gencfu: tool completed successfully.\n"); + } + return 0; +#endif // UCONFIG_NO_REGULAR_EXPRESSIONS +} + + + // + // Read in a confusables source file + // + static const char *readFile(const char *fileName, int32_t *len) { + char *result; + long fileSize; + FILE *file; + + file = fopen(fileName, "rb"); + if( file == 0 ) { + return nullptr; + } + fseek(file, 0, SEEK_END); + fileSize = ftell(file); + fseek(file, 0, SEEK_SET); + result = new char[fileSize+10]; + if (result==nullptr) { + fclose(file); + return nullptr; + } + + long t = static_cast<long>(fread(result, 1, fileSize, file)); + if (t != fileSize) { + delete [] result; + fclose(file); + return nullptr; + } + result[fileSize]=0; + *len = static_cast<int32_t>(fileSize); + fclose(file); + return result; + } diff --git a/intl/icu/source/tools/gencfu/gencfu.vcxproj b/intl/icu/source/tools/gencfu/gencfu.vcxproj new file mode 100644 index 0000000000..4018d5078d --- /dev/null +++ b/intl/icu/source/tools/gencfu/gencfu.vcxproj @@ -0,0 +1,80 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)\gencfu.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\common;..\..\i18n;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)\gencfu.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)\gencfu.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)\gencfu.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icuind.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icuin.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="gencfu.cpp" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/gencfu/gencfu.vcxproj.filters b/intl/icu/source/tools/gencfu/gencfu.vcxproj.filters new file mode 100644 index 0000000000..096a235f7c --- /dev/null +++ b/intl/icu/source/tools/gencfu/gencfu.vcxproj.filters @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Header Files"> + <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier> + <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx</Extensions> + </Filter> + <Filter Include="Source Files"> + <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier> + <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="gencfu.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/gencfu/sources.txt b/intl/icu/source/tools/gencfu/sources.txt new file mode 100644 index 0000000000..e5a05f8451 --- /dev/null +++ b/intl/icu/source/tools/gencfu/sources.txt @@ -0,0 +1 @@ +gencfu.cpp diff --git a/intl/icu/source/tools/gencmn/Makefile.in b/intl/icu/source/tools/gencmn/Makefile.in new file mode 100644 index 0000000000..bb7fc4e7b4 --- /dev/null +++ b/intl/icu/source/tools/gencmn/Makefile.in @@ -0,0 +1,96 @@ +## Makefile.in for ICU - tools/gencmn +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 1999-2011, International Business Machines Corporation and +## others. All Rights Reserved. +## Steven R. Loomis + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/gencmn + +TARGET_STUB_NAME = gencmn + +SECTION = 8 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(SOURCES:.c=.o) + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(sbindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir) + +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/gencmn/gencmn.8.in b/intl/icu/source/tools/gencmn/gencmn.8.in new file mode 100644 index 0000000000..c48fbeeab6 --- /dev/null +++ b/intl/icu/source/tools/gencmn/gencmn.8.in @@ -0,0 +1,131 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" gencmn.8: manual page for the gencmn utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2000-2001 IBM, Inc. and others. +.\" +.\" Manual page by Yves Arrouye <yves@realnames.com>. +.\" +.TH GENCMN 8 "5 November 2001" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B gencmn +\- generate an ICU memory-mappable data file +.SH SYNOPSIS +.B gencmn +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-v\fP, \fB\-\-verbose" +] +[ +.BR "\-c\fP, \fB\-\-copyright" +| +.BI "\-C\fP, \fB\-\-comment" " comment" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +[ +.BI "\-n\fP, \fB\-\-name" " name" +] +[ +.BI "\-t\fP, \fB\-\-type" " fileext" +] +[ +.BI "\-S\fP, \fB\-\-source" +] +[ +.BI "\-e\fP, \fB\-\-entrypoint" " name" +] +.I maxsize +[ +.I listfilename +] +.SH DESCRIPTION +.B gencmn +takes a set of files and packages them as an ICU memory-mappable data +file. The resulting data file can then be used directly by ICU. +.PP +.B gencmn +reads a list of files to be packaged from either the +supplied +.I listfilename +file, or from its standard output. It packages all the files from +the list that are not bigger than +.I maxsize +bytes, except if +.I maxsize +is 0, which indicates that there is no size limit on files. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BR "\-v\fP, \fB\-\-verbose" +Display extra informative messages during execution. +.TP +.BR \-c\fP, \fB\-\-copyright +Include the ICU copyright notice in the resulting data. +.TP +.BI "\-C\fP, \fB\-\-comment" " comment" +Include the specified +.I comment +in the resulting data instead of the ICU copyright notice. +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory to +.IR destination . +The default destination directory is specified by the environment variable +.BR ICU_DATA . +.TP +.BI "\-n\fP, \fB\-\-name" " name" +Set the data name to +.I name +instead of the default. This name is also used as the base name of the +output. The default name is made of the +.I icudt +prefix, followed by a two-digit version number corresponding to +the current version of the ICU release, and a single letter indicating +the endianness of the data (the letter +.I b +indicated big endian data, and the letter +.I l +indicates little endian ones). +.TP +.BI "\-t\fP, \fB\-\-type" " type" +Use +.I type +as the type of the data. This type is also used as the extension of +the generated data file. The default type ie +.IR dat . +.TP +.BI "\-S\fP, \fB\-\-source" +Write a C source file with the table of contents of the data. +.TP +.BI "\-e\fP, \fB\-\-entrypoint" " name" +Set the data entry point (used for linking against the data in a +shared library form) to +.IR name . +The default entry point name is made of the data (set by the +.BI "\-n\fP, \fB\-\-name" +option) followed by an underscore and the type of the data (set by the +.BI "\-t\fP, \fB\-\-type" +option). +.SH ENVIRONMENT +.TP 10 +.B ICU_DATA +Specifies the directory containing ICU data. Defaults to +.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ . +Some tools in ICU depend on the presence of the trailing slash. It is thus +important to make sure that it is present if +.B ICU_DATA +is set. +.SH VERSION +@VERSION@ +.SH COPYRIGHT +Copyright (C) 2000-2001 IBM, Inc. and others. +.SH SEE ALSO +.BR decmn (8) diff --git a/intl/icu/source/tools/gencmn/gencmn.c b/intl/icu/source/tools/gencmn/gencmn.c new file mode 100644 index 0000000000..77f0c20c61 --- /dev/null +++ b/intl/icu/source/tools/gencmn/gencmn.c @@ -0,0 +1,126 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: gencmn.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999nov01 +* created by: Markus W. Scherer +* +* This program reads a list of data files and combines them +* into one common, memory-mappable file. +*/ + +#include <stdio.h> +#include <stdlib.h> +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" +#include "toolutil.h" +#include "unicode/uclean.h" +#include "unewdata.h" +#include "uoptions.h" +#include "putilimp.h" +#include "pkg_gencmn.h" + +static UOption options[]={ +/*0*/ UOPTION_HELP_H, +/*1*/ UOPTION_HELP_QUESTION_MARK, +/*2*/ UOPTION_VERBOSE, +/*3*/ UOPTION_COPYRIGHT, +/*4*/ UOPTION_DESTDIR, +/*5*/ UOPTION_DEF( "comment", 'C', UOPT_REQUIRES_ARG), +/*6*/ UOPTION_DEF( "name", 'n', UOPT_REQUIRES_ARG), +/*7*/ UOPTION_DEF( "type", 't', UOPT_REQUIRES_ARG), +/*8*/ UOPTION_DEF( "source", 'S', UOPT_NO_ARG), +/*9*/ UOPTION_DEF( "entrypoint", 'e', UOPT_REQUIRES_ARG), +/*10*/UOPTION_SOURCEDIR, +}; + +extern int +main(int argc, char* argv[]) { + UBool sourceTOC, verbose; + uint32_t maxSize; + + U_MAIN_INIT_ARGS(argc, argv); + + /* preset then read command line options */ + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } else if(argc<2) { + argc=-1; + } + + if(argc<0 || options[0].doesOccur || options[1].doesOccur) { + FILE *where = argc < 0 ? stderr : stdout; + + /* + * Broken into chucks because the C89 standard says the minimum + * required supported string length is 509 bytes. + */ + fprintf(where, + "%csage: %s [ -h, -?, --help ] [ -v, --verbose ] [ -c, --copyright ] [ -C, --comment comment ] [ -d, --destdir dir ] [ -n, --name filename ] [ -t, --type filetype ] [ -S, --source tocfile ] [ -e, --entrypoint name ] maxsize listfile\n", argc < 0 ? 'u' : 'U', *argv); + if (options[0].doesOccur || options[1].doesOccur) { + fprintf(where, "\n" + "Read the list file (default: standard input) and create a common data\n" + "file from specified files. Omit any files larger than maxsize, if maxsize > 0.\n"); + fprintf(where, "\n" + "Options:\n" + "\t-h, -?, --help this usage text\n" + "\t-v, --verbose verbose output\n" + "\t-c, --copyright include the ICU copyright notice\n" + "\t-C, --comment comment include a comment string\n" + "\t-d, --destdir dir destination directory\n"); + fprintf(where, + "\t-n, --name filename output filename, without .type extension\n" + "\t (default: " U_ICUDATA_NAME ")\n" + "\t-t, --type filetype type of the destination file\n" + "\t (default: \" dat \")\n" + "\t-S, --source tocfile write a .c source file with the table of\n" + "\t contents\n" + "\t-e, --entrypoint name override the c entrypoint name\n" + "\t (default: \"<name>_<type>\")\n"); + } + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + sourceTOC=options[8].doesOccur; + + verbose = options[2].doesOccur; + + maxSize=(uint32_t)uprv_strtoul(argv[1], NULL, 0); + + createCommonDataFile(options[4].doesOccur ? options[4].value : NULL, + options[6].doesOccur ? options[6].value : NULL, + options[9].doesOccur ? options[9].value : options[6].doesOccur ? options[6].value : NULL, + options[7].doesOccur ? options[7].value : NULL, + options[10].doesOccur ? options[10].value : NULL, + options[3].doesOccur ? U_COPYRIGHT_STRING : options[5].doesOccur ? options[5].value : NULL, + argc == 2 ? NULL : argv[2], + maxSize, sourceTOC, verbose, NULL); + + return 0; +} +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ diff --git a/intl/icu/source/tools/gencmn/gencmn.vcxproj b/intl/icu/source/tools/gencmn/gencmn.vcxproj new file mode 100644 index 0000000000..94c18b6ac5 --- /dev/null +++ b/intl/icu/source/tools/gencmn/gencmn.vcxproj @@ -0,0 +1,80 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{A8D36F8D-09E6-4174-91C3-7BEAA9C3F04F}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)/gencmn.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)/gencmn.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)/gencmn.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)/gencmn.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="gencmn.c" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/gencmn/gencmn.vcxproj.filters b/intl/icu/source/tools/gencmn/gencmn.vcxproj.filters new file mode 100644 index 0000000000..d69b206638 --- /dev/null +++ b/intl/icu/source/tools/gencmn/gencmn.vcxproj.filters @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{5be5a226-8538-4456-89d2-b20fb1906561}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{56407937-8dde-450c-b05f-115430ced4a1}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{7de2655e-2d51-47cf-8e41-c3fe55c0f921}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="gencmn.c"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/gencmn/sources.txt b/intl/icu/source/tools/gencmn/sources.txt new file mode 100644 index 0000000000..6ffa7777aa --- /dev/null +++ b/intl/icu/source/tools/gencmn/sources.txt @@ -0,0 +1 @@ +gencmn.c diff --git a/intl/icu/source/tools/gencnval/Makefile.in b/intl/icu/source/tools/gencnval/Makefile.in new file mode 100644 index 0000000000..b808b0f9e4 --- /dev/null +++ b/intl/icu/source/tools/gencnval/Makefile.in @@ -0,0 +1,97 @@ +## Makefile.in for ICU - tools/gencnval +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 1999-2011, International Business Machines Corporation and +## others. All Rights Reserved. +## Steven R. Loomis + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/gencnval + +TARGET_STUB_NAME = gencnval + +SECTION = 1 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(SOURCES:.c=.o) + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(bindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(bindir) + +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/gencnval/gencnval.1.in b/intl/icu/source/tools/gencnval/gencnval.1.in new file mode 100644 index 0000000000..c563e1ba0a --- /dev/null +++ b/intl/icu/source/tools/gencnval/gencnval.1.in @@ -0,0 +1,93 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" gencnval.1: manual page for the gencnval utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2000-2004 IBM, Inc. and others. +.\" +.\" Manual page by Yves Arrouye <yves@realnames.com>. +.\" Manual page by George Rhoten +.\" +.TH GENCNVAL 1 "11 March 2004" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B gencnval +\- compile the converters aliases file +.SH SYNOPSIS +.B gencnval +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-v\fP, \fB\-\-verbose" +] +[ +.BR "\-c\fP, \fB\-\-copyright" +] +[ +.BI "\-s\fP, \fB\-\-sourcedir" " source" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +[ +.I converterfile +] +.SH DESCRIPTION +.B gencnval +converts the ICU aliases file +.I converterfile +into the binary file +.BR cnvalias.icu . +This binary file can then be read directly by ICU, or used by +.BR pkgdata (1) +for incorporation into a larger archive or library. +.PP +If +.I converterfile +is not provided, the default ICU +.B convrtrs.txt +file is used. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BR "\-v\fP, \fB\-\-verbose" +Display verbose output. This information can include information about +conflicting aliases and the converters the aliases resolve to. +.TP +.BR "\-c\fP, \fB\-\-copyright" +Include a copyright notice in the binary data. +.TP +.BI "\-s\fP, \fB\-\-sourcedir" " source" +Set the source directory to +.IR source . +The default source directory is specified by the environment variable +.BR ICU_DATA . +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory to +.IR destination . +The default destination directory is specified by the environment variable +.BR ICU_DATA . +.SH ENVIRONMENT +.TP 10 +.B ICU_DATA +Specifies the directory containing ICU data. Defaults to +.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ . +Some tools in ICU depend on the presence of the trailing slash. It is thus +important to make sure that it is present if +.B ICU_DATA +is set. +.SH FILES +.TP \w'\fB@PACKAGE@/source/data/mappings/convrtrs.txt'u+3n +.B @PACKAGE@/source/data/mappings/convrtrs.txt +Description of ICU's converters and their aliases. This data file is not +normally installed, and it is available as a part of ICU source code. +.SH VERSION +@VERSION@ +.SH COPYRIGHT +Copyright (C) 2000-2004 IBM, Inc. and others. +.SH SEE ALSO +.BR pkgdata (1) diff --git a/intl/icu/source/tools/gencnval/gencnval.c b/intl/icu/source/tools/gencnval/gencnval.c new file mode 100644 index 0000000000..54b41fb57d --- /dev/null +++ b/intl/icu/source/tools/gencnval/gencnval.c @@ -0,0 +1,1142 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2016 International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: gencnval.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999nov05 +* created by: Markus W. Scherer +* +* This program reads convrtrs.txt and writes a memory-mappable +* converter name alias table to cnvalias.dat . +* +* This program currently writes version 2.1 of the data format. See +* ucnv_io.c for more details on the format. Note that version 2.1 +* is written in such a way that a 2.0 reader will be able to use it, +* and a 2.1 reader will be able to read 2.0. +*/ + +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/ucnv.h" /* ucnv_compareNames() */ +#include "ucnv_io.h" +#include "cmemory.h" +#include "cstring.h" +#include "uinvchar.h" +#include "filestrm.h" +#include "toolutil.h" +#include "unicode/uclean.h" +#include "unewdata.h" +#include "uoptions.h" + +#include <ctype.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> + +/* TODO: Need to check alias name length is less than UCNV_MAX_CONVERTER_NAME_LENGTH */ + +/* STRING_STORE_SIZE + TAG_STORE_SIZE <= ((2^16 - 1) * 2) + That is the maximum size for the string stores combined + because the strings are indexed at 16-bit boundaries by a + 16-bit index, and there is only one section for the + strings. + */ +#define STRING_STORE_SIZE 0x1FBFE /* 130046 */ +#define TAG_STORE_SIZE 0x400 /* 1024 */ + +/* The combined tag and converter count can affect the number of lists + created. The size of all lists must be less than (2^17 - 1) + because the lists are indexed as a 16-bit array with a 16-bit index. + */ +#define MAX_TAG_COUNT 0x3F /* 63 */ +#define MAX_CONV_COUNT UCNV_CONVERTER_INDEX_MASK +#define MAX_ALIAS_COUNT 0xFFFF /* 65535 */ + +/* The maximum number of aliases that a standard tag/converter combination can have. + At this moment 6/18/2002, IANA has 12 names for ASCII. Don't go below 15 for + this value. I don't recommend more than 31 for this value. + */ +#define MAX_TC_ALIAS_COUNT 0x1F /* 31 */ + +#define MAX_LINE_SIZE 0x7FFF /* 32767 */ +#define MAX_LIST_SIZE 0xFFFF /* 65535 */ + +#define DATA_NAME "cnvalias" +#define DATA_TYPE "icu" /* ICU alias table */ + +#define ALL_TAG_STR "ALL" +#define ALL_TAG_NUM 1 +#define EMPTY_TAG_NUM 0 + +/* UDataInfo cf. udata.h */ +static const UDataInfo dataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + sizeof(UChar), + 0, + + {0x43, 0x76, 0x41, 0x6c}, /* dataFormat="CvAl" */ + {3, 0, 1, 0}, /* formatVersion */ + {1, 4, 2, 0} /* dataVersion */ +}; + +typedef struct { + char *store; + uint32_t top; + uint32_t max; +} StringBlock; + +static char stringStore[STRING_STORE_SIZE]; +static StringBlock stringBlock = { stringStore, 0, STRING_STORE_SIZE }; + +typedef struct { + uint16_t aliasCount; + uint16_t *aliases; /* Index into stringStore */ +} AliasList; + +typedef struct { + uint16_t converter; /* Index into stringStore */ + uint16_t totalAliasCount; /* Total aliases in this column */ +} Converter; + +static Converter converters[MAX_CONV_COUNT]; +static uint16_t converterCount=0; + +static char tagStore[TAG_STORE_SIZE]; +static StringBlock tagBlock = { tagStore, 0, TAG_STORE_SIZE }; + +typedef struct { + uint16_t tag; /* Index into tagStore */ + uint16_t totalAliasCount; /* Total aliases in this row */ + AliasList aliasList[MAX_CONV_COUNT]; +} Tag; + +/* Think of this as a 3D array. It's tagCount by converterCount by aliasCount */ +static Tag tags[MAX_TAG_COUNT]; +static uint16_t tagCount = 0; + +/* Used for storing all aliases */ +static uint16_t knownAliases[MAX_ALIAS_COUNT]; +static uint16_t knownAliasesCount = 0; +/*static uint16_t duplicateKnownAliasesCount = 0;*/ + +/* Used for storing the lists section that point to aliases */ +static uint16_t aliasLists[MAX_LIST_SIZE]; +static uint16_t aliasListsSize = 0; + +/* Were the standard tags declared before the aliases. */ +static UBool standardTagsUsed = false; +static UBool verbose = false; +static UBool quiet = false; +static int lineNum = 1; + +static UConverterAliasOptions tableOptions = { + UCNV_IO_STD_NORMALIZED, + 1 /* containsCnvOptionInfo */ +}; + + +/** + * path to convrtrs.txt + */ +const char *path; + +/* prototypes --------------------------------------------------------------- */ + +static void +parseLine(const char *line); + +static void +parseFile(FileStream *in); + +static int32_t +chomp(char *line); + +static void +addOfficialTaggedStandards(char *line, int32_t lineLen); + +static uint16_t +addAlias(const char *alias, uint16_t standard, uint16_t converter, UBool defaultName); + +static uint16_t +addConverter(const char *converter); + +static char * +allocString(StringBlock *block, const char *s, int32_t length); + +static uint16_t +addToKnownAliases(const char *alias); + +static int +compareAliases(const void *alias1, const void *alias2); + +static uint16_t +getTagNumber(const char *tag, uint16_t tagLen); + +/*static void +addTaggedAlias(uint16_t tag, const char *alias, uint16_t converter);*/ + +static void +writeAliasTable(UNewDataMemory *out); + +/* -------------------------------------------------------------------------- */ + +/* Presumes that you used allocString() */ +#define GET_ALIAS_STR(index) (stringStore + ((size_t)(index) << 1)) +#define GET_TAG_STR(index) (tagStore + ((size_t)(index) << 1)) + +/* Presumes that you used allocString() */ +#define GET_ALIAS_NUM(str) ((uint16_t)((str - stringStore) >> 1)) +#define GET_TAG_NUM(str) ((uint16_t)((str - tagStore) >> 1)) + +enum +{ + HELP1, + HELP2, + VERBOSE, + COPYRIGHT, + DESTDIR, + SOURCEDIR, + QUIET +}; + +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_VERBOSE, + UOPTION_COPYRIGHT, + UOPTION_DESTDIR, + UOPTION_SOURCEDIR, + UOPTION_QUIET +}; + +extern int +main(int argc, char* argv[]) { + int i, n; + char pathBuf[512]; + FileStream *in; + UNewDataMemory *out; + UErrorCode errorCode=U_ZERO_ERROR; + + U_MAIN_INIT_ARGS(argc, argv); + + /* preset then read command line options */ + options[DESTDIR].value=options[SOURCEDIR].value=u_getDataDirectory(); + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } + if(argc<0 || options[HELP1].doesOccur || options[HELP2].doesOccur) { + fprintf(stderr, + "usage: %s [-options] [convrtrs.txt]\n" + "\tread convrtrs.txt and create " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE "\n" + "options:\n" + "\t-h or -? or --help this usage text\n" + "\t-v or --verbose prints out extra information about the alias table\n" + "\t-q or --quiet do not display warnings and progress\n" + "\t-c or --copyright include a copyright notice\n" + "\t-d or --destdir destination directory, followed by the path\n" + "\t-s or --sourcedir source directory, followed by the path\n", + argv[0]); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + if(options[VERBOSE].doesOccur) { + verbose = true; + } + + if(options[QUIET].doesOccur) { + quiet = true; + } + + if (argc >= 2) { + path = argv[1]; + } else { + path = "convrtrs.txt"; + } + + const char* sourcedir = options[SOURCEDIR].value; + if (sourcedir != NULL && *sourcedir != 0) { + char *end; + uprv_strcpy(pathBuf, sourcedir); + end = uprv_strchr(pathBuf, 0); + if(*(end-1)!=U_FILE_SEP_CHAR) { + *(end++)=U_FILE_SEP_CHAR; + } + uprv_strcpy(end, path); + path = pathBuf; + } + + uprv_memset(stringStore, 0, sizeof(stringStore)); + uprv_memset(tagStore, 0, sizeof(tagStore)); + uprv_memset(converters, 0, sizeof(converters)); + uprv_memset(tags, 0, sizeof(tags)); + uprv_memset(aliasLists, 0, sizeof(aliasLists)); + uprv_memset(knownAliases, 0, sizeof(aliasLists)); + + + in=T_FileStream_open(path, "r"); + if(in==NULL) { + fprintf(stderr, "gencnval: unable to open input file %s\n", path); + exit(U_FILE_ACCESS_ERROR); + } + parseFile(in); + T_FileStream_close(in); + + /* create the output file */ + out=udata_create(options[DESTDIR].value, DATA_TYPE, DATA_NAME, &dataInfo, + options[COPYRIGHT].doesOccur ? U_COPYRIGHT_STRING : NULL, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "gencnval: unable to open output file - error %s\n", u_errorName(errorCode)); + exit(errorCode); + } + + /* write the table of aliases based on a tag/converter name combination */ + writeAliasTable(out); + + /* finish */ + udata_finish(out, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "gencnval: error finishing output file - %s\n", u_errorName(errorCode)); + exit(errorCode); + } + + /* clean up tags */ + for (i = 0; i < MAX_TAG_COUNT; i++) { + for (n = 0; n < MAX_CONV_COUNT; n++) { + if (tags[i].aliasList[n].aliases!=NULL) { + uprv_free(tags[i].aliasList[n].aliases); + } + } + } + + return 0; +} + +static void +parseFile(FileStream *in) { + char line[MAX_LINE_SIZE]; + char lastLine[MAX_LINE_SIZE]; + int32_t lineSize = 0; + int32_t lastLineSize = 0; + UBool validParse = true; + + lineNum = 0; + + /* Add the empty tag, which is for untagged aliases */ + getTagNumber("", 0); + getTagNumber(ALL_TAG_STR, 3); + allocString(&stringBlock, "", 0); + + /* read the list of aliases */ + while (validParse) { + validParse = false; + + /* Read non-empty lines that don't start with a space character. */ + while (T_FileStream_readLine(in, lastLine, MAX_LINE_SIZE) != NULL) { + lastLineSize = chomp(lastLine); + if (lineSize == 0 || (lastLineSize > 0 && isspace((int)*lastLine))) { + uprv_strcpy(line + lineSize, lastLine); + lineSize += lastLineSize; + } else if (lineSize > 0) { + validParse = true; + break; + } + lineNum++; + } + + if (validParse || lineSize > 0) { + if (isspace((int)*line)) { + fprintf(stderr, "%s:%d: error: cannot start an alias with a space\n", path, lineNum-1); + exit(U_PARSE_ERROR); + } else if (line[0] == '{') { + if (!standardTagsUsed && line[lineSize - 1] != '}') { + fprintf(stderr, "%s:%d: error: alias needs to start with a converter name\n", path, lineNum); + exit(U_PARSE_ERROR); + } + addOfficialTaggedStandards(line, lineSize); + standardTagsUsed = true; + } else { + if (standardTagsUsed) { + parseLine(line); + } + else { + fprintf(stderr, "%s:%d: error: alias table needs to start a list of standard tags\n", path, lineNum); + exit(U_PARSE_ERROR); + } + } + /* Was the last line consumed */ + if (lastLineSize > 0) { + uprv_strcpy(line, lastLine); + lineSize = lastLineSize; + } + else { + lineSize = 0; + } + } + lineNum++; + } +} + +/* This works almost like the Perl chomp. + It removes the newlines, comments and trailing whitespace (not preceding whitespace). +*/ +static int32_t +chomp(char *line) { + char *s = line; + char *lastNonSpace = line; + while(*s!=0) { + /* truncate at a newline or a comment */ + if(*s == '\r' || *s == '\n' || *s == '#') { + *s = 0; + break; + } + if (!isspace((int)*s)) { + lastNonSpace = s; + } + ++s; + } + if (lastNonSpace++ > line) { + *lastNonSpace = 0; + s = lastNonSpace; + } + return (int32_t)(s - line); +} + +static void +parseLine(const char *line) { + uint16_t pos=0, start, limit, length, cnv; + char *converter, *alias; + + /* skip leading white space */ + /* There is no whitespace at the beginning anymore */ +/* while(line[pos]!=0 && isspace(line[pos])) { + ++pos; + } +*/ + + /* is there nothing on this line? */ + if(line[pos]==0) { + return; + } + + /* get the converter name */ + start=pos; + while(line[pos]!=0 && !isspace((int)line[pos])) { + ++pos; + } + limit=pos; + + /* store the converter name */ + length=(uint16_t)(limit-start); + converter=allocString(&stringBlock, line+start, length); + + /* add the converter to the converter table */ + cnv=addConverter(converter); + + /* The name itself may be tagged, so let's added it to the aliases list properly */ + pos = start; + + /* get all the real aliases */ + for(;;) { + + /* skip white space */ + while(line[pos]!=0 && isspace((int)line[pos])) { + ++pos; + } + + /* is there no more alias name on this line? */ + if(line[pos]==0) { + break; + } + + /* get an alias name */ + start=pos; + while(line[pos]!=0 && line[pos]!='{' && !isspace((int)line[pos])) { + ++pos; + } + limit=pos; + + /* store the alias name */ + length=(uint16_t)(limit-start); + if (start == 0) { + /* add the converter as its own alias to the alias table */ + alias = converter; + addAlias(alias, ALL_TAG_NUM, cnv, true); + } + else { + alias=allocString(&stringBlock, line+start, length); + addAlias(alias, ALL_TAG_NUM, cnv, false); + } + addToKnownAliases(alias); + + /* add the alias/converter pair to the alias table */ + /* addAlias(alias, 0, cnv, false);*/ + + /* skip whitespace */ + while (line[pos] && isspace((int)line[pos])) { + ++pos; + } + + /* handle tags if they are present */ + if (line[pos] == '{') { + ++pos; + do { + start = pos; + while (line[pos] && line[pos] != '}' && !isspace((int)line[pos])) { + ++pos; + } + limit = pos; + + if (start != limit) { + /* add the tag to the tag table */ + uint16_t tag = getTagNumber(line + start, (uint16_t)(limit - start)); + addAlias(alias, tag, cnv, (UBool)(line[limit-1] == '*')); + } + + while (line[pos] && isspace((int)line[pos])) { + ++pos; + } + } while (line[pos] && line[pos] != '}'); + + if (line[pos] == '}') { + ++pos; + } else { + fprintf(stderr, "%s:%d: Unterminated tag list\n", path, lineNum); + exit(U_UNMATCHED_BRACES); + } + } else { + addAlias(alias, EMPTY_TAG_NUM, cnv, (UBool)(tags[0].aliasList[cnv].aliasCount == 0)); + } + } +} + +static uint16_t +getTagNumber(const char *tag, uint16_t tagLen) { + char *atag; + uint16_t t; + UBool preferredName = ((tagLen > 0) ? (tag[tagLen - 1] == '*') : (false)); + + if (tagCount >= MAX_TAG_COUNT) { + fprintf(stderr, "%s:%d: too many tags\n", path, lineNum); + exit(U_BUFFER_OVERFLOW_ERROR); + } + + if (preferredName) { +/* puts(tag);*/ + tagLen--; + } + + for (t = 0; t < tagCount; ++t) { + const char *currTag = GET_TAG_STR(tags[t].tag); + if (uprv_strlen(currTag) == tagLen && !uprv_strnicmp(currTag, tag, tagLen)) { + return t; + } + } + + /* we need to add this tag */ + if (tagCount >= MAX_TAG_COUNT) { + fprintf(stderr, "%s:%d: error: too many tags\n", path, lineNum); + exit(U_BUFFER_OVERFLOW_ERROR); + } + + /* allocate a new entry in the tag table */ + atag = allocString(&tagBlock, tag, tagLen); + + if (standardTagsUsed) { + fprintf(stderr, "%s:%d: error: Tag \"%s\" is not declared at the beginning of the alias table.\n", + path, lineNum, atag); + exit(1); + } + else if (tagLen > 0 && strcmp(tag, ALL_TAG_STR) != 0) { + fprintf(stderr, "%s:%d: warning: Tag \"%s\" was added to the list of standards because it was not declared at beginning of the alias table.\n", + path, lineNum, atag); + } + + /* add the tag to the tag table */ + tags[tagCount].tag = GET_TAG_NUM(atag); + /* The aliasList should be set to 0's already */ + + return tagCount++; +} + +/*static void +addTaggedAlias(uint16_t tag, const char *alias, uint16_t converter) { + tags[tag].aliases[converter] = alias; +} +*/ + +static void +addOfficialTaggedStandards(char *line, int32_t lineLen) { + (void) lineLen; // suppress compiler warnings about unused variable + char *atag; + char *endTagExp; + char *tag; + static const char WHITESPACE[] = " \t"; + + if (tagCount > UCNV_NUM_RESERVED_TAGS) { + fprintf(stderr, "%s:%d: error: official tags already added\n", path, lineNum); + exit(U_BUFFER_OVERFLOW_ERROR); + } + tag = strchr(line, '{'); + if (tag == NULL) { + /* Why were we called? */ + fprintf(stderr, "%s:%d: error: Missing start of tag group\n", path, lineNum); + exit(U_PARSE_ERROR); + } + tag++; + endTagExp = strchr(tag, '}'); + if (endTagExp == NULL) { + fprintf(stderr, "%s:%d: error: Missing end of tag group\n", path, lineNum); + exit(U_PARSE_ERROR); + } + endTagExp[0] = 0; + + tag = strtok(tag, WHITESPACE); + while (tag != NULL) { +/* printf("Adding original tag \"%s\"\n", tag);*/ + + /* allocate a new entry in the tag table */ + atag = allocString(&tagBlock, tag, -1); + + /* add the tag to the tag table */ + tags[tagCount++].tag = (uint16_t)((atag - tagStore) >> 1); + + /* The aliasList should already be set to 0's */ + + /* Get next tag */ + tag = strtok(NULL, WHITESPACE); + } +} + +static uint16_t +addToKnownAliases(const char *alias) { +/* uint32_t idx; */ + /* strict matching */ +/* for (idx = 0; idx < knownAliasesCount; idx++) { + uint16_t num = GET_ALIAS_NUM(alias); + if (knownAliases[idx] != num + && uprv_strcmp(alias, GET_ALIAS_STR(knownAliases[idx])) == 0) + { + fprintf(stderr, "%s:%d: warning: duplicate alias %s and %s found\n", path, + lineNum, alias, GET_ALIAS_STR(knownAliases[idx])); + duplicateKnownAliasesCount++; + break; + } + else if (knownAliases[idx] != num + && ucnv_compareNames(alias, GET_ALIAS_STR(knownAliases[idx])) == 0) + { + if (verbose) { + fprintf(stderr, "%s:%d: information: duplicate alias %s and %s found\n", path, + lineNum, alias, GET_ALIAS_STR(knownAliases[idx])); + } + duplicateKnownAliasesCount++; + break; + } + } +*/ + if (knownAliasesCount >= MAX_ALIAS_COUNT) { + fprintf(stderr, "%s:%d: warning: Too many aliases defined for all converters\n", + path, lineNum); + exit(U_BUFFER_OVERFLOW_ERROR); + } + /* TODO: We could try to unlist exact duplicates. */ + return knownAliases[knownAliasesCount++] = GET_ALIAS_NUM(alias); +} + +/* +@param standard When standard is 0, then it's the "empty" tag. +*/ +static uint16_t +addAlias(const char *alias, uint16_t standard, uint16_t converter, UBool defaultName) { + uint32_t idx, idx2; + UBool startEmptyWithoutDefault = false; + AliasList *aliasList; + + if(standard>=MAX_TAG_COUNT) { + fprintf(stderr, "%s:%d: error: too many standard tags\n", path, lineNum); + exit(U_BUFFER_OVERFLOW_ERROR); + } + if(converter>=MAX_CONV_COUNT) { + fprintf(stderr, "%s:%d: error: too many converter names\n", path, lineNum); + exit(U_BUFFER_OVERFLOW_ERROR); + } + aliasList = &tags[standard].aliasList[converter]; + + if (strchr(alias, '}')) { + fprintf(stderr, "%s:%d: error: unmatched } found\n", path, + lineNum); + } + + if(aliasList->aliasCount + 1 >= MAX_TC_ALIAS_COUNT) { + fprintf(stderr, "%s:%d: error: too many aliases for alias %s and converter %s\n", path, + lineNum, alias, GET_ALIAS_STR(converters[converter].converter)); + exit(U_BUFFER_OVERFLOW_ERROR); + } + + /* Show this warning only once. All aliases are added to the "ALL" tag. */ + if (standard == ALL_TAG_NUM && GET_ALIAS_STR(converters[converter].converter) != alias) { + /* Normally these option values are parsed at runtime, and they can + be discarded when the alias is a default converter. Options should + only be on a converter and not an alias. */ + if (uprv_strchr(alias, UCNV_OPTION_SEP_CHAR) != 0) + { + fprintf(stderr, "warning(line %d): alias %s contains a \""UCNV_OPTION_SEP_STRING"\". Options are parsed at run-time and do not need to be in the alias table.\n", + lineNum, alias); + } + if (uprv_strchr(alias, UCNV_VALUE_SEP_CHAR) != 0) + { + fprintf(stderr, "warning(line %d): alias %s contains an \""UCNV_VALUE_SEP_STRING"\". Options are parsed at run-time and do not need to be in the alias table.\n", + lineNum, alias); + } + } + + if (standard != ALL_TAG_NUM) { + /* Check for duplicate aliases for this tag on all converters */ + for (idx = 0; idx < converterCount; idx++) { + for (idx2 = 0; idx2 < tags[standard].aliasList[idx].aliasCount; idx2++) { + uint16_t aliasNum = tags[standard].aliasList[idx].aliases[idx2]; + if (aliasNum + && ucnv_compareNames(alias, GET_ALIAS_STR(aliasNum)) == 0) + { + if (idx == converter) { + /* + * (alias, standard) duplicates are harmless if they map to the same converter. + * Only print a warning in verbose mode, or if the alias is a precise duplicate, + * not just a lenient-match duplicate. + */ + if (verbose || 0 == uprv_strcmp(alias, GET_ALIAS_STR(aliasNum))) { + fprintf(stderr, "%s:%d: warning: duplicate aliases %s and %s found for standard %s and converter %s\n", path, + lineNum, alias, GET_ALIAS_STR(aliasNum), + GET_TAG_STR(tags[standard].tag), + GET_ALIAS_STR(converters[converter].converter)); + } + } else { + fprintf(stderr, "%s:%d: warning: duplicate aliases %s and %s found for standard tag %s between converter %s and converter %s\n", path, + lineNum, alias, GET_ALIAS_STR(aliasNum), + GET_TAG_STR(tags[standard].tag), + GET_ALIAS_STR(converters[converter].converter), + GET_ALIAS_STR(converters[idx].converter)); + } + break; + } + } + } + + /* Check for duplicate default aliases for this converter on all tags */ + /* It's okay to have multiple standards prefer the same name */ +/* if (verbose && !dupFound) { + for (idx = 0; idx < tagCount; idx++) { + if (tags[idx].aliasList[converter].aliases) { + uint16_t aliasNum = tags[idx].aliasList[converter].aliases[0]; + if (aliasNum + && ucnv_compareNames(alias, GET_ALIAS_STR(aliasNum)) == 0) + { + fprintf(stderr, "%s:%d: warning: duplicate alias %s found for converter %s and standard tag %s\n", path, + lineNum, alias, GET_ALIAS_STR(converters[converter].converter), GET_TAG_STR(tags[standard].tag)); + break; + } + } + } + }*/ + } + + if (aliasList->aliasCount <= 0) { + aliasList->aliasCount++; + startEmptyWithoutDefault = true; + } + aliasList->aliases = (uint16_t *)uprv_realloc(aliasList->aliases, (aliasList->aliasCount + 1) * sizeof(aliasList->aliases[0])); + if (startEmptyWithoutDefault) { + aliasList->aliases[0] = 0; + } + if (defaultName) { + if (aliasList->aliases[0] != 0) { + fprintf(stderr, "%s:%d: error: Alias %s and %s cannot both be the default alias for standard tag %s and converter %s\n", path, + lineNum, + alias, + GET_ALIAS_STR(aliasList->aliases[0]), + GET_TAG_STR(tags[standard].tag), + GET_ALIAS_STR(converters[converter].converter)); + exit(U_PARSE_ERROR); + } + aliasList->aliases[0] = GET_ALIAS_NUM(alias); + } else { + aliasList->aliases[aliasList->aliasCount++] = GET_ALIAS_NUM(alias); + } +/* aliasList->converter = converter;*/ + + converters[converter].totalAliasCount++; /* One more to the column */ + tags[standard].totalAliasCount++; /* One more to the row */ + + return aliasList->aliasCount; +} + +static uint16_t +addConverter(const char *converter) { + uint32_t idx; + if(converterCount>=MAX_CONV_COUNT) { + fprintf(stderr, "%s:%d: error: too many converters\n", path, lineNum); + exit(U_BUFFER_OVERFLOW_ERROR); + } + + for (idx = 0; idx < converterCount; idx++) { + if (ucnv_compareNames(converter, GET_ALIAS_STR(converters[idx].converter)) == 0) { + fprintf(stderr, "%s:%d: error: duplicate converter %s found!\n", path, lineNum, converter); + exit(U_PARSE_ERROR); + break; + } + } + + converters[converterCount].converter = GET_ALIAS_NUM(converter); + converters[converterCount].totalAliasCount = 0; + + return converterCount++; +} + +/* resolve this alias based on the prioritization of the standard tags. */ +static void +resolveAliasToConverter(uint16_t alias, uint16_t *tagNum, uint16_t *converterNum) { + uint16_t idx, idx2, idx3; + + for (idx = UCNV_NUM_RESERVED_TAGS; idx < tagCount; idx++) { + for (idx2 = 0; idx2 < converterCount; idx2++) { + for (idx3 = 0; idx3 < tags[idx].aliasList[idx2].aliasCount; idx3++) { + uint16_t aliasNum = tags[idx].aliasList[idx2].aliases[idx3]; + if (aliasNum == alias) { + *tagNum = idx; + *converterNum = idx2; + return; + } + } + } + } + /* Do the leftovers last, just in case */ + /* There is no need to do the ALL tag */ + idx = 0; + for (idx2 = 0; idx2 < converterCount; idx2++) { + for (idx3 = 0; idx3 < tags[idx].aliasList[idx2].aliasCount; idx3++) { + uint16_t aliasNum = tags[idx].aliasList[idx2].aliases[idx3]; + if (aliasNum == alias) { + *tagNum = idx; + *converterNum = idx2; + return; + } + } + } + *tagNum = UINT16_MAX; + *converterNum = UINT16_MAX; + fprintf(stderr, "%s: warning: alias %s not found\n", + path, + GET_ALIAS_STR(alias)); + return; +} + +/* The knownAliases should be sorted before calling this function */ +static uint32_t +resolveAliases(uint16_t *uniqueAliasArr, uint16_t *uniqueAliasToConverterArr, uint16_t aliasOffset) { + uint32_t uniqueAliasIdx = 0; + uint32_t idx; + uint16_t currTagNum, oldTagNum; + uint16_t currConvNum, oldConvNum; + const char *lastName; + + if (knownAliasesCount != 0) { + resolveAliasToConverter(knownAliases[0], &oldTagNum, &currConvNum); + uniqueAliasToConverterArr[uniqueAliasIdx] = currConvNum; + oldConvNum = currConvNum; + uniqueAliasArr[uniqueAliasIdx] = knownAliases[0] + aliasOffset; + uniqueAliasIdx++; + lastName = GET_ALIAS_STR(knownAliases[0]); + + for (idx = 1; idx < knownAliasesCount; idx++) { + resolveAliasToConverter(knownAliases[idx], &currTagNum, &currConvNum); + if (ucnv_compareNames(lastName, GET_ALIAS_STR(knownAliases[idx])) == 0) { + /* duplicate found */ + if ((currTagNum < oldTagNum && currTagNum >= UCNV_NUM_RESERVED_TAGS) + || oldTagNum == 0) { + oldTagNum = currTagNum; + uniqueAliasToConverterArr[uniqueAliasIdx - 1] = currConvNum; + uniqueAliasArr[uniqueAliasIdx - 1] = knownAliases[idx] + aliasOffset; + if (verbose) { + printf("using %s instead of %s -> %s", + GET_ALIAS_STR(knownAliases[idx]), + lastName, + GET_ALIAS_STR(converters[currConvNum].converter)); + if (oldConvNum != currConvNum) { + printf(" (alias conflict)"); + } + puts(""); + } + } + else { + /* else ignore it */ + if (verbose) { + printf("folding %s into %s -> %s", + GET_ALIAS_STR(knownAliases[idx]), + lastName, + GET_ALIAS_STR(converters[oldConvNum].converter)); + if (oldConvNum != currConvNum) { + printf(" (alias conflict)"); + } + puts(""); + } + } + if (oldConvNum != currConvNum) { + uniqueAliasToConverterArr[uniqueAliasIdx - 1] |= UCNV_AMBIGUOUS_ALIAS_MAP_BIT; + } + } + else { + uniqueAliasToConverterArr[uniqueAliasIdx] = currConvNum; + oldConvNum = currConvNum; + uniqueAliasArr[uniqueAliasIdx] = knownAliases[idx] + aliasOffset; + uniqueAliasIdx++; + lastName = GET_ALIAS_STR(knownAliases[idx]); + oldTagNum = currTagNum; + /*printf("%s -> %s\n", GET_ALIAS_STR(knownAliases[idx]), GET_ALIAS_STR(converters[currConvNum].converter));*/ + } + if (uprv_strchr(GET_ALIAS_STR(converters[currConvNum].converter), UCNV_OPTION_SEP_CHAR) != NULL) { + uniqueAliasToConverterArr[uniqueAliasIdx-1] |= UCNV_CONTAINS_OPTION_BIT; + } + } + } + return uniqueAliasIdx; +} + +static void +createOneAliasList(uint16_t *aliasArrLists, uint32_t tag, uint32_t converter, uint16_t offset) { + uint32_t aliasNum; + AliasList *aliasList = &tags[tag].aliasList[converter]; + + if (aliasList->aliasCount == 0) { + aliasArrLists[tag*converterCount + converter] = 0; + } + else { + aliasLists[aliasListsSize++] = aliasList->aliasCount; + + /* write into the array area a 1's based index. */ + aliasArrLists[tag*converterCount + converter] = aliasListsSize; + +/* printf("tag %s converter %s\n", + GET_TAG_STR(tags[tag].tag), + GET_ALIAS_STR(converters[converter].converter));*/ + for (aliasNum = 0; aliasNum < aliasList->aliasCount; aliasNum++) { + uint16_t value; +/* printf(" %s\n", + GET_ALIAS_STR(aliasList->aliases[aliasNum]));*/ + if (aliasList->aliases[aliasNum]) { + value = aliasList->aliases[aliasNum] + offset; + } else { + value = 0; + if (tag != 0 && !quiet) { /* Only show the warning when it's not the leftover tag. */ + fprintf(stderr, "%s: warning: tag %s does not have a default alias for %s\n", + path, + GET_TAG_STR(tags[tag].tag), + GET_ALIAS_STR(converters[converter].converter)); + } + } + aliasLists[aliasListsSize++] = value; + if (aliasListsSize >= MAX_LIST_SIZE) { + fprintf(stderr, "%s: error: Too many alias lists\n", path); + exit(U_BUFFER_OVERFLOW_ERROR); + } + + } + } +} + +static void +createNormalizedAliasStrings(char *normalizedStrings, const char *origStringBlock, int32_t stringBlockLength) { + int32_t currStrLen; + uprv_memcpy(normalizedStrings, origStringBlock, stringBlockLength); + while ((currStrLen = (int32_t)uprv_strlen(origStringBlock)) < stringBlockLength) { + int32_t currStrSize = currStrLen + 1; + if (currStrLen > 0) { + int32_t normStrLen; + ucnv_io_stripForCompare(normalizedStrings, origStringBlock); + normStrLen = (int32_t)uprv_strlen(normalizedStrings); + if (normStrLen > 0) { + uprv_memset(normalizedStrings + normStrLen, 0, currStrSize - normStrLen); + } + } + stringBlockLength -= currStrSize; + normalizedStrings += currStrSize; + origStringBlock += currStrSize; + } +} + +static void +writeAliasTable(UNewDataMemory *out) { + uint32_t i, j; + uint32_t uniqueAliasesSize; + uint16_t aliasOffset = (uint16_t)(tagBlock.top/sizeof(uint16_t)); + uint16_t *aliasArrLists = (uint16_t *)uprv_malloc(tagCount * converterCount * sizeof(uint16_t)); + uint16_t *uniqueAliases = (uint16_t *)uprv_malloc(knownAliasesCount * sizeof(uint16_t)); + uint16_t *uniqueAliasesToConverter = (uint16_t *)uprv_malloc(knownAliasesCount * sizeof(uint16_t)); + + qsort(knownAliases, knownAliasesCount, sizeof(knownAliases[0]), compareAliases); + uniqueAliasesSize = resolveAliases(uniqueAliases, uniqueAliasesToConverter, aliasOffset); + + /* Array index starts at 1. aliasLists[0] is the size of the lists section. */ + aliasListsSize = 0; + + /* write the offsets of all the aliases lists in a 2D array, and create the lists. */ + for (i = 0; i < tagCount; ++i) { + for (j = 0; j < converterCount; ++j) { + createOneAliasList(aliasArrLists, i, j, aliasOffset); + } + } + + /* Write the size of the TOC */ + if (tableOptions.stringNormalizationType == UCNV_IO_UNNORMALIZED) { + udata_write32(out, 8); + } + else { + udata_write32(out, 9); + } + + /* Write the sizes of each section */ + /* All sizes are the number of uint16_t units, not bytes */ + udata_write32(out, converterCount); + udata_write32(out, tagCount); + udata_write32(out, uniqueAliasesSize); /* list of aliases */ + udata_write32(out, uniqueAliasesSize); /* The preresolved form of mapping an untagged the alias to a converter */ + udata_write32(out, tagCount * converterCount); + udata_write32(out, aliasListsSize + 1); + udata_write32(out, sizeof(tableOptions) / sizeof(uint16_t)); + udata_write32(out, (tagBlock.top + stringBlock.top) / sizeof(uint16_t)); + if (tableOptions.stringNormalizationType != UCNV_IO_UNNORMALIZED) { + udata_write32(out, (tagBlock.top + stringBlock.top) / sizeof(uint16_t)); + } + + /* write the table of converters */ + /* Think of this as the column headers */ + for(i=0; i<converterCount; ++i) { + udata_write16(out, (uint16_t)(converters[i].converter + aliasOffset)); + } + + /* write the table of tags */ + /* Think of this as the row headers */ + for(i=UCNV_NUM_RESERVED_TAGS; i<tagCount; ++i) { + udata_write16(out, tags[i].tag); + } + /* The empty tag is considered the leftover list, and put that at the end of the priority list. */ + udata_write16(out, tags[EMPTY_TAG_NUM].tag); + udata_write16(out, tags[ALL_TAG_NUM].tag); + + /* Write the unique list of aliases */ + udata_writeBlock(out, uniqueAliases, uniqueAliasesSize * sizeof(uint16_t)); + + /* Write the unique list of aliases */ + udata_writeBlock(out, uniqueAliasesToConverter, uniqueAliasesSize * sizeof(uint16_t)); + + /* Write the array to the lists */ + udata_writeBlock(out, (const void *)(aliasArrLists + (2*converterCount)), (((tagCount - 2) * converterCount) * sizeof(uint16_t))); + /* Now write the leftover part of the array for the EMPTY and ALL lists */ + udata_writeBlock(out, (const void *)aliasArrLists, (2 * converterCount * sizeof(uint16_t))); + + /* Offset the next array to make the index start at 1. */ + udata_write16(out, 0xDEAD); + + /* Write the lists */ + udata_writeBlock(out, (const void *)aliasLists, aliasListsSize * sizeof(uint16_t)); + + /* Write any options for the alias table. */ + udata_writeBlock(out, (const void *)&tableOptions, sizeof(tableOptions)); + + /* write the tags strings */ + udata_writeString(out, tagBlock.store, tagBlock.top); + + /* write the aliases strings */ + udata_writeString(out, stringBlock.store, stringBlock.top); + + /* write the normalized aliases strings */ + if (tableOptions.stringNormalizationType != UCNV_IO_UNNORMALIZED) { + char *normalizedStrings = (char *)uprv_malloc(tagBlock.top + stringBlock.top); + createNormalizedAliasStrings(normalizedStrings, tagBlock.store, tagBlock.top); + createNormalizedAliasStrings(normalizedStrings + tagBlock.top, stringBlock.store, stringBlock.top); + + /* Write out the complete normalized array. */ + udata_writeString(out, normalizedStrings, tagBlock.top + stringBlock.top); + uprv_free(normalizedStrings); + } + + uprv_free(uniqueAliasesToConverter); + uprv_free(uniqueAliases); + uprv_free(aliasArrLists); +} + +static char * +allocString(StringBlock *block, const char *s, int32_t length) { + uint32_t top; + char *p; + + if(length<0) { + length=(int32_t)uprv_strlen(s); + } + + /* + * add 1 for the terminating NUL + * and round up (+1 &~1) + * to keep the addresses on a 16-bit boundary + */ + top=block->top + (uint32_t)((length + 1 + 1) & ~1); + + if(top >= block->max) { + fprintf(stderr, "%s:%d: error: out of memory\n", path, lineNum); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + /* get the pointer and copy the string */ + p = block->store + block->top; + uprv_memcpy(p, s, length); + p[length] = 0; /* NUL-terminate it */ + if((length & 1) == 0) { + p[length + 1] = 0; /* set the padding byte */ + } + + /* check for invariant characters now that we have a NUL-terminated string for easy output */ + if(!uprv_isInvariantString(p, length)) { + fprintf(stderr, "%s:%d: error: the name %s contains not just invariant characters\n", path, lineNum, p); + exit(U_INVALID_TABLE_FORMAT); + } + + block->top = top; + return p; +} + +static int +compareAliases(const void *alias1, const void *alias2) { + /* Names like IBM850 and ibm-850 need to be sorted together */ + int result = ucnv_compareNames(GET_ALIAS_STR(*(uint16_t*)alias1), GET_ALIAS_STR(*(uint16_t*)alias2)); + if (!result) { + /* Sort the shortest first */ + return (int)uprv_strlen(GET_ALIAS_STR(*(uint16_t*)alias1)) - (int)uprv_strlen(GET_ALIAS_STR(*(uint16_t*)alias2)); + } + return result; +} + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ + diff --git a/intl/icu/source/tools/gencnval/gencnval.vcxproj b/intl/icu/source/tools/gencnval/gencnval.vcxproj new file mode 100644 index 0000000000..8e3b7bab4f --- /dev/null +++ b/intl/icu/source/tools/gencnval/gencnval.vcxproj @@ -0,0 +1,80 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{8B41752B-5A52-41E4-B7E0-07921C0CC6BF}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)/gencnval.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)/gencnval.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)/gencnval.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)/gencnval.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="gencnval.c" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/gencnval/gencnval.vcxproj.filters b/intl/icu/source/tools/gencnval/gencnval.vcxproj.filters new file mode 100644 index 0000000000..20f10c506d --- /dev/null +++ b/intl/icu/source/tools/gencnval/gencnval.vcxproj.filters @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{570fb8ae-ac18-467d-8502-470a241a60d4}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{7b2185f2-4ff9-4419-b596-0a21e37414c9}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{1dc5e7e3-4d1b-4031-a31f-c39b3a3e283a}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="gencnval.c"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/gencnval/sources.txt b/intl/icu/source/tools/gencnval/sources.txt new file mode 100644 index 0000000000..9206d402d9 --- /dev/null +++ b/intl/icu/source/tools/gencnval/sources.txt @@ -0,0 +1 @@ +gencnval.c diff --git a/intl/icu/source/tools/gencolusb/Makefile b/intl/icu/source/tools/gencolusb/Makefile new file mode 100644 index 0000000000..be13b5b106 --- /dev/null +++ b/intl/icu/source/tools/gencolusb/Makefile @@ -0,0 +1,45 @@ +## Makefile for rebuilding 'unsafe backward' data +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 2015, International Business Machines Corporation and +## others. All Rights Reserved. + +## +## CONFIGURATION: +## 1. create Makefile.local containing overrides if necessary: +## BUILD_ROOT=/home/user/icu-build (location of 'config.status' etc.) +## PATH_VAR=DYLD_LIBRARY_PATH (if on OSX etc) +## +## UPDATING +## 1. make 'reset-icu' will reset ICU to 'bootstrap' state, zeroing out source/i18n/collunsafe.h +## 2. make 'gen-file' will generate and test source/i18n/collunsafe.h + +subdir=tools/gencolusb +srcdir=$(shell pwd) +SOURCE_ROOT=$(shell cd ../.. ; pwd) +BUILD_ROOT=$(SOURCE_ROOT) +BUILD_HERE=$(BUILD_ROOT)/$(subdir) +TOOL=extract_unsafe_backwards +TEST=verify_uset +PATH_VAR=LD_LIBRARY_PATH + +-include Makefile.local + +GEN_FILE=$(SOURCE_ROOT)/i18n/collunsafe.h +BUILD_OPTS=-I$(SOURCE_ROOT)/common -I$(SOURCE_ROOT)/i18n -L$(BUILD_ROOT)/lib -licuuc -licui18n -licudata +RUN_OPTS=env $(PATH_VAR)=$(BUILD_ROOT)/lib + +reset-icu: + >$(GEN_FILE) + $(MAKE) -C $(BUILD_ROOT)/i18n + +gen-file: reset-icu + mkdir -p $(BUILD_HERE) + $(CXX) -o $(BUILD_HERE)/$(TOOL) $(srcdir)/$(TOOL).cpp $(BUILD_OPTS) + $(RUN_OPTS) $(BUILD_HERE)/$(TOOL) > $(GEN_FILE) || exit 1 + $(CXX) -o $(BUILD_HERE)/$(TEST) $(srcdir)/$(TEST).cpp $(BUILD_OPTS) + $(RUN_OPTS) $(BUILD_HERE)/$(TEST) || exit 1 + $(MAKE) -C $(BUILD_ROOT)/i18n + $(RUN_OPTS) $(BUILD_HERE)/$(TEST) || exit 1 + +.PHONY: reset-icu gen-file diff --git a/intl/icu/source/tools/gencolusb/README.md b/intl/icu/source/tools/gencolusb/README.md new file mode 100644 index 0000000000..b0d9bae091 --- /dev/null +++ b/intl/icu/source/tools/gencolusb/README.md @@ -0,0 +1,10 @@ +Unsafe-Backward Collator Data +=== + +This directory contains tools to build the `source/i18n/collunsafe.h` +precomputed data. + +See [Makefile](./Makefile) for more details. + +* Copyright (C) 2016 and later: Unicode, Inc. and others. License & terms of use: http://www.unicode.org/copyright.html +* Copyright (c) 2015, International Business Machines Corporation and others. All Rights Reserved. diff --git a/intl/icu/source/tools/gencolusb/extract_unsafe_backwards.cpp b/intl/icu/source/tools/gencolusb/extract_unsafe_backwards.cpp new file mode 100644 index 0000000000..ee12e69f9b --- /dev/null +++ b/intl/icu/source/tools/gencolusb/extract_unsafe_backwards.cpp @@ -0,0 +1,168 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/** + * Copyright (c) 1999-2016, International Business Machines Corporation and + * others. All Rights Reserved. + * + * Generator for source/i18n/collunsafe.h + * see Makefile + */ + +#include <stdio.h> +#include "unicode/uversion.h" +#include "unicode/uniset.h" +#include "collationroot.h" +#include "collationtailoring.h" + +/** + * Define the type of generator to use. Choose one. + */ +#define SERIALIZE 1 //< Default: use UnicodeSet.serialize() and a new internal c'tor +#define RANGES 0 //< Enumerate ranges (works, not as fast. No support in collationdatareader.cpp) +#define PATTERN 0 //< Generate a UnicodeSet pattern (depends on #11891 AND probably slower. No support in collationdatareader.cpp) + +int main(int argc, const char *argv[]) { + UErrorCode errorCode = U_ZERO_ERROR; + + // Get the unsafeBackwardsSet + const CollationCacheEntry *rootEntry = CollationRoot::getRootCacheEntry(errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "Err: %s getting root cache entry\n", u_errorName(errorCode)); + return 1; + } + const UVersionInfo &version = rootEntry->tailoring->version; + const UnicodeSet *unsafeBackwardSet = rootEntry->tailoring->unsafeBackwardSet; + char verString[20]; + u_versionToString(version, verString); + fprintf(stderr, "Generating data for ICU %s, Collation %s\n", U_ICU_VERSION, verString); + int32_t rangeCount = unsafeBackwardSet->getRangeCount(); + +#if SERIALIZE + fprintf(stderr, ".. serializing\n"); + // UnicodeSet serialization + + UErrorCode preflightCode = U_ZERO_ERROR; + // preflight + int32_t serializedCount = unsafeBackwardSet->serialize(nullptr,0,preflightCode); + if(U_FAILURE(preflightCode) && preflightCode != U_BUFFER_OVERFLOW_ERROR) { + fprintf(stderr, "Err: %s preflighting unicode set\n", u_errorName(preflightCode)); + return 1; + } + uint16_t *serializedData = new uint16_t[serializedCount]; + // serialize + unsafeBackwardSet->serialize(serializedData, serializedCount, errorCode); + if(U_FAILURE(errorCode)) { + delete [] serializedData; + fprintf(stderr, "Err: %s serializing unicodeset\n", u_errorName(errorCode)); + return 1; + } +#endif + +#if PATTERN + fprintf(stderr,".. pattern. (Note: collationdatareader.cpp does not support this form also see #11891)\n"); + // attempt to use pattern + + UnicodeString pattern; + UnicodeSet set(*unsafeBackwardSet); + set.compact(); + set.toPattern(pattern, false); + + if(U_SUCCESS(errorCode)) { + // This fails (bug# ?) - which is why this method was abandoned. + + // UnicodeSet usA(pattern, errorCode); + // fprintf(stderr, "\n%s:%d: err creating set A %s\n", __FILE__, __LINE__, u_errorName(errorCode)); + // return 1; + } + + + const char16_t *buf = pattern.getBuffer(); + int32_t needed = pattern.length(); + + // print + { + char buf2[2048]; + int32_t len2 = pattern.extract(0, pattern.length(), buf2, "utf-8"); + buf2[len2]=0; + fprintf(stderr,"===\n%s\n===\n", buf2); + } + + const UnicodeString unsafeBackwardPattern(false, buf, needed); + if(U_SUCCESS(errorCode)) { + //UnicodeSet us(unsafeBackwardPattern, errorCode); + // fprintf(stderr, "\n%s:%d: err creating set %s\n", __FILE__, __LINE__, u_errorName(errorCode)); + } else { + fprintf(stderr, "Uset OK - \n"); + } +#endif + + + // Generate the output file. + + printf("// collunsafe.h\n"); + printf("// %s\n", U_COPYRIGHT_STRING); + printf("\n"); + printf("// To be included by collationdatareader.cpp, and generated by gencolusb.\n"); + printf("// Machine generated, do not edit.\n"); + printf("\n"); + printf("#ifndef COLLUNSAFE_H\n" + "#define COLLUNSAFE_H\n" + "\n" + "#include \"unicode/utypes.h\"\n" + "\n" + "#define COLLUNSAFE_ICU_VERSION \"" U_ICU_VERSION "\"\n"); + printf("#define COLLUNSAFE_COLL_VERSION \"%s\"\n", verString); + + + +#if PATTERN + printf("#define COLLUNSAFE_PATTERN 1\n"); + printf("static const int32_t collunsafe_len = %d;\n", needed); + printf("static const char16_t collunsafe_pattern[collunsafe_len] = {\n"); + for(int i=0;i<needed;i++) { + if( (i>0) && (i%8 == 0) ) { + printf(" // %d\n", i); + } + printf("0x%04X", buf[i]); // TODO check + if(i != (needed-1)) { + printf(", "); + } + } + printf(" //%d\n};\n", (needed-1)); +#endif + +#if RANGE + fprintf(stderr, "COLLUNSAFE_RANGE - no code support in collationdatareader.cpp for this\n"); + printf("#define COLLUNSAFE_RANGE 1\n"); + printf("static const int32_t unsafe_rangeCount = %d;\n", rangeCount); + printf("static const UChar32 unsafe_ranges[%d] = { \n", rangeCount*2); + for(int32_t i=0;i<rangeCount;i++) { + printf(" 0x%04X, 0x%04X, // %d\n", + unsafeBackwardSet->getRangeStart(i), + unsafeBackwardSet->getRangeEnd(i), + i); + } + printf("};\n"); +#endif + +#if SERIALIZE + printf("#define COLLUNSAFE_SERIALIZE 1\n"); + printf("static const int32_t unsafe_serializedCount = %d;\n", serializedCount); + printf("static const uint16_t unsafe_serializedData[%d] = { \n", serializedCount); + for(int32_t i=0;i<serializedCount;i++) { + if( (i>0) && (i%8 == 0) ) { + printf(" // %d\n", i); + } + printf("0x%04X", serializedData[i]); // TODO check + if(i != (serializedCount-1)) { + printf(", "); + } + } + printf("};\n"); +#endif + + printf("#endif\n"); + fflush(stderr); + fflush(stdout); + return(U_SUCCESS(errorCode)?0:1); +} diff --git a/intl/icu/source/tools/gencolusb/verify_uset.cpp b/intl/icu/source/tools/gencolusb/verify_uset.cpp new file mode 100644 index 0000000000..03a4930489 --- /dev/null +++ b/intl/icu/source/tools/gencolusb/verify_uset.cpp @@ -0,0 +1,71 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/** + * Copyright (c) 1999-2012, International Business Machines Corporation and + * others. All Rights Reserved. + * + * Test for source/i18n/collunsafe.h + */ + +#include <stdio.h> +#include "unicode/ucol.h" +#include "unicode/uniset.h" +#include "unicode/coll.h" +#include "collation.h" + +#include "collunsafe.h" + + +int main(int argc, const char *argv[]) { + puts("verify"); + UErrorCode errorCode = U_ZERO_ERROR; +#if defined (COLLUNSAFE_PATTERN) + puts("verify pattern"); + const UnicodeString unsafeBackwardPattern(false, collunsafe_pattern, collunsafe_len); + fprintf(stderr, "\n -- pat '%c%c%c%c%c'\n", + collunsafe_pattern[0], + collunsafe_pattern[1], + collunsafe_pattern[2], + collunsafe_pattern[3], + collunsafe_pattern[4]); + if(U_SUCCESS(errorCode)) { + UnicodeSet us(unsafeBackwardPattern, errorCode); + fprintf(stderr, "\n%s:%d: err creating set %s\n", __FILE__, __LINE__, u_errorName(errorCode)); + } +#endif + +#if defined (COLLUNSAFE_RANGE) + { + puts("verify range"); + UnicodeSet u; + for(int32_t i=0;i<unsafe_rangeCount*2;i+=2) { + u.add(unsafe_ranges[i+0],unsafe_ranges[i+1]); + } + printf("Finished with %d ranges\n", u.getRangeCount()); + } +#endif + +#if defined (COLLUNSAFE_SERIALIZE) + { + puts("verify serialize"); + UnicodeSet u(unsafe_serializedData, unsafe_serializedCount, UnicodeSet::kSerialized, errorCode); + fprintf(stderr, "\n%s:%d: err creating set %s\n", __FILE__, __LINE__, u_errorName(errorCode)); + printf("Finished deserialize with %d ranges\n", u.getRangeCount()); + } +#endif +// if(tailoring.unsafeBackwardSet == nullptr) { + // errorCode = U_MEMORY_ALLOCATION_ERROR; + // fprintf(stderr, "\n%s:%d: err %s\n", __FILE__, __LINE__, u_errorName(errorCode)); + // } + puts("verify col UCA"); + if(U_SUCCESS(errorCode)) { + Collator *col = Collator::createInstance(Locale::getEnglish(), errorCode); + fprintf(stderr, "\n%s:%d: err %s creating collator\n", __FILE__, __LINE__, u_errorName(errorCode)); + } + + if(U_FAILURE(errorCode)) { + return 1; + } else { + return 0; + } +} diff --git a/intl/icu/source/tools/gendict/Makefile.in b/intl/icu/source/tools/gendict/Makefile.in new file mode 100644 index 0000000000..d2cdb1c3c3 --- /dev/null +++ b/intl/icu/source/tools/gendict/Makefile.in @@ -0,0 +1,96 @@ +## Makefile.in for ICU - tools/gendict +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 2002-2012 International Business Machines Corporation and +## others. All Rights Reserved. + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/gendict + +TARGET_STUB_NAME = gendict + +SECTION = 1 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(SOURCES:.cpp=.o) + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(bindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(bindir) + +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/gendict/gendict.1.in b/intl/icu/source/tools/gendict/gendict.1.in new file mode 100644 index 0000000000..f204f83e4e --- /dev/null +++ b/intl/icu/source/tools/gendict/gendict.1.in @@ -0,0 +1,133 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" gendict.1: manual page for the gendict utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2012 International Business Machines Corporation and others +.\" +.TH GENDICT 1 "1 June 2012" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B gendict +\- Compiles word list into ICU string trie dictionary +.SH SYNOPSIS +.B gendict +[ +.BR "\fB\-\-uchars" +| +.BR "\fB\-\-bytes" +.BI "\fB\-\-transform" " transform" +] +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-V\fP, \fB\-\-version" +] +[ +.BR "\-c\fP, \fB\-\-copyright" +] +[ +.BR "\-v\fP, \fB\-\-verbose" +] +[ +.BI "\-i\fP, \fB\-\-icudatadir" " directory" +] +.IR " input-file" +.IR " output\-file" +.SH DESCRIPTION +.B gendict +reads the word list from +.I dictionary-file +and creates a string trie dictionary file. Normally this data file has the +.B .dict +extension. +.PP +Words begin at the beginning of a line and are terminated by the first whitespace. +Lines that begin with whitespace are ignored. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BR "\-V\fP, \fB\-\-version" +Print the version of +.B gendict +and exit. +.TP +.BR "\-c\fP, \fB\-\-copyright" +Embeds the standard ICU copyright into the +.IR output-file . +.TP +.BR "\-v\fP, \fB\-\-verbose" +Display extra informative messages during execution. +.TP +.BI "\-i\fP, \fB\-\-icudatadir" " directory" +Look for any necessary ICU data files in +.IR directory . +For example, the file +.B pnames.icu +must be located when ICU's data is not built as a shared library. +The default ICU data directory is specified by the environment variable +.BR ICU_DATA . +Most configurations of ICU do not require this argument. +.TP +.BR "\fB\-\-uchars" +Set the output trie type to UChar. Mutually exclusive with +.BR --bytes. +.TP +.BR "\fB\-\-bytes" +Set the output trie type to Bytes. Mutually exclusive with +.BR --uchars. +.TP +.BR "\fB\-\-transform" +Set the transform type. Should only be specified with +.BR --bytes. +Currently supported transforms are: +.BR offset-<hex-number>, +which specifies an offset to subtract from all input characters. +It should be noted that the offset transform also maps U+200D +to 0xFF and U+200C to 0xFE, in order to offer compatibility to +languages that require these characters. +A transform must be specified for a bytes trie, and when applied +to the non-value characters in the +.IR input-file +must produce output between 0x00 and 0xFF. +.TP +.BI " input\-file" +The source file to read. +.TP +.BI " output\-file" +The file to write the output dictionary to. +.SH CAVEATS +The +.IR input-file +is assumed to be encoded in UTF-8. +The integers in the +.IR input-file +that are used as values must be made up of ASCII digits. They +may be specified either in hex, by using a 0x prefix, or in +decimal. +Either +.BI --bytes +or +.BI --uchars +must be specified. +.SH ENVIRONMENT +.TP 10 +.B ICU_DATA +Specifies the directory containing ICU data. Defaults to +.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ . +Some tools in ICU depend on the presence of the trailing slash. It is thus +important to make sure that it is present if +.B ICU_DATA +is set. +.SH AUTHORS +Maxime Serrano +.SH VERSION +1.0 +.SH COPYRIGHT +Copyright (C) 2012 International Business Machines Corporation and others +.SH SEE ALSO +.BR http://www.icu-project.org/userguide/boundaryAnalysis.html + diff --git a/intl/icu/source/tools/gendict/gendict.cpp b/intl/icu/source/tools/gendict/gendict.cpp new file mode 100644 index 0000000000..995bd32724 --- /dev/null +++ b/intl/icu/source/tools/gendict/gendict.cpp @@ -0,0 +1,480 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (C) 2002-2016, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* +* File gendict.cpp +*/ + +#include "unicode/utypes.h" +#include "unicode/uchar.h" +#include "unicode/ucnv.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" +#include "unicode/uclean.h" +#include "unicode/udata.h" +#include "unicode/putil.h" +#include "unicode/ucharstriebuilder.h" +#include "unicode/bytestriebuilder.h" +#include "unicode/ucharstrie.h" +#include "unicode/bytestrie.h" +#include "unicode/ucnv.h" +#include "unicode/ustring.h" +#include "unicode/utf16.h" + +#include "charstr.h" +#include "dictionarydata.h" +#include "uoptions.h" +#include "unewdata.h" +#include "cmemory.h" +#include "uassert.h" +#include "ucbuf.h" +#include "toolutil.h" +#include "cstring.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "putilimp.h" +UDate startTime; + +static int elapsedTime() { + return (int)uprv_floor((uprv_getRawUTCtime()-startTime)/1000.0); +} + +U_NAMESPACE_USE + +static char *progName; +static UOption options[]={ + UOPTION_HELP_H, /* 0 */ + UOPTION_HELP_QUESTION_MARK, /* 1 */ + UOPTION_VERBOSE, /* 2 */ + UOPTION_ICUDATADIR, /* 4 */ + UOPTION_COPYRIGHT, /* 5 */ + { "uchars", nullptr, nullptr, nullptr, '\1', UOPT_NO_ARG, 0}, /* 6 */ + { "bytes", nullptr, nullptr, nullptr, '\1', UOPT_NO_ARG, 0}, /* 7 */ + { "transform", nullptr, nullptr, nullptr, '\1', UOPT_REQUIRES_ARG, 0}, /* 8 */ + UOPTION_QUIET, /* 9 */ +}; + +enum arguments { + ARG_HELP = 0, + ARG_QMARK, + ARG_VERBOSE, + ARG_ICUDATADIR, + ARG_COPYRIGHT, + ARG_UCHARS, + ARG_BYTES, + ARG_TRANSFORM, + ARG_QUIET +}; + +// prints out the standard usage method describing command line arguments, +// then bails out with the desired exit code +static void usageAndDie(UErrorCode retCode) { + fprintf((U_SUCCESS(retCode) ? stdout : stderr), "Usage: %s -trietype [-options] input-dictionary-file output-file\n", progName); + fprintf((U_SUCCESS(retCode) ? stdout : stderr), + "\tRead in a word list and write out a string trie dictionary\n" + "options:\n" + "\t-h or -? or --help this usage text\n" + "\t-V or --version show a version message\n" + "\t-c or --copyright include a copyright notice\n" + "\t-v or --verbose turn on verbose output\n" + "\t-q or --quiet do not display warnings and progress\n" + "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" // TODO: figure out if we need this option + "\t followed by path, defaults to %s\n" + "\t--uchars output a UCharsTrie (mutually exclusive with -b!)\n" + "\t--bytes output a BytesTrie (mutually exclusive with -u!)\n" + "\t--transform the kind of transform to use (eg --transform offset-40A3,\n" + "\t which specifies an offset transform with constant 0x40A3)\n", + u_getDataDirectory()); + exit(retCode); +} + + +/* UDataInfo cf. udata.h */ +static UDataInfo dataInfo = { + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, + + { 0x44, 0x69, 0x63, 0x74 }, /* "Dict" */ + { 1, 0, 0, 0 }, /* format version */ + { 0, 0, 0, 0 } /* data version */ +}; + +#if !UCONFIG_NO_BREAK_ITERATION + +// A wrapper for both BytesTrieBuilder and UCharsTrieBuilder. +// may want to put this somewhere in ICU, as it could be useful outside +// of this tool? +class DataDict { +private: + BytesTrieBuilder *bt; + UCharsTrieBuilder *ut; + UChar32 transformConstant; + int32_t transformType; +public: + // constructs a new data dictionary. if there is an error, + // it will be returned in status + // isBytesTrie != 0 will produce a BytesTrieBuilder, + // isBytesTrie == 0 will produce a UCharsTrieBuilder + DataDict(UBool isBytesTrie, UErrorCode &status) : bt(nullptr), ut(nullptr), + transformConstant(0), transformType(DictionaryData::TRANSFORM_NONE) { + if (isBytesTrie) { + bt = new BytesTrieBuilder(status); + } else { + ut = new UCharsTrieBuilder(status); + } + } + + ~DataDict() { + delete bt; + delete ut; + } + +private: + char transform(UChar32 c, UErrorCode &status) { + if (transformType == DictionaryData::TRANSFORM_TYPE_OFFSET) { + if (c == 0x200D) { return (char)0xFF; } + else if (c == 0x200C) { return (char)0xFE; } + int32_t delta = c - transformConstant; + if (delta < 0 || 0xFD < delta) { + fprintf(stderr, "Codepoint U+%04lx out of range for --transform offset-%04lx!\n", + (long)c, (long)transformConstant); + exit(U_ILLEGAL_ARGUMENT_ERROR); // TODO: should return and print the line number + } + return (char)delta; + } else { // no such transform type + status = U_INTERNAL_PROGRAM_ERROR; + return (char)c; // it should be noted this transform type will not generally work + } + } + + void transform(const UnicodeString &word, CharString &buf, UErrorCode &errorCode) { + UChar32 c = 0; + int32_t len = word.length(); + for (int32_t i = 0; i < len; i += U16_LENGTH(c)) { + c = word.char32At(i); + buf.append(transform(c, errorCode), errorCode); + } + } + +public: + // sets the desired transformation data. + // should be populated from a command line argument + // so far the only acceptable format is offset-<hex constant> + // eventually others (mask-<hex constant>?) may be enabled + // more complex functions may be more difficult + void setTransform(const char *t) { + if (strncmp(t, "offset-", 7) == 0) { + char *end; + unsigned long base = uprv_strtoul(t + 7, &end, 16); + if (end == (t + 7) || *end != 0 || base > 0x10FF80) { + fprintf(stderr, "Syntax for offset value in --transform offset-%s invalid!\n", t + 7); + usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); + } + transformType = DictionaryData::TRANSFORM_TYPE_OFFSET; + transformConstant = (UChar32)base; + } + else { + fprintf(stderr, "Invalid transform specified: %s\n", t); + usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); + } + } + + // add a word to the trie + void addWord(const UnicodeString &word, int32_t value, UErrorCode &status) { + if (bt) { + CharString buf; + transform(word, buf, status); + bt->add(buf.toStringPiece(), value, status); + } + if (ut) { ut->add(word, value, status); } + } + + // if we are a bytestrie, give back the StringPiece representing the serialized version of us + StringPiece serializeBytes(UErrorCode &status) { + return bt->buildStringPiece(USTRINGTRIE_BUILD_SMALL, status); + } + + // if we are a ucharstrie, produce the UnicodeString representing the serialized version of us + void serializeUChars(UnicodeString &s, UErrorCode &status) { + ut->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, s, status); + } + + int32_t getTransform() { + return (int32_t)(transformType | transformConstant); + } +}; +#endif + +static const char16_t LINEFEED_CHARACTER = 0x000A; +static const char16_t CARRIAGE_RETURN_CHARACTER = 0x000D; + +static UBool readLine(UCHARBUF *f, UnicodeString &fileLine, IcuToolErrorCode &errorCode) { + int32_t lineLength; + const char16_t *line = ucbuf_readline(f, &lineLength, errorCode); + if(line == nullptr || errorCode.isFailure()) { return false; } + // Strip trailing CR/LF, comments, and spaces. + const char16_t *comment = u_memchr(line, 0x23, lineLength); // '#' + if(comment != nullptr) { + lineLength = (int32_t)(comment - line); + } else { + while(lineLength > 0 && (line[lineLength - 1] == CARRIAGE_RETURN_CHARACTER || line[lineLength - 1] == LINEFEED_CHARACTER)) { --lineLength; } + } + while(lineLength > 0 && u_isspace(line[lineLength - 1])) { --lineLength; } + fileLine.setTo(false, line, lineLength); + return true; +} + +//---------------------------------------------------------------------------- +// +// main for gendict +// +//---------------------------------------------------------------------------- +int main(int argc, char **argv) { + // + // Pick up and check the command line arguments, + // using the standard ICU tool utils option handling. + // + U_MAIN_INIT_ARGS(argc, argv); + progName = argv[0]; + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + if(argc<0) { + // Unrecognized option + fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); + usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); + } + + if(options[ARG_HELP].doesOccur || options[ARG_QMARK].doesOccur) { + // -? or -h for help. + usageAndDie(U_ZERO_ERROR); + } + + UBool verbose = options[ARG_VERBOSE].doesOccur; + UBool quiet = options[ARG_QUIET].doesOccur; + + if (argc < 3) { + fprintf(stderr, "input and output file must both be specified.\n"); + usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); + } + const char *outFileName = argv[2]; + const char *wordFileName = argv[1]; + + startTime = uprv_getRawUTCtime(); // initialize start timer + + if (options[ARG_ICUDATADIR].doesOccur) { + u_setDataDirectory(options[ARG_ICUDATADIR].value); + } + + const char *copyright = nullptr; + if (options[ARG_COPYRIGHT].doesOccur) { + copyright = U_COPYRIGHT_STRING; + } + + if (options[ARG_UCHARS].doesOccur == options[ARG_BYTES].doesOccur) { + fprintf(stderr, "you must specify exactly one type of trie to output!\n"); + usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); + } + UBool isBytesTrie = options[ARG_BYTES].doesOccur; + if (isBytesTrie != options[ARG_TRANSFORM].doesOccur) { + fprintf(stderr, "you must provide a transformation for a bytes trie, and must not provide one for a uchars trie!\n"); + usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); + } + + IcuToolErrorCode status("gendict/main()"); + +#if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO + const char* outDir=nullptr; + + UNewDataMemory *pData; + char msg[1024]; + UErrorCode tempstatus = U_ZERO_ERROR; + + /* write message with just the name */ // potential for a buffer overflow here... + snprintf(msg, sizeof(msg), "gendict writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); + fprintf(stderr, "%s\n", msg); + + /* write the dummy data file */ + pData = udata_create(outDir, nullptr, outFileName, &dataInfo, nullptr, &tempstatus); + udata_writeBlock(pData, msg, strlen(msg)); + udata_finish(pData, &tempstatus); + return (int)tempstatus; + +#else + // Read in the dictionary source file + if (verbose) { printf("Opening file %s...\n", wordFileName); } + const char *codepage = "UTF-8"; + LocalUCHARBUFPointer f(ucbuf_open(wordFileName, &codepage, true, false, status)); + if (status.isFailure()) { + fprintf(stderr, "error opening input file: ICU Error \"%s\"\n", status.errorName()); + exit(status.reset()); + } + if (verbose) { printf("Initializing dictionary builder of type %s...\n", (isBytesTrie ? "BytesTrie" : "UCharsTrie")); } + DataDict dict(isBytesTrie, status); + if (status.isFailure()) { + fprintf(stderr, "new DataDict: ICU Error \"%s\"\n", status.errorName()); + exit(status.reset()); + } + if (options[ARG_TRANSFORM].doesOccur) { + dict.setTransform(options[ARG_TRANSFORM].value); + } + + UnicodeString fileLine; + if (verbose) { puts("Adding words to dictionary..."); } + UBool hasValues = false; + UBool hasValuelessContents = false; + int lineCount = 0; + int wordCount = 0; + int minlen = 255; + int maxlen = 0; + UBool isOk = true; + while (readLine(f.getAlias(), fileLine, status)) { + lineCount++; + if (fileLine.isEmpty()) continue; + + // Parse word [spaces value]. + int32_t keyLen; + for (keyLen = 0; keyLen < fileLine.length() && !u_isspace(fileLine[keyLen]); ++keyLen) {} + if (keyLen == 0) { + fprintf(stderr, "Error: no word on line %i!\n", lineCount); + isOk = false; + continue; + } + int32_t valueStart; + for (valueStart = keyLen; + valueStart < fileLine.length() && u_isspace(fileLine[valueStart]); + ++valueStart) {} + + if (keyLen < valueStart) { + int32_t valueLength = fileLine.length() - valueStart; + if (valueLength > 15) { + fprintf(stderr, "Error: value too long on line %i!\n", lineCount); + isOk = false; + continue; + } + char s[16]; + fileLine.extract(valueStart, valueLength, s, 16, US_INV); + char *end; + unsigned long value = uprv_strtoul(s, &end, 0); + if (end == s || *end != 0 || (int32_t)uprv_strlen(s) != valueLength || value > 0xffffffff) { + fprintf(stderr, "Error: value syntax error or value too large on line %i!\n", lineCount); + isOk = false; + continue; + } + dict.addWord(fileLine.tempSubString(0, keyLen), (int32_t)value, status); + hasValues = true; + wordCount++; + if (keyLen < minlen) minlen = keyLen; + if (keyLen > maxlen) maxlen = keyLen; + } else { + dict.addWord(fileLine.tempSubString(0, keyLen), 0, status); + hasValuelessContents = true; + wordCount++; + if (keyLen < minlen) minlen = keyLen; + if (keyLen > maxlen) maxlen = keyLen; + } + + if (status.isFailure()) { + fprintf(stderr, "ICU Error \"%s\": Failed to add word to trie at input line %d in input file\n", + status.errorName(), lineCount); + exit(status.reset()); + } + } + if (verbose) { printf("Processed %d lines, added %d words, minlen %d, maxlen %d\n", lineCount, wordCount, minlen, maxlen); } + + if (!isOk && status.isSuccess()) { + status.set(U_ILLEGAL_ARGUMENT_ERROR); + } + if (hasValues && hasValuelessContents) { + fprintf(stderr, "warning: file contained both valued and unvalued strings!\n"); + } + + if (verbose) { printf("Serializing data...isBytesTrie? %d\n", isBytesTrie); } + int32_t outDataSize; + const void *outData; + UnicodeString usp; + if (isBytesTrie) { + StringPiece sp = dict.serializeBytes(status); + outDataSize = sp.size(); + outData = sp.data(); + } else { + dict.serializeUChars(usp, status); + outDataSize = usp.length() * U_SIZEOF_UCHAR; + outData = usp.getBuffer(); + } + if (status.isFailure()) { + fprintf(stderr, "gendict: got failure of type %s while serializing, if U_ILLEGAL_ARGUMENT_ERROR possibly due to duplicate dictionary entries\n", status.errorName()); + exit(status.reset()); + } + if (verbose) { puts("Opening output file..."); } + UNewDataMemory *pData = udata_create(nullptr, nullptr, outFileName, &dataInfo, copyright, status); + if (status.isFailure()) { + fprintf(stderr, "gendict: could not open output file \"%s\", \"%s\"\n", outFileName, status.errorName()); + exit(status.reset()); + } + + if (verbose) { puts("Writing to output file..."); } + int32_t indexes[DictionaryData::IX_COUNT] = { + DictionaryData::IX_COUNT * sizeof(int32_t), 0, 0, 0, 0, 0, 0, 0 + }; + int32_t size = outDataSize + indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; + indexes[DictionaryData::IX_RESERVED1_OFFSET] = size; + indexes[DictionaryData::IX_RESERVED2_OFFSET] = size; + indexes[DictionaryData::IX_TOTAL_SIZE] = size; + + indexes[DictionaryData::IX_TRIE_TYPE] = isBytesTrie ? DictionaryData::TRIE_TYPE_BYTES : DictionaryData::TRIE_TYPE_UCHARS; + if (hasValues) { + indexes[DictionaryData::IX_TRIE_TYPE] |= DictionaryData::TRIE_HAS_VALUES; + } + + indexes[DictionaryData::IX_TRANSFORM] = dict.getTransform(); + udata_writeBlock(pData, indexes, sizeof(indexes)); + udata_writeBlock(pData, outData, outDataSize); + size_t bytesWritten = udata_finish(pData, status); + if (status.isFailure()) { + fprintf(stderr, "gendict: error \"%s\" writing the output file\n", status.errorName()); + exit(status.reset()); + } + + if (bytesWritten != (size_t)size) { + fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); + exit(U_INTERNAL_PROGRAM_ERROR); + } + + if (!quiet) { printf("%s: done writing\t%s (%ds).\n", progName, outFileName, elapsedTime()); } + +#ifdef TEST_GENDICT + if (isBytesTrie) { + BytesTrie::Iterator it(outData, outDataSize, status); + while (it.hasNext()) { + it.next(status); + const StringPiece s = it.getString(); + int32_t val = it.getValue(); + printf("%s -> %i\n", s.data(), val); + } + } else { + UCharsTrie::Iterator it((const char16_t *)outData, outDataSize, status); + while (it.hasNext()) { + it.next(status); + const UnicodeString s = it.getString(); + int32_t val = it.getValue(); + char tmp[1024]; + s.extract(0, s.length(), tmp, 1024); + printf("%s -> %i\n", tmp, val); + } + } +#endif + + return 0; +#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ +} diff --git a/intl/icu/source/tools/gendict/gendict.vcxproj b/intl/icu/source/tools/gendict/gendict.vcxproj new file mode 100644 index 0000000000..42c1ba7869 --- /dev/null +++ b/intl/icu/source/tools/gendict/gendict.vcxproj @@ -0,0 +1,80 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{9D4211F7-2C77-439C-82F0-30A4E43BA569}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)\gendict.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)\gendict.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)\gendict.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)\gendict.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="gendict.cpp" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/gendict/gendict.vcxproj.filters b/intl/icu/source/tools/gendict/gendict.vcxproj.filters new file mode 100644 index 0000000000..52823d6f9a --- /dev/null +++ b/intl/icu/source/tools/gendict/gendict.vcxproj.filters @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{570fb8ae-ac18-467d-8502-470a241a60d4}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{7b2185f2-4ff9-4419-b596-0a21e37414c9}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{1dc5e7e3-4d1b-4031-a31f-c39b3a3e283a}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="gendict.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> +</Project> diff --git a/intl/icu/source/tools/gendict/sources.txt b/intl/icu/source/tools/gendict/sources.txt new file mode 100644 index 0000000000..6647c5c2c8 --- /dev/null +++ b/intl/icu/source/tools/gendict/sources.txt @@ -0,0 +1 @@ +gendict.cpp diff --git a/intl/icu/source/tools/gennorm2/BUILD.bazel b/intl/icu/source/tools/gennorm2/BUILD.bazel new file mode 100644 index 0000000000..c602897baf --- /dev/null +++ b/intl/icu/source/tools/gennorm2/BUILD.bazel @@ -0,0 +1,39 @@ +# © 2021 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html + +# This Bazel build file defines a target for the gennorm2 binary that generates +# headers needed for bootstrapping the ICU4C build process in a way that +# integrates the normalization data. + +load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") + +package( + default_visibility = ["//visibility:public"], +) + +cc_binary( + name = "gennorm2", + srcs = glob([ + "*.c", + "*.cpp", + "*.h", # cannot have hdrs section in cc_binary + ]), + deps = [ + "//icu4c/source/common:uhash", + "//icu4c/source/common:umutablecptrie", + "//icu4c/source/common:ucptrie", + "//icu4c/source/common:errorcode", + "//icu4c/source/common:uniset", + "//icu4c/source/common:uvector32", + + "//icu4c/source/common:platform", + "//icu4c/source/common:headers", + + "//icu4c/source/tools/toolutil:toolutil", + "//icu4c/source/tools/toolutil:unewdata", + "//icu4c/source/tools/toolutil:writesrc", + "//icu4c/source/tools/toolutil:uoptions", + "//icu4c/source/tools/toolutil:uparse", + ], + linkopts = ["-pthread"], +) diff --git a/intl/icu/source/tools/gennorm2/Makefile.in b/intl/icu/source/tools/gennorm2/Makefile.in new file mode 100644 index 0000000000..84f5830e67 --- /dev/null +++ b/intl/icu/source/tools/gennorm2/Makefile.in @@ -0,0 +1,82 @@ +## Makefile.in for ICU - tools/gennorm2 +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 2009-2011, International Business Machines Corporation and +## others. All Rights Reserved. +## Steven R. Loomis/Markus W. Scherer + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/gennorm2 + +TARGET_STUB_NAME = gennorm2 + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(srcdir)/../toolutil +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(SOURCES:.cpp=.o) + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) + +install-local: all-local + $(MKINSTALLDIRS) $(DESTDIR)$(sbindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir) + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif diff --git a/intl/icu/source/tools/gennorm2/extradata.cpp b/intl/icu/source/tools/gennorm2/extradata.cpp new file mode 100644 index 0000000000..f31bc418ea --- /dev/null +++ b/intl/icu/source/tools/gennorm2/extradata.cpp @@ -0,0 +1,254 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// extradata.cpp +// created: 2017jun04 Markus W. Scherer +// (pulled out of n2builder.cpp) + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_NORMALIZATION + +#include <stdio.h> +#include <stdlib.h> +#include "unicode/errorcode.h" +#include "unicode/unistr.h" +#include "unicode/utf16.h" +#include "extradata.h" +#include "normalizer2impl.h" +#include "norms.h" +#include "toolutil.h" +#include "utrie2.h" +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +ExtraData::ExtraData(Norms &n, UBool fast) : + Norms::Enumerator(n), + yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions + yesNoMappingsAndCompositions(1000, (UChar32)0, 1), // 0=Hangul LV, 1=start of normal data + yesNoMappingsOnly(1000, (UChar32)0, 1), // 0=Hangul LVT, 1=start of normal data + optimizeFast(fast) { + // Hangul LV algorithmically decomposes to two Jamo. + // Some code may harmlessly read this firstUnit. + yesNoMappingsAndCompositions.setCharAt(0, 2); + // Hangul LVT algorithmically decomposes to three Jamo. + // Some code may harmlessly read this firstUnit. + yesNoMappingsOnly.setCharAt(0, 3); +} + +int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) { + UnicodeString &m=*norm.mapping; + int32_t length=m.length(); + // Write the mapping & raw mapping extraData. + int32_t firstUnit=length|(norm.trailCC<<8); + int32_t preMappingLength=0; + if(norm.rawMapping!=nullptr) { + UnicodeString &rm=*norm.rawMapping; + int32_t rmLength=rm.length(); + if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) { + fprintf(stderr, + "gennorm2 error: " + "raw mapping for U+%04lX longer than maximum of %d\n", + (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); + exit(U_INVALID_FORMAT_ERROR); + } + char16_t rm0=rm.charAt(0); + if( rmLength==length-1 && + // 99: overlong substring lengths get pinned to remainder lengths anyway + 0==rm.compare(1, 99, m, 2, 99) && + rm0>Normalizer2Impl::MAPPING_LENGTH_MASK + ) { + // Compression: + // rawMapping=rm0+mapping.substring(2) -> store only rm0 + // + // The raw mapping is the same as the final mapping after replacing + // the final mapping's first two code units with the raw mapping's first one. + // In this case, we store only that first unit, rm0. + // This helps with a few hundred mappings. + dataString.append(rm0); + preMappingLength=1; + } else { + // Store the raw mapping with its length. + dataString.append(rm); + dataString.append((char16_t)rmLength); + preMappingLength=rmLength+1; + } + firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING; + } + int32_t cccLccc=norm.cc|(norm.leadCC<<8); + if(cccLccc!=0) { + dataString.append((char16_t)cccLccc); + ++preMappingLength; + firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD; + } + dataString.append((char16_t)firstUnit); + dataString.append(m); + return preMappingLength; +} + +int32_t ExtraData::writeNoNoMapping(UChar32 c, const Norm &norm, + UnicodeString &dataString, + Hashtable &previousMappings) { + UnicodeString newMapping; + int32_t offset=writeMapping(c, norm, newMapping); + UBool found=false; + int32_t previousOffset=previousMappings.getiAndFound(newMapping, found); + if(found) { + // Duplicate, point to the identical mapping that has already been stored. + offset=previousOffset; + } else { + // Append this new mapping and + // enter it into the hashtable, avoiding value 0 which is "not found". + offset=dataString.length()+offset; + dataString.append(newMapping); + IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.putiAllowZero()"); + previousMappings.putiAllowZero(newMapping, offset, errorCode); + } + return offset; +} + +UBool ExtraData::setNoNoDelta(UChar32 c, Norm &norm) const { + // Try a compact, algorithmic encoding to a single compYesAndZeroCC code point. + // Do not map from ASCII to non-ASCII. + if(norm.mappingCP>=0 && + !(c<=0x7f && norm.mappingCP>0x7f) && + norms.getNormRef(norm.mappingCP).type<Norm::NO_NO_COMP_YES) { + int32_t delta=norm.mappingCP-c; + if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) { + norm.type=Norm::NO_NO_DELTA; + norm.offset=delta; + return true; + } + } + return false; +} + +void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString) { + if(norm.cc!=0) { + fprintf(stderr, + "gennorm2 error: " + "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n", + (long)c); + exit(U_INVALID_FORMAT_ERROR); + } + int32_t length; + const CompositionPair *pairs=norm.getCompositionPairs(length); + for(int32_t i=0; i<length; ++i) { + const CompositionPair &pair=pairs[i]; + // 22 bits for the composite character and whether it combines forward. + UChar32 compositeAndFwd=pair.composite<<1; + if(norms.getNormRef(pair.composite).compositions!=nullptr) { + compositeAndFwd|=1; // The composite character also combines-forward. + } + // Encode most pairs in two units and some in three. + int32_t firstUnit, secondUnit, thirdUnit; + if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) { + if(compositeAndFwd<=0xffff) { + firstUnit=pair.trail<<1; + secondUnit=compositeAndFwd; + thirdUnit=-1; + } else { + firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE; + secondUnit=compositeAndFwd>>16; + thirdUnit=compositeAndFwd; + } + } else { + firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+ + (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))| + Normalizer2Impl::COMP_1_TRIPLE; + secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)| + (compositeAndFwd>>16); + thirdUnit=compositeAndFwd; + } + // Set the high bit of the first unit if this is the last composition pair. + if(i==(length-1)) { + firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE; + } + dataString.append((char16_t)firstUnit).append((char16_t)secondUnit); + if(thirdUnit>=0) { + dataString.append((char16_t)thirdUnit); + } + } +} + +void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) { + if(start!=end) { + fprintf(stderr, + "gennorm2 error: unexpected shared data for " + "multiple code points U+%04lX..U+%04lX\n", + (long)start, (long)end); + exit(U_INTERNAL_PROGRAM_ERROR); + } + if(norm.error!=nullptr) { + fprintf(stderr, "gennorm2 error: U+%04lX %s\n", (long)start, norm.error); + exit(U_INVALID_FORMAT_ERROR); + } + writeExtraData(start, norm); +} + +// Ticket #13342 - Disable optimizations on MSVC for this function as a workaround. +#if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210)) +#pragma optimize( "", off ) +#endif + +void ExtraData::writeExtraData(UChar32 c, Norm &norm) { + switch(norm.type) { + case Norm::INERT: + break; // no extra data + case Norm::YES_YES_COMBINES_FWD: + norm.offset=yesYesCompositions.length(); + writeCompositions(c, norm, yesYesCompositions); + break; + case Norm::YES_NO_COMBINES_FWD: + norm.offset=yesNoMappingsAndCompositions.length()+ + writeMapping(c, norm, yesNoMappingsAndCompositions); + writeCompositions(c, norm, yesNoMappingsAndCompositions); + break; + case Norm::YES_NO_MAPPING_ONLY: + norm.offset=yesNoMappingsOnly.length()+ + writeMapping(c, norm, yesNoMappingsOnly); + break; + case Norm::NO_NO_COMP_YES: + if(!optimizeFast && setNoNoDelta(c, norm)) { + break; + } + norm.offset=writeNoNoMapping(c, norm, noNoMappingsCompYes, previousNoNoMappingsCompYes); + break; + case Norm::NO_NO_COMP_BOUNDARY_BEFORE: + if(!optimizeFast && setNoNoDelta(c, norm)) { + break; + } + norm.offset=writeNoNoMapping( + c, norm, noNoMappingsCompBoundaryBefore, previousNoNoMappingsCompBoundaryBefore); + break; + case Norm::NO_NO_COMP_NO_MAYBE_CC: + norm.offset=writeNoNoMapping( + c, norm, noNoMappingsCompNoMaybeCC, previousNoNoMappingsCompNoMaybeCC); + break; + case Norm::NO_NO_EMPTY: + // There can be multiple extra data entries for mappings to the empty string + // if they have different raw mappings. + norm.offset=writeNoNoMapping(c, norm, noNoMappingsEmpty, previousNoNoMappingsEmpty); + break; + case Norm::MAYBE_YES_COMBINES_FWD: + norm.offset=maybeYesCompositions.length(); + writeCompositions(c, norm, maybeYesCompositions); + break; + case Norm::MAYBE_YES_SIMPLE: + break; // no extra data + case Norm::YES_YES_WITH_CC: + break; // no extra data + default: // Should not occur. + exit(U_INTERNAL_PROGRAM_ERROR); + } +} + +// Ticket #13342 - Turn optimization back on. +#if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210)) +#pragma optimize( "", on ) +#endif + +U_NAMESPACE_END + +#endif // #if !UCONFIG_NO_NORMALIZATION diff --git a/intl/icu/source/tools/gennorm2/extradata.h b/intl/icu/source/tools/gennorm2/extradata.h new file mode 100644 index 0000000000..0a8e73087d --- /dev/null +++ b/intl/icu/source/tools/gennorm2/extradata.h @@ -0,0 +1,70 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// extradata.h +// created: 2017jun04 Markus W. Scherer +// (pulled out of n2builder.cpp) + +// Write mappings and compositions in compact form for Normalizer2 "extra data", +// the data that does not fit into the trie itself. + +#ifndef __EXTRADATA_H__ +#define __EXTRADATA_H__ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_NORMALIZATION + +#include "unicode/errorcode.h" +#include "unicode/unistr.h" +#include "unicode/utf16.h" +#include "hash.h" +#include "norms.h" +#include "toolutil.h" +#include "utrie2.h" +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +class ExtraData : public Norms::Enumerator { +public: + ExtraData(Norms &n, UBool fast); + + void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override; + + UnicodeString maybeYesCompositions; + UnicodeString yesYesCompositions; + UnicodeString yesNoMappingsAndCompositions; + UnicodeString yesNoMappingsOnly; + UnicodeString noNoMappingsCompYes; + UnicodeString noNoMappingsCompBoundaryBefore; + UnicodeString noNoMappingsCompNoMaybeCC; + UnicodeString noNoMappingsEmpty; + +private: + /** + * Requires norm.hasMapping(). + * Returns the offset of the "first unit" from the beginning of the extraData for c. + * That is the same as the length of the optional data + * for the raw mapping and the ccc/lccc word. + */ + int32_t writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString); + int32_t writeNoNoMapping(UChar32 c, const Norm &norm, + UnicodeString &dataString, Hashtable &previousMappings); + UBool setNoNoDelta(UChar32 c, Norm &norm) const; + /** Requires norm.compositions!=nullptr. */ + void writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString); + void writeExtraData(UChar32 c, Norm &norm); + + UBool optimizeFast; + Hashtable previousNoNoMappingsCompYes; // If constructed in runtime code, pass in UErrorCode. + Hashtable previousNoNoMappingsCompBoundaryBefore; + Hashtable previousNoNoMappingsCompNoMaybeCC; + Hashtable previousNoNoMappingsEmpty; +}; + +U_NAMESPACE_END + +#endif // #if !UCONFIG_NO_NORMALIZATION + +#endif // __EXTRADATA_H__ diff --git a/intl/icu/source/tools/gennorm2/gennorm2.cpp b/intl/icu/source/tools/gennorm2/gennorm2.cpp new file mode 100644 index 0000000000..2575bf7ed8 --- /dev/null +++ b/intl/icu/source/tools/gennorm2/gennorm2.cpp @@ -0,0 +1,333 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2009-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: gennorm2.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2009nov25 +* created by: Markus W. Scherer +* +* This program reads text files that define Unicode normalization, +* parses them, and builds a binary data file. +*/ + +#include "unicode/utypes.h" +#include "n2builder.h" + +#include <fstream> +#include <stdio.h> +#include <stdlib.h> +#include <string> +#include <string.h> +#include "unicode/errorcode.h" +#include "unicode/localpointer.h" +#include "unicode/putil.h" +#include "unicode/uchar.h" +#include "unicode/unistr.h" +#include "charstr.h" +#include "normalizer2impl.h" +#include "toolutil.h" +#include "uoptions.h" +#include "uparse.h" + +#if UCONFIG_NO_NORMALIZATION +#include "unewdata.h" +#endif + +U_NAMESPACE_BEGIN + +UBool beVerbose=false, haveCopyright=true; + +#if !UCONFIG_NO_NORMALIZATION +void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder); +#endif + +/* -------------------------------------------------------------------------- */ + +enum { + HELP_H, + HELP_QUESTION_MARK, + VERBOSE, + COPYRIGHT, + SOURCEDIR, + OUTPUT_FILENAME, + UNICODE_VERSION, + WRITE_C_SOURCE, + WRITE_COMBINED_DATA, + OPT_FAST +}; + +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_VERBOSE, + UOPTION_COPYRIGHT, + UOPTION_SOURCEDIR, + UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG), + UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), + UOPTION_DEF("csource", '\1', UOPT_NO_ARG), + UOPTION_DEF("combined", '\1', UOPT_NO_ARG), + UOPTION_DEF("fast", '\1', UOPT_NO_ARG) +}; + +extern "C" int +main(int argc, char* argv[]) { + U_MAIN_INIT_ARGS(argc, argv); + + /* preset then read command line options */ + options[SOURCEDIR].value=""; + argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } + if(!options[OUTPUT_FILENAME].doesOccur) { + argc=-1; + } + if( argc<2 || + options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur + ) { + fprintf(stderr, + "Usage: %s [-options] infiles+ -o outputfilename\n" + "\n" + "Reads the infiles with normalization data and\n" + "creates a binary file, or a C source file (--csource), with the data,\n" + "or writes a data file with the combined data (--combined).\n" + "See https://unicode-org.github.io/icu/userguide/transforms/normalization#data-file-syntax\n" + "\n" + "Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n" + "\n" + "Computes the difference of (a, b) minus (p, q) and writes the diff data\n" + "in input-file syntax to the outputfilename.\n" + "It is then possible to build (p, q, diff) to get the same data as (a, b).\n" + "(Useful for computing minimal incremental mapping data files.)\n" + "\n", + argv[0], argv[0]); + fprintf(stderr, + "Options:\n" + "\t-h or -? or --help this usage text\n" + "\t-v or --verbose verbose output\n" + "\t-c or --copyright include a copyright notice\n" + "\t-u or --unicode Unicode version, followed by the version like 5.2.0\n"); + fprintf(stderr, + "\t-s or --sourcedir source directory, followed by the path\n" + "\t-o or --output output filename\n" + "\t --csource writes a C source file with initializers\n" + "\t --combined writes a .txt file (input-file syntax) with the\n" + "\t combined data from all of the input files\n"); + fprintf(stderr, + "\t --fast optimize the data for fast normalization,\n" + "\t which might increase its size (Writes fully decomposed\n" + "\t regular mappings instead of delta mappings.\n" + "\t You should measure the runtime speed to make sure that\n" + "\t this is a good trade-off.)\n"); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + beVerbose=options[VERBOSE].doesOccur; + haveCopyright=options[COPYRIGHT].doesOccur; + + IcuToolErrorCode errorCode("gennorm2/main()"); + +#if UCONFIG_NO_NORMALIZATION + + fprintf(stderr, + "gennorm2 writes a dummy binary data file " + "because UCONFIG_NO_NORMALIZATION is set, \n" + "see icu/source/common/unicode/uconfig.h\n"); + udata_createDummy(nullptr, nullptr, options[OUTPUT_FILENAME].value, errorCode); + // Should not return an error since this is the expected behaviour if UCONFIG_NO_NORMALIZATION is on. + // return U_UNSUPPORTED_ERROR; + return 0; + +#else + + LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode); + LocalPointer<Normalizer2DataBuilder> b2; + LocalPointer<Normalizer2DataBuilder> diff; + Normalizer2DataBuilder *builder = b1.getAlias(); + errorCode.assertSuccess(); + + if(options[UNICODE_VERSION].doesOccur) { + builder->setUnicodeVersion(options[UNICODE_VERSION].value); + } + + if(options[OPT_FAST].doesOccur) { + builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); + } + + // prepare the filename beginning with the source dir + CharString filename(options[SOURCEDIR].value, errorCode); + int32_t pathLength=filename.length(); + if( pathLength>0 && + filename[pathLength-1]!=U_FILE_SEP_CHAR && + filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR + ) { + filename.append(U_FILE_SEP_CHAR, errorCode); + pathLength=filename.length(); + } + + bool doMinus = false; + for(int i=1; i<argc; ++i) { + printf("gennorm2: processing %s\n", argv[i]); + if(strcmp(argv[i], "minus") == 0) { + if(doMinus) { + fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + // Data from previous input files has been collected in b1. + // Collect data from further input files in b2. + b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode); + diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode); + errorCode.assertSuccess(); + builder = b2.getAlias(); + if(options[UNICODE_VERSION].doesOccur) { + builder->setUnicodeVersion(options[UNICODE_VERSION].value); + } + if(options[OPT_FAST].doesOccur) { + builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST); + } + doMinus = true; + continue; + } + filename.append(argv[i], errorCode); + std::ifstream f(filename.data()); + if(f.fail()) { + fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.data()); + exit(U_FILE_ACCESS_ERROR); + } + builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS); + parseFile(f, *builder); + filename.truncate(pathLength); + } + + if(doMinus) { + Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff); + diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true); + } else if(options[WRITE_COMBINED_DATA].doesOccur) { + builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false); + } else if(options[WRITE_C_SOURCE].doesOccur) { + builder->writeCSourceFile(options[OUTPUT_FILENAME].value); + } else { + builder->writeBinaryFile(options[OUTPUT_FILENAME].value); + } + + return errorCode.get(); + +#endif +} + +#if !UCONFIG_NO_NORMALIZATION + +void parseFile(std::ifstream &f, Normalizer2DataBuilder &builder) { + IcuToolErrorCode errorCode("gennorm2/parseFile()"); + std::string lineString; + uint32_t startCP, endCP; + while(std::getline(f, lineString)) { + if (lineString.empty()) { + continue; // skip empty lines. + } + char *line = &lineString.front(); + char *comment=(char *)strchr(line, '#'); + if(comment!=nullptr) { + *comment=0; + } + u_rtrim(line); + if(line[0]==0) { + continue; // skip empty and comment-only lines + } + if(line[0]=='*') { + const char *s=u_skipWhitespace(line+1); + if(0==strncmp(s, "Unicode", 7)) { + s=u_skipWhitespace(s+7); + builder.setUnicodeVersion(s); + } + continue; // reserved syntax + } + const char *delimiter; + int32_t rangeLength= + u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode); + if(errorCode.isFailure()) { + fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line); + exit(errorCode.reset()); + } + if (endCP >= 0xd800 && startCP <= 0xdfff) { + fprintf(stderr, "gennorm2 error: value or mapping for surrogate code points: %s\n", + line); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + delimiter=u_skipWhitespace(delimiter); + if(*delimiter==':') { + const char *s=u_skipWhitespace(delimiter+1); + char *end; + unsigned long value=strtoul(s, &end, 10); + if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) { + fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line); + exit(U_PARSE_ERROR); + } + for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { + builder.setCC(c, (uint8_t)value); + } + continue; + } + if(*delimiter=='-') { + if(*u_skipWhitespace(delimiter+1)!=0) { + fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line); + exit(U_PARSE_ERROR); + } + for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { + builder.removeMapping(c); + } + continue; + } + if(*delimiter=='=' || *delimiter=='>') { + char16_t uchars[Normalizer2Impl::MAPPING_LENGTH_MASK]; + int32_t length=u_parseString(delimiter+1, uchars, UPRV_LENGTHOF(uchars), nullptr, errorCode); + if(errorCode.isFailure()) { + fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line); + exit(errorCode.reset()); + } + UnicodeString mapping(false, uchars, length); + if(*delimiter=='=') { + if(rangeLength!=1) { + fprintf(stderr, + "gennorm2 error: round-trip mapping for more than 1 code point on %s\n", + line); + exit(U_PARSE_ERROR); + } + builder.setRoundTripMapping((UChar32)startCP, mapping); + } else { + for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) { + builder.setOneWayMapping(c, mapping); + } + } + continue; + } + fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line); + exit(U_PARSE_ERROR); + } +} + +#endif // !UCONFIG_NO_NORMALIZATION + +U_NAMESPACE_END + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ diff --git a/intl/icu/source/tools/gennorm2/gennorm2.vcxproj b/intl/icu/source/tools/gennorm2/gennorm2.vcxproj new file mode 100644 index 0000000000..2f41299098 --- /dev/null +++ b/intl/icu/source/tools/gennorm2/gennorm2.vcxproj @@ -0,0 +1,103 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{C7891A65-80AB-4245-912E-5F1E17B0E6C4}</ProjectGuid> + <RootNamespace>gennorm2</RootNamespace> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)'=='Release'" Label="Configuration"> + <WholeProgramOptimization>true</WholeProgramOptimization> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)\gennorm2.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)\gennorm2.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)\gennorm2.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)\gennorm2.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="extradata.cpp" /> + <ClCompile Include="gennorm2.cpp" /> + <ClCompile Include="n2builder.cpp" /> + <ClCompile Include="norms.cpp" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="extradata.h" /> + <ClInclude Include="n2builder.h" /> + <ClInclude Include="norms.h" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/gennorm2/n2builder.cpp b/intl/icu/source/tools/gennorm2/n2builder.cpp new file mode 100644 index 0000000000..a07327145d --- /dev/null +++ b/intl/icu/source/tools/gennorm2/n2builder.cpp @@ -0,0 +1,1051 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2009-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: n2builder.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2009nov25 +* created by: Markus W. Scherer +* +* Builds Normalizer2 data and writes a binary .nrm file. +* For the file format see source/common/normalizer2impl.h. +*/ + +#include "unicode/utypes.h" +#include "n2builder.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <vector> +#include "unicode/errorcode.h" +#include "unicode/localpointer.h" +#include "unicode/putil.h" +#include "unicode/ucptrie.h" +#include "unicode/udata.h" +#include "unicode/umutablecptrie.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" +#include "unicode/usetiter.h" +#include "unicode/ustring.h" +#include "charstr.h" +#include "extradata.h" +#include "hash.h" +#include "normalizer2impl.h" +#include "norms.h" +#include "toolutil.h" +#include "unewdata.h" +#include "uvectr32.h" +#include "writesrc.h" + +#if !UCONFIG_NO_NORMALIZATION + +/* UDataInfo cf. udata.h */ +static UDataInfo dataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, + + { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ + { 4, 0, 0, 0 }, /* formatVersion */ + { 11, 0, 0, 0 } /* dataVersion (Unicode version) */ +}; + +U_NAMESPACE_BEGIN + +class HangulIterator { +public: + struct Range { + UChar32 start, end; + }; + + HangulIterator() : rangeIndex(0) {} + const Range *nextRange() { + if(rangeIndex<UPRV_LENGTHOF(ranges)) { + return ranges+rangeIndex++; + } else { + return nullptr; + } + } +private: + static const Range ranges[4]; + int32_t rangeIndex; +}; + +const HangulIterator::Range HangulIterator::ranges[4]={ + { Hangul::JAMO_L_BASE, Hangul::JAMO_L_END }, + { Hangul::JAMO_V_BASE, Hangul::JAMO_V_END }, + // JAMO_T_BASE+1: not U+11A7 + { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END }, + { Hangul::HANGUL_BASE, Hangul::HANGUL_END }, +}; + +Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : + norms(errorCode), + phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL), + norm16TrieBytes(nullptr), norm16TrieLength(0) { + memset(unicodeVersion, 0, sizeof(unicodeVersion)); + memset(indexes, 0, sizeof(indexes)); + memset(smallFCD, 0, sizeof(smallFCD)); +} + +Normalizer2DataBuilder::~Normalizer2DataBuilder() { + delete[] norm16TrieBytes; +} + +void +Normalizer2DataBuilder::setUnicodeVersion(const char *v) { + UVersionInfo nullVersion={ 0, 0, 0, 0 }; + UVersionInfo version; + u_versionFromString(version, v); + if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) && + 0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH) + ) { + char buffer[U_MAX_VERSION_STRING_LENGTH]; + u_versionToString(unicodeVersion, buffer); + fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n", + buffer, v); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH); +} + +Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) { + if(p!=nullptr) { + if(p->mappingType!=Norm::NONE) { + if( overrideHandling==OVERRIDE_NONE || + (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase) + ) { + fprintf(stderr, + "error in gennorm2 phase %d: " + "not permitted to override mapping for U+%04lX from phase %d\n", + (int)phase, (long)c, (int)p->mappingPhase); + exit(U_INVALID_FORMAT_ERROR); + } + delete p->mapping; + p->mapping=nullptr; + } + p->mappingPhase=phase; + } + return p; +} + +void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) { + overrideHandling=oh; + ++phase; +} + +void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) { + norms.createNorm(c)->cc=cc; + norms.ccSet.add(c); +} + +static UBool isWellFormed(const UnicodeString &s) { + UErrorCode errorCode=U_ZERO_ERROR; + u_strToUTF8(nullptr, 0, nullptr, toUCharPtr(s.getBuffer()), s.length(), &errorCode); + return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR; +} + +void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) { + if(!isWellFormed(m)) { + fprintf(stderr, + "error in gennorm2 phase %d: " + "illegal one-way mapping from U+%04lX to malformed string\n", + (int)phase, (long)c); + exit(U_INVALID_FORMAT_ERROR); + } + Norm *p=checkNormForMapping(norms.createNorm(c), c); + p->mapping=new UnicodeString(m); + p->mappingType=Norm::ONE_WAY; + p->setMappingCP(); + norms.mappingSet.add(c); +} + +void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) { + if(U_IS_SURROGATE(c)) { + fprintf(stderr, + "error in gennorm2 phase %d: " + "illegal round-trip mapping from surrogate code point U+%04lX\n", + (int)phase, (long)c); + exit(U_INVALID_FORMAT_ERROR); + } + if(!isWellFormed(m)) { + fprintf(stderr, + "error in gennorm2 phase %d: " + "illegal round-trip mapping from U+%04lX to malformed string\n", + (int)phase, (long)c); + exit(U_INVALID_FORMAT_ERROR); + } + int32_t numCP=u_countChar32(toUCharPtr(m.getBuffer()), m.length()); + if(numCP!=2) { + fprintf(stderr, + "error in gennorm2 phase %d: " + "illegal round-trip mapping from U+%04lX to %d!=2 code points\n", + (int)phase, (long)c, (int)numCP); + exit(U_INVALID_FORMAT_ERROR); + } + Norm *p=checkNormForMapping(norms.createNorm(c), c); + p->mapping=new UnicodeString(m); + p->mappingType=Norm::ROUND_TRIP; + p->mappingCP=U_SENTINEL; + norms.mappingSet.add(c); +} + +void Normalizer2DataBuilder::removeMapping(UChar32 c) { + // createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data. + Norm *p=checkNormForMapping(norms.createNorm(c), c); + p->mappingType=Norm::REMOVED; + norms.mappingSet.add(c); +} + +UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer, + Norm::MappingType mappingType) const { + if(buffer.isEmpty()) { + return false; // Maps-to-empty-string is no boundary of any kind. + } + int32_t lastStarterIndex=buffer.lastStarterIndex(); + if(lastStarterIndex<0) { + return false; // no starter + } + const int32_t lastIndex=buffer.length()-1; + if(mappingType==Norm::ONE_WAY && lastStarterIndex<lastIndex && buffer.ccAt(lastIndex)>1) { + // One-way mapping where after the last starter is at least one combining mark + // with a combining class greater than 1, + // which means that another combining mark can reorder before it. + // By contrast, in a round-trip mapping this does not prevent a boundary as long as + // the starter or composite does not combine-forward with a following combining mark. + return false; + } + UChar32 starter=buffer.charAt(lastStarterIndex); + if(lastStarterIndex==0 && norms.combinesBack(starter)) { + // The last starter is at the beginning of the mapping and combines backward. + return false; + } + if(Hangul::isJamoL(starter) || + (Hangul::isJamoV(starter) && + 0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) { + // A Jamo leading consonant or an LV pair combines-forward if it is at the end, + // otherwise it is blocked. + return lastStarterIndex!=lastIndex; + } + // Note: There can be no Hangul syllable in the fully decomposed mapping. + + // Multiple starters can combine into one. + // Look for the first of the last sequence of starters, excluding Jamos. + int32_t i=lastStarterIndex; + UChar32 c; + while(0<i && buffer.ccAt(i-1)==0 && !Hangul::isJamo(c=buffer.charAt(i-1))) { + starter=c; + --i; + } + // Compose as far as possible, and see if further compositions with + // characters following this mapping are possible. + const Norm *starterNorm=norms.getNorm(starter); + if(i==lastStarterIndex && + (starterNorm==nullptr || starterNorm->compositions==nullptr)) { + return true; // The last starter does not combine forward. + } + uint8_t prevCC=0; + while(++i<buffer.length()) { + uint8_t cc=buffer.ccAt(i); // !=0 if after last starter + if(i>lastStarterIndex && norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) { + // The starter combines with a mark that reorders before the current one. + return false; + } + UChar32 c=buffer.charAt(i); + if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) && + norms.getNormRef(c).combinesBack && (starter=starterNorm->combine(c))>=0) { + // The starter combines with c into a composite replacement starter. + starterNorm=norms.getNorm(starter); + if(i>=lastStarterIndex && + (starterNorm==nullptr || starterNorm->compositions==nullptr)) { + return true; // The composite does not combine further. + } + // Keep prevCC because we "removed" the combining mark. + } else if(cc==0) { + starterNorm=norms.getNorm(c); + if(i==lastStarterIndex && + (starterNorm==nullptr || starterNorm->compositions==nullptr)) { + return true; // The new starter does not combine forward. + } + prevCC=0; + } else { + prevCC=cc; + } + } + if(prevCC==0) { + return false; // forward-combining starter at the very end + } + if(norms.combinesWithCCBetween(*starterNorm, prevCC, 256)) { + // The starter combines with another mark. + return false; + } + return true; +} + +UBool Normalizer2DataBuilder::mappingRecomposes(const BuilderReorderingBuffer &buffer) const { + if(buffer.lastStarterIndex()<0) { + return false; // no starter + } + const Norm *starterNorm=nullptr; + uint8_t prevCC=0; + for(int32_t i=0; i<buffer.length(); ++i) { + UChar32 c=buffer.charAt(i); + uint8_t cc=buffer.ccAt(i); + if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) && + norms.getNormRef(c).combinesBack && starterNorm->combine(c)>=0) { + return true; // normal composite + } else if(cc==0) { + if(Hangul::isJamoL(c)) { + if((i+1)<buffer.length() && Hangul::isJamoV(buffer.charAt(i+1))) { + return true; // Hangul syllable + } + starterNorm=nullptr; + } else { + starterNorm=norms.getNorm(c); + } + } + prevCC=cc; + } + return false; +} + +void Normalizer2DataBuilder::postProcess(Norm &norm) { + // Prerequisites: Compositions are built, mappings are recursively decomposed. + // Mappings are not yet in canonical order. + // + // This function works on a Norm struct. We do not know which code point(s) map(s) to it. + // Therefore, we cannot compute algorithmic mapping deltas here. + // Error conditions are checked, but printed later when we do know the offending code point. + if(norm.hasMapping()) { + if(norm.mapping->length()>Normalizer2Impl::MAPPING_LENGTH_MASK) { + norm.error="mapping longer than maximum of 31"; + return; + } + // Ensure canonical order. + BuilderReorderingBuffer buffer; + if(norm.rawMapping!=nullptr) { + norms.reorder(*norm.rawMapping, buffer); + buffer.reset(); + } + norms.reorder(*norm.mapping, buffer); + if(buffer.isEmpty()) { + // A character that is deleted (maps to an empty string) must + // get the worst-case lccc and tccc values because arbitrary + // characters on both sides will become adjacent. + norm.leadCC=1; + norm.trailCC=0xff; + } else { + norm.leadCC=buffer.ccAt(0); + norm.trailCC=buffer.ccAt(buffer.length()-1); + } + + norm.hasCompBoundaryBefore= + !buffer.isEmpty() && norm.leadCC==0 && !norms.combinesBack(buffer.charAt(0)); + norm.hasCompBoundaryAfter= + norm.compositions==nullptr && mappingHasCompBoundaryAfter(buffer, norm.mappingType); + + if(norm.combinesBack) { + norm.error="combines-back and decomposes, not possible in Unicode normalization"; + } else if(norm.mappingType==Norm::ROUND_TRIP) { + if(norm.compositions!=nullptr) { + norm.type=Norm::YES_NO_COMBINES_FWD; + } else { + norm.type=Norm::YES_NO_MAPPING_ONLY; + } + } else { // one-way mapping + if(norm.compositions!=nullptr) { + norm.error="combines-forward and has a one-way mapping, " + "not possible in Unicode normalization"; + } else if(buffer.isEmpty()) { + norm.type=Norm::NO_NO_EMPTY; + } else if(!norm.hasCompBoundaryBefore) { + norm.type=Norm::NO_NO_COMP_NO_MAYBE_CC; + } else if(mappingRecomposes(buffer)) { + norm.type=Norm::NO_NO_COMP_BOUNDARY_BEFORE; + } else { + // The mapping is comp-normalized. + norm.type=Norm::NO_NO_COMP_YES; + } + } + } else { // no mapping + norm.leadCC=norm.trailCC=norm.cc; + + norm.hasCompBoundaryBefore= + norm.cc==0 && !norm.combinesBack; + norm.hasCompBoundaryAfter= + norm.cc==0 && !norm.combinesBack && norm.compositions==nullptr; + + if(norm.combinesBack) { + if(norm.compositions!=nullptr) { + // Earlier code checked ccc=0. + norm.type=Norm::MAYBE_YES_COMBINES_FWD; + } else { + norm.type=Norm::MAYBE_YES_SIMPLE; // any ccc + } + } else if(norm.compositions!=nullptr) { + // Earlier code checked ccc=0. + norm.type=Norm::YES_YES_COMBINES_FWD; + } else if(norm.cc!=0) { + norm.type=Norm::YES_YES_WITH_CC; + } else { + norm.type=Norm::INERT; + } + } +} + +class Norm16Writer : public Norms::Enumerator { +public: + Norm16Writer(UMutableCPTrie *trie, Norms &n, Normalizer2DataBuilder &b) : + Norms::Enumerator(n), builder(b), norm16Trie(trie) {} + void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override { + builder.writeNorm16(norm16Trie, start, end, norm); + } + Normalizer2DataBuilder &builder; + UMutableCPTrie *norm16Trie; +}; + +void Normalizer2DataBuilder::setSmallFCD(UChar32 c) { + UChar32 lead= c<=0xffff ? c : U16_LEAD(c); + smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); +} + +void Normalizer2DataBuilder::writeNorm16(UMutableCPTrie *norm16Trie, UChar32 start, UChar32 end, Norm &norm) { + if((norm.leadCC|norm.trailCC)!=0) { + for(UChar32 c=start; c<=end; ++c) { + setSmallFCD(c); + } + } + + int32_t norm16; + switch(norm.type) { + case Norm::INERT: + norm16=Normalizer2Impl::INERT; + break; + case Norm::YES_YES_COMBINES_FWD: + norm16=norm.offset*2; + break; + case Norm::YES_NO_COMBINES_FWD: + norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset*2; + break; + case Norm::YES_NO_MAPPING_ONLY: + norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset*2; + break; + case Norm::NO_NO_COMP_YES: + norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset*2; + break; + case Norm::NO_NO_COMP_BOUNDARY_BEFORE: + norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]+norm.offset*2; + break; + case Norm::NO_NO_COMP_NO_MAYBE_CC: + norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]+norm.offset*2; + break; + case Norm::NO_NO_EMPTY: + norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]+norm.offset*2; + break; + case Norm::NO_NO_DELTA: + { + // Positive offset from minNoNoDelta, shifted left for additional bits. + int32_t offset=(norm.offset+Normalizer2Impl::MAX_DELTA)<<Normalizer2Impl::DELTA_SHIFT; + if(norm.trailCC==0) { + // DELTA_TCCC_0==0 + } else if(norm.trailCC==1) { + offset|=Normalizer2Impl::DELTA_TCCC_1; + } else { + offset|=Normalizer2Impl::DELTA_TCCC_GT_1; + } + norm16=getMinNoNoDelta()+offset; + break; + } + case Norm::MAYBE_YES_COMBINES_FWD: + norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset*2; + break; + case Norm::MAYBE_YES_SIMPLE: + norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc*2; // ccc=0..255 + break; + case Norm::YES_YES_WITH_CC: + U_ASSERT(norm.cc!=0); + norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-2+norm.cc*2; // ccc=1..255 + break; + default: // Should not occur. + exit(U_INTERNAL_PROGRAM_ERROR); + } + U_ASSERT((norm16&1)==0); + if(norm.hasCompBoundaryAfter) { + norm16|=Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER; + } + IcuToolErrorCode errorCode("gennorm2/writeNorm16()"); + umutablecptrie_setRange(norm16Trie, start, end, (uint32_t)norm16, errorCode); + + // Set the minimum code points for real data lookups in the quick check loops. + UBool isDecompNo= + (Norm::YES_NO_COMBINES_FWD<=norm.type && norm.type<=Norm::NO_NO_DELTA) || + norm.cc!=0; + if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { + indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start; + } + UBool isCompNoMaybe= norm.type>=Norm::NO_NO_COMP_YES; + if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { + indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start; + } + if(norm.leadCC!=0 && start<indexes[Normalizer2Impl::IX_MIN_LCCC_CP]) { + indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=start; + } +} + +void Normalizer2DataBuilder::setHangulData(UMutableCPTrie *norm16Trie) { + HangulIterator hi; + const HangulIterator::Range *range; + // Check that none of the Hangul/Jamo code points have data. + while((range=hi.nextRange())!=nullptr) { + for(UChar32 c=range->start; c<=range->end; ++c) { + if(umutablecptrie_get(norm16Trie, c)>Normalizer2Impl::INERT) { + fprintf(stderr, + "gennorm2 error: " + "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n", + (long)c); + exit(U_INVALID_FORMAT_ERROR); + } + } + } + // Set data for algorithmic runtime handling. + IcuToolErrorCode errorCode("gennorm2/setHangulData()"); + + // Jamo V/T are maybeYes + if(Hangul::JAMO_V_BASE<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { + indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=Hangul::JAMO_V_BASE; + } + umutablecptrie_setRange(norm16Trie, Hangul::JAMO_L_BASE, Hangul::JAMO_L_END, + Normalizer2Impl::JAMO_L, errorCode); + umutablecptrie_setRange(norm16Trie, Hangul::JAMO_V_BASE, Hangul::JAMO_V_END, + Normalizer2Impl::JAMO_VT, errorCode); + // JAMO_T_BASE+1: not U+11A7 + umutablecptrie_setRange(norm16Trie, Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END, + Normalizer2Impl::JAMO_VT, errorCode); + + // Hangul LV encoded as minYesNo + uint32_t lv=indexes[Normalizer2Impl::IX_MIN_YES_NO]; + // Hangul LVT encoded as minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER + uint32_t lvt=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]| + Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER; + if(Hangul::HANGUL_BASE<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { + indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=Hangul::HANGUL_BASE; + } + // Set the first LV, then write all other Hangul syllables as LVT, + // then overwrite the remaining LV. + umutablecptrie_set(norm16Trie, Hangul::HANGUL_BASE, lv, errorCode); + umutablecptrie_setRange(norm16Trie, Hangul::HANGUL_BASE+1, Hangul::HANGUL_END, lvt, errorCode); + UChar32 c=Hangul::HANGUL_BASE; + while((c+=Hangul::JAMO_T_COUNT)<=Hangul::HANGUL_END) { + umutablecptrie_set(norm16Trie, c, lv, errorCode); + } + errorCode.assertSuccess(); +} + +LocalUCPTriePointer Normalizer2DataBuilder::processData() { + // Build composition lists before recursive decomposition, + // so that we still have the raw, pair-wise mappings. + CompositionBuilder compBuilder(norms); + norms.enumRanges(compBuilder); + + // Recursively decompose all mappings. + Decomposer decomposer(norms); + do { + decomposer.didDecompose=false; + norms.enumRanges(decomposer); + } while(decomposer.didDecompose); + + // Set the Norm::Type and other properties. + int32_t normsLength=norms.length(); + for(int32_t i=1; i<normsLength; ++i) { + postProcess(norms.getNormRefByIndex(i)); + } + + // Write the properties, mappings and composition lists to + // appropriate parts of the "extra data" array. + ExtraData extra(norms, optimization==OPTIMIZE_FAST); + norms.enumRanges(extra); + + extraData=extra.yesYesCompositions; + indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length()*2; + extraData.append(extra.yesNoMappingsAndCompositions); + indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length()*2; + extraData.append(extra.yesNoMappingsOnly); + indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length()*2; + extraData.append(extra.noNoMappingsCompYes); + indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]=extraData.length()*2; + extraData.append(extra.noNoMappingsCompBoundaryBefore); + indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]=extraData.length()*2; + extraData.append(extra.noNoMappingsCompNoMaybeCC); + indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]=extraData.length()*2; + extraData.append(extra.noNoMappingsEmpty); + indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length()*2; + + // Pad the maybeYesCompositions length to a multiple of 4, + // so that NO_NO_DELTA bits 2..1 can be used without subtracting the center. + while(extra.maybeYesCompositions.length()&3) { + extra.maybeYesCompositions.append((char16_t)0); + } + extraData.insert(0, extra.maybeYesCompositions); + indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]= + Normalizer2Impl::MIN_NORMAL_MAYBE_YES- + extra.maybeYesCompositions.length()*2; + + // Pad to even length for 4-byte alignment of following data. + if(extraData.length()&1) { + extraData.append((char16_t)0); + } + + int32_t minNoNoDelta=getMinNoNoDelta(); + U_ASSERT((minNoNoDelta&7)==0); + if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) { + fprintf(stderr, + "gennorm2 error: " + "data structure overflow, too much mapping composition data\n"); + exit(U_BUFFER_OVERFLOW_ERROR); + } + + // writeNorm16() and setHangulData() reduce these as needed. + indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000; + indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000; + indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=0x110000; + + IcuToolErrorCode errorCode("gennorm2/processData()"); + UMutableCPTrie *norm16Trie = umutablecptrie_open( + Normalizer2Impl::INERT, Normalizer2Impl::INERT, errorCode); + errorCode.assertSuccess(); + + // Map each code point to its norm16 value, + // including the properties that fit directly, + // and the offset to the "extra data" if necessary. + Norm16Writer norm16Writer(norm16Trie, norms, *this); + norms.enumRanges(norm16Writer); + // TODO: iterate via getRange() instead of callback? + + setHangulData(norm16Trie); + + // Look for the "worst" norm16 value of any supplementary code point + // corresponding to a lead surrogate, and set it as that surrogate's value. + // Enables UTF-16 quick check inner loops to look at only code units. + // + // We could be more sophisticated: + // We could collect a bit set for whether there are values in the different + // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.) + // and select the best value that only breaks the composition and/or decomposition + // inner loops if necessary. + // However, that seems like overkill for an optimization for supplementary characters. + // + // First check that surrogate code *points* are inert. + // The parser should have rejected values/mappings for them. + uint32_t value; + UChar32 end = umutablecptrie_getRange(norm16Trie, 0xd800, UCPMAP_RANGE_NORMAL, 0, + nullptr, nullptr, &value); + if (value != Normalizer2Impl::INERT || end < 0xdfff) { + fprintf(stderr, + "gennorm2 error: not all surrogate code points are inert: U+d800..U+%04x=%lx\n", + (int)end, (long)value); + exit(U_INTERNAL_PROGRAM_ERROR); + } + uint32_t maxNorm16 = 0; + // ANDing values yields 0 bits where any value has a 0. + // Used for worst-case HAS_COMP_BOUNDARY_AFTER. + uint32_t andedNorm16 = 0; + end = 0; + for (UChar32 start = 0x10000;;) { + if (start > end) { + end = umutablecptrie_getRange(norm16Trie, start, UCPMAP_RANGE_NORMAL, 0, + nullptr, nullptr, &value); + if (end < 0) { break; } + } + if ((start & 0x3ff) == 0) { + // Data for a new lead surrogate. + maxNorm16 = andedNorm16 = value; + } else { + if (value > maxNorm16) { + maxNorm16 = value; + } + andedNorm16 &= value; + } + // Intersect each range with the code points for one lead surrogate. + UChar32 leadEnd = start | 0x3ff; + if (leadEnd <= end) { + // End of the supplementary block for a lead surrogate. + if (maxNorm16 >= (uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]) { + // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0. + // Otherwise it might end up at something like JAMO_VT which stays in + // the inner decomposition quick check loop. + maxNorm16 = (uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]; + } + maxNorm16 = + (maxNorm16 & ~Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER)| + (andedNorm16 & Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER); + if (maxNorm16 != Normalizer2Impl::INERT) { + umutablecptrie_set(norm16Trie, U16_LEAD(start), maxNorm16, errorCode); + } + if (value == Normalizer2Impl::INERT) { + // Potentially skip inert supplementary blocks for several lead surrogates. + start = (end + 1) & ~0x3ff; + } else { + start = leadEnd + 1; + } + } else { + start = end + 1; + } + } + + // Adjust supplementary minimum code points to break quick check loops at their lead surrogates. + // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate) + // which is harmless. + // As a result, the minimum code points are always BMP code points. + int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; + if(minCP>=0x10000) { + indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); + } + minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; + if(minCP>=0x10000) { + indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); + } + minCP=indexes[Normalizer2Impl::IX_MIN_LCCC_CP]; + if(minCP>=0x10000) { + indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=U16_LEAD(minCP); + } + + LocalUCPTriePointer builtTrie( + umutablecptrie_buildImmutable(norm16Trie, UCPTRIE_TYPE_FAST, UCPTRIE_VALUE_BITS_16, errorCode)); + norm16TrieLength=ucptrie_toBinary(builtTrie.getAlias(), nullptr, 0, errorCode); + if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { + fprintf(stderr, "gennorm2 error: unable to build/serialize the normalization trie - %s\n", + errorCode.errorName()); + exit(errorCode.reset()); + } + umutablecptrie_close(norm16Trie); + errorCode.reset(); + norm16TrieBytes=new uint8_t[norm16TrieLength]; + ucptrie_toBinary(builtTrie.getAlias(), norm16TrieBytes, norm16TrieLength, errorCode); + errorCode.assertSuccess(); + + int32_t offset=(int32_t)sizeof(indexes); + indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; + offset+=norm16TrieLength; + indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; + offset+=extraData.length()*2; + indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset; + offset+=sizeof(smallFCD); + int32_t totalSize=offset; + for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) { + indexes[i]=totalSize; + } + + if(beVerbose) { + printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength); + printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length()); + printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD)); + printf("size of binary data file contents: %5ld bytes\n", (long)totalSize); + printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]); + printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]); + printf("minLcccCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_LCCC_CP]); + printf("minYesNo: (with compositions) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]); + printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]); + printf("minNoNo: (comp-normalized) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]); + printf("minNoNoCompBoundaryBefore: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]); + printf("minNoNoCompNoMaybeCC: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]); + printf("minNoNoEmpty: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]); + printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]); + printf("minNoNoDelta: 0x%04x\n", (int)minNoNoDelta); + printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]); + } + + UVersionInfo nullVersion={ 0, 0, 0, 0 }; + if(0==memcmp(nullVersion, unicodeVersion, 4)) { + u_versionFromString(unicodeVersion, U_UNICODE_VERSION); + } + memcpy(dataInfo.dataVersion, unicodeVersion, 4); + return builtTrie; +} + +void Normalizer2DataBuilder::writeBinaryFile(const char *filename) { + processData(); + + IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); + UNewDataMemory *pData= + udata_create(nullptr, nullptr, filename, &dataInfo, + haveCopyright ? U_COPYRIGHT_STRING : nullptr, errorCode); + if(errorCode.isFailure()) { + fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n", + filename, errorCode.errorName()); + exit(errorCode.reset()); + } + udata_writeBlock(pData, indexes, sizeof(indexes)); + udata_writeBlock(pData, norm16TrieBytes, norm16TrieLength); + udata_writeUString(pData, toUCharPtr(extraData.getBuffer()), extraData.length()); + udata_writeBlock(pData, smallFCD, sizeof(smallFCD)); + int32_t writtenSize=udata_finish(pData, errorCode); + if(errorCode.isFailure()) { + fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName()); + exit(errorCode.reset()); + } + int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; + if(writtenSize!=totalSize) { + fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n", + (long)writtenSize, (long)totalSize); + exit(U_INTERNAL_PROGRAM_ERROR); + } +} + +void +Normalizer2DataBuilder::writeCSourceFile(const char *filename) { + LocalUCPTriePointer norm16Trie = processData(); + + IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()"); + const char *basename=findBasename(filename); + CharString path(filename, (int32_t)(basename-filename), errorCode); + CharString dataName(basename, errorCode); + const char *extension=strrchr(basename, '.'); + if(extension!=nullptr) { + dataName.truncate((int32_t)(extension-basename)); + } + const char *name=dataName.data(); + errorCode.assertSuccess(); + + FILE *f=usrc_create(path.data(), basename, 2016, "icu/source/tools/gennorm2/n2builder.cpp"); + if(f==nullptr) { + fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n", + filename); + exit(U_FILE_ACCESS_ERROR); + } + fputs("#ifdef INCLUDED_FROM_NORMALIZER2_CPP\n\n", f); + + char line[100]; + snprintf(line, sizeof(line), "static const UVersionInfo %s_formatVersion={", name); + usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "", "};\n"); + snprintf(line, sizeof(line), "static const UVersionInfo %s_dataVersion={", name); + usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "", "};\n\n"); + snprintf(line, sizeof(line), "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n", name); + usrc_writeArray(f, line, indexes, 32, Normalizer2Impl::IX_COUNT, "", "\n};\n\n"); + + usrc_writeUCPTrie(f, name, norm16Trie.getAlias(), UPRV_TARGET_SYNTAX_CCODE); + + snprintf(line, sizeof(line), "static const uint16_t %s_extraData[%%ld]={\n", name); + usrc_writeArray(f, line, extraData.getBuffer(), 16, extraData.length(), "", "\n};\n\n"); + snprintf(line, sizeof(line), "static const uint8_t %s_smallFCD[%%ld]={\n", name); + usrc_writeArray(f, line, smallFCD, 8, sizeof(smallFCD), "", "\n};\n\n"); + + fputs("#endif // INCLUDED_FROM_NORMALIZER2_CPP\n", f); + fclose(f); +} + +namespace { + +bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) { + if(s1 == nullptr) { + return s2 == nullptr; + } else if(s2 == nullptr) { + return false; + } else { + return *s1 == *s2; + } +} + +const char *typeChars = "?-=>"; + +void writeMapping(FILE *f, const UnicodeString *m) { + if(m != nullptr && !m->isEmpty()) { + int32_t i = 0; + UChar32 c = m->char32At(i); + fprintf(f, "%04lX", (long)c); + while((i += U16_LENGTH(c)) < m->length()) { + c = m->char32At(i); + fprintf(f, " %04lX", (long)c); + } + } + fputs("\n", f); +} + +} // namespace + +void +Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const { + // Do not processData() before writing the input-syntax data file. + FILE *f = fopen(filename, "w"); + if(f == nullptr) { + fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n", + filename); + exit(U_FILE_ACCESS_ERROR); + return; + } + + if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 || + unicodeVersion[2] != 0 || unicodeVersion[3] != 0) { + char uv[U_MAX_VERSION_STRING_LENGTH]; + u_versionToString(unicodeVersion, uv); + fprintf(f, "* Unicode %s\n\n", uv); + } + + UnicodeSetIterator ccIter(norms.ccSet); + UChar32 start = U_SENTINEL; + UChar32 end = U_SENTINEL; + uint8_t prevCC = 0; + bool done = false; + bool didWrite = false; + do { + UChar32 c; + uint8_t cc; + if(ccIter.next() && !ccIter.isString()) { + c = ccIter.getCodepoint(); + cc = norms.getCC(c); + } else { + c = 0x110000; + cc = 0; + done = true; + } + if(cc == prevCC && c == (end + 1)) { + end = c; + } else { + if(prevCC != 0) { + if(start == end) { + fprintf(f, "%04lX:%d\n", (long)start, (int)prevCC); + } else { + fprintf(f, "%04lX..%04lX:%d\n", (long)start, (long)end, (int)prevCC); + } + didWrite = true; + } + start = end = c; + prevCC = cc; + } + } while(!done); + if(didWrite) { + fputs("\n", f); + } + + UnicodeSetIterator mIter(norms.mappingSet); + start = U_SENTINEL; + end = U_SENTINEL; + const UnicodeString *prevMapping = nullptr; + Norm::MappingType prevType = Norm::NONE; + done = false; + do { + UChar32 c; + const Norm *norm; + if(mIter.next() && !mIter.isString()) { + c = mIter.getCodepoint(); + norm = norms.getNorm(c); + } else { + c = 0x110000; + norm = nullptr; + done = true; + } + const UnicodeString *mapping; + Norm::MappingType type; + if(norm == nullptr) { + mapping = nullptr; + type = Norm::NONE; + } else { + type = norm->mappingType; + if(type == Norm::NONE) { + mapping = nullptr; + } else { + mapping = norm->mapping; + } + } + if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) { + end = c; + } else { + if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) { + if(start == end) { + fprintf(f, "%04lX%c", (long)start, typeChars[prevType]); + } else { + fprintf(f, "%04lX..%04lX%c", (long)start, (long)end, typeChars[prevType]); + } + writeMapping(f, prevMapping); + } + start = end = c; + prevMapping = mapping; + prevType = type; + } + } while(!done); + + fclose(f); +} + +void +Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1, + const Normalizer2DataBuilder &b2, + Normalizer2DataBuilder &diff) { + // Compute diff = b1 - b2 + // so that we should be able to get b1 = b2 + diff. + if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) { + memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH); + } + + UnicodeSet ccSet(b1.norms.ccSet); + ccSet.addAll(b2.norms.ccSet); + UnicodeSetIterator ccIter(ccSet); + while(ccIter.next() && !ccIter.isString()) { + UChar32 c = ccIter.getCodepoint(); + uint8_t cc1 = b1.norms.getCC(c); + uint8_t cc2 = b2.norms.getCC(c); + if(cc1 != cc2) { + diff.setCC(c, cc1); + } + } + + UnicodeSet mSet(b1.norms.mappingSet); + mSet.addAll(b2.norms.mappingSet); + UnicodeSetIterator mIter(mSet); + while(mIter.next() && !mIter.isString()) { + UChar32 c = mIter.getCodepoint(); + const Norm *norm1 = b1.norms.getNorm(c); + const Norm *norm2 = b2.norms.getNorm(c); + const UnicodeString *mapping1; + Norm::MappingType type1; + if(norm1 == nullptr || !norm1->hasMapping()) { + mapping1 = nullptr; + type1 = Norm::NONE; + } else { + mapping1 = norm1->mapping; + type1 = norm1->mappingType; + } + const UnicodeString *mapping2; + Norm::MappingType type2; + if(norm2 == nullptr || !norm2->hasMapping()) { + mapping2 = nullptr; + type2 = Norm::NONE; + } else { + mapping2 = norm2->mapping; + type2 = norm2->mappingType; + } + if(type1 == type2 && equalStrings(mapping1, mapping2)) { + // Nothing to do. + } else if(type1 == Norm::NONE) { + diff.removeMapping(c); + } else if(type1 == Norm::ROUND_TRIP) { + diff.setRoundTripMapping(c, *mapping1); + } else if(type1 == Norm::ONE_WAY) { + diff.setOneWayMapping(c, *mapping1); + } + } +} + +U_NAMESPACE_END + +#endif /* #if !UCONFIG_NO_NORMALIZATION */ + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + */ diff --git a/intl/icu/source/tools/gennorm2/n2builder.h b/intl/icu/source/tools/gennorm2/n2builder.h new file mode 100644 index 0000000000..b3698253be --- /dev/null +++ b/intl/icu/source/tools/gennorm2/n2builder.h @@ -0,0 +1,122 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2009-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: n2builder.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2009nov25 +* created by: Markus W. Scherer +*/ + +#ifndef __N2BUILDER_H__ +#define __N2BUILDER_H__ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_NORMALIZATION + +#include "unicode/errorcode.h" +#include "unicode/umutablecptrie.h" +#include "unicode/unistr.h" +#include "normalizer2impl.h" // for IX_COUNT +#include "toolutil.h" +#include "norms.h" + +U_NAMESPACE_BEGIN + +extern UBool beVerbose, haveCopyright; + +class Normalizer2DataBuilder { +public: + Normalizer2DataBuilder(UErrorCode &errorCode); + ~Normalizer2DataBuilder(); + + enum OverrideHandling { + OVERRIDE_NONE, + OVERRIDE_ANY, + OVERRIDE_PREVIOUS + }; + + void setOverrideHandling(OverrideHandling oh); + + enum Optimization { + OPTIMIZE_NORMAL, + OPTIMIZE_FAST + }; + + void setOptimization(Optimization opt) { optimization=opt; } + + void setCC(UChar32 c, uint8_t cc); + void setOneWayMapping(UChar32 c, const UnicodeString &m); + void setRoundTripMapping(UChar32 c, const UnicodeString &m); + void removeMapping(UChar32 c); + + void setUnicodeVersion(const char *v); + + void writeBinaryFile(const char *filename); + void writeCSourceFile(const char *filename); + void writeDataFile(const char *filename, bool writeRemoved) const; + + static void computeDiff(const Normalizer2DataBuilder &b1, + const Normalizer2DataBuilder &b2, + Normalizer2DataBuilder &diff); + +private: + friend class Norm16Writer; + + Normalizer2DataBuilder(const Normalizer2DataBuilder &other) = delete; + Normalizer2DataBuilder &operator=(const Normalizer2DataBuilder &other) = delete; + + Norm *checkNormForMapping(Norm *p, UChar32 c); // check for permitted overrides + + /** + * A starter character with a mapping does not have a composition boundary after it + * if the character itself combines-forward (which is tested by the caller of this function), + * or it is deleted (mapped to the empty string), + * or its mapping contains no starter, + * or the last starter combines-forward. + */ + UBool mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer, + Norm::MappingType mappingType) const; + /** Returns true if the mapping by itself recomposes, that is, it is not comp-normalized. */ + UBool mappingRecomposes(const BuilderReorderingBuffer &buffer) const; + void postProcess(Norm &norm); + + void setSmallFCD(UChar32 c); + int32_t getMinNoNoDelta() const { + return indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]- + ((2*Normalizer2Impl::MAX_DELTA+1)<<Normalizer2Impl::DELTA_SHIFT); + } + void writeNorm16(UMutableCPTrie *norm16Trie, UChar32 start, UChar32 end, Norm &norm); + void setHangulData(UMutableCPTrie *norm16Trie); + LocalUCPTriePointer processData(); + + Norms norms; + + int32_t phase; + OverrideHandling overrideHandling; + + Optimization optimization; + + int32_t indexes[Normalizer2Impl::IX_COUNT]; + uint8_t *norm16TrieBytes; + int32_t norm16TrieLength; + UnicodeString extraData; + uint8_t smallFCD[0x100]; + + UVersionInfo unicodeVersion; +}; + +U_NAMESPACE_END + +#endif // #if !UCONFIG_NO_NORMALIZATION + +#endif // __N2BUILDER_H__ diff --git a/intl/icu/source/tools/gennorm2/norms.cpp b/intl/icu/source/tools/gennorm2/norms.cpp new file mode 100644 index 0000000000..9dd8dca977 --- /dev/null +++ b/intl/icu/source/tools/gennorm2/norms.cpp @@ -0,0 +1,324 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// norms.cpp +// created: 2017jun04 Markus W. Scherer +// (pulled out of n2builder.cpp) + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_NORMALIZATION + +#include <stdio.h> +#include <stdlib.h> +#include "unicode/errorcode.h" +#include "unicode/umutablecptrie.h" +#include "unicode/unistr.h" +#include "unicode/utf16.h" +#include "normalizer2impl.h" +#include "norms.h" +#include "toolutil.h" +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) { + if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) { + if(cc==0) { + fLastStarterIndex=fLength; + } + fArray[fLength++]=(c<<8)|cc; + return; + } + // Let this character bubble back to its canonical order. + int32_t i=fLength-1; + while(i>fLastStarterIndex && ccAt(i)>cc) { + --i; + } + ++i; // after the last starter or prevCC<=cc + // Move this and the following characters forward one to make space. + for(int32_t j=fLength; i<j; --j) { + fArray[j]=fArray[j-1]; + } + fArray[i]=(c<<8)|cc; + ++fLength; + fDidReorder=true; +} + +void BuilderReorderingBuffer::toString(UnicodeString &dest) const { + dest.remove(); + for(int32_t i=0; i<fLength; ++i) { + dest.append(charAt(i)); + } +} + +UChar32 Norm::combine(UChar32 trail) const { + int32_t length; + const CompositionPair *pairs=getCompositionPairs(length); + for(int32_t i=0; i<length; ++i) { + if(trail==pairs[i].trail) { + return pairs[i].composite; + } + if(trail<pairs[i].trail) { + break; + } + } + return U_SENTINEL; +} + +Norms::Norms(UErrorCode &errorCode) { + normTrie = umutablecptrie_open(0, 0, &errorCode); + normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm)); + // Default "inert" Norm struct at index 0. Practically immutable. + norms=allocNorm(); + norms->type=Norm::INERT; +} + +Norms::~Norms() { + umutablecptrie_close(normTrie); + int32_t normsLength=utm_countItems(normMem); + for(int32_t i=1; i<normsLength; ++i) { + delete norms[i].mapping; + delete norms[i].rawMapping; + delete norms[i].compositions; + } + utm_close(normMem); +} + +Norm *Norms::allocNorm() { + Norm *p=(Norm *)utm_alloc(normMem); + norms=(Norm *)utm_getStart(normMem); // in case it got reallocated + return p; +} + +Norm *Norms::getNorm(UChar32 c) { + uint32_t i = umutablecptrie_get(normTrie, c); + if(i==0) { + return nullptr; + } + return norms+i; +} + +const Norm *Norms::getNorm(UChar32 c) const { + uint32_t i = umutablecptrie_get(normTrie, c); + if(i==0) { + return nullptr; + } + return norms+i; +} + +const Norm &Norms::getNormRef(UChar32 c) const { + return norms[umutablecptrie_get(normTrie, c)]; +} + +Norm *Norms::createNorm(UChar32 c) { + uint32_t i=umutablecptrie_get(normTrie, c); + if(i!=0) { + return norms+i; + } else { + /* allocate Norm */ + Norm *p=allocNorm(); + IcuToolErrorCode errorCode("gennorm2/createNorm()"); + umutablecptrie_set(normTrie, c, (uint32_t)(p - norms), errorCode); + return p; + } +} + +void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const { + int32_t length=mapping.length(); + U_ASSERT(length<=Normalizer2Impl::MAPPING_LENGTH_MASK); + const char16_t *s=mapping.getBuffer(); + int32_t i=0; + UChar32 c; + while(i<length) { + U16_NEXT(s, i, length, c); + buffer.append(c, getCC(c)); + } + if(buffer.didReorder()) { + buffer.toString(mapping); + } +} + +UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const { + if((highCC-lowCC)>=2) { + int32_t length; + const CompositionPair *pairs=norm.getCompositionPairs(length); + for(int32_t i=0; i<length; ++i) { + uint8_t trailCC=getCC(pairs[i].trail); + if(lowCC<trailCC && trailCC<highCC) { + return true; + } + } + } + return false; +} + +void Norms::enumRanges(Enumerator &e) { + UChar32 start = 0, end; + uint32_t i; + while ((end = umutablecptrie_getRange(normTrie, start, UCPMAP_RANGE_NORMAL, 0, + nullptr, nullptr, &i)) >= 0) { + if (i > 0) { + e.rangeHandler(start, end, norms[i]); + } + start = end + 1; + } +} + +Norms::Enumerator::~Enumerator() {} + +void CompositionBuilder::rangeHandler(UChar32 start, UChar32 end, Norm &norm) { + if(norm.mappingType!=Norm::ROUND_TRIP) { return; } + if(start!=end) { + fprintf(stderr, + "gennorm2 error: same round-trip mapping for " + "more than 1 code point U+%04lX..U+%04lX\n", + (long)start, (long)end); + exit(U_INVALID_FORMAT_ERROR); + } + if(norm.cc!=0) { + fprintf(stderr, + "gennorm2 error: " + "U+%04lX has a round-trip mapping and ccc!=0, " + "not possible in Unicode normalization\n", + (long)start); + exit(U_INVALID_FORMAT_ERROR); + } + // setRoundTripMapping() ensured that there are exactly two code points. + const UnicodeString &m=*norm.mapping; + UChar32 lead=m.char32At(0); + UChar32 trail=m.char32At(m.length()-1); + if(norms.getCC(lead)!=0) { + fprintf(stderr, + "gennorm2 error: " + "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, " + "not possible in Unicode normalization\n", + (long)start, (long)lead); + exit(U_INVALID_FORMAT_ERROR); + } + // Flag for trailing character. + norms.createNorm(trail)->combinesBack=true; + // Insert (trail, composite) pair into compositions list for the lead character. + IcuToolErrorCode errorCode("gennorm2/addComposition()"); + Norm *leadNorm=norms.createNorm(lead); + UVector32 *compositions=leadNorm->compositions; + int32_t i; + if(compositions==nullptr) { + compositions=leadNorm->compositions=new UVector32(errorCode); + i=0; // "insert" the first pair at index 0 + } else { + // Insertion sort, and check for duplicate trail characters. + int32_t length; + const CompositionPair *pairs=leadNorm->getCompositionPairs(length); + for(i=0; i<length; ++i) { + if(trail==pairs[i].trail) { + fprintf(stderr, + "gennorm2 error: same round-trip mapping for " + "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n", + (long)start, (long)lead, (long)trail); + exit(U_INVALID_FORMAT_ERROR); + } + if(trail<pairs[i].trail) { + break; + } + } + } + compositions->insertElementAt(trail, 2*i, errorCode); + compositions->insertElementAt(start, 2*i+1, errorCode); +} + +void Decomposer::rangeHandler(UChar32 start, UChar32 end, Norm &norm) { + if(!norm.hasMapping()) { return; } + const UnicodeString &m=*norm.mapping; + UnicodeString *decomposed=nullptr; + const char16_t *s=toUCharPtr(m.getBuffer()); + int32_t length=m.length(); + int32_t prev, i=0; + UChar32 c; + while(i<length) { + prev=i; + U16_NEXT(s, i, length, c); + if(start<=c && c<=end) { + fprintf(stderr, + "gennorm2 error: U+%04lX maps to itself directly or indirectly\n", + (long)c); + exit(U_INVALID_FORMAT_ERROR); + } + const Norm &cNorm=norms.getNormRef(c); + if(cNorm.hasMapping()) { + if(norm.mappingType==Norm::ROUND_TRIP) { + if(prev==0) { + if(cNorm.mappingType!=Norm::ROUND_TRIP) { + fprintf(stderr, + "gennorm2 error: " + "U+%04lX's round-trip mapping's starter " + "U+%04lX one-way-decomposes, " + "not possible in Unicode normalization\n", + (long)start, (long)c); + exit(U_INVALID_FORMAT_ERROR); + } + uint8_t myTrailCC=norms.getCC(m.char32At(i)); + UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1); + uint8_t cTrailCC=norms.getCC(cTrailChar); + if(cTrailCC>myTrailCC) { + fprintf(stderr, + "gennorm2 error: " + "U+%04lX's round-trip mapping's starter " + "U+%04lX decomposes and the " + "inner/earlier tccc=%hu > outer/following tccc=%hu, " + "not possible in Unicode normalization\n", + (long)start, (long)c, + (short)cTrailCC, (short)myTrailCC); + exit(U_INVALID_FORMAT_ERROR); + } + } else { + fprintf(stderr, + "gennorm2 error: " + "U+%04lX's round-trip mapping's non-starter " + "U+%04lX decomposes, " + "not possible in Unicode normalization\n", + (long)start, (long)c); + exit(U_INVALID_FORMAT_ERROR); + } + } + if(decomposed==nullptr) { + decomposed=new UnicodeString(m, 0, prev); + } + decomposed->append(*cNorm.mapping); + } else if(Hangul::isHangul(c)) { + char16_t buffer[3]; + int32_t hangulLength=Hangul::decompose(c, buffer); + if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) { + fprintf(stderr, + "gennorm2 error: " + "U+%04lX's round-trip mapping's non-starter " + "U+%04lX decomposes, " + "not possible in Unicode normalization\n", + (long)start, (long)c); + exit(U_INVALID_FORMAT_ERROR); + } + if(decomposed==nullptr) { + decomposed=new UnicodeString(m, 0, prev); + } + decomposed->append(buffer, hangulLength); + } else if(decomposed!=nullptr) { + decomposed->append(m, prev, i-prev); + } + } + if(decomposed!=nullptr) { + if(norm.rawMapping==nullptr) { + // Remember the original mapping when decomposing recursively. + norm.rawMapping=norm.mapping; + } else { + delete norm.mapping; + } + norm.mapping=decomposed; + // Not norm.setMappingCP(); because the original mapping + // is most likely to be encodable as a delta. + didDecompose|=true; + } +} + +U_NAMESPACE_END + +#endif // #if !UCONFIG_NO_NORMALIZATION diff --git a/intl/icu/source/tools/gennorm2/norms.h b/intl/icu/source/tools/gennorm2/norms.h new file mode 100644 index 0000000000..f2778d9509 --- /dev/null +++ b/intl/icu/source/tools/gennorm2/norms.h @@ -0,0 +1,215 @@ +// © 2017 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// norms.h +// created: 2017jun04 Markus W. Scherer +// (pulled out of n2builder.cpp) + +// Storing & manipulating Normalizer2 builder data. + +#ifndef __NORMS_H__ +#define __NORMS_H__ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_NORMALIZATION + +#include "unicode/errorcode.h" +#include "unicode/umutablecptrie.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" +#include "unicode/utf16.h" +#include "normalizer2impl.h" +#include "toolutil.h" +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +class BuilderReorderingBuffer { +public: + BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(false) {} + void reset() { + fLength=0; + fLastStarterIndex=-1; + fDidReorder=false; + } + int32_t length() const { return fLength; } + UBool isEmpty() const { return fLength==0; } + int32_t lastStarterIndex() const { return fLastStarterIndex; } + UChar32 charAt(int32_t i) const { return fArray[i]>>8; } + uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; } + UBool didReorder() const { return fDidReorder; } + + void append(UChar32 c, uint8_t cc); + void toString(UnicodeString &dest) const; + +private: + int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK]; + int32_t fLength; + int32_t fLastStarterIndex; + UBool fDidReorder; +}; + +struct CompositionPair { + CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {} + UChar32 trail, composite; +}; + +struct Norm { + enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY }; + + UBool hasMapping() const { return mappingType>REMOVED; } + + // Requires hasMapping() and well-formed mapping. + void setMappingCP() { + UChar32 c; + if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) { + mappingCP=c; + } else { + mappingCP=U_SENTINEL; + } + } + + const CompositionPair *getCompositionPairs(int32_t &length) const { + if(compositions==nullptr) { + length=0; + return nullptr; + } else { + length=compositions->size()/2; + return reinterpret_cast<const CompositionPair *>(compositions->getBuffer()); + } + } + UChar32 combine(UChar32 trail) const; + + UnicodeString *mapping; + UnicodeString *rawMapping; // non-nullptr if the mapping is further decomposed + UChar32 mappingCP; // >=0 if mapping to 1 code point + int32_t mappingPhase; + MappingType mappingType; + + UVector32 *compositions; // (trail, composite) pairs + uint8_t cc, leadCC, trailCC; + UBool combinesBack; + UBool hasCompBoundaryBefore, hasCompBoundaryAfter; + + /** + * Overall type of normalization properties. + * Set after most processing is done. + * + * Corresponds to the rows in the chart on + * https://icu.unicode.org/design/normalization/custom + * in numerical (but reverse visual) order. + * + * YES_NO means composition quick check=yes, decomposition QC=no -- etc. + */ + enum Type { + /** Initial value until most processing is done. */ + UNKNOWN, + /** No mapping, does not combine, ccc=0. */ + INERT, + /** Starter, no mapping, has compositions. */ + YES_YES_COMBINES_FWD, + /** Starter with a round-trip mapping and compositions. */ + YES_NO_COMBINES_FWD, + /** Starter with a round-trip mapping but no compositions. */ + YES_NO_MAPPING_ONLY, + /** Has a one-way mapping which is comp-normalized. */ + NO_NO_COMP_YES, + /** Has a one-way mapping which is not comp-normalized but has a comp boundary before. */ + NO_NO_COMP_BOUNDARY_BEFORE, + /** Has a one-way mapping which does not have a comp boundary before. */ + NO_NO_COMP_NO_MAYBE_CC, + /** Has a one-way mapping to the empty string. */ + NO_NO_EMPTY, + /** Has an algorithmic one-way mapping to a single code point. */ + NO_NO_DELTA, + /** + * Combines both backward and forward, has compositions. + * Allowed, but not normally used. + */ + MAYBE_YES_COMBINES_FWD, + /** Combines only backward. */ + MAYBE_YES_SIMPLE, + /** Non-zero ccc but does not combine backward. */ + YES_YES_WITH_CC + } type; + /** Offset into the type's part of the extra data, or the algorithmic-mapping delta. */ + int32_t offset; + + /** + * Error string set by processing functions that do not have access + * to the code point, deferred for readable reporting. + */ + const char *error; +}; + +class Norms { +public: + Norms(UErrorCode &errorCode); + ~Norms(); + + int32_t length() const { return utm_countItems(normMem); } + const Norm &getNormRefByIndex(int32_t i) const { return norms[i]; } + Norm &getNormRefByIndex(int32_t i) { return norms[i]; } + + Norm *allocNorm(); + /** Returns an existing Norm unit, or nullptr if c has no data. */ + Norm *getNorm(UChar32 c); + const Norm *getNorm(UChar32 c) const; + /** Returns a Norm unit, creating a new one if necessary. */ + Norm *createNorm(UChar32 c); + /** Returns an existing Norm unit, or an immutable empty object if c has no data. */ + const Norm &getNormRef(UChar32 c) const; + uint8_t getCC(UChar32 c) const { return getNormRef(c).cc; } + UBool combinesBack(UChar32 c) const { + return Hangul::isJamoV(c) || Hangul::isJamoT(c) || getNormRef(c).combinesBack; + } + + void reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const; + + // int32_t highCC not uint8_t so that we can pass in 256 as the upper limit. + UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const; + + class Enumerator { + public: + Enumerator(Norms &n) : norms(n) {} + virtual ~Enumerator(); + /** Called for enumerated value!=0. */ + virtual void rangeHandler(UChar32 start, UChar32 end, Norm &norm) = 0; + protected: + Norms &norms; + }; + + void enumRanges(Enumerator &e); + + UnicodeSet ccSet, mappingSet; + +private: + Norms(const Norms &other) = delete; + Norms &operator=(const Norms &other) = delete; + + UMutableCPTrie *normTrie; + UToolMemory *normMem; + Norm *norms; +}; + +class CompositionBuilder : public Norms::Enumerator { +public: + CompositionBuilder(Norms &n) : Norms::Enumerator(n) {} + /** Adds a composition mapping for the first character in a round-trip mapping. */ + void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override; +}; + +class Decomposer : public Norms::Enumerator { +public: + Decomposer(Norms &n) : Norms::Enumerator(n), didDecompose(false) {} + /** Decomposes each character of the current mapping. Sets didDecompose if any. */ + void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override; + UBool didDecompose; +}; + +U_NAMESPACE_END + +#endif // #if !UCONFIG_NO_NORMALIZATION + +#endif // __NORMS_H__ diff --git a/intl/icu/source/tools/gennorm2/sources.txt b/intl/icu/source/tools/gennorm2/sources.txt new file mode 100644 index 0000000000..76452d26ea --- /dev/null +++ b/intl/icu/source/tools/gennorm2/sources.txt @@ -0,0 +1,4 @@ +extradata.cpp +gennorm2.cpp +n2builder.cpp +norms.cpp diff --git a/intl/icu/source/tools/genrb/Makefile.in b/intl/icu/source/tools/genrb/Makefile.in new file mode 100644 index 0000000000..336d839448 --- /dev/null +++ b/intl/icu/source/tools/genrb/Makefile.in @@ -0,0 +1,114 @@ +################################################################################# +## Makefile.in for ICU - tools/genrb # +## Copyright (C) 2016 and later: Unicode, Inc. and others. # +## License & terms of use: http://www.unicode.org/copyright.html # +## Copyright (c) 1999-2014, International Business Machines Corporation and # +## others. All Rights Reserved. # +################################################################################# + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/genrb + +TARGET_STUB_NAME = genrb +DERB_STUB_NAME = derb + +SECTION = 1 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) +@ICUIO_TRUE@MAN_FILES += $(DERB_STUB_NAME).$(SECTION) + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(MAN_FILES) $(DEPS) $(DERB_DEPS) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) +# derb depends on icuio +@ICUIO_TRUE@DERB = $(BINDIR)/$(DERB_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(srcdir)/../toolutil -I$(top_srcdir)/io +CPPFLAGS += -DUNISTR_FROM_CHAR_EXPLICIT=explicit -DUNISTR_FROM_STRING_EXPLICIT=explicit +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(patsubst %.cpp,%.o,$(patsubst %.c,%.o, $(SOURCES))) +DERB_SOURCES = derb.cpp +DERB_OBJ = $(DERB_SOURCES:.cpp=.o) + +DEPS = $(OBJECTS:.o=.d) +DERB_DEPS = $(DERB_OBJ:.o=.d) + +-include Makefile.local + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(DERB) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(bindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(bindir) +@ICUIO_TRUE@ $(INSTALL) $(DERB) $(DESTDIR)$(bindir) + +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(DERB) $(OBJECTS) $(DERB_OBJ) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + +$(DERB) : $(DERB_OBJ) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBICUIO) $(LIBS) + $(POST_BUILD_STEP) + +# This line is needed to serialize builds when the gmake -j option is used. +$(TARGET_STUB_NAME).$(SECTION): $(DERB_STUB_NAME).$(SECTION) + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +-include $(DERB_DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif diff --git a/intl/icu/source/tools/genrb/derb.1.in b/intl/icu/source/tools/genrb/derb.1.in new file mode 100644 index 0000000000..725b571ce2 --- /dev/null +++ b/intl/icu/source/tools/genrb/derb.1.in @@ -0,0 +1,198 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" derb.1: manual page for the derb utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2000-2014 IBM, Inc. and others. +.\" +.TH DERB 1 "7 Mar 2014" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B derb +\- disassemble a resource bundle +.SH SYNOPSIS +.B derb +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-V\fP, \fB\-\-version" +] +[ +.BR "\-v\fP, \fB\-\-verbose" +] +[ +.BI "\-e\fP, \fB\-\-encoding" " encoding" +] +[ +.BI "\-\-bom" +] +[ +.BI "\-t\fP, \fB\-\-truncate" " \fR[ \fPsize\fR ]\fP" +] +[ +.BI "\-s\fP, \fB\-\-sourcedir" " source" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +[ +.BI "\-i\fP, \fB\-\-icudatadir" " directory" +] +[ +.BI "\-c\fP, \fB\-\-to\-stdout" +] +.IR bundle " \.\.\." +.SH DESCRIPTION +.B derb +reads the compiled resource +.I bundle +files passed on the command line and write them back in text form. +The resulting text files have a +.B .txt +extension while compiled resource bundle source files typically have a +.B .res +extension. +.PP +It is customary to name the resource bundles by their locale name, +i.e. to use a local identifier for the +.I bundle +filename, e.g. +.B ja_JP.res +for Japanese (Japan) data, or +.B root.res +for the root bundle. +This is especially important for +.B derb +since the locale name is not accessible directly from the compiled +resource bundle, and to know which locale to ask for when opening +the bundle. +.B derb +will produce a file whose base name is the base name of the compiled resource file itself. +If the +.BI "\-\-to\-stdout\fP, \fB\-c\fP" +option is used, however, the text will be written on the standard output. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BR "\-V\fP, \fB\-\-version" +Print the version of +.B derb +and exit. +.TP +.BR "\-v\fP, \fB\-\-verbose" +Display extra informative messages during execution. +.TP +.BR "\-A\fP, \fB\-\-suppressAliases" +Don't follow aliases when producing output. +.TP +.BI "\-e\fP, \fB\-\-encoding" " encoding" +Set the encoding used to write output files to +.IR encoding . +The default encoding is the invariant (subset of ASCII or EBCDIC) +codepage for the system (see section +.BR "INVARIANT CHARACTERS" ). +The choice of the encoding does not affect the data, just their +representation. Characters that cannot be represented in the +.I encoding +will be represented using +.BI \eu "hhhh" +escape sequences. +.TP +.BI "\-\-bom" +Write a byte order mark (BOM) at the beginning of the file. +.TP +.BI "\-l\fP, \fB\-\-locale" " locale" +Set the +.I locale +for the resource bundle, which is used both in the generated text and +as the base name of the output file. +.TP +.BI "\-t\fP, \fB\-\-truncate" " \fR[ \fPsize\fR ]\fP" +Truncate individual resources (strings or binary data) to +.I size +bytes. The default if +.I size +is not specified is +.B 80 +bytes. +.TP +.BI "\-s\fP, \fB\-\-sourcedir" " source" +Set the source directory to +.IR source . +The default source directory is the current directory. +If +.B - +is passed for +.IR source , +then the +.I bundle +will be looked for in its default location, specified by +the +.B ICU_DATA +environment variable (or defaulting to +the location set when ICU was built if +.B ICU_DATA +is not set). +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory to +.IR destination . +The default destination directory is specified by the environment variable +.BR ICU_DATA +or is the location set when ICU was built if +.B ICU_DATA +is not set. +.TP +.BI "\-i\fP, \fB\-\-icudatadir" " directory" +Look for any necessary ICU data files in +.IR directory . +For example, when processing collation overrides, the file +.B ucadata.dat +must be located. +The default ICU data directory is specified by the environment variable +.BR ICU_DATA . +.TP +.BI "\-c\fP, \fB\-\-to\-stdout" +Write the disassembled +.I bundle +on standard output instead of into a file. +.SH CAVEATS +When the option +.BI \-\-bom +is used, the character +.B U+FEFF +is written in the destination +.I encoding +regardless of whether it is a Unicode transformation format (UTF) or not. +This option should only be used with an UTF encoding, as byte order marks +are not meaningful for other encodings. +.SH INVARIANT CHARACTERS +The +.B invariant character set +consists of the following set of characters, expressed as a standard POSIX +regular expression: +.BR "[a-z]|[A-Z]|[0-9]|_| |+|-|*|/" . +This is the set which is guaranteed to be available regardless of code page. +.SH ENVIRONMENT +.TP 10 +.B ICU_DATA +Specifies the directory containing ICU data. Defaults to +.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ . +Some tools in ICU depend on the presence of the trailing slash. It is thus +important to make sure that it is present if +.B ICU_DATA +is set. +.SH AUTHORS +Vladimir Weinstein +.br +Yves Arrouye +.SH VERSION +1.0 +.SH COPYRIGHT +Copyright (C) 2002 IBM, Inc. and others. +.SH SEE ALSO +.BR genrb (1) + diff --git a/intl/icu/source/tools/genrb/derb.cpp b/intl/icu/source/tools/genrb/derb.cpp new file mode 100644 index 0000000000..3b28289569 --- /dev/null +++ b/intl/icu/source/tools/genrb/derb.cpp @@ -0,0 +1,657 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: derb.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000sep6 +* created by: Vladimir Weinstein as an ICU workshop example +* maintained by: Yves Arrouye <yves@realnames.com> +*/ + +#include "unicode/stringpiece.h" +#include "unicode/ucnv.h" +#include "unicode/unistr.h" +#include "unicode/ustring.h" +#include "unicode/putil.h" +#include "unicode/ustdio.h" + +#include "charstr.h" +#include "uresimp.h" +#include "cmemory.h" +#include "cstring.h" +#include "uoptions.h" +#include "toolutil.h" +#include "ustrfmt.h" + +#if !UCONFIG_NO_FORMATTING + +#define DERB_VERSION "1.1" + +#define DERB_DEFAULT_TRUNC 80 + +static const int32_t indentsize = 4; +static int32_t truncsize = DERB_DEFAULT_TRUNC; +static UBool opt_truncate = false; + +static const char *getEncodingName(const char *encoding); +static void reportError(const char *pname, UErrorCode *status, const char *when); +static char16_t *quotedString(const char16_t *string); +static void printOutBundle(UFILE *out, UResourceBundle *resource, int32_t indent, const char *pname, UErrorCode *status); +static void printString(UFILE *out, const char16_t *str, int32_t len); +static void printCString(UFILE *out, const char *str, int32_t len); +static void printIndent(UFILE *out, int32_t indent); +static void printHex(UFILE *out, uint8_t what); + +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, +/* 2 */ UOPTION_ENCODING, +/* 3 */ { "to-stdout", nullptr, nullptr, nullptr, 'c', UOPT_NO_ARG, 0 } , +/* 4 */ { "truncate", nullptr, nullptr, nullptr, 't', UOPT_OPTIONAL_ARG, 0 }, +/* 5 */ UOPTION_VERBOSE, +/* 6 */ UOPTION_DESTDIR, +/* 7 */ UOPTION_SOURCEDIR, +/* 8 */ { "bom", nullptr, nullptr, nullptr, 0, UOPT_NO_ARG, 0 }, +/* 9 */ UOPTION_ICUDATADIR, +/* 10 */ UOPTION_VERSION, +/* 11 */ { "suppressAliases", nullptr, nullptr, nullptr, 'A', UOPT_NO_ARG, 0 }, +}; + +static UBool verbose = false; +static UBool suppressAliases = false; +static UFILE *ustderr = nullptr; + +extern int +main(int argc, char* argv[]) { + const char *encoding = nullptr; + const char *outputDir = nullptr; /* nullptr = no output directory, use current */ + const char *inputDir = "."; + int tostdout = 0; + int prbom = 0; + + const char *pname; + + UResourceBundle *bundle = nullptr; + int32_t i = 0; + + const char* arg; + + /* Get the name of tool. */ + pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR); +#if U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR + if (!pname) { + pname = uprv_strrchr(*argv, U_FILE_ALT_SEP_CHAR); + } +#endif + if (!pname) { + pname = *argv; + } else { + ++pname; + } + + /* error handling, printing usage message */ + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "%s: error in command line argument \"%s\"\n", pname, + argv[-argc]); + } + if(argc<0 || options[0].doesOccur || options[1].doesOccur) { + fprintf(argc < 0 ? stderr : stdout, + "%csage: %s [ -h, -?, --help ] [ -V, --version ]\n" + " [ -v, --verbose ] [ -e, --encoding encoding ] [ --bom ]\n" + " [ -t, --truncate [ size ] ]\n" + " [ -s, --sourcedir source ] [ -d, --destdir destination ]\n" + " [ -i, --icudatadir directory ] [ -c, --to-stdout ]\n" + " [ -A, --suppressAliases]\n" + " bundle ...\n", argc < 0 ? 'u' : 'U', + pname); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + if(options[10].doesOccur) { + fprintf(stderr, + "%s version %s (ICU version %s).\n" + "%s\n", + pname, DERB_VERSION, U_ICU_VERSION, U_COPYRIGHT_STRING); + return U_ZERO_ERROR; + } + if(options[2].doesOccur) { + encoding = options[2].value; + } + + if (options[3].doesOccur) { + if(options[2].doesOccur) { + fprintf(stderr, "%s: Error: don't specify an encoding (-e) when writing to stdout (-c).\n", pname); + return 3; + } + tostdout = 1; + } + + if(options[4].doesOccur) { + opt_truncate = true; + if(options[4].value != nullptr) { + truncsize = atoi(options[4].value); /* user defined printable size */ + } else { + truncsize = DERB_DEFAULT_TRUNC; /* we'll use default omitting size */ + } + } else { + opt_truncate = false; + } + + if(options[5].doesOccur) { + verbose = true; + } + + if (options[6].doesOccur) { + outputDir = options[6].value; + } + + if(options[7].doesOccur) { + inputDir = options[7].value; /* we'll use users resources */ + } + + if (options[8].doesOccur) { + prbom = 1; + } + + if (options[9].doesOccur) { + u_setDataDirectory(options[9].value); + } + + if (options[11].doesOccur) { + suppressAliases = true; + } + + fflush(stderr); // use ustderr now. + ustderr = u_finit(stderr, nullptr, nullptr); + + for (i = 1; i < argc; ++i) { + static const char16_t sp[] = { 0x0020 }; /* " " */ + + arg = getLongPathname(argv[i]); + + if (verbose) { + u_fprintf(ustderr, "processing bundle \"%s\"\n", argv[i]); + } + + icu::CharString locale; + UErrorCode status = U_ZERO_ERROR; + { + const char *p = findBasename(arg); + const char *q = uprv_strrchr(p, '.'); + if (q == nullptr) { + locale.append(p, status); + } else { + locale.append(p, (int32_t)(q - p), status); + } + } + if (U_FAILURE(status)) { + return status; + } + + icu::CharString infile; + const char *thename = nullptr; + UBool fromICUData = !uprv_strcmp(inputDir, "-"); + if (!fromICUData) { + UBool absfilename = *arg == U_FILE_SEP_CHAR; +#if U_PLATFORM_HAS_WIN32_API + if (!absfilename) { + absfilename = (uprv_strlen(arg) > 2 && isalpha(arg[0]) + && arg[1] == ':' && arg[2] == U_FILE_SEP_CHAR); + } +#endif + if (absfilename) { + thename = arg; + } else { + const char *q = uprv_strrchr(arg, U_FILE_SEP_CHAR); +#if U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR + if (q == nullptr) { + q = uprv_strrchr(arg, U_FILE_ALT_SEP_CHAR); + } +#endif + infile.append(inputDir, status); + if(q != nullptr) { + infile.appendPathPart(icu::StringPiece(arg, (int32_t)(q - arg)), status); + } + if (U_FAILURE(status)) { + return status; + } + thename = infile.data(); + } + } + if (thename) { + bundle = ures_openDirect(thename, locale.data(), &status); + } else { + bundle = ures_open(fromICUData ? 0 : inputDir, locale.data(), &status); + } + if (U_SUCCESS(status)) { + UFILE *out = nullptr; + + const char *filename = 0; + const char *ext = 0; + + if (locale.isEmpty() || !tostdout) { + filename = findBasename(arg); + ext = uprv_strrchr(filename, '.'); + if (!ext) { + ext = uprv_strchr(filename, 0); + } + } + + if (tostdout) { + out = u_get_stdout(); + } else { + icu::CharString thefile; + if (outputDir) { + thefile.append(outputDir, status); + } + thefile.appendPathPart(filename, status); + if (*ext) { + thefile.truncate(thefile.length() - (int32_t)uprv_strlen(ext)); + } + thefile.append(".txt", status); + if (U_FAILURE(status)) { + return status; + } + + out = u_fopen(thefile.data(), "w", nullptr, encoding); + if (!out) { + u_fprintf(ustderr, "%s: couldn't create %s\n", pname, thefile.data()); + u_fclose(ustderr); + return 4; + } + } + + // now, set the callback. + ucnv_setFromUCallBack(u_fgetConverter(out), UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C, 0, 0, &status); + if (U_FAILURE(status)) { + u_fprintf(ustderr, "%s: couldn't configure converter for encoding\n", pname); + u_fclose(ustderr); + if(!tostdout) { + u_fclose(out); + } + return 3; + } + + if (prbom) { /* XXX: Should be done only for UTFs */ + u_fputc(0xFEFF, out); + } + u_fprintf(out, "// -*- Coding: %s; -*-\n//\n", encoding ? encoding : getEncodingName(ucnv_getDefaultName())); + u_fprintf(out, "// This file was dumped by derb(8) from "); + if (thename) { + u_fprintf(out, "%s", thename); + } else if (fromICUData) { + u_fprintf(out, "the ICU internal %s locale", locale.data()); + } + + u_fprintf(out, "\n// derb(8) by Vladimir Weinstein and Yves Arrouye\n\n"); + + if (!locale.isEmpty()) { + u_fprintf(out, "%s", locale.data()); + } else { + u_fprintf(out, "%.*s%.*S", (int32_t)(ext - filename), filename, UPRV_LENGTHOF(sp), sp); + } + printOutBundle(out, bundle, 0, pname, &status); + + if (!tostdout) { + u_fclose(out); + } + } + else { + reportError(pname, &status, "opening resource file"); + } + + ures_close(bundle); + } + + return 0; +} + +static char16_t *quotedString(const char16_t *string) { + int len = u_strlen(string); + int alen = len; + const char16_t *sp; + char16_t *newstr, *np; + + for (sp = string; *sp; ++sp) { + switch (*sp) { + case '\n': + case 0x0022: + ++alen; + break; + } + } + + newstr = (char16_t *) uprv_malloc((1 + alen) * U_SIZEOF_UCHAR); + for (sp = string, np = newstr; *sp; ++sp) { + switch (*sp) { + case '\n': + *np++ = 0x005C; + *np++ = 0x006E; + break; + + case 0x0022: + *np++ = 0x005C; + U_FALLTHROUGH; + default: + *np++ = *sp; + break; + } + } + *np = 0; + + return newstr; +} + + +static void printString(UFILE *out, const char16_t *str, int32_t len) { + u_file_write(str, len, out); +} + +static void printCString(UFILE *out, const char *str, int32_t len) { + if(len==-1) { + u_fprintf(out, "%s", str); + } else { + u_fprintf(out, "%.*s", len, str); + } +} + +static void printIndent(UFILE *out, int32_t indent) { + icu::UnicodeString inchar(indent, 0x20, indent); + printString(out, inchar.getBuffer(), indent); +} + +static void printHex(UFILE *out, uint8_t what) { + static const char map[] = "0123456789ABCDEF"; + char16_t hex[2]; + + hex[0] = map[what >> 4]; + hex[1] = map[what & 0xf]; + + printString(out, hex, 2); +} + +static void printOutAlias(UFILE *out, UResourceBundle *parent, Resource r, const char *key, int32_t indent, const char *pname, UErrorCode *status) { + static const char16_t cr[] = { 0xA }; // LF + int32_t len = 0; + const char16_t* thestr = res_getAlias(&(parent->getResData()), r, &len); + char16_t *string = quotedString(thestr); + if(opt_truncate && len > truncsize) { + char msg[128]; + printIndent(out, indent); + snprintf(msg, sizeof(msg), "// WARNING: this resource, size %li is truncated to %li\n", + (long)len, (long)truncsize/2); + printCString(out, msg, -1); + len = truncsize; + } + if(U_SUCCESS(*status)) { + static const char16_t openStr[] = { 0x003A, 0x0061, 0x006C, 0x0069, 0x0061, 0x0073, 0x0020, 0x007B, 0x0020, 0x0022 }; /* ":alias { \"" */ + static const char16_t closeStr[] = { 0x0022, 0x0020, 0x007D, 0x0020 }; /* "\" } " */ + printIndent(out, indent); + if(key != nullptr) { + printCString(out, key, -1); + } + printString(out, openStr, UPRV_LENGTHOF(openStr)); + printString(out, string, len); + printString(out, closeStr, UPRV_LENGTHOF(closeStr)); + if(verbose) { + printCString(out, " // ALIAS", -1); + } + printString(out, cr, UPRV_LENGTHOF(cr)); + } else { + reportError(pname, status, "getting binary value"); + } + uprv_free(string); +} + +static void printOutBundle(UFILE *out, UResourceBundle *resource, int32_t indent, const char *pname, UErrorCode *status) +{ + static const char16_t cr[] = { 0xA }; // LF + +/* int32_t noOfElements = ures_getSize(resource);*/ + int32_t i = 0; + const char *key = ures_getKey(resource); + + switch(ures_getType(resource)) { + case URES_STRING : + { + int32_t len=0; + const char16_t* thestr = ures_getString(resource, &len, status); + char16_t *string = quotedString(thestr); + + /* TODO: String truncation */ + if(opt_truncate && len > truncsize) { + char msg[128]; + printIndent(out, indent); + snprintf(msg, sizeof(msg), "// WARNING: this resource, size %li is truncated to %li\n", + (long)len, (long)(truncsize/2)); + printCString(out, msg, -1); + len = truncsize/2; + } + printIndent(out, indent); + if(key != nullptr) { + static const char16_t openStr[] = { 0x0020, 0x007B, 0x0020, 0x0022 }; /* " { \"" */ + static const char16_t closeStr[] = { 0x0022, 0x0020, 0x007D }; /* "\" }" */ + printCString(out, key, (int32_t)uprv_strlen(key)); + printString(out, openStr, UPRV_LENGTHOF(openStr)); + printString(out, string, len); + printString(out, closeStr, UPRV_LENGTHOF(closeStr)); + } else { + static const char16_t openStr[] = { 0x0022 }; /* "\"" */ + static const char16_t closeStr[] = { 0x0022, 0x002C }; /* "\"," */ + + printString(out, openStr, UPRV_LENGTHOF(openStr)); + printString(out, string, (int32_t)(u_strlen(string))); + printString(out, closeStr, UPRV_LENGTHOF(closeStr)); + } + + if(verbose) { + printCString(out, "// STRING", -1); + } + printString(out, cr, UPRV_LENGTHOF(cr)); + + uprv_free(string); + } + break; + + case URES_INT : + { + static const char16_t openStr[] = { 0x003A, 0x0069, 0x006E, 0x0074, 0x0020, 0x007B, 0x0020 }; /* ":int { " */ + static const char16_t closeStr[] = { 0x0020, 0x007D }; /* " }" */ + char16_t num[20]; + + printIndent(out, indent); + if(key != nullptr) { + printCString(out, key, -1); + } + printString(out, openStr, UPRV_LENGTHOF(openStr)); + uprv_itou(num, 20, ures_getInt(resource, status), 10, 0); + printString(out, num, u_strlen(num)); + printString(out, closeStr, UPRV_LENGTHOF(closeStr)); + + if(verbose) { + printCString(out, "// INT", -1); + } + printString(out, cr, UPRV_LENGTHOF(cr)); + break; + } + case URES_BINARY : + { + int32_t len = 0; + const int8_t *data = (const int8_t *)ures_getBinary(resource, &len, status); + if(opt_truncate && len > truncsize) { + char msg[128]; + printIndent(out, indent); + snprintf(msg, sizeof(msg), "// WARNING: this resource, size %li is truncated to %li\n", + (long)len, (long)(truncsize/2)); + printCString(out, msg, -1); + len = truncsize; + } + if(U_SUCCESS(*status)) { + static const char16_t openStr[] = { 0x003A, 0x0062, 0x0069, 0x006E, 0x0061, 0x0072, 0x0079, 0x0020, 0x007B, 0x0020 }; /* ":binary { " */ + static const char16_t closeStr[] = { 0x0020, 0x007D, 0x0020 }; /* " } " */ + printIndent(out, indent); + if(key != nullptr) { + printCString(out, key, -1); + } + printString(out, openStr, UPRV_LENGTHOF(openStr)); + for(i = 0; i<len; i++) { + printHex(out, *data++); + } + printString(out, closeStr, UPRV_LENGTHOF(closeStr)); + if(verbose) { + printCString(out, " // BINARY", -1); + } + printString(out, cr, UPRV_LENGTHOF(cr)); + } else { + reportError(pname, status, "getting binary value"); + } + } + break; + case URES_INT_VECTOR : + { + int32_t len = 0; + const int32_t *data = ures_getIntVector(resource, &len, status); + if(U_SUCCESS(*status)) { + static const char16_t openStr[] = { 0x003A, 0x0069, 0x006E, 0x0074, 0x0076, 0x0065, 0x0063, 0x0074, 0x006F, 0x0072, 0x0020, 0x007B, 0x0020 }; /* ":intvector { " */ + static const char16_t closeStr[] = { 0x0020, 0x007D, 0x0020 }; /* " } " */ + char16_t num[20]; + + printIndent(out, indent); + if(key != nullptr) { + printCString(out, key, -1); + } + printString(out, openStr, UPRV_LENGTHOF(openStr)); + for(i = 0; i < len - 1; i++) { + int32_t numLen = uprv_itou(num, 20, data[i], 10, 0); + num[numLen++] = 0x002C; /* ',' */ + num[numLen++] = 0x0020; /* ' ' */ + num[numLen] = 0; + printString(out, num, u_strlen(num)); + } + if(len > 0) { + uprv_itou(num, 20, data[len - 1], 10, 0); + printString(out, num, u_strlen(num)); + } + printString(out, closeStr, UPRV_LENGTHOF(closeStr)); + if(verbose) { + printCString(out, "// INTVECTOR", -1); + } + printString(out, cr, UPRV_LENGTHOF(cr)); + } else { + reportError(pname, status, "getting int vector"); + } + } + break; + case URES_TABLE : + case URES_ARRAY : + { + static const char16_t openStr[] = { 0x007B }; /* "{" */ + static const char16_t closeStr[] = { 0x007D, '\n' }; /* "}\n" */ + + UResourceBundle *t = nullptr; + ures_resetIterator(resource); + printIndent(out, indent); + if(key != nullptr) { + printCString(out, key, -1); + } + printString(out, openStr, UPRV_LENGTHOF(openStr)); + if(verbose) { + if(ures_getType(resource) == URES_TABLE) { + printCString(out, "// TABLE", -1); + } else { + printCString(out, "// ARRAY", -1); + } + } + printString(out, cr, UPRV_LENGTHOF(cr)); + + if(suppressAliases == false) { + while(U_SUCCESS(*status) && ures_hasNext(resource)) { + t = ures_getNextResource(resource, t, status); + if(U_SUCCESS(*status)) { + printOutBundle(out, t, indent+indentsize, pname, status); + } else { + reportError(pname, status, "While processing table"); + *status = U_ZERO_ERROR; + } + } + } else { /* we have to use low level access to do this */ + Resource r; + int32_t resSize = ures_getSize(resource); + UBool isTable = (UBool)(ures_getType(resource) == URES_TABLE); + for(i = 0; i < resSize; i++) { + /* need to know if it's an alias */ + if(isTable) { + r = res_getTableItemByIndex(&resource->getResData(), resource->fRes, i, &key); + } else { + r = res_getArrayItem(&resource->getResData(), resource->fRes, i); + } + if(U_SUCCESS(*status)) { + if(res_getPublicType(r) == URES_ALIAS) { + printOutAlias(out, resource, r, key, indent+indentsize, pname, status); + } else { + t = ures_getByIndex(resource, i, t, status); + printOutBundle(out, t, indent+indentsize, pname, status); + } + } else { + reportError(pname, status, "While processing table"); + *status = U_ZERO_ERROR; + } + } + } + + printIndent(out, indent); + printString(out, closeStr, UPRV_LENGTHOF(closeStr)); + ures_close(t); + } + break; + default: + break; + } + +} + +static const char *getEncodingName(const char *encoding) { + UErrorCode err; + const char *enc; + + err = U_ZERO_ERROR; + if (!(enc = ucnv_getStandardName(encoding, "MIME", &err))) { + err = U_ZERO_ERROR; + if (!(enc = ucnv_getStandardName(encoding, "IANA", &err))) { + // do nothing + } + } + + return enc; +} + +static void reportError(const char *pname, UErrorCode *status, const char *when) { + u_fprintf(ustderr, "%s: error %d while %s: %s\n", pname, *status, when, u_errorName(*status)); +} + +#else +extern int +main(int argc, char* argv[]) { + /* Changing stdio.h ustdio.h requires that formatting not be disabled. */ + return 3; +} +#endif /* !UCONFIG_NO_FORMATTING */ + +/* + * Local Variables: + * indent-tabs-mode: nil + * End: + */ diff --git a/intl/icu/source/tools/genrb/derb.vcxproj b/intl/icu/source/tools/genrb/derb.vcxproj new file mode 100644 index 0000000000..f5ba9bf22f --- /dev/null +++ b/intl/icu/source/tools/genrb/derb.vcxproj @@ -0,0 +1,80 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{D3065ADB-8820-4CC7-9B6C-9510833961A3}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)/derb.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\i18n;..\..\common;..\toolutil;..\..\io;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)/derb.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)/derb.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)/derb.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icuind.lib;icuiod.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icuin.lib;icuio.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="derb.cpp" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/genrb/derb.vcxproj.filters b/intl/icu/source/tools/genrb/derb.vcxproj.filters new file mode 100644 index 0000000000..c62d612888 --- /dev/null +++ b/intl/icu/source/tools/genrb/derb.vcxproj.filters @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{b10d3c34-0b4c-43e9-9c28-e17fdabee575}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{0f0a70a2-7e7e-4e7a-88ab-b3bf739fabed}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{ac6d5215-57af-486d-81ed-badc17745780}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="derb.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/genrb/errmsg.c b/intl/icu/source/tools/genrb/errmsg.c new file mode 100644 index 0000000000..a99d797ec5 --- /dev/null +++ b/intl/icu/source/tools/genrb/errmsg.c @@ -0,0 +1,75 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File error.c +* +* Modification History: +* +* Date Name Description +* 05/28/99 stephen Creation. +******************************************************************************* +*/ + +#include <stdarg.h> +#include <stdbool.h> +#include <stdio.h> +#include "cstring.h" +#include "errmsg.h" +#include "toolutil.h" + +U_CFUNC void error(uint32_t linenumber, const char *msg, ...) +{ + va_list va; + + va_start(va, msg); + fprintf(stderr, "%s:%u: ", gCurrentFileName, (int)linenumber); + vfprintf(stderr, msg, va); + fprintf(stderr, "\n"); + va_end(va); +} + +static UBool gShowWarning = true; + +U_CFUNC void setShowWarning(UBool val) +{ + gShowWarning = val; +} + +U_CFUNC UBool getShowWarning(){ + return gShowWarning; +} + +static UBool gStrict =false; +U_CFUNC UBool isStrict(){ + return gStrict; +} +U_CFUNC void setStrict(UBool val){ + gStrict = val; +} +static UBool gVerbose =false; +U_CFUNC UBool isVerbose(){ + return gVerbose; +} +U_CFUNC void setVerbose(UBool val){ + gVerbose = val; +} +U_CFUNC void warning(uint32_t linenumber, const char *msg, ...) +{ + if (gShowWarning) + { + va_list va; + + va_start(va, msg); + fprintf(stderr, "%s:%u: warning: ", gCurrentFileName, (int)linenumber); + vfprintf(stderr, msg, va); + fprintf(stderr, "\n"); + va_end(va); + } +} diff --git a/intl/icu/source/tools/genrb/errmsg.h b/intl/icu/source/tools/genrb/errmsg.h new file mode 100644 index 0000000000..e01b9558f0 --- /dev/null +++ b/intl/icu/source/tools/genrb/errmsg.h @@ -0,0 +1,46 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File error.h +* +* Modification History: +* +* Date Name Description +* 05/28/99 stephen Creation. +******************************************************************************* +*/ + +#ifndef ERROR_H +#define ERROR_H 1 + +#include "unicode/utypes.h" + +U_CDECL_BEGIN + +extern const char *gCurrentFileName; + +U_CFUNC void error(uint32_t linenumber, const char *msg, ...); +U_CFUNC void warning(uint32_t linenumber, const char *msg, ...); + +/* Show warnings? */ +U_CFUNC void setShowWarning(UBool val); +U_CFUNC UBool getShowWarning(void); + +/* strict */ +U_CFUNC void setStrict(UBool val); +U_CFUNC UBool isStrict(void); + +/* verbosity */ +U_CFUNC void setVerbose(UBool val); +U_CFUNC UBool isVerbose(void); + +U_CDECL_END + +#endif diff --git a/intl/icu/source/tools/genrb/filterrb.cpp b/intl/icu/source/tools/genrb/filterrb.cpp new file mode 100644 index 0000000000..dcc02fc621 --- /dev/null +++ b/intl/icu/source/tools/genrb/filterrb.cpp @@ -0,0 +1,239 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include <iostream> +#include <stack> + +#include "filterrb.h" +#include "errmsg.h" + + +const char* PathFilter::kEInclusionNames[] = { + "INCLUDE", + "PARTIAL", + "EXCLUDE" +}; + + +ResKeyPath::ResKeyPath() {} + +ResKeyPath::ResKeyPath(const std::string& path, UErrorCode& status) { + if (path.empty() || path[0] != '/') { + std::cerr << "genrb error: path must start with /: " << path << std::endl; + status = U_PARSE_ERROR; + return; + } + if (path.length() == 1) { + return; + } + size_t i; + size_t j = 0; + while (true) { + i = j + 1; + j = path.find('/', i); + std::string key = path.substr(i, j - i); + if (key.empty()) { + std::cerr << "genrb error: empty subpaths and trailing slashes are not allowed: " << path << std::endl; + status = U_PARSE_ERROR; + return; + } + push(key); + if (j == std::string::npos) { + break; + } + } +} + +void ResKeyPath::push(const std::string& key) { + fPath.push_back(key); +} + +void ResKeyPath::pop() { + fPath.pop_back(); +} + +const std::list<std::string>& ResKeyPath::pieces() const { + return fPath; +} + +std::ostream& operator<<(std::ostream& out, const ResKeyPath& value) { + if (value.pieces().empty()) { + out << "/"; + } else for (auto& key : value.pieces()) { + out << "/" << key; + } + return out; +} + + +PathFilter::~PathFilter() = default; + + +void SimpleRuleBasedPathFilter::addRule(const std::string& ruleLine, UErrorCode& status) { + if (ruleLine.empty()) { + std::cerr << "genrb error: empty filter rules are not allowed" << std::endl; + status = U_PARSE_ERROR; + return; + } + bool inclusionRule = false; + if (ruleLine[0] == '+') { + inclusionRule = true; + } else if (ruleLine[0] != '-') { + std::cerr << "genrb error: rules must start with + or -: " << ruleLine << std::endl; + status = U_PARSE_ERROR; + return; + } + ResKeyPath path(ruleLine.substr(1), status); + addRule(path, inclusionRule, status); +} + +void SimpleRuleBasedPathFilter::addRule(const ResKeyPath& path, bool inclusionRule, UErrorCode& status) { + if (U_FAILURE(status)) { + return; + } + fRoot.applyRule(path, path.pieces().begin(), inclusionRule, status); +} + +PathFilter::EInclusion SimpleRuleBasedPathFilter::match(const ResKeyPath& path) const { + const Tree* node = &fRoot; + + // defaultResult "bubbles up" the nearest "definite" inclusion/exclusion rule + EInclusion defaultResult = INCLUDE; + if (node->fIncluded != PARTIAL) { + // rules handled here: "+/" and "-/" + defaultResult = node->fIncluded; + } + + // isLeaf is whether the filter tree can provide no additional information + // even if additional subpaths are added to the given key + bool isLeaf = false; + + for (auto& key : path.pieces()) { + auto child = node->fChildren.find(key); + // Leaf case 1: input path descends outside the filter tree + if (child == node->fChildren.end()) { + if (node->fWildcard) { + // A wildcard pattern is present; continue checking + node = node->fWildcard.get(); + } else { + isLeaf = true; + break; + } + } else { + node = &child->second; + } + if (node->fIncluded != PARTIAL) { + defaultResult = node->fIncluded; + } + } + + // Leaf case 2: input path exactly matches a filter leaf + if (node->isLeaf()) { + isLeaf = true; + } + + // Always return PARTIAL if we are not at a leaf + if (!isLeaf) { + return PARTIAL; + } + + // If leaf node is PARTIAL, return the default + if (node->fIncluded == PARTIAL) { + return defaultResult; + } + + return node->fIncluded; +} + + +SimpleRuleBasedPathFilter::Tree::Tree(const Tree& other) + : fIncluded(other.fIncluded), fChildren(other.fChildren) { + // Note: can't use the default copy assignment because of the std::unique_ptr + if (other.fWildcard) { + fWildcard.reset(new Tree(*other.fWildcard)); + } +} + +bool SimpleRuleBasedPathFilter::Tree::isLeaf() const { + return fChildren.empty() && !fWildcard; +} + +void SimpleRuleBasedPathFilter::Tree::applyRule( + const ResKeyPath& path, + std::list<std::string>::const_iterator it, + bool inclusionRule, + UErrorCode& status) { + + // Base Case + if (it == path.pieces().end()) { + if (isVerbose() && (fIncluded != PARTIAL || !isLeaf())) { + std::cout << "genrb info: rule on path " << path + << " overrides previous rules" << std::endl; + } + fIncluded = inclusionRule ? INCLUDE : EXCLUDE; + fChildren.clear(); + fWildcard.reset(); + return; + } + + // Recursive Step + auto& key = *it; + if (key == "*") { + // Case 1: Wildcard + if (!fWildcard) { + fWildcard.reset(new Tree()); + } + // Apply the rule to fWildcard and also to all existing children. + it++; + fWildcard->applyRule(path, it, inclusionRule, status); + for (auto& child : fChildren) { + child.second.applyRule(path, it, inclusionRule, status); + } + it--; + + } else { + // Case 2: Normal Key + auto search = fChildren.find(key); + if (search == fChildren.end()) { + if (fWildcard) { + // Deep-copy the existing wildcard tree into the new key + search = fChildren.emplace(key, Tree(*fWildcard)).first; + } else { + search = fChildren.emplace(key, Tree()).first; + } + } + it++; + search->second.applyRule(path, it, inclusionRule, status); + it--; + } +} + +void SimpleRuleBasedPathFilter::Tree::print(std::ostream& out, int32_t indent) const { + for (int32_t i=0; i<indent; i++) out << "\t"; + out << "included: " << kEInclusionNames[fIncluded] << std::endl; + for (auto& child : fChildren) { + for (int32_t i=0; i<indent; i++) out << "\t"; + out << child.first << ": {" << std::endl; + child.second.print(out, indent + 1); + for (int32_t i=0; i<indent; i++) out << "\t"; + out << "}" << std::endl; + } + if (fWildcard) { + for (int32_t i=0; i<indent; i++) out << "\t"; + out << "* {" << std::endl; + fWildcard->print(out, indent + 1); + for (int32_t i=0; i<indent; i++) out << "\t"; + out << "}" << std::endl; + } +} + +void SimpleRuleBasedPathFilter::print(std::ostream& out) const { + out << "SimpleRuleBasedPathFilter {" << std::endl; + fRoot.print(out, 1); + out << "}" << std::endl; +} + +std::ostream& operator<<(std::ostream& out, const SimpleRuleBasedPathFilter& value) { + value.print(out); + return out; +} diff --git a/intl/icu/source/tools/genrb/filterrb.h b/intl/icu/source/tools/genrb/filterrb.h new file mode 100644 index 0000000000..cf54766041 --- /dev/null +++ b/intl/icu/source/tools/genrb/filterrb.h @@ -0,0 +1,180 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#ifndef __FILTERRB_H__ +#define __FILTERRB_H__ + +#include <list> +#include <map> +#include <memory> +#include <ostream> +#include <string> + +#include "unicode/utypes.h" + + +/** + * Represents an absolute path into a resource bundle. + * For example: "/units/length/meter" + */ +class ResKeyPath { +public: + /** Constructs an empty path (top of tree) */ + ResKeyPath(); + + /** Constructs from a string path */ + ResKeyPath(const std::string& path, UErrorCode& status); + + void push(const std::string& key); + void pop(); + + const std::list<std::string>& pieces() const; + + private: + std::list<std::string> fPath; +}; + +std::ostream& operator<<(std::ostream& out, const ResKeyPath& value); + + +/** + * Interface used to determine whether to include or reject pieces of a + * resource bundle based on their absolute path. + */ +class PathFilter { +public: + enum EInclusion { + INCLUDE, + PARTIAL, + EXCLUDE + }; + + static const char* kEInclusionNames[]; + + virtual ~PathFilter(); + + /** + * Returns an EInclusion on whether or not the given path should be included. + * + * INCLUDE = include the whole subtree + * PARTIAL = recurse into the subtree + * EXCLUDE = reject the whole subtree + */ + virtual EInclusion match(const ResKeyPath& path) const = 0; +}; + + +/** + * Implementation of PathFilter for a list of inclusion/exclusion rules. + * + * The wildcard pattern "*" means that the subsequent filters are applied to + * every other tree sharing the same parent. + * + * For example, given this list of filter rules: + */ +// -/alabama +// +/alabama/alaska/arizona +// -/fornia/hawaii +// -/mississippi +// +/mississippi/michigan +// +/mississippi/*/maine +// -/mississippi/*/iowa +// +/mississippi/louisiana/iowa +/* + * You get the following structure: + * + * SimpleRuleBasedPathFilter { + * included: PARTIAL + * alabama: { + * included: EXCLUDE + * alaska: { + * included: PARTIAL + * arizona: { + * included: INCLUDE + * } + * } + * } + * fornia: { + * included: PARTIAL + * hawaii: { + * included: EXCLUDE + * } + * } + * mississippi: { + * included: EXCLUDE + * louisiana: { + * included: PARTIAL + * iowa: { + * included: INCLUDE + * } + * maine: { + * included: INCLUDE + * } + * } + * michigan: { + * included: INCLUDE + * iowa: { + * included: EXCLUDE + * } + * maine: { + * included: INCLUDE + * } + * } + * * { + * included: PARTIAL + * iowa: { + * included: EXCLUDE + * } + * maine: { + * included: INCLUDE + * } + * } + * } + * } + */ +class SimpleRuleBasedPathFilter : public PathFilter { +public: + void addRule(const std::string& ruleLine, UErrorCode& status); + void addRule(const ResKeyPath& path, bool inclusionRule, UErrorCode& status); + + EInclusion match(const ResKeyPath& path) const override; + + void print(std::ostream& out) const; + +private: + struct Tree { + + Tree() = default; + + /** Copy constructor */ + Tree(const Tree& other); + + /** + * Information on the USER-SPECIFIED inclusion/exclusion. + * + * INCLUDE = this path exactly matches a "+" rule + * PARTIAL = this path does not match any rule, but subpaths exist + * EXCLUDE = this path exactly matches a "-" rule + */ + EInclusion fIncluded = PARTIAL; + std::map<std::string, Tree> fChildren; + std::unique_ptr<Tree> fWildcard; + + void applyRule( + const ResKeyPath& path, + std::list<std::string>::const_iterator it, + bool inclusionRule, + UErrorCode& status); + + bool isLeaf() const; + + void print(std::ostream& out, int32_t indent) const; + }; + + Tree fRoot; +}; + +std::ostream& operator<<(std::ostream& out, const SimpleRuleBasedPathFilter& value); + + +#endif //__FILTERRB_H__ diff --git a/intl/icu/source/tools/genrb/genrb.1.in b/intl/icu/source/tools/genrb/genrb.1.in new file mode 100644 index 0000000000..a457719238 --- /dev/null +++ b/intl/icu/source/tools/genrb/genrb.1.in @@ -0,0 +1,148 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" genrb.1: manual page for the genrb utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2000-2002 IBM, Inc. and others. +.\" +.\" Manual page by Yves Arrouye <yves@realnames.com>. +.\" +.TH GENRB 1 "16 April 2002" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B genrb +\- compile a resource bundle +.SH SYNOPSIS +.B genrb +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-V\fP, \fB\-\-version" +] +[ +.BR "\-v\fP, \fB\-\-verbose" +] +[ +.BI "\-e\fP, \fB\-\-encoding" " encoding" +] +[ +.BI "\-j\fP, \fB\-\-write\-java" " \fR[ \fPencoding\fR ]\fP" +] +[ +.BI "\-s\fP, \fB\-\-sourcedir" " source" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +[ +.BI "\-i\fP, \fB\-\-icudatadir" " directory" +] +.IR bundle " \.\.\." +.SH DESCRIPTION +.B genrb +converts the resource +.I bundle +source files passed on the command line to their binary form or to +a Java source file for use with ICU4J. +The resulting binary files have a +.B .res +extension while resource bundle source files typically have a +.B .txt +extension. Java source files have a +.B java +extension and follow the ICU4J naming conventions. +.PP +It is customary to name the resource bundles by their locale name, +i.e. to use a local identifier for the +.I bundle +filename, e.g. +.B ja_JP.txt +for Japanese (Japan) data, or +.B root.txt +for the root bundle. +In any case, +.B genrb +will produce a file whose base name is the name of the locale found +in the resource file, not the base name of the resource file itself. +.PP +The binary files can be read directly by ICU, or used by +.BR pkgdata (1) +for incorporation into a larger archive or library. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BR "\-V\fP, \fB\-\-version" +Print the version of +.B genrb +and exit. +.TP +.BR "\-v\fP, \fB\-\-verbose" +Display extra informative messages during execution. +.TP +.BI "\-e\fP, \fB\-\-encoding" " encoding" +Set the encoding used to read input files to +.IR encoding . +The default encoding is the invariant (subset of ASCII or EBCDIC) +codepage for the system (see section +.BR "INVARIANT CHARACTERS" ). +The encodings UTF-8, UTF-16BE, and UTF-16LE are automatically detected +if a byte order mark (BOM) is present. +.TP +.BI "\-j\fP, \fB\-\-write\-java" " \fR[ \fPencoding\fR ]\fP" +Generate a Java source code for use with ICU4J. An optional +.I encoding +for the Java file can be given. +.TP +.BI "\-s\fP, \fB\-\-sourcedir" " source" +Set the source directory to +.IR source . +The default source directory is specified by the environment variable +.BR ICU_DATA , +or the location set when ICU was built if +.B ICU_DATA +is not set. +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory to +.IR destination . +The default destination directory is specified by the environment variable +.BR ICU_DATA +or is the location set when ICU was built if +.B ICU_DATA +is not set. +.TP +.BI "\-i\fP, \fB\-\-icudatadir" " directory" +Look for any necessary ICU data files in +.IR directory . +For example, when processing collation overrides, the file +.B ucadata.dat +must be located. +The default ICU data directory is specified by the environment variable +.BR ICU_DATA . +.SH INVARIANT CHARACTERS +The +.B invariant character set +consists of the following set of characters, expressed as a standard POSIX +regular expression: +.BR "[a-z]|[A-Z]|[0-9]|_| |+|-|*|/" . +This is the set which is guaranteed to be available regardless of code page. +.SH ENVIRONMENT +.TP 10 +.B ICU_DATA +Specifies the directory containing ICU data. Defaults to +.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ . +Some tools in ICU depend on the presence of the trailing slash. It is thus +important to make sure that it is present if +.B ICU_DATA +is set. +.SH VERSION +@VERSION@ +.SH COPYRIGHT +Copyright (C) 2000-2002 IBM, Inc. and others. +.SH SEE ALSO +.BR derb (1) +.br +.BR pkgdata (1) diff --git a/intl/icu/source/tools/genrb/genrb.cpp b/intl/icu/source/tools/genrb/genrb.cpp new file mode 100644 index 0000000000..fbf396d468 --- /dev/null +++ b/intl/icu/source/tools/genrb/genrb.cpp @@ -0,0 +1,869 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File genrb.cpp +* +* Modification History: +* +* Date Name Description +* 05/25/99 stephen Creation. +* 5/10/01 Ram removed ustdio dependency +******************************************************************************* +*/ + +#include <fstream> +#include <iostream> +#include <list> +#include <string> + +#include <assert.h> +#include "genrb.h" +#include "unicode/localpointer.h" +#include "unicode/uclean.h" +#include "unicode/utf16.h" +#include "charstr.h" +#include "cmemory.h" +#include "filterrb.h" +#include "reslist.h" +#include "ucmndata.h" /* TODO: for reading the pool bundle */ +#include "collationroot.h" + +U_NAMESPACE_USE + +/* Protos */ +void processFile(const char *filename, const char* cp, + const char *inputDir, const char *outputDir, const char *filterDir, + const char *packageName, + SRBRoot *newPoolBundle, UBool omitBinaryCollation, UErrorCode &status); +static char *make_res_filename(const char *filename, const char *outputDir, + const char *packageName, UErrorCode &status); + +/* File suffixes */ +#define RES_SUFFIX ".res" +#define COL_SUFFIX ".col" + +const char *gCurrentFileName = nullptr; +#ifdef XP_MAC_CONSOLE +#include <console.h> +#endif + +void ResFile::close() { + delete[] fBytes; + fBytes = nullptr; + delete fStrings; + fStrings = nullptr; +} + +enum +{ + HELP1, + HELP2, + VERBOSE, + QUIET, + VERSION, + SOURCEDIR, + DESTDIR, + ENCODING, + ICUDATADIR, + WRITE_JAVA, + COPYRIGHT, + JAVA_PACKAGE, + BUNDLE_NAME, + WRITE_XLIFF, + STRICT, + NO_BINARY_COLLATION, + LANGUAGE, + NO_COLLATION_RULES, + FORMAT_VERSION, + WRITE_POOL_BUNDLE, + USE_POOL_BUNDLE, + INCLUDE_UNIHAN_COLL, + FILTERDIR, + ICU4X_MODE, + UCADATA +}; + +UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_VERBOSE, + UOPTION_QUIET, + UOPTION_VERSION, + UOPTION_SOURCEDIR, + UOPTION_DESTDIR, + UOPTION_ENCODING, + UOPTION_ICUDATADIR, + UOPTION_WRITE_JAVA, + UOPTION_COPYRIGHT, + UOPTION_DEF("java-package", '\x01', UOPT_REQUIRES_ARG), + UOPTION_BUNDLE_NAME, + UOPTION_DEF("write-xliff", 'x', UOPT_OPTIONAL_ARG), + UOPTION_DEF("strict", 'k', UOPT_NO_ARG), /* 14 */ + UOPTION_DEF("noBinaryCollation", 'C', UOPT_NO_ARG),/* 15 */ + UOPTION_DEF("language", 'l', UOPT_REQUIRES_ARG), /* 16 */ + UOPTION_DEF("omitCollationRules", 'R', UOPT_NO_ARG),/* 17 */ + UOPTION_DEF("formatVersion", '\x01', UOPT_REQUIRES_ARG),/* 18 */ + UOPTION_DEF("writePoolBundle", '\x01', UOPT_OPTIONAL_ARG),/* 19 */ + UOPTION_DEF("usePoolBundle", '\x01', UOPT_OPTIONAL_ARG),/* 20 */ + UOPTION_DEF("includeUnihanColl", '\x01', UOPT_NO_ARG),/* 21 */ /* temporary, don't display in usage info */ + UOPTION_DEF("filterDir", '\x01', UOPT_OPTIONAL_ARG), /* 22 */ + UOPTION_DEF("icu4xMode", 'X', UOPT_NO_ARG),/* 23 */ + UOPTION_DEF("ucadata", '\x01', UOPT_REQUIRES_ARG),/* 24 */ + }; + +static UBool write_java = false; +static UBool write_xliff = false; +static const char* outputEnc =""; + +static ResFile poolBundle; + +/*added by Jing*/ +static const char* language = nullptr; +static const char* xliffOutputFileName = nullptr; +int +main(int argc, + char* argv[]) +{ + UErrorCode status = U_ZERO_ERROR; + const char *arg = nullptr; + const char *outputDir = nullptr; /* nullptr = no output directory, use current */ + const char *inputDir = nullptr; + const char *filterDir = nullptr; + const char *encoding = ""; + int i; + UBool illegalArg = false; + + U_MAIN_INIT_ARGS(argc, argv); + + options[JAVA_PACKAGE].value = "com.ibm.icu.impl.data"; + options[BUNDLE_NAME].value = "LocaleElements"; + argc = u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, "%s: error in command line argument \"%s\"\n", argv[0], argv[-argc]); + illegalArg = true; + } else if(argc<2) { + illegalArg = true; + } + if(options[WRITE_POOL_BUNDLE].doesOccur && options[USE_POOL_BUNDLE].doesOccur) { + fprintf(stderr, "%s: cannot combine --writePoolBundle and --usePoolBundle\n", argv[0]); + illegalArg = true; + } + if (options[ICU4X_MODE].doesOccur && !options[UCADATA].doesOccur) { + fprintf(stderr, "%s: --icu4xMode requires --ucadata\n", argv[0]); + illegalArg = true; + } + if(options[FORMAT_VERSION].doesOccur) { + const char *s = options[FORMAT_VERSION].value; + if(uprv_strlen(s) != 1 || (s[0] < '1' && '3' < s[0])) { + fprintf(stderr, "%s: unsupported --formatVersion %s\n", argv[0], s); + illegalArg = true; + } else if(s[0] == '1' && + (options[WRITE_POOL_BUNDLE].doesOccur || options[USE_POOL_BUNDLE].doesOccur) + ) { + fprintf(stderr, "%s: cannot combine --formatVersion 1 with --writePoolBundle or --usePoolBundle\n", argv[0]); + illegalArg = true; + } else { + setFormatVersion(s[0] - '0'); + } + } + + if((options[JAVA_PACKAGE].doesOccur || options[BUNDLE_NAME].doesOccur) && + !options[WRITE_JAVA].doesOccur) { + fprintf(stderr, + "%s error: command line argument --java-package or --bundle-name " + "without --write-java\n", + argv[0]); + illegalArg = true; + } + + if(options[VERSION].doesOccur) { + fprintf(stderr, + "%s version %s (ICU version %s).\n" + "%s\n", + argv[0], GENRB_VERSION, U_ICU_VERSION, U_COPYRIGHT_STRING); + if(!illegalArg) { + return U_ZERO_ERROR; + } + } + + if(illegalArg || options[HELP1].doesOccur || options[HELP2].doesOccur) { + /* + * Broken into chunks because the C89 standard says the minimum + * required supported string length is 509 bytes. + */ + fprintf(stderr, + "Usage: %s [OPTIONS] [FILES]\n" + "\tReads the list of resource bundle source files and creates\n" + "\tbinary version of resource bundles (.res files)\n", + argv[0]); + fprintf(stderr, + "Options:\n" + "\t-h or -? or --help this usage text\n" + "\t-q or --quiet do not display warnings\n" + "\t-v or --verbose print extra information when processing files\n" + "\t-V or --version prints out version number and exits\n" + "\t-c or --copyright include copyright notice\n"); + fprintf(stderr, + "\t-e or --encoding encoding of source files\n" + "\t-d or --destdir destination directory, followed by the path, defaults to '%s'\n" + "\t-s or --sourcedir source directory for files followed by path, defaults to '%s'\n" + "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" + "\t followed by path, defaults to '%s'\n", + u_getDataDirectory(), u_getDataDirectory(), u_getDataDirectory()); + fprintf(stderr, + "\t-j or --write-java write a Java ListResourceBundle for ICU4J, followed by optional encoding\n" + "\t defaults to ASCII and \\uXXXX format.\n" + "\t --java-package For --write-java: package name for writing the ListResourceBundle,\n" + "\t defaults to com.ibm.icu.impl.data\n"); + fprintf(stderr, + "\t-b or --bundle-name For --write-java: root resource bundle name for writing the ListResourceBundle,\n" + "\t defaults to LocaleElements\n" + "\t-x or --write-xliff write an XLIFF file for the resource bundle. Followed by\n" + "\t an optional output file name.\n" + "\t-k or --strict use pedantic parsing of syntax\n" + /*added by Jing*/ + "\t-l or --language for XLIFF: language code compliant with BCP 47.\n"); + fprintf(stderr, + "\t-C or --noBinaryCollation do not generate binary collation image;\n" + "\t makes .res file smaller but collator instantiation much slower;\n" + "\t maintains ability to get tailoring rules\n" + "\t-R or --omitCollationRules do not include collation (tailoring) rules;\n" + "\t makes .res file smaller and maintains collator instantiation speed\n" + "\t but tailoring rules will not be available (they are rarely used)\n"); + fprintf(stderr, + "\t --formatVersion write a .res file compatible with the requested formatVersion (single digit);\n" + "\t for example, --formatVersion 1\n"); + fprintf(stderr, + "\t --writePoolBundle [directory] write a pool.res file with all of the keys of all input bundles\n" + "\t --usePoolBundle [directory] point to keys from the pool.res keys pool bundle if they are available there;\n" + "\t makes .res files smaller but dependent on the pool bundle\n" + "\t (--writePoolBundle and --usePoolBundle cannot be combined)\n"); + fprintf(stderr, + "\t --filterDir Input directory where filter files are available.\n" + "\t For more on filter files, see ICU Data Build Tool.\n"); + + return illegalArg ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + if(options[VERBOSE].doesOccur) { + setVerbose(true); + } + + if(options[QUIET].doesOccur) { + setShowWarning(false); + } + if(options[STRICT].doesOccur) { + setStrict(true); + } + if(options[COPYRIGHT].doesOccur){ + setIncludeCopyright(true); + } + + if(options[SOURCEDIR].doesOccur) { + inputDir = options[SOURCEDIR].value; + } + + if(options[DESTDIR].doesOccur) { + outputDir = options[DESTDIR].value; + } + + if (options[FILTERDIR].doesOccur) { + filterDir = options[FILTERDIR].value; + } + + if(options[ENCODING].doesOccur) { + encoding = options[ENCODING].value; + } + + if(options[ICUDATADIR].doesOccur) { + u_setDataDirectory(options[ICUDATADIR].value); + } + /* Initialize ICU */ + u_init(&status); + if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) { + /* Note: u_init() will try to open ICU property data. + * failures here are expected when building ICU from scratch. + * ignore them. + */ + fprintf(stderr, "%s: can not initialize ICU. status = %s\n", + argv[0], u_errorName(status)); + exit(1); + } + status = U_ZERO_ERROR; + if(options[WRITE_JAVA].doesOccur) { + write_java = true; + outputEnc = options[WRITE_JAVA].value; + } + + if(options[WRITE_XLIFF].doesOccur) { + write_xliff = true; + if(options[WRITE_XLIFF].value != nullptr){ + xliffOutputFileName = options[WRITE_XLIFF].value; + } + } + + if (options[UCADATA].doesOccur) { +#if !UCONFIG_NO_COLLATION + CollationRoot::forceLoadFromFile(options[UCADATA].value, status); +#else + fprintf(stderr, "--ucadata was used with UCONFIG_NO_COLLATION\n"); + return status; +#endif + } + + initParser(); + + /*added by Jing*/ + if(options[LANGUAGE].doesOccur) { + language = options[LANGUAGE].value; + } + + LocalPointer<SRBRoot> newPoolBundle; + if(options[WRITE_POOL_BUNDLE].doesOccur) { + newPoolBundle.adoptInsteadAndCheckErrorCode(new SRBRoot(nullptr, true, status), status); + if(U_FAILURE(status)) { + fprintf(stderr, "unable to create an empty bundle for the pool keys: %s\n", u_errorName(status)); + return status; + } else { + const char *poolResName = "pool.res"; + char *nameWithoutSuffix = static_cast<char *>(uprv_malloc(uprv_strlen(poolResName) + 1)); + if (nameWithoutSuffix == nullptr) { + fprintf(stderr, "out of memory error\n"); + return U_MEMORY_ALLOCATION_ERROR; + } + uprv_strcpy(nameWithoutSuffix, poolResName); + *uprv_strrchr(nameWithoutSuffix, '.') = 0; + newPoolBundle->fLocale = nameWithoutSuffix; + } + } + + if(options[USE_POOL_BUNDLE].doesOccur) { + const char *poolResName = "pool.res"; + FileStream *poolFile; + int32_t poolFileSize; + int32_t indexLength; + /* + * TODO: Consolidate inputDir/filename handling from main() and processFile() + * into a common function, and use it here as well. + * Try to create toolutil functions for dealing with dir/filenames and + * loading ICU data files without udata_open(). + * Share code with icupkg? + * Also, make_res_filename() seems to be unused. Review and remove. + */ + CharString poolFileName; + if (options[USE_POOL_BUNDLE].value!=nullptr) { + poolFileName.append(options[USE_POOL_BUNDLE].value, status); + } else if (inputDir) { + poolFileName.append(inputDir, status); + } + poolFileName.appendPathPart(poolResName, status); + if (U_FAILURE(status)) { + return status; + } + poolFile = T_FileStream_open(poolFileName.data(), "rb"); + if (poolFile == nullptr) { + fprintf(stderr, "unable to open pool bundle file %s\n", poolFileName.data()); + return 1; + } + poolFileSize = T_FileStream_size(poolFile); + if (poolFileSize < 32) { + fprintf(stderr, "the pool bundle file %s is too small\n", poolFileName.data()); + return 1; + } + poolBundle.fBytes = new uint8_t[(poolFileSize + 15) & ~15]; + if (poolFileSize > 0 && poolBundle.fBytes == nullptr) { + fprintf(stderr, "unable to allocate memory for the pool bundle file %s\n", poolFileName.data()); + return U_MEMORY_ALLOCATION_ERROR; + } + + UDataSwapper *ds; + const DataHeader *header; + int32_t bytesRead = T_FileStream_read(poolFile, poolBundle.fBytes, poolFileSize); + if (bytesRead != poolFileSize) { + fprintf(stderr, "unable to read the pool bundle file %s\n", poolFileName.data()); + return 1; + } + /* + * Swap the pool bundle so that a single checked-in file can be used. + * The swapper functions also test that the data looks like + * a well-formed .res file. + */ + ds = udata_openSwapperForInputData(poolBundle.fBytes, bytesRead, + U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, &status); + if (U_FAILURE(status)) { + fprintf(stderr, "udata_openSwapperForInputData(pool bundle %s) failed: %s\n", + poolFileName.data(), u_errorName(status)); + return status; + } + ures_swap(ds, poolBundle.fBytes, bytesRead, poolBundle.fBytes, &status); + udata_closeSwapper(ds); + if (U_FAILURE(status)) { + fprintf(stderr, "ures_swap(pool bundle %s) failed: %s\n", + poolFileName.data(), u_errorName(status)); + return status; + } + header = (const DataHeader *)poolBundle.fBytes; + if (header->info.formatVersion[0] < 2) { + fprintf(stderr, "invalid format of pool bundle file %s\n", poolFileName.data()); + return U_INVALID_FORMAT_ERROR; + } + const int32_t *pRoot = (const int32_t *)( + (const char *)header + header->dataHeader.headerSize); + poolBundle.fIndexes = pRoot + 1; + indexLength = poolBundle.fIndexes[URES_INDEX_LENGTH] & 0xff; + if (indexLength <= URES_INDEX_POOL_CHECKSUM) { + fprintf(stderr, "insufficient indexes[] in pool bundle file %s\n", poolFileName.data()); + return U_INVALID_FORMAT_ERROR; + } + int32_t keysBottom = 1 + indexLength; + int32_t keysTop = poolBundle.fIndexes[URES_INDEX_KEYS_TOP]; + poolBundle.fKeys = (const char *)(pRoot + keysBottom); + poolBundle.fKeysLength = (keysTop - keysBottom) * 4; + poolBundle.fChecksum = poolBundle.fIndexes[URES_INDEX_POOL_CHECKSUM]; + + for (i = 0; i < poolBundle.fKeysLength; ++i) { + if (poolBundle.fKeys[i] == 0) { + ++poolBundle.fKeysCount; + } + } + + // 16BitUnits[] begins with strings-v2. + // The strings-v2 may optionally be terminated by what looks like + // an explicit string length that exceeds the number of remaining 16-bit units. + int32_t stringUnitsLength = (poolBundle.fIndexes[URES_INDEX_16BIT_TOP] - keysTop) * 2; + if (stringUnitsLength >= 2 && getFormatVersion() >= 3) { + poolBundle.fStrings = new PseudoListResource(nullptr, status); + if (poolBundle.fStrings == nullptr) { + fprintf(stderr, "unable to allocate memory for the pool bundle strings %s\n", + poolFileName.data()); + return U_MEMORY_ALLOCATION_ERROR; + } + // The PseudoListResource constructor call did not allocate further memory. + assert(U_SUCCESS(status)); + const char16_t *p = (const char16_t *)(pRoot + keysTop); + int32_t remaining = stringUnitsLength; + do { + int32_t first = *p; + int8_t numCharsForLength; + int32_t length; + if (!U16_IS_TRAIL(first)) { + // NUL-terminated + numCharsForLength = 0; + for (length = 0; + length < remaining && p[length] != 0; + ++length) {} + } else if (first < 0xdfef) { + numCharsForLength = 1; + length = first & 0x3ff; + } else if (first < 0xdfff && remaining >= 2) { + numCharsForLength = 2; + length = ((first - 0xdfef) << 16) | p[1]; + } else if (first == 0xdfff && remaining >= 3) { + numCharsForLength = 3; + length = ((int32_t)p[1] << 16) | p[2]; + } else { + break; // overrun + } + // Check for overrun before changing remaining, + // so that it is always accurate after the loop body. + if ((numCharsForLength + length) >= remaining || + p[numCharsForLength + length] != 0) { + break; // overrun or explicitly terminated + } + int32_t poolStringIndex = stringUnitsLength - remaining; + // Maximum pool string index when suffix-sharing the last character. + int32_t maxStringIndex = poolStringIndex + numCharsForLength + length - 1; + if (maxStringIndex >= RES_MAX_OFFSET) { + // pool string index overrun + break; + } + p += numCharsForLength; + remaining -= numCharsForLength; + if (length != 0) { + StringResource *sr = + new StringResource(poolStringIndex, numCharsForLength, + p, length, status); + if (sr == nullptr) { + fprintf(stderr, "unable to allocate memory for a pool bundle string %s\n", + poolFileName.data()); + return U_MEMORY_ALLOCATION_ERROR; + } + poolBundle.fStrings->add(sr); + poolBundle.fStringIndexLimit = maxStringIndex + 1; + // The StringResource constructor did not allocate further memory. + assert(U_SUCCESS(status)); + } + p += length + 1; + remaining -= length + 1; + } while (remaining > 0); + if (poolBundle.fStrings->fCount == 0) { + delete poolBundle.fStrings; + poolBundle.fStrings = nullptr; + } + } + + T_FileStream_close(poolFile); + setUsePoolBundle(true); + if (isVerbose() && poolBundle.fStrings != nullptr) { + printf("number of shared strings: %d\n", (int)poolBundle.fStrings->fCount); + int32_t length = poolBundle.fStringIndexLimit + 1; // incl. last NUL + printf("16-bit units for strings: %6d = %6d bytes\n", + (int)length, (int)length * 2); + } + } + + if(!options[FORMAT_VERSION].doesOccur && getFormatVersion() == 3 && + poolBundle.fStrings == nullptr && + !options[WRITE_POOL_BUNDLE].doesOccur) { + // If we just default to formatVersion 3 + // but there are no pool bundle strings to share + // and we do not write a pool bundle, + // then write formatVersion 2 which is just as good. + setFormatVersion(2); + } + + if(options[INCLUDE_UNIHAN_COLL].doesOccur) { + puts("genrb option --includeUnihanColl ignored: \n" + "CLDR 26/ICU 54 unihan data is small, except\n" + "the ucadata-unihan.icu version of the collation root data\n" + "is about 300kB larger than the ucadata-implicithan.icu version."); + } + + if((argc-1)!=1) { + printf("genrb number of files: %d\n", argc - 1); + } + /* generate the binary files */ + for(i = 1; i < argc; ++i) { + status = U_ZERO_ERROR; + arg = getLongPathname(argv[i]); + + CharString theCurrentFileName; + if (inputDir) { + theCurrentFileName.append(inputDir, status); + } + theCurrentFileName.appendPathPart(arg, status); + if (U_FAILURE(status)) { + break; + } + + gCurrentFileName = theCurrentFileName.data(); + if (isVerbose()) { + printf("Processing file \"%s\"\n", theCurrentFileName.data()); + } + processFile(arg, encoding, inputDir, outputDir, filterDir, nullptr, + newPoolBundle.getAlias(), + options[NO_BINARY_COLLATION].doesOccur, status); + } + + poolBundle.close(); + + if(U_SUCCESS(status) && options[WRITE_POOL_BUNDLE].doesOccur) { + const char* writePoolDir; + if (options[WRITE_POOL_BUNDLE].value!=nullptr) { + writePoolDir = options[WRITE_POOL_BUNDLE].value; + } else { + writePoolDir = outputDir; + } + char outputFileName[256]; + newPoolBundle->write(writePoolDir, nullptr, outputFileName, sizeof(outputFileName), status); + if(U_FAILURE(status)) { + fprintf(stderr, "unable to write the pool bundle: %s\n", u_errorName(status)); + } + } + + u_cleanup(); + + /* Don't return warnings as a failure */ + if (U_SUCCESS(status)) { + return 0; + } + + return status; +} + +/* Process a file */ +void +processFile(const char *filename, const char *cp, + const char *inputDir, const char *outputDir, const char *filterDir, + const char *packageName, + SRBRoot *newPoolBundle, + UBool omitBinaryCollation, UErrorCode &status) { + LocalPointer<SRBRoot> data; + LocalUCHARBUFPointer ucbuf; + CharString openFileName; + CharString inputDirBuf; + + char outputFileName[256]; + int32_t dirlen = 0; + + if (U_FAILURE(status)) { + return; + } + if(filename==nullptr){ + status=U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if(inputDir == nullptr) { + const char *filenameBegin = uprv_strrchr(filename, U_FILE_SEP_CHAR); + if (filenameBegin != nullptr) { + /* + * When a filename ../../../data/root.txt is specified, + * we presume that the input directory is ../../../data + * This is very important when the resource file includes + * another file, like UCARules.txt or thaidict.brk. + */ + int32_t filenameSize = (int32_t)(filenameBegin - filename + 1); + inputDirBuf.append(filename, filenameSize, status); + + inputDir = inputDirBuf.data(); + dirlen = inputDirBuf.length(); + } + }else{ + dirlen = (int32_t)uprv_strlen(inputDir); + + if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) { + /* + * append the input dir to openFileName if the first char in + * filename is not file separation char and the last char input directory is not '.'. + * This is to support : + * genrb -s. /home/icu/data + * genrb -s. icu/data + * The user cannot mix notations like + * genrb -s. /icu/data --- the absolute path specified. -s redundant + * user should use + * genrb -s. icu/data --- start from CWD and look in icu/data dir + */ + if( (filename[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')){ + openFileName.append(inputDir, status); + } + } else { + openFileName.append(inputDir, status); + } + } + openFileName.appendPathPart(filename, status); + + // Test for CharString failure + if (U_FAILURE(status)) { + return; + } + + ucbuf.adoptInstead(ucbuf_open(openFileName.data(), &cp,getShowWarning(),true, &status)); + if(status == U_FILE_ACCESS_ERROR) { + + fprintf(stderr, "couldn't open file %s\n", openFileName.data()); + return; + } + if (ucbuf.isNull() || U_FAILURE(status)) { + fprintf(stderr, "An error occurred processing file %s. Error: %s\n", + openFileName.data(), u_errorName(status)); + return; + } + /* auto detected popular encodings? */ + if (cp!=nullptr && isVerbose()) { + printf("autodetected encoding %s\n", cp); + } + /* Parse the data into an SRBRoot */ + data.adoptInstead(parse(ucbuf.getAlias(), inputDir, outputDir, filename, + !omitBinaryCollation, options[NO_COLLATION_RULES].doesOccur, options[ICU4X_MODE].doesOccur, &status)); + + if (data.isNull() || U_FAILURE(status)) { + fprintf(stderr, "couldn't parse the file %s. Error:%s\n", filename, u_errorName(status)); + return; + } + + // Run filtering before writing pool bundle + if (filterDir != nullptr) { + CharString filterFileName(filterDir, status); + filterFileName.appendPathPart(filename, status); + if (U_FAILURE(status)) { + return; + } + + // Open the file and read it into filter + SimpleRuleBasedPathFilter filter; + std::ifstream f(filterFileName.data()); + if (f.fail()) { + std::cerr << "genrb error: unable to open " << filterFileName.data() << std::endl; + status = U_FILE_ACCESS_ERROR; + return; + } + std::string currentLine; + while (std::getline(f, currentLine)) { + // Ignore # comments and empty lines + if (currentLine.empty() || currentLine[0] == '#') { + continue; + } + filter.addRule(currentLine, status); + if (U_FAILURE(status)) { + return; + } + } + + if (isVerbose()) { + filter.print(std::cout); + } + + // Apply the filter to the data + ResKeyPath path; + data->fRoot->applyFilter(filter, path, data.getAlias()); + } + + if(options[WRITE_POOL_BUNDLE].doesOccur) { + data->fWritePoolBundle = newPoolBundle; + data->compactKeys(status); + int32_t newKeysLength; + const char *newKeys = data->getKeyBytes(&newKeysLength); + newPoolBundle->addKeyBytes(newKeys, newKeysLength, status); + if(U_FAILURE(status)) { + fprintf(stderr, "bundle_compactKeys(%s) or bundle_getKeyBytes() failed: %s\n", + filename, u_errorName(status)); + return; + } + /* count the number of just-added key strings */ + for(const char *newKeysLimit = newKeys + newKeysLength; newKeys < newKeysLimit; ++newKeys) { + if(*newKeys == 0) { + ++newPoolBundle->fKeysCount; + } + } + } + + if(options[USE_POOL_BUNDLE].doesOccur) { + data->fUsePoolBundle = &poolBundle; + } + + /* Determine the target rb filename */ + uprv_free(make_res_filename(filename, outputDir, packageName, status)); + if(U_FAILURE(status)) { + fprintf(stderr, "couldn't make the res fileName for bundle %s. Error:%s\n", + filename, u_errorName(status)); + return; + } + if(write_java== true){ + bundle_write_java(data.getAlias(), outputDir, outputEnc, + outputFileName, sizeof(outputFileName), + options[JAVA_PACKAGE].value, options[BUNDLE_NAME].value, &status); + }else if(write_xliff ==true){ + bundle_write_xml(data.getAlias(), outputDir, outputEnc, + filename, outputFileName, sizeof(outputFileName), + language, xliffOutputFileName, &status); + }else{ + /* Write the data to the file */ + data->write(outputDir, packageName, outputFileName, sizeof(outputFileName), status); + } + if (U_FAILURE(status)) { + fprintf(stderr, "couldn't write bundle %s. Error:%s\n", outputFileName, u_errorName(status)); + } +} + +/* Generate the target .res file name from the input file name */ +static char* +make_res_filename(const char *filename, + const char *outputDir, + const char *packageName, + UErrorCode &status) { + char *basename; + char *dirname; + char *resName; + + int32_t pkgLen = 0; /* length of package prefix */ + + + if (U_FAILURE(status)) { + return 0; + } + + if(packageName != nullptr) + { + pkgLen = (int32_t)(1 + uprv_strlen(packageName)); + } + + /* setup */ + basename = dirname = resName = 0; + + /* determine basename, and compiled file names */ + basename = (char*) uprv_malloc(sizeof(char) * (uprv_strlen(filename) + 1)); + if(basename == 0) { + status = U_MEMORY_ALLOCATION_ERROR; + goto finish; + } + + get_basename(basename, filename); + + dirname = (char*) uprv_malloc(sizeof(char) * (uprv_strlen(filename) + 1)); + if(dirname == 0) { + status = U_MEMORY_ALLOCATION_ERROR; + goto finish; + } + + get_dirname(dirname, filename); + + if (outputDir == nullptr) { + /* output in same dir as .txt */ + resName = (char*) uprv_malloc(sizeof(char) * (uprv_strlen(dirname) + + pkgLen + + uprv_strlen(basename) + + uprv_strlen(RES_SUFFIX) + 8)); + if(resName == 0) { + status = U_MEMORY_ALLOCATION_ERROR; + goto finish; + } + + uprv_strcpy(resName, dirname); + + if(packageName != nullptr) + { + uprv_strcat(resName, packageName); + uprv_strcat(resName, "_"); + } + + uprv_strcat(resName, basename); + + } else { + int32_t dirlen = (int32_t)uprv_strlen(outputDir); + int32_t basenamelen = (int32_t)uprv_strlen(basename); + + resName = (char*) uprv_malloc(sizeof(char) * (dirlen + pkgLen + basenamelen + 8)); + + if (resName == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + goto finish; + } + + uprv_strcpy(resName, outputDir); + + if(outputDir[dirlen] != U_FILE_SEP_CHAR) { + resName[dirlen] = U_FILE_SEP_CHAR; + resName[dirlen + 1] = '\0'; + } + + if(packageName != nullptr) + { + uprv_strcat(resName, packageName); + uprv_strcat(resName, "_"); + } + + uprv_strcat(resName, basename); + } + +finish: + uprv_free(basename); + uprv_free(dirname); + + return resName; +} + +/* + * Local Variables: + * indent-tabs-mode: nil + * End: + */ diff --git a/intl/icu/source/tools/genrb/genrb.h b/intl/icu/source/tools/genrb/genrb.h new file mode 100644 index 0000000000..019020a34a --- /dev/null +++ b/intl/icu/source/tools/genrb/genrb.h @@ -0,0 +1,52 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2002-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File genrb.h +*/ + +#ifndef GENRB_H +#define GENRB_H + +#include <stdio.h> +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" + + +#include "ucbuf.h" +#include "errmsg.h" +#include "parse.h" +#include "rbutil.h" + +#include "toolutil.h" +#include "uoptions.h" + +#include "unicode/ucol.h" +#include "unicode/uloc.h" + +/* The version of genrb */ +#define GENRB_VERSION "56" + +U_CDECL_BEGIN + +U_CAPI void processFile( + const char *filename, + const char* cp, + const char *inputDir, + const char *outputDir, + const char *packageName, + UBool omitBinaryCollation, + UErrorCode *status); + +U_CDECL_END + +#endif diff --git a/intl/icu/source/tools/genrb/genrb.vcxproj b/intl/icu/source/tools/genrb/genrb.vcxproj new file mode 100644 index 0000000000..66651c11d0 --- /dev/null +++ b/intl/icu/source/tools/genrb/genrb.vcxproj @@ -0,0 +1,113 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{97521D06-EC47-45D4-8BD0-9E16B3F93B2A}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)/genrb.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\..\include;..\..\common;..\toolutil;..\..\i18n;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)/genrb.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)/genrb.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)/genrb.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icuind.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icuin.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="errmsg.c" /> + <ClCompile Include="filterrb.cpp" /> + <ClCompile Include="genrb.cpp" /> + <ClCompile Include="parse.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="prscmnts.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="rbutil.c" /> + <ClCompile Include="read.c" /> + <ClCompile Include="reslist.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="rle.c" /> + <ClCompile Include="ustr.c" /> + <ClCompile Include="wrtjava.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="wrtxml.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="errmsg.h" /> + <ClInclude Include="genrb.h" /> + <ClInclude Include="filterrb.h" /> + <ClInclude Include="parse.h" /> + <ClInclude Include="prscmnts.h" /> + <ClInclude Include="rbutil.h" /> + <ClInclude Include="read.h" /> + <ClInclude Include="reslist.h" /> + <ClInclude Include="rle.h" /> + <ClInclude Include="ustr.h" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/genrb/genrb.vcxproj.filters b/intl/icu/source/tools/genrb/genrb.vcxproj.filters new file mode 100644 index 0000000000..1f2f5b3b8c --- /dev/null +++ b/intl/icu/source/tools/genrb/genrb.vcxproj.filters @@ -0,0 +1,87 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{2dee2c2f-25a5-43f0-985f-de4ba26925b4}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{7156c811-7116-4eef-8bb1-0400c51f9fd3}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{df647868-56cc-475d-a3f6-1d1f50aa5e4f}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="errmsg.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="filterrb.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="genrb.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="parse.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="prscmnts.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="rbutil.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="read.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="reslist.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="rle.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="ustr.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="wrtjava.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="wrtxml.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="errmsg.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="filterrb.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="genrb.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="parse.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="prscmnts.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="rbutil.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="read.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="reslist.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="rle.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="ustr.h"> + <Filter>Header Files</Filter> + </ClInclude> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/genrb/parse.cpp b/intl/icu/source/tools/genrb/parse.cpp new file mode 100644 index 0000000000..1e82bda6e5 --- /dev/null +++ b/intl/icu/source/tools/genrb/parse.cpp @@ -0,0 +1,2435 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File parse.cpp +* +* Modification History: +* +* Date Name Description +* 05/26/99 stephen Creation. +* 02/25/00 weiv Overhaul to write udata +* 5/10/01 Ram removed ustdio dependency +* 06/10/2001 Dominic Ludlam <dom@recoil.org> Rewritten +******************************************************************************* +*/ + +// Safer use of UnicodeString. +#include <cstdint> +#include "unicode/umachine.h" +#ifndef UNISTR_FROM_CHAR_EXPLICIT +# define UNISTR_FROM_CHAR_EXPLICIT explicit +#endif + +// Less important, but still a good idea. +#ifndef UNISTR_FROM_STRING_EXPLICIT +# define UNISTR_FROM_STRING_EXPLICIT explicit +#endif + +#include <assert.h> +#include "parse.h" +#include "errmsg.h" +#include "uhash.h" +#include "cmemory.h" +#include "cstring.h" +#include "uinvchar.h" +#include "read.h" +#include "ustr.h" +#include "reslist.h" +#include "rbt_pars.h" +#include "genrb.h" +#include "unicode/normalizer2.h" +#include "unicode/stringpiece.h" +#include "unicode/unistr.h" +#include "unicode/ustring.h" +#include "unicode/uscript.h" +#include "unicode/utf16.h" +#include "unicode/putil.h" +#include "charstr.h" +#include "collationbuilder.h" +#include "collationdata.h" +#include "collationdatareader.h" +#include "collationdatawriter.h" +#include "collationfastlatinbuilder.h" +#include "collationinfo.h" +#include "collationroot.h" +#include "collationruleparser.h" +#include "collationtailoring.h" +#include <stdio.h> +#include "writesrc.h" + +/* Number of tokens to read ahead of the current stream position */ +#define MAX_LOOKAHEAD 3 + +#define CR 0x000D +#define LF 0x000A +#define SPACE 0x0020 +#define TAB 0x0009 +#define ESCAPE 0x005C +#define HASH 0x0023 +#define QUOTE 0x0027 +#define ZERO 0x0030 +#define STARTCOMMAND 0x005B +#define ENDCOMMAND 0x005D +#define OPENSQBRACKET 0x005B +#define CLOSESQBRACKET 0x005D + +#define ICU4X_DIACRITIC_BASE 0x0300 +#define ICU4X_DIACRITIC_LIMIT 0x034F + +using icu::CharString; +using icu::LocalMemory; +using icu::LocalPointer; +using icu::LocalUCHARBUFPointer; +using icu::StringPiece; +using icu::UnicodeString; + +struct Lookahead +{ + enum ETokenType type; + struct UString value; + struct UString comment; + uint32_t line; +}; + +/* keep in sync with token defines in read.h */ +const char *tokenNames[TOK_TOKEN_COUNT] = +{ + "string", /* A string token, such as "MonthNames" */ + "'{'", /* An opening brace character */ + "'}'", /* A closing brace character */ + "','", /* A comma */ + "':'", /* A colon */ + + "<end of file>", /* End of the file has been reached successfully */ + "<end of line>" +}; + +/* Just to store "TRUE" */ +//static const char16_t trueValue[] = {0x0054, 0x0052, 0x0055, 0x0045, 0x0000}; + +typedef struct { + struct Lookahead lookahead[MAX_LOOKAHEAD + 1]; + uint32_t lookaheadPosition; + UCHARBUF *buffer; + struct SRBRoot *bundle; + const char *inputdir; + uint32_t inputdirLength; + const char *outputdir; + uint32_t outputdirLength; + const char *filename; + UBool makeBinaryCollation; + UBool omitCollationRules; + UBool icu4xMode; +} ParseState; + +typedef struct SResource * +ParseResourceFunction(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status); + +static struct SResource *parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status); + +/* The nature of the lookahead buffer: + There are MAX_LOOKAHEAD + 1 slots, used as a circular buffer. This provides + MAX_LOOKAHEAD lookahead tokens and a slot for the current token and value. + When getToken is called, the current pointer is moved to the next slot and the + old slot is filled with the next token from the reader by calling getNextToken. + The token values are stored in the slot, which means that token values don't + survive a call to getToken, ie. + + UString *value; + + getToken(&value, nullptr, status); + getToken(nullptr, nullptr, status); bad - value is now a different string +*/ +static void +initLookahead(ParseState* state, UCHARBUF *buf, UErrorCode *status) +{ + static uint32_t initTypeStrings = 0; + uint32_t i; + + if (!initTypeStrings) + { + initTypeStrings = 1; + } + + state->lookaheadPosition = 0; + state->buffer = buf; + + resetLineNumber(); + + for (i = 0; i < MAX_LOOKAHEAD; i++) + { + state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status); + if (U_FAILURE(*status)) + { + return; + } + } + + *status = U_ZERO_ERROR; +} + +static void +cleanupLookahead(ParseState* state) +{ + uint32_t i; + for (i = 0; i <= MAX_LOOKAHEAD; i++) + { + ustr_deinit(&state->lookahead[i].value); + ustr_deinit(&state->lookahead[i].comment); + } + +} + +static enum ETokenType +getToken(ParseState* state, struct UString **tokenValue, struct UString* comment, uint32_t *linenumber, UErrorCode *status) +{ + enum ETokenType result; + uint32_t i; + + result = state->lookahead[state->lookaheadPosition].type; + + if (tokenValue != nullptr) + { + *tokenValue = &state->lookahead[state->lookaheadPosition].value; + } + + if (linenumber != nullptr) + { + *linenumber = state->lookahead[state->lookaheadPosition].line; + } + + if (comment != nullptr) + { + ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status); + } + + i = (state->lookaheadPosition + MAX_LOOKAHEAD) % (MAX_LOOKAHEAD + 1); + state->lookaheadPosition = (state->lookaheadPosition + 1) % (MAX_LOOKAHEAD + 1); + ustr_setlen(&state->lookahead[i].comment, 0, status); + ustr_setlen(&state->lookahead[i].value, 0, status); + state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status); + + /* printf("getToken, returning %s\n", tokenNames[result]); */ + + return result; +} + +static enum ETokenType +peekToken(ParseState* state, uint32_t lookaheadCount, struct UString **tokenValue, uint32_t *linenumber, struct UString *comment, UErrorCode *status) +{ + uint32_t i = (state->lookaheadPosition + lookaheadCount) % (MAX_LOOKAHEAD + 1); + + if (U_FAILURE(*status)) + { + return TOK_ERROR; + } + + if (lookaheadCount >= MAX_LOOKAHEAD) + { + *status = U_INTERNAL_PROGRAM_ERROR; + return TOK_ERROR; + } + + if (tokenValue != nullptr) + { + *tokenValue = &state->lookahead[i].value; + } + + if (linenumber != nullptr) + { + *linenumber = state->lookahead[i].line; + } + + if(comment != nullptr){ + ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status); + } + + return state->lookahead[i].type; +} + +static void +expect(ParseState* state, enum ETokenType expectedToken, struct UString **tokenValue, struct UString *comment, uint32_t *linenumber, UErrorCode *status) +{ + uint32_t line; + + enum ETokenType token = getToken(state, tokenValue, comment, &line, status); + + if (linenumber != nullptr) + { + *linenumber = line; + } + + if (U_FAILURE(*status)) + { + return; + } + + if (token != expectedToken) + { + *status = U_INVALID_FORMAT_ERROR; + error(line, "expecting %s, got %s", tokenNames[expectedToken], tokenNames[token]); + } + else + { + *status = U_ZERO_ERROR; + } +} + +static char *getInvariantString(ParseState* state, uint32_t *line, struct UString *comment, + int32_t &stringLength, UErrorCode *status) +{ + struct UString *tokenValue; + char *result; + + expect(state, TOK_STRING, &tokenValue, comment, line, status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + + if(!uprv_isInvariantUString(tokenValue->fChars, tokenValue->fLength)) { + *status = U_INVALID_FORMAT_ERROR; + error(*line, "invariant characters required for table keys, binary data, etc."); + return nullptr; + } + + result = static_cast<char *>(uprv_malloc(tokenValue->fLength+1)); + + if (result == nullptr) + { + *status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + + u_UCharsToChars(tokenValue->fChars, result, tokenValue->fLength+1); + stringLength = tokenValue->fLength; + return result; +} + +static struct SResource * +parseUCARules(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status) +{ + struct SResource *result = nullptr; + struct UString *tokenValue; + FileStream *file = nullptr; + char filename[256] = { '\0' }; + char cs[128] = { '\0' }; + uint32_t line; + UBool quoted = false; + UCHARBUF *ucbuf=nullptr; + UChar32 c = 0; + const char* cp = nullptr; + char16_t *pTarget = nullptr; + char16_t *target = nullptr; + char16_t *targetLimit = nullptr; + int32_t size = 0; + + expect(state, TOK_STRING, &tokenValue, nullptr, &line, status); + + if(isVerbose()){ + printf(" %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + if (U_FAILURE(*status)) + { + return nullptr; + } + /* make the filename including the directory */ + if (state->inputdir != nullptr) + { + uprv_strcat(filename, state->inputdir); + + if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR) + { + uprv_strcat(filename, U_FILE_SEP_STRING); + } + } + + u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength); + + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + uprv_strcat(filename, cs); + + if(state->omitCollationRules) { + return res_none(); + } + + ucbuf = ucbuf_open(filename, &cp, getShowWarning(),false, status); + + if (U_FAILURE(*status)) { + error(line, "An error occurred while opening the input file %s\n", filename); + return nullptr; + } + + /* We allocate more space than actually required + * since the actual size needed for storing UChars + * is not known in UTF-8 byte stream + */ + size = ucbuf_size(ucbuf) + 1; + pTarget = (char16_t*) uprv_malloc(U_SIZEOF_UCHAR * size); + uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR); + target = pTarget; + targetLimit = pTarget+size; + + /* read the rules into the buffer */ + while (target < targetLimit) + { + c = ucbuf_getc(ucbuf, status); + if(c == QUOTE) { + quoted = (UBool)!quoted; + } + /* weiv (06/26/2002): adding the following: + * - preserving spaces in commands [...] + * - # comments until the end of line + */ + if (c == STARTCOMMAND && !quoted) + { + /* preserve commands + * closing bracket will be handled by the + * append at the end of the loop + */ + while(c != ENDCOMMAND) { + U_APPEND_CHAR32_ONLY(c, target); + c = ucbuf_getc(ucbuf, status); + } + } + else if (c == HASH && !quoted) { + /* skip comments */ + while(c != CR && c != LF) { + c = ucbuf_getc(ucbuf, status); + } + continue; + } + else if (c == ESCAPE) + { + c = unescape(ucbuf, status); + + if (c == (UChar32)U_ERR) + { + uprv_free(pTarget); + T_FileStream_close(file); + return nullptr; + } + } + else if (!quoted && (c == SPACE || c == TAB || c == CR || c == LF)) + { + /* ignore spaces carriage returns + * and line feed unless in the form \uXXXX + */ + continue; + } + + /* Append char16_t * after dissembling if c > 0xffff*/ + if (c != (UChar32)U_EOF) + { + U_APPEND_CHAR32_ONLY(c, target); + } + else + { + break; + } + } + + /* terminate the string */ + if(target < targetLimit){ + *target = 0x0000; + } + + result = string_open(state->bundle, tag, pTarget, (int32_t)(target - pTarget), nullptr, status); + + + ucbuf_close(ucbuf); + uprv_free(pTarget); + T_FileStream_close(file); + + return result; +} + +static struct SResource * +parseTransliterator(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status) +{ + struct SResource *result = nullptr; + struct UString *tokenValue; + FileStream *file = nullptr; + char filename[256] = { '\0' }; + char cs[128] = { '\0' }; + uint32_t line; + UCHARBUF *ucbuf=nullptr; + const char* cp = nullptr; + char16_t *pTarget = nullptr; + const char16_t *pSource = nullptr; + int32_t size = 0; + + expect(state, TOK_STRING, &tokenValue, nullptr, &line, status); + + if(isVerbose()){ + printf(" %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + if (U_FAILURE(*status)) + { + return nullptr; + } + /* make the filename including the directory */ + if (state->inputdir != nullptr) + { + uprv_strcat(filename, state->inputdir); + + if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR) + { + uprv_strcat(filename, U_FILE_SEP_STRING); + } + } + + u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength); + + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + uprv_strcat(filename, cs); + + + ucbuf = ucbuf_open(filename, &cp, getShowWarning(),false, status); + + if (U_FAILURE(*status)) { + error(line, "An error occurred while opening the input file %s\n", filename); + return nullptr; + } + + /* We allocate more space than actually required + * since the actual size needed for storing UChars + * is not known in UTF-8 byte stream + */ + pSource = ucbuf_getBuffer(ucbuf, &size, status); + pTarget = (char16_t*) uprv_malloc(U_SIZEOF_UCHAR * (size + 1)); + uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR); + +#if !UCONFIG_NO_TRANSLITERATION + size = utrans_stripRules(pSource, size, pTarget, status); +#else + size = 0; + fprintf(stderr, " Warning: writing empty transliteration data ( UCONFIG_NO_TRANSLITERATION ) \n"); +#endif + result = string_open(state->bundle, tag, pTarget, size, nullptr, status); + + ucbuf_close(ucbuf); + uprv_free(pTarget); + T_FileStream_close(file); + + return result; +} +static ArrayResource* dependencyArray = nullptr; + +static struct SResource * +parseDependency(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status) +{ + struct SResource *result = nullptr; + struct SResource *elem = nullptr; + struct UString *tokenValue; + uint32_t line; + char filename[256] = { '\0' }; + char cs[128] = { '\0' }; + + expect(state, TOK_STRING, &tokenValue, nullptr, &line, status); + + if(isVerbose()){ + printf(" %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + if (U_FAILURE(*status)) + { + return nullptr; + } + /* make the filename including the directory */ + if (state->outputdir != nullptr) + { + uprv_strcat(filename, state->outputdir); + + if (state->outputdir[state->outputdirLength - 1] != U_FILE_SEP_CHAR) + { + uprv_strcat(filename, U_FILE_SEP_STRING); + } + } + + u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength); + + if (U_FAILURE(*status)) + { + return nullptr; + } + uprv_strcat(filename, cs); + if(!T_FileStream_file_exists(filename)){ + if(isStrict()){ + error(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename); + }else{ + warning(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename); + } + } + if(dependencyArray==nullptr){ + dependencyArray = array_open(state->bundle, "%%DEPENDENCY", nullptr, status); + } + if(tag!=nullptr){ + result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status); + } + elem = string_open(state->bundle, nullptr, tokenValue->fChars, tokenValue->fLength, comment, status); + + dependencyArray->add(elem); + + if (U_FAILURE(*status)) + { + return nullptr; + } + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + return result; +} +static struct SResource * +parseString(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status) +{ + struct UString *tokenValue; + struct SResource *result = nullptr; + +/* if (tag != nullptr && uprv_strcmp(tag, "%%UCARULES") == 0) + { + return parseUCARules(tag, startline, status); + }*/ + if(isVerbose()){ + printf(" string %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + expect(state, TOK_STRING, &tokenValue, nullptr, nullptr, status); + + if (U_SUCCESS(*status)) + { + /* create the string now - tokenValue doesn't survive a call to getToken (and therefore + doesn't survive expect either) */ + + result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status); + if(U_SUCCESS(*status) && result) { + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + } + } + + return result; +} + +static struct SResource * +parseAlias(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) +{ + struct UString *tokenValue; + struct SResource *result = nullptr; + + expect(state, TOK_STRING, &tokenValue, nullptr, nullptr, status); + + if(isVerbose()){ + printf(" alias %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + if (U_SUCCESS(*status)) + { + /* create the string now - tokenValue doesn't survive a call to getToken (and therefore + doesn't survive expect either) */ + + result = alias_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status); + + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + } + + return result; +} + +#if !UCONFIG_NO_COLLATION + +namespace { + +static struct SResource* resLookup(struct SResource* res, const char* key){ + if (res == res_none() || !res->isTable()) { + return nullptr; + } + + TableResource *list = static_cast<TableResource *>(res); + SResource *current = list->fFirst; + while (current != nullptr) { + if (uprv_strcmp(((list->fRoot->fKeys) + (current->fKey)), key) == 0) { + return current; + } + current = current->fNext; + } + return nullptr; +} + +class GenrbImporter : public icu::CollationRuleParser::Importer { +public: + GenrbImporter(const char *in, const char *out) : inputDir(in), outputDir(out) {} + virtual ~GenrbImporter(); + virtual void getRules( + const char *localeID, const char *collationType, + UnicodeString &rules, + const char *&errorReason, UErrorCode &errorCode) override; + +private: + const char *inputDir; + const char *outputDir; +}; + +GenrbImporter::~GenrbImporter() {} + +void +GenrbImporter::getRules( + const char *localeID, const char *collationType, + UnicodeString &rules, + const char *& /*errorReason*/, UErrorCode &errorCode) { + CharString filename(localeID, errorCode); + for(int32_t i = 0; i < filename.length(); i++){ + if(filename[i] == '-'){ + filename.data()[i] = '_'; + } + } + filename.append(".txt", errorCode); + if (U_FAILURE(errorCode)) { + return; + } + CharString inputDirBuf; + CharString openFileName; + if(inputDir == nullptr) { + const char *filenameBegin = uprv_strrchr(filename.data(), U_FILE_SEP_CHAR); + if (filenameBegin != nullptr) { + /* + * When a filename ../../../data/root.txt is specified, + * we presume that the input directory is ../../../data + * This is very important when the resource file includes + * another file, like UCARules.txt or thaidict.brk. + */ + StringPiece dir = filename.toStringPiece(); + const char *filenameLimit = filename.data() + filename.length(); + dir.remove_suffix((int32_t)(filenameLimit - filenameBegin)); + inputDirBuf.append(dir, errorCode); + inputDir = inputDirBuf.data(); + } + }else{ + int32_t dirlen = (int32_t)uprv_strlen(inputDir); + + if((filename[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')) { + /* + * append the input dir to openFileName if the first char in + * filename is not file separator char and the last char input directory is not '.'. + * This is to support : + * genrb -s. /home/icu/data + * genrb -s. icu/data + * The user cannot mix notations like + * genrb -s. /icu/data --- the absolute path specified. -s redundant + * user should use + * genrb -s. icu/data --- start from CWD and look in icu/data dir + */ + openFileName.append(inputDir, dirlen, errorCode); + if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) { + openFileName.append(U_FILE_SEP_CHAR, errorCode); + } + } + } + openFileName.append(filename, errorCode); + if(U_FAILURE(errorCode)) { + return; + } + // printf("GenrbImporter::getRules(%s, %s) reads %s\n", localeID, collationType, openFileName.data()); + const char* cp = ""; + LocalUCHARBUFPointer ucbuf( + ucbuf_open(openFileName.data(), &cp, getShowWarning(), true, &errorCode)); + if(errorCode == U_FILE_ACCESS_ERROR) { + fprintf(stderr, "couldn't open file %s\n", openFileName.data()); + return; + } + if (ucbuf.isNull() || U_FAILURE(errorCode)) { + fprintf(stderr, "An error occurred processing file %s. Error: %s\n", openFileName.data(), u_errorName(errorCode)); + return; + } + + /* Parse the data into an SRBRoot */ + LocalPointer<SRBRoot> data( + parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), false, false, false, &errorCode)); + if (U_FAILURE(errorCode)) { + return; + } + + struct SResource *root = data->fRoot; + struct SResource *collations = resLookup(root, "collations"); + if (collations != nullptr) { + struct SResource *collation = resLookup(collations, collationType); + if (collation != nullptr) { + struct SResource *sequence = resLookup(collation, "Sequence"); + if (sequence != nullptr && sequence->isString()) { + // No string pointer aliasing so that we need not hold onto the resource bundle. + StringResource *sr = static_cast<StringResource *>(sequence); + rules = sr->fString; + } + } + } +} + +// Quick-and-dirty escaping function. +// Assumes that we are on an ASCII-based platform. +static void +escape(const char16_t *s, char *buffer, size_t n) { + int32_t length = u_strlen(s); + int32_t i = 0; + for (;;) { + UChar32 c; + U16_NEXT(s, i, length, c); + if (c == 0) { + *buffer = 0; + return; + } else if (0x20 <= c && c <= 0x7e) { + // printable ASCII + *buffer++ = (char)c; // assumes ASCII-based platform + } else { + buffer += snprintf(buffer, n, "\\u%04X", (int)c); + } + } +} + +} // namespace + +static FILE* +openTOML(const char* outputdir, const char* name, const char* collationType, const char* structType, UErrorCode *status) { + CharString baseName; + baseName.append(name, *status); + baseName.append("_", *status); + baseName.append(collationType, *status); + baseName.append("_", *status); + baseName.append(structType, *status); + + CharString outFileName; + if (outputdir && *outputdir) { + outFileName.append(outputdir, *status).ensureEndsWithFileSeparator(*status); + } + outFileName.append(baseName, *status); + outFileName.append(".toml", *status); + if (U_FAILURE(*status)) { + return nullptr; + } + + FILE* f = fopen(outFileName.data(), "w"); + if (!f) { + *status = U_FILE_ACCESS_ERROR; + return nullptr; + } + usrc_writeFileNameGeneratedBy(f, "#", baseName.data(), "genrb -X"); + + return f; +} + +static void +writeCollationMetadataTOML(const char* outputdir, const char* name, const char* collationType, const uint32_t metadataBits, UErrorCode *status) { + FILE* f = openTOML(outputdir, name, collationType, "meta", status); + if (!f) { + return; + } + // printf("writeCollationMetadataTOML %s %s\n", name, collationType); + fprintf(f, "bits = 0x%X\n", metadataBits); + fclose(f); +} + +static UChar32 +writeCollationDiacriticsTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) { + UChar32 limit = ICU4X_DIACRITIC_LIMIT; + FILE* f = openTOML(outputdir, name, collationType, "dia", status); + if (!f) { + return limit; + } + // printf("writeCollationDiacriticsTOML %s %s\n", name, collationType); + uint16_t secondaries[ICU4X_DIACRITIC_LIMIT-ICU4X_DIACRITIC_BASE]; + for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) { + uint16_t secondary = 0; + uint32_t ce32 = data->getCE32(c); + if (ce32 == icu::Collation::FALLBACK_CE32) { + ce32 = data->base->getCE32(c); + } + if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) { + // These never occur in NFD data + } else if (!icu::Collation::isSimpleOrLongCE32(ce32)) { + if (uprv_strcmp(name, "root") == 0) { + printf("UNSUPPORTED DIACRITIC CE32 in root: TAG: %X CE32: %X char: %X\n", icu::Collation::tagFromCE32(ce32), ce32, c); + fclose(f); + *status = U_INTERNAL_PROGRAM_ERROR; + return limit; + } + limit = c; + break; + } else { + uint64_t ce = uint64_t(icu::Collation::ceFromCE32(ce32)); + if ((ce & 0xFFFFFFFF0000FFFF) != uint64_t(icu::Collation::COMMON_TERTIARY_CE)) { + // Not a CE where only the secondary weight differs from the expected + // pattern. + limit = c; + break; + } + secondary = uint16_t(ce >> 16); + } + secondaries[c - ICU4X_DIACRITIC_BASE] = secondary; + + } + usrc_writeArray(f, "secondaries = [\n ", secondaries, 16, limit-ICU4X_DIACRITIC_BASE, " ", "\n]\n"); + fclose(f); + return limit; +} + +static void +writeCollationReorderingTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationSettings* settings, UErrorCode *status) { + FILE* f = openTOML(outputdir, name, collationType, "reord", status); + if (!f) { + return; + } + // printf("writeCollationReorderingTOML %s %s\n", name, collationType); + fprintf(f, "min_high_no_reorder = 0x%X\n", settings->minHighNoReorder); + usrc_writeArray(f, "reorder_table = [\n ", settings->reorderTable, 8, 256, " ", "\n]\n"); + usrc_writeArray(f, "reorder_ranges = [\n ", settings->reorderRanges, 32, settings->reorderRangesLength, " ", "\n]\n"); + fclose(f); +} + + +static void +writeCollationJamoTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) { + FILE* f = openTOML(outputdir, name, collationType, "jamo", status); + if (!f) { + printf("writeCollationJamoTOML FAILED TO OPEN FILE %s %s\n", name, collationType); + return; + } + uint32_t jamo[0x1200-0x1100]; + for (UChar32 c = 0x1100; c < 0x1200; ++c) { + uint32_t ce32 = data->getCE32(c); + if (ce32 == icu::Collation::FALLBACK_CE32) { + ce32 = data->base->getCE32(c); + } + // Can't reject complex CE32s, because search collations have expansions. + // These expansions refer to the tailoring, which foils the reuse of the + // these jamo tables. + // XXX Figure out what to do. Perhaps instead of having Latin mini expansions, + // there should be Hangul mini expansions. + // XXX in any case, validate that modern jamo are self-contained. + jamo[c - 0x1100] = ce32; + + } + usrc_writeArray(f, "ce32s = [\n ", jamo, 32, 0x1200-0x1100, " ", "\n]\n"); + fclose(f); +} + +static UBool +convertTrie(const void *context, UChar32 start, UChar32 end, uint32_t value) { + if (start >= 0x1100 && start < 0x1200 && end >= 0x1100 && end < 0x1200) { + // Range entirely in conjoining jamo block. + return true; + } + icu::IcuToolErrorCode status("genrb: convertTrie"); + umutablecptrie_setRange((UMutableCPTrie*)context, start, end, value, status); + return !U_FAILURE(*status); +} + +static void +writeCollationDataTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UBool root, UChar32 diacriticLimit, UErrorCode *status) { + FILE* f = openTOML(outputdir, name, collationType, "data", status); + if (!f) { + return; + } + // printf("writeCollationDataTOML %s %s\n", name, collationType); + + icu::UnicodeSet tailoringSet; + + if (data->base) { + tailoringSet.addAll(*(data->unsafeBackwardSet)); + tailoringSet.removeAll(*(data->base->unsafeBackwardSet)); + } else { + tailoringSet.addAll(*(data->unsafeBackwardSet)); + } + + // Use the same value for out-of-range and default in the hope of not having to allocate + // different blocks, since ICU4X never does out-of-range queries. + uint32_t trieDefault = root ? icu::Collation::UNASSIGNED_CE32 : icu::Collation::FALLBACK_CE32; + icu::LocalUMutableCPTriePointer builder(umutablecptrie_open(trieDefault, trieDefault, status)); + + utrie2_enum(data->trie, nullptr, &convertTrie, builder.getAlias()); + + // If the diacritic table was cut short, copy CE32s between the lowered + // limit and the max limit from the root to the tailoring. As of June 2022, + // no collation in CLDR needs this. + for (UChar32 c = diacriticLimit; c < ICU4X_DIACRITIC_LIMIT; ++c) { + if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) { + // These never occur in NFD data. + continue; + } + uint32_t ce32 = data->getCE32(c); + if (ce32 == icu::Collation::FALLBACK_CE32) { + ce32 = data->base->getCE32(c); + umutablecptrie_set(builder.getAlias(), c, ce32, status); + } + } + + // Ensure that the range covered by the diacritic table isn't duplicated + // in the trie. + for (UChar32 c = ICU4X_DIACRITIC_BASE; c < diacriticLimit; ++c) { + if (umutablecptrie_get(builder.getAlias(), c) != trieDefault) { + umutablecptrie_set(builder.getAlias(), c, trieDefault, status); + } + } + + icu::LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + builder.getAlias(), + UCPTRIE_TYPE_SMALL, + UCPTRIE_VALUE_BITS_32, + status)); + usrc_writeArray(f, "contexts = [\n ", data->contexts, 16, data->contextsLength, " ", "\n]\n"); + usrc_writeArray(f, "ce32s = [\n ", data->ce32s, 32, data->ce32sLength, " ", "\n]\n"); + usrc_writeArray(f, "ces = [\n ", data->ces, 64, data->cesLength, " ", "\n]\n"); + fprintf(f, "[trie]\n"); + usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); + + fclose(f); +} + +static void +writeCollationSpecialPrimariesTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) { + FILE* f = openTOML(outputdir, name, collationType, "prim", status); + if (!f) { + return; + } + // printf("writeCollationSpecialPrimariesTOML %s %s\n", name, collationType); + + uint16_t lastPrimaries[4]; + for (int32_t i = 0; i < 4; ++i) { + // getLastPrimaryForGroup subtracts one from a 16-bit value, so we add one + // back to get a value that fits in 16 bits. + lastPrimaries[i] = (uint16_t)((data->getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i) + 1) >> 16); + } + + uint32_t numericPrimary = data->numericPrimary; + if (numericPrimary & 0xFFFFFF) { + printf("Lower 24 bits set in numeric primary"); + *status = U_INTERNAL_PROGRAM_ERROR; + return; + } + + usrc_writeArray(f, "last_primaries = [\n ", lastPrimaries, 16, 4, " ", "\n]\n"); + fprintf(f, "numeric_primary = 0x%X\n", numericPrimary >> 24); + fclose(f); +} + +static void +writeCollationTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, const icu::CollationSettings* settings, UErrorCode *status) { + UBool tailored = false; + UBool tailoredDiacritics = false; + UBool lithuanianDotAbove = (uprv_strcmp(name, "lt") == 0); + UBool reordering = false; + UBool isRoot = uprv_strcmp(name, "root") == 0; + UChar32 diacriticLimit = ICU4X_DIACRITIC_LIMIT; + if (!data->base && isRoot) { + diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status); + if (U_FAILURE(*status)) { + return; + } + writeCollationJamoTOML(outputdir, name, collationType, data, status); + if (U_FAILURE(*status)) { + return; + } + writeCollationSpecialPrimariesTOML(outputdir, name, collationType, data, status); + if (U_FAILURE(*status)) { + return; + } + } else if (data->base && !lithuanianDotAbove) { + for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) { + if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) { + // These never occur in NFD data. + continue; + } + uint32_t ce32 = data->getCE32(c); + if ((ce32 != icu::Collation::FALLBACK_CE32) && (ce32 != data->base->getCE32(c))) { + tailoredDiacritics = true; + diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status); + if (U_FAILURE(*status)) { + return; + } + break; + } + } + } + + if (settings->hasReordering()) { + reordering = true; + // Note: There are duplicate reorderings. Expecting the ICU4X provider + // to take care of deduplication. + writeCollationReorderingTOML(outputdir, name, collationType, settings, status); + if (U_FAILURE(*status)) { + return; + } + } + + // Write collation data if either base is non-null or the name is root. + // Languages that only reorder scripts are otherwise root-like and have + // null base. + if (data->base || isRoot) { + tailored = !isRoot; + writeCollationDataTOML(outputdir, name, collationType, data, (!data->base && isRoot), diacriticLimit, status); + if (U_FAILURE(*status)) { + return; + } + } + + uint32_t maxVariable = (uint32_t)settings->getMaxVariable(); + if (maxVariable >= 4) { + printf("Max variable out of range"); + *status = U_INTERNAL_PROGRAM_ERROR; + return; + } + + uint32_t metadataBits = maxVariable; + if (tailored) { + metadataBits |= (1 << 3); + } + if (tailoredDiacritics) { + metadataBits |= (1 << 4); + } + if (reordering) { + metadataBits |= (1 << 5); + } + if (lithuanianDotAbove) { + metadataBits |= (1 << 6); + } + if ((settings->options & icu::CollationSettings::BACKWARD_SECONDARY) != 0) { + metadataBits |= (1 << 7); + } + if (settings->getAlternateHandling() == UCOL_SHIFTED) { + metadataBits |= (1 << 8); + } + switch (settings->getCaseFirst()) { + case UCOL_OFF: + break; + case UCOL_UPPER_FIRST: + metadataBits |= (1 << 9); + metadataBits |= (1 << 10); + break; + case UCOL_LOWER_FIRST: + metadataBits |= (1 << 9); + break; + default: + *status = U_INTERNAL_PROGRAM_ERROR; + return; + } + + writeCollationMetadataTOML(outputdir, name, collationType, metadataBits, status); +} + +#endif // !UCONFIG_NO_COLLATION + +static TableResource * +addCollation(ParseState* state, TableResource *result, const char *collationType, + uint32_t startline, UErrorCode *status) +{ + // TODO: Use LocalPointer for result, or make caller close it when there is a failure. + struct SResource *member = nullptr; + struct UString *tokenValue; + struct UString comment; + enum ETokenType token; + char subtag[1024]; + UnicodeString rules; + UBool haveRules = false; + UVersionInfo version; + uint32_t line; + + /* '{' . (name resource)* '}' */ + version[0]=0; version[1]=0; version[2]=0; version[3]=0; + + for (;;) + { + ustr_init(&comment); + token = getToken(state, &tokenValue, &comment, &line, status); + + if (token == TOK_CLOSE_BRACE) + { + break; + } + + if (token != TOK_STRING) + { + res_close(result); + *status = U_INVALID_FORMAT_ERROR; + + if (token == TOK_EOF) + { + error(startline, "unterminated table"); + } + else + { + error(line, "Unexpected token %s", tokenNames[token]); + } + + return nullptr; + } + + u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1); + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + + member = parseResource(state, subtag, nullptr, status); + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + if (result == nullptr) + { + // Ignore the parsed resources, continue parsing. + } + else if (uprv_strcmp(subtag, "Version") == 0 && member->isString()) + { + StringResource *sr = static_cast<StringResource *>(member); + char ver[40]; + int32_t length = sr->length(); + + if (length >= UPRV_LENGTHOF(ver)) + { + length = UPRV_LENGTHOF(ver) - 1; + } + + sr->fString.extract(0, length, ver, UPRV_LENGTHOF(ver), US_INV); + u_versionFromString(version, ver); + + result->add(member, line, *status); + member = nullptr; + } + else if(uprv_strcmp(subtag, "%%CollationBin")==0) + { + /* discard duplicate %%CollationBin if any*/ + } + else if (uprv_strcmp(subtag, "Sequence") == 0 && member->isString()) + { + StringResource *sr = static_cast<StringResource *>(member); + rules = sr->fString; + haveRules = true; + // Defer building the collator until we have seen + // all sub-elements of the collation table, including the Version. + /* in order to achieve smaller data files, we can direct genrb */ + /* to omit collation rules */ + if(!state->omitCollationRules) { + result->add(member, line, *status); + member = nullptr; + } + } + else // Just copy non-special items. + { + result->add(member, line, *status); + member = nullptr; + } + res_close(member); // TODO: use LocalPointer + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + } + + if (!haveRules) { return result; } + +#if UCONFIG_NO_COLLATION || UCONFIG_NO_FILE_IO + warning(line, "Not building collation elements because of UCONFIG_NO_COLLATION and/or UCONFIG_NO_FILE_IO, see uconfig.h"); + (void)collationType; +#else + // CLDR ticket #3949, ICU ticket #8082: + // Do not build collation binary data for for-import-only "private" collation rule strings. + if (uprv_strncmp(collationType, "private-", 8) == 0) { + if(isVerbose()) { + printf("Not building %s~%s collation binary\n", state->filename, collationType); + } + return result; + } + + if(!state->makeBinaryCollation) { + if(isVerbose()) { + printf("Not building %s~%s collation binary\n", state->filename, collationType); + } + return result; + } + UErrorCode intStatus = U_ZERO_ERROR; + UParseError parseError; + uprv_memset(&parseError, 0, sizeof(parseError)); + GenrbImporter importer(state->inputdir, state->outputdir); + const icu::CollationTailoring *base = icu::CollationRoot::getRoot(intStatus); + if(U_FAILURE(intStatus)) { + error(line, "failed to load root collator (ucadata.icu) - %s", u_errorName(intStatus)); + res_close(result); + return nullptr; // TODO: use LocalUResourceBundlePointer for result + } + icu::CollationBuilder builder(base, state->icu4xMode, intStatus); + if(state->icu4xMode || (uprv_strncmp(collationType, "search", 6) == 0)) { + builder.disableFastLatin(); // build fast-Latin table unless search collator or ICU4X + } + LocalPointer<icu::CollationTailoring> t( + builder.parseAndBuild(rules, version, &importer, &parseError, intStatus)); + if(U_FAILURE(intStatus)) { + const char *reason = builder.getErrorReason(); + if(reason == nullptr) { reason = ""; } + error(line, "CollationBuilder failed at %s~%s/Sequence rule offset %ld: %s %s", + state->filename, collationType, + (long)parseError.offset, u_errorName(intStatus), reason); + if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) { + // Print pre- and post-context. + char preBuffer[100], postBuffer[100]; + escape(parseError.preContext, preBuffer, sizeof(preBuffer)); + escape(parseError.postContext, postBuffer, sizeof(postBuffer)); + error(line, " error context: \"...%s\" ! \"%s...\"", preBuffer, postBuffer); + } + if(isStrict() || t.isNull()) { + *status = intStatus; + res_close(result); + return nullptr; + } + } + if (state->icu4xMode) { + char *nameWithoutSuffix = static_cast<char *>(uprv_malloc(uprv_strlen(state->filename) + 1)); + if (nameWithoutSuffix == nullptr) { + *status = U_MEMORY_ALLOCATION_ERROR; + res_close(result); + return nullptr; + } + uprv_strcpy(nameWithoutSuffix, state->filename); + *uprv_strrchr(nameWithoutSuffix, '.') = 0; + + writeCollationTOML(state->outputdir, nameWithoutSuffix, collationType, t->data, t->settings, status); + uprv_free(nameWithoutSuffix); + } + icu::LocalMemory<uint8_t> buffer; + int32_t capacity = 100000; + uint8_t *dest = buffer.allocateInsteadAndCopy(capacity); + if(dest == nullptr) { + fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n", + (long)capacity); + *status = U_MEMORY_ALLOCATION_ERROR; + res_close(result); + return nullptr; + } + int32_t indexes[icu::CollationDataReader::IX_TOTAL_SIZE + 1]; + int32_t totalSize = icu::CollationDataWriter::writeTailoring( + *t, *t->settings, indexes, dest, capacity, intStatus); + if(intStatus == U_BUFFER_OVERFLOW_ERROR) { + intStatus = U_ZERO_ERROR; + capacity = totalSize; + dest = buffer.allocateInsteadAndCopy(capacity); + if(dest == nullptr) { + fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n", + (long)capacity); + *status = U_MEMORY_ALLOCATION_ERROR; + res_close(result); + return nullptr; + } + totalSize = icu::CollationDataWriter::writeTailoring( + *t, *t->settings, indexes, dest, capacity, intStatus); + } + if(U_FAILURE(intStatus)) { + fprintf(stderr, "CollationDataWriter::writeTailoring() failed: %s\n", + u_errorName(intStatus)); + res_close(result); + return nullptr; + } + if(isVerbose()) { + printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType); + icu::CollationInfo::printSizes(totalSize, indexes); + if(t->settings->hasReordering()) { + printf("%s~%s collation reordering ranges:\n", state->filename, collationType); + icu::CollationInfo::printReorderRanges( + *t->data, t->settings->reorderCodes, t->settings->reorderCodesLength); + } +#if 0 // debugging output + } else { + printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType); + icu::CollationInfo::printSizes(totalSize, indexes); +#endif + } + struct SResource *collationBin = bin_open(state->bundle, "%%CollationBin", totalSize, dest, nullptr, nullptr, status); + result->add(collationBin, line, *status); + if (U_FAILURE(*status)) { + res_close(result); + return nullptr; + } +#endif + return result; +} + +static UBool +keepCollationType(const char * /*type*/) { + return true; +} + +static struct SResource * +parseCollationElements(ParseState* state, char *tag, uint32_t startline, UBool newCollation, UErrorCode *status) +{ + TableResource *result = nullptr; + struct SResource *member = nullptr; + struct UString *tokenValue; + struct UString comment; + enum ETokenType token; + char subtag[1024], typeKeyword[1024]; + uint32_t line; + + result = table_open(state->bundle, tag, nullptr, status); + + if (result == nullptr || U_FAILURE(*status)) + { + return nullptr; + } + if(isVerbose()){ + printf(" collation elements %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + if(!newCollation) { + return addCollation(state, result, "(no type)", startline, status); + } + else { + for(;;) { + ustr_init(&comment); + token = getToken(state, &tokenValue, &comment, &line, status); + + if (token == TOK_CLOSE_BRACE) + { + return result; + } + + if (token != TOK_STRING) + { + res_close(result); + *status = U_INVALID_FORMAT_ERROR; + + if (token == TOK_EOF) + { + error(startline, "unterminated table"); + } + else + { + error(line, "Unexpected token %s", tokenNames[token]); + } + + return nullptr; + } + + u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1); + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + + if (uprv_strcmp(subtag, "default") == 0) + { + member = parseResource(state, subtag, nullptr, status); + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + + result->add(member, line, *status); + } + else + { + token = peekToken(state, 0, &tokenValue, &line, &comment, status); + /* this probably needs to be refactored or recursively use the parser */ + /* first we assume that our collation table won't have the explicit type */ + /* then, we cannot handle aliases */ + if(token == TOK_OPEN_BRACE) { + token = getToken(state, &tokenValue, &comment, &line, status); + TableResource *collationRes; + if (keepCollationType(subtag)) { + collationRes = table_open(state->bundle, subtag, nullptr, status); + } else { + collationRes = nullptr; + } + // need to parse the collation data regardless + collationRes = addCollation(state, collationRes, subtag, startline, status); + if (collationRes != nullptr) { + result->add(collationRes, startline, *status); + } + } else if(token == TOK_COLON) { /* right now, we'll just try to see if we have aliases */ + /* we could have a table too */ + token = peekToken(state, 1, &tokenValue, &line, &comment, status); + u_UCharsToChars(tokenValue->fChars, typeKeyword, u_strlen(tokenValue->fChars) + 1); + if(uprv_strcmp(typeKeyword, "alias") == 0) { + member = parseResource(state, subtag, nullptr, status); + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + + result->add(member, line, *status); + } else { + res_close(result); + *status = U_INVALID_FORMAT_ERROR; + return nullptr; + } + } else { + res_close(result); + *status = U_INVALID_FORMAT_ERROR; + return nullptr; + } + } + + /*member = string_open(bundle, subtag, tokenValue->fChars, tokenValue->fLength, status);*/ + + /*expect(TOK_CLOSE_BRACE, nullptr, nullptr, status);*/ + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + } + } +} + +/* Necessary, because CollationElements requires the bundle->fRoot member to be present which, + if this weren't special-cased, wouldn't be set until the entire file had been processed. */ +static struct SResource * +realParseTable(ParseState* state, TableResource *table, char *tag, uint32_t startline, UErrorCode *status) +{ + struct SResource *member = nullptr; + struct UString *tokenValue=nullptr; + struct UString comment; + enum ETokenType token; + char subtag[1024]; + uint32_t line; + UBool readToken = false; + + /* '{' . (name resource)* '}' */ + + if(isVerbose()){ + printf(" parsing table %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + for (;;) + { + ustr_init(&comment); + token = getToken(state, &tokenValue, &comment, &line, status); + + if (token == TOK_CLOSE_BRACE) + { + if (!readToken && isVerbose()) { + warning(startline, "Encountered empty table"); + } + return table; + } + + if (token != TOK_STRING) + { + *status = U_INVALID_FORMAT_ERROR; + + if (token == TOK_EOF) + { + error(startline, "unterminated table"); + } + else + { + error(line, "unexpected token %s", tokenNames[token]); + } + + return nullptr; + } + + if(uprv_isInvariantUString(tokenValue->fChars, -1)) { + u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1); + } else { + *status = U_INVALID_FORMAT_ERROR; + error(line, "invariant characters required for table keys"); + return nullptr; + } + + if (U_FAILURE(*status)) + { + error(line, "parse error. Stopped parsing tokens with %s", u_errorName(*status)); + return nullptr; + } + + member = parseResource(state, subtag, &comment, status); + + if (member == nullptr || U_FAILURE(*status)) + { + error(line, "parse error. Stopped parsing resource with %s", u_errorName(*status)); + return nullptr; + } + + table->add(member, line, *status); + + if (U_FAILURE(*status)) + { + error(line, "parse error. Stopped parsing table with %s", u_errorName(*status)); + return nullptr; + } + readToken = true; + ustr_deinit(&comment); + } + + /* not reached */ + /* A compiler warning will appear if all paths don't contain a return statement. */ +/* *status = U_INTERNAL_PROGRAM_ERROR; + return nullptr;*/ +} + +static struct SResource * +parseTable(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) +{ + if (tag != nullptr && uprv_strcmp(tag, "CollationElements") == 0) + { + return parseCollationElements(state, tag, startline, false, status); + } + if (tag != nullptr && uprv_strcmp(tag, "collations") == 0) + { + return parseCollationElements(state, tag, startline, true, status); + } + if(isVerbose()){ + printf(" table %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + TableResource *result = table_open(state->bundle, tag, comment, status); + + if (result == nullptr || U_FAILURE(*status)) + { + return nullptr; + } + return realParseTable(state, result, tag, startline, status); +} + +static struct SResource * +parseArray(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) +{ + struct SResource *member = nullptr; + struct UString *tokenValue; + struct UString memberComments; + enum ETokenType token; + UBool readToken = false; + + ArrayResource *result = array_open(state->bundle, tag, comment, status); + + if (result == nullptr || U_FAILURE(*status)) + { + return nullptr; + } + if(isVerbose()){ + printf(" array %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + ustr_init(&memberComments); + + /* '{' . resource [','] '}' */ + for (;;) + { + /* reset length */ + ustr_setlen(&memberComments, 0, status); + + /* check for end of array, but don't consume next token unless it really is the end */ + token = peekToken(state, 0, &tokenValue, nullptr, &memberComments, status); + + + if (token == TOK_CLOSE_BRACE) + { + getToken(state, nullptr, nullptr, nullptr, status); + if (!readToken) { + warning(startline, "Encountered empty array"); + } + break; + } + + if (token == TOK_EOF) + { + res_close(result); + *status = U_INVALID_FORMAT_ERROR; + error(startline, "unterminated array"); + return nullptr; + } + + /* string arrays are a special case */ + if (token == TOK_STRING) + { + getToken(state, &tokenValue, &memberComments, nullptr, status); + member = string_open(state->bundle, nullptr, tokenValue->fChars, tokenValue->fLength, &memberComments, status); + } + else + { + member = parseResource(state, nullptr, &memberComments, status); + } + + if (member == nullptr || U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + + result->add(member); + + /* eat optional comma if present */ + token = peekToken(state, 0, nullptr, nullptr, nullptr, status); + + if (token == TOK_COMMA) + { + getToken(state, nullptr, nullptr, nullptr, status); + } + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + readToken = true; + } + + ustr_deinit(&memberComments); + return result; +} + +static struct SResource * +parseIntVector(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) +{ + enum ETokenType token; + char *string; + int32_t value; + UBool readToken = false; + char *stopstring; + struct UString memberComments; + + IntVectorResource *result = intvector_open(state->bundle, tag, comment, status); + + if (result == nullptr || U_FAILURE(*status)) + { + return nullptr; + } + + if(isVerbose()){ + printf(" vector %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + ustr_init(&memberComments); + /* '{' . string [','] '}' */ + for (;;) + { + ustr_setlen(&memberComments, 0, status); + + /* check for end of array, but don't consume next token unless it really is the end */ + token = peekToken(state, 0, nullptr, nullptr,&memberComments, status); + + if (token == TOK_CLOSE_BRACE) + { + /* it's the end, consume the close brace */ + getToken(state, nullptr, nullptr, nullptr, status); + if (!readToken) { + warning(startline, "Encountered empty int vector"); + } + ustr_deinit(&memberComments); + return result; + } + + int32_t stringLength; + string = getInvariantString(state, nullptr, nullptr, stringLength, status); + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + + /* For handling illegal char in the Intvector */ + value = uprv_strtoul(string, &stopstring, 0);/* make intvector support decimal,hexdigit,octal digit ranging from -2^31-2^32-1*/ + int32_t len = (int32_t)(stopstring-string); + + if(len==stringLength) + { + result->add(value, *status); + uprv_free(string); + token = peekToken(state, 0, nullptr, nullptr, nullptr, status); + } + else + { + uprv_free(string); + *status=U_INVALID_CHAR_FOUND; + } + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + + /* the comma is optional (even though it is required to prevent the reader from concatenating + consecutive entries) so that a missing comma on the last entry isn't an error */ + if (token == TOK_COMMA) + { + getToken(state, nullptr, nullptr, nullptr, status); + } + readToken = true; + } + + /* not reached */ + /* A compiler warning will appear if all paths don't contain a return statement. */ +/* intvector_close(result, status); + *status = U_INTERNAL_PROGRAM_ERROR; + return nullptr;*/ +} + +static struct SResource * +parseBinary(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) +{ + uint32_t line; + int32_t stringLength; + LocalMemory<char> string(getInvariantString(state, &line, nullptr, stringLength, status)); + if (string.isNull() || U_FAILURE(*status)) + { + return nullptr; + } + + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + if (U_FAILURE(*status)) + { + return nullptr; + } + + if(isVerbose()){ + printf(" binary %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + LocalMemory<uint8_t> value; + int32_t count = 0; + if (stringLength > 0 && value.allocateInsteadAndCopy(stringLength) == nullptr) + { + *status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + + char toConv[3] = {'\0', '\0', '\0'}; + for (int32_t i = 0; i < stringLength;) + { + // Skip spaces (which may have been line endings). + char c0 = string[i++]; + if (c0 == ' ') { continue; } + if (i == stringLength) { + *status=U_INVALID_CHAR_FOUND; + error(line, "Encountered invalid binary value (odd number of hex digits)"); + return nullptr; + } + toConv[0] = c0; + toConv[1] = string[i++]; + + char *stopstring; + value[count++] = (uint8_t) uprv_strtoul(toConv, &stopstring, 16); + uint32_t len=(uint32_t)(stopstring-toConv); + + if(len!=2) + { + *status=U_INVALID_CHAR_FOUND; + error(line, "Encountered invalid binary value (not all pairs of hex digits)"); + return nullptr; + } + } + + if (count == 0) { + warning(startline, "Encountered empty binary value"); + return bin_open(state->bundle, tag, 0, nullptr, "", comment, status); + } else { + return bin_open(state->bundle, tag, count, value.getAlias(), nullptr, comment, status); + } +} + +static struct SResource * +parseInteger(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) +{ + struct SResource *result = nullptr; + int32_t value; + char *string; + char *stopstring; + + int32_t stringLength; + string = getInvariantString(state, nullptr, nullptr, stringLength, status); + + if (string == nullptr || U_FAILURE(*status)) + { + return nullptr; + } + + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + + if (U_FAILURE(*status)) + { + uprv_free(string); + return nullptr; + } + + if(isVerbose()){ + printf(" integer %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + if (stringLength == 0) + { + warning(startline, "Encountered empty integer. Default value is 0."); + } + + /* Allow integer support for hexdecimal, octal digit and decimal*/ + /* and handle illegal char in the integer*/ + value = uprv_strtoul(string, &stopstring, 0); + int32_t len = (int32_t)(stopstring-string); + if(len==stringLength) + { + result = int_open(state->bundle, tag, value, comment, status); + } + else + { + *status=U_INVALID_CHAR_FOUND; + } + uprv_free(string); + + return result; +} + +static struct SResource * +parseImport(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status) +{ + uint32_t line; + int32_t stringLength; + LocalMemory<char> filename(getInvariantString(state, &line, nullptr, stringLength, status)); + if (U_FAILURE(*status)) + { + return nullptr; + } + + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + + if(isVerbose()){ + printf(" import %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + /* Open the input file for reading */ + CharString fullname; + if (state->inputdir != nullptr) { + fullname.append(state->inputdir, *status); + } + fullname.appendPathPart(filename.getAlias(), *status); + if (U_FAILURE(*status)) { + return nullptr; + } + + FileStream *file = T_FileStream_open(fullname.data(), "rb"); + if (file == nullptr) + { + error(line, "couldn't open input file %s", filename.getAlias()); + *status = U_FILE_ACCESS_ERROR; + return nullptr; + } + + int32_t len = T_FileStream_size(file); + LocalMemory<uint8_t> data; + if(data.allocateInsteadAndCopy(len) == nullptr) + { + *status = U_MEMORY_ALLOCATION_ERROR; + T_FileStream_close (file); + return nullptr; + } + + /* int32_t numRead = */ T_FileStream_read(file, data.getAlias(), len); + T_FileStream_close (file); + + return bin_open(state->bundle, tag, len, data.getAlias(), fullname.data(), comment, status); +} + +static struct SResource * +parseInclude(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status) +{ + struct SResource *result; + int32_t len=0; + char *filename; + uint32_t line; + char16_t *pTarget = nullptr; + + UCHARBUF *ucbuf; + char *fullname = nullptr; + const char* cp = nullptr; + const char16_t* uBuffer = nullptr; + + int32_t stringLength; + filename = getInvariantString(state, &line, nullptr, stringLength, status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + + if (U_FAILURE(*status)) + { + uprv_free(filename); + return nullptr; + } + + if(isVerbose()){ + printf(" include %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + fullname = (char *) uprv_malloc(state->inputdirLength + stringLength + 2); + /* test for nullptr */ + if(fullname == nullptr) + { + *status = U_MEMORY_ALLOCATION_ERROR; + uprv_free(filename); + return nullptr; + } + + if(state->inputdir!=nullptr){ + if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR) + { + + uprv_strcpy(fullname, state->inputdir); + + fullname[state->inputdirLength] = U_FILE_SEP_CHAR; + fullname[state->inputdirLength + 1] = '\0'; + + uprv_strcat(fullname, filename); + } + else + { + uprv_strcpy(fullname, state->inputdir); + uprv_strcat(fullname, filename); + } + }else{ + uprv_strcpy(fullname,filename); + } + + ucbuf = ucbuf_open(fullname, &cp,getShowWarning(),false,status); + + if (U_FAILURE(*status)) { + error(line, "couldn't open input file %s\n", filename); + return nullptr; + } + + uBuffer = ucbuf_getBuffer(ucbuf,&len,status); + result = string_open(state->bundle, tag, uBuffer, len, comment, status); + + ucbuf_close(ucbuf); + + uprv_free(pTarget); + + uprv_free(filename); + uprv_free(fullname); + + return result; +} + + + + + +U_STRING_DECL(k_type_string, "string", 6); +U_STRING_DECL(k_type_binary, "binary", 6); +U_STRING_DECL(k_type_bin, "bin", 3); +U_STRING_DECL(k_type_table, "table", 5); +U_STRING_DECL(k_type_table_no_fallback, "table(nofallback)", 17); +U_STRING_DECL(k_type_int, "int", 3); +U_STRING_DECL(k_type_integer, "integer", 7); +U_STRING_DECL(k_type_array, "array", 5); +U_STRING_DECL(k_type_alias, "alias", 5); +U_STRING_DECL(k_type_intvector, "intvector", 9); +U_STRING_DECL(k_type_import, "import", 6); +U_STRING_DECL(k_type_include, "include", 7); + +/* Various non-standard processing plugins that create one or more special resources. */ +U_STRING_DECL(k_type_plugin_uca_rules, "process(uca_rules)", 18); +U_STRING_DECL(k_type_plugin_collation, "process(collation)", 18); +U_STRING_DECL(k_type_plugin_transliterator, "process(transliterator)", 23); +U_STRING_DECL(k_type_plugin_dependency, "process(dependency)", 19); + +typedef enum EResourceType +{ + RESTYPE_UNKNOWN, + RESTYPE_STRING, + RESTYPE_BINARY, + RESTYPE_TABLE, + RESTYPE_TABLE_NO_FALLBACK, + RESTYPE_INTEGER, + RESTYPE_ARRAY, + RESTYPE_ALIAS, + RESTYPE_INTVECTOR, + RESTYPE_IMPORT, + RESTYPE_INCLUDE, + RESTYPE_PROCESS_UCA_RULES, + RESTYPE_PROCESS_COLLATION, + RESTYPE_PROCESS_TRANSLITERATOR, + RESTYPE_PROCESS_DEPENDENCY, + RESTYPE_RESERVED +} EResourceType; + +static struct { + const char *nameChars; /* only used for debugging */ + const char16_t *nameUChars; + ParseResourceFunction *parseFunction; +} gResourceTypes[] = { + {"Unknown", nullptr, nullptr}, + {"string", k_type_string, parseString}, + {"binary", k_type_binary, parseBinary}, + {"table", k_type_table, parseTable}, + {"table(nofallback)", k_type_table_no_fallback, nullptr}, /* parseFunction will never be called */ + {"integer", k_type_integer, parseInteger}, + {"array", k_type_array, parseArray}, + {"alias", k_type_alias, parseAlias}, + {"intvector", k_type_intvector, parseIntVector}, + {"import", k_type_import, parseImport}, + {"include", k_type_include, parseInclude}, + {"process(uca_rules)", k_type_plugin_uca_rules, parseUCARules}, + {"process(collation)", k_type_plugin_collation, nullptr /* not implemented yet */}, + {"process(transliterator)", k_type_plugin_transliterator, parseTransliterator}, + {"process(dependency)", k_type_plugin_dependency, parseDependency}, + {"reserved", nullptr, nullptr} +}; + +void initParser() +{ + U_STRING_INIT(k_type_string, "string", 6); + U_STRING_INIT(k_type_binary, "binary", 6); + U_STRING_INIT(k_type_bin, "bin", 3); + U_STRING_INIT(k_type_table, "table", 5); + U_STRING_INIT(k_type_table_no_fallback, "table(nofallback)", 17); + U_STRING_INIT(k_type_int, "int", 3); + U_STRING_INIT(k_type_integer, "integer", 7); + U_STRING_INIT(k_type_array, "array", 5); + U_STRING_INIT(k_type_alias, "alias", 5); + U_STRING_INIT(k_type_intvector, "intvector", 9); + U_STRING_INIT(k_type_import, "import", 6); + U_STRING_INIT(k_type_include, "include", 7); + + U_STRING_INIT(k_type_plugin_uca_rules, "process(uca_rules)", 18); + U_STRING_INIT(k_type_plugin_collation, "process(collation)", 18); + U_STRING_INIT(k_type_plugin_transliterator, "process(transliterator)", 23); + U_STRING_INIT(k_type_plugin_dependency, "process(dependency)", 19); +} + +static inline UBool isTable(enum EResourceType type) { + return (UBool)(type==RESTYPE_TABLE || type==RESTYPE_TABLE_NO_FALLBACK); +} + +static enum EResourceType +parseResourceType(ParseState* state, UErrorCode *status) +{ + struct UString *tokenValue; + struct UString comment; + enum EResourceType result = RESTYPE_UNKNOWN; + uint32_t line=0; + ustr_init(&comment); + expect(state, TOK_STRING, &tokenValue, &comment, &line, status); + + if (U_FAILURE(*status)) + { + return RESTYPE_UNKNOWN; + } + + *status = U_ZERO_ERROR; + + /* Search for normal types */ + result=RESTYPE_UNKNOWN; + while ((result=(EResourceType)(result+1)) < RESTYPE_RESERVED) { + if (u_strcmp(tokenValue->fChars, gResourceTypes[result].nameUChars) == 0) { + break; + } + } + /* Now search for the aliases */ + if (u_strcmp(tokenValue->fChars, k_type_int) == 0) { + result = RESTYPE_INTEGER; + } + else if (u_strcmp(tokenValue->fChars, k_type_bin) == 0) { + result = RESTYPE_BINARY; + } + else if (result == RESTYPE_RESERVED) { + char tokenBuffer[1024]; + u_austrncpy(tokenBuffer, tokenValue->fChars, sizeof(tokenBuffer)); + tokenBuffer[sizeof(tokenBuffer) - 1] = 0; + *status = U_INVALID_FORMAT_ERROR; + error(line, "unknown resource type '%s'", tokenBuffer); + } + + return result; +} + +/* parse a non-top-level resource */ +static struct SResource * +parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status) +{ + enum ETokenType token; + enum EResourceType resType = RESTYPE_UNKNOWN; + ParseResourceFunction *parseFunction = nullptr; + struct UString *tokenValue; + uint32_t startline; + uint32_t line; + + + token = getToken(state, &tokenValue, nullptr, &startline, status); + + if(isVerbose()){ + printf(" resource %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + /* name . [ ':' type ] '{' resource '}' */ + /* This function parses from the colon onwards. If the colon is present, parse the + type then try to parse a resource of that type. If there is no explicit type, + work it out using the lookahead tokens. */ + switch (token) + { + case TOK_EOF: + *status = U_INVALID_FORMAT_ERROR; + error(startline, "Unexpected EOF encountered"); + return nullptr; + + case TOK_ERROR: + *status = U_INVALID_FORMAT_ERROR; + return nullptr; + + case TOK_COLON: + resType = parseResourceType(state, status); + expect(state, TOK_OPEN_BRACE, &tokenValue, nullptr, &startline, status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + + break; + + case TOK_OPEN_BRACE: + break; + + default: + *status = U_INVALID_FORMAT_ERROR; + error(startline, "syntax error while reading a resource, expected '{' or ':'"); + return nullptr; + } + + + if (resType == RESTYPE_UNKNOWN) + { + /* No explicit type, so try to work it out. At this point, we've read the first '{'. + We could have any of the following: + { { => array (nested) + { :/} => array + { string , => string array + + { string { => table + + { string :/{ => table + { string } => string + */ + + token = peekToken(state, 0, nullptr, &line, nullptr,status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + + if (token == TOK_OPEN_BRACE || token == TOK_COLON ||token ==TOK_CLOSE_BRACE ) + { + resType = RESTYPE_ARRAY; + } + else if (token == TOK_STRING) + { + token = peekToken(state, 1, nullptr, &line, nullptr, status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + + switch (token) + { + case TOK_COMMA: resType = RESTYPE_ARRAY; break; + case TOK_OPEN_BRACE: resType = RESTYPE_TABLE; break; + case TOK_CLOSE_BRACE: resType = RESTYPE_STRING; break; + case TOK_COLON: resType = RESTYPE_TABLE; break; + default: + *status = U_INVALID_FORMAT_ERROR; + error(line, "Unexpected token after string, expected ',', '{' or '}'"); + return nullptr; + } + } + else + { + *status = U_INVALID_FORMAT_ERROR; + error(line, "Unexpected token after '{'"); + return nullptr; + } + + /* printf("Type guessed as %s\n", resourceNames[resType]); */ + } else if(resType == RESTYPE_TABLE_NO_FALLBACK) { + *status = U_INVALID_FORMAT_ERROR; + error(startline, "error: %s resource type not valid except on top bundle level", gResourceTypes[resType].nameChars); + return nullptr; + } + + + /* We should now know what we need to parse next, so call the appropriate parser + function and return. */ + parseFunction = gResourceTypes[resType].parseFunction; + if (parseFunction != nullptr) { + return parseFunction(state, tag, startline, comment, status); + } + else { + *status = U_INTERNAL_PROGRAM_ERROR; + error(startline, "internal error: %s resource type found and not handled", gResourceTypes[resType].nameChars); + } + + return nullptr; +} + +/* parse the top-level resource */ +struct SRBRoot * +parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *filename, + UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status) +{ + struct UString *tokenValue; + struct UString comment; + uint32_t line; + enum EResourceType bundleType; + enum ETokenType token; + ParseState state; + uint32_t i; + + + for (i = 0; i < MAX_LOOKAHEAD + 1; i++) + { + ustr_init(&state.lookahead[i].value); + ustr_init(&state.lookahead[i].comment); + } + + initLookahead(&state, buf, status); + + state.inputdir = inputDir; + state.inputdirLength = (state.inputdir != nullptr) ? (uint32_t)uprv_strlen(state.inputdir) : 0; + state.outputdir = outputDir; + state.outputdirLength = (state.outputdir != nullptr) ? (uint32_t)uprv_strlen(state.outputdir) : 0; + state.filename = filename; + state.makeBinaryCollation = makeBinaryCollation; + state.omitCollationRules = omitCollationRules; + state.icu4xMode = icu4xMode; + + ustr_init(&comment); + expect(&state, TOK_STRING, &tokenValue, &comment, nullptr, status); + + state.bundle = new SRBRoot(&comment, false, *status); + + if (state.bundle == nullptr || U_FAILURE(*status)) + { + delete state.bundle; + + return nullptr; + } + + + state.bundle->setLocale(tokenValue->fChars, *status); + + /* The following code is to make Empty bundle work no matter with :table specifer or not */ + token = getToken(&state, nullptr, nullptr, &line, status); + if(token==TOK_COLON) { + *status=U_ZERO_ERROR; + bundleType=parseResourceType(&state, status); + + if(isTable(bundleType)) + { + expect(&state, TOK_OPEN_BRACE, nullptr, nullptr, &line, status); + } + else + { + *status=U_PARSE_ERROR; + error(line, "parse error. Stopped parsing with %s", u_errorName(*status)); + } + } + else + { + /* not a colon */ + if(token==TOK_OPEN_BRACE) + { + *status=U_ZERO_ERROR; + bundleType=RESTYPE_TABLE; + } + else + { + /* neither colon nor open brace */ + *status=U_PARSE_ERROR; + bundleType=RESTYPE_UNKNOWN; + error(line, "parse error, did not find open-brace '{' or colon ':', stopped with %s", u_errorName(*status)); + } + } + + if (U_FAILURE(*status)) + { + delete state.bundle; + return nullptr; + } + + if(bundleType==RESTYPE_TABLE_NO_FALLBACK) { + /* + * Parse a top-level table with the table(nofallback) declaration. + * This is the same as a regular table, but also sets the + * URES_ATT_NO_FALLBACK flag in indexes[URES_INDEX_ATTRIBUTES] . + */ + state.bundle->fNoFallback=true; + } + /* top-level tables need not handle special table names like "collations" */ + assert(!state.bundle->fIsPoolBundle); + assert(state.bundle->fRoot->fType == URES_TABLE); + TableResource *rootTable = static_cast<TableResource *>(state.bundle->fRoot); + realParseTable(&state, rootTable, nullptr, line, status); + if(dependencyArray!=nullptr){ + rootTable->add(dependencyArray, 0, *status); + dependencyArray = nullptr; + } + if (U_FAILURE(*status)) + { + delete state.bundle; + res_close(dependencyArray); + return nullptr; + } + + if (getToken(&state, nullptr, nullptr, &line, status) != TOK_EOF) + { + warning(line, "extraneous text after resource bundle (perhaps unmatched braces)"); + if(isStrict()){ + *status = U_INVALID_FORMAT_ERROR; + return nullptr; + } + } + + cleanupLookahead(&state); + ustr_deinit(&comment); + return state.bundle; +} diff --git a/intl/icu/source/tools/genrb/parse.h b/intl/icu/source/tools/genrb/parse.h new file mode 100644 index 0000000000..fa90ede9d2 --- /dev/null +++ b/intl/icu/source/tools/genrb/parse.h @@ -0,0 +1,38 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File parse.h +* +* Modification History: +* +* Date Name Description +* 05/26/99 stephen Creation. +******************************************************************************* +*/ + +#ifndef PARSE_H +#define PARSE_H 1 + +#include "unicode/utypes.h" +#include "filestrm.h" +#include "ucbuf.h" + +U_CDECL_BEGIN +/* One time parser initialization */ +void initParser(); + +/* Parse a ResourceBundle text file */ +struct SRBRoot* parse(UCHARBUF *buf, const char* inputDir, const char* outputDir, + const char *filename, + UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status); + +U_CDECL_END + +#endif diff --git a/intl/icu/source/tools/genrb/prscmnts.cpp b/intl/icu/source/tools/genrb/prscmnts.cpp new file mode 100644 index 0000000000..ea55352b41 --- /dev/null +++ b/intl/icu/source/tools/genrb/prscmnts.cpp @@ -0,0 +1,248 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ******************************************************************************* + * Copyright (C) 2003-2014, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + * + * File prscmnts.cpp + * + * Modification History: + * + * Date Name Description + * 08/22/2003 ram Creation. + ******************************************************************************* + */ + +// Safer use of UnicodeString. +#ifndef UNISTR_FROM_CHAR_EXPLICIT +# define UNISTR_FROM_CHAR_EXPLICIT explicit +#endif + +// Less important, but still a good idea. +#ifndef UNISTR_FROM_STRING_EXPLICIT +# define UNISTR_FROM_STRING_EXPLICIT explicit +#endif + +#include "unicode/regex.h" +#include "unicode/unistr.h" +#include "unicode/parseerr.h" +#include "prscmnts.h" +#include <stdio.h> +#include <stdlib.h> + +U_NAMESPACE_USE + +#if UCONFIG_NO_REGULAR_EXPRESSIONS==0 /* donot compile when RegularExpressions not available */ + +#define MAX_SPLIT_STRINGS 20 + +const char *patternStrings[UPC_LIMIT]={ + "^translate\\s*(.*)", + "^note\\s*(.*)" +}; + +U_CFUNC int32_t +removeText(char16_t *source, int32_t srcLen, + UnicodeString patString,uint32_t options, + UnicodeString replaceText, UErrorCode *status){ + + if(status == nullptr || U_FAILURE(*status)){ + return 0; + } + + UnicodeString src(source, srcLen); + + RegexMatcher myMatcher(patString, src, options, *status); + if(U_FAILURE(*status)){ + return 0; + } + UnicodeString dest; + + + dest = myMatcher.replaceAll(replaceText,*status); + + + return dest.extract(source, srcLen, *status); + +} +U_CFUNC int32_t +trim(char16_t *src, int32_t srcLen, UErrorCode *status){ + srcLen = removeText(src, srcLen, UnicodeString("^[ \\r\\n]+ "), 0, UnicodeString(), status); // remove leading new lines + srcLen = removeText(src, srcLen, UnicodeString("^\\s+"), 0, UnicodeString(), status); // remove leading spaces + srcLen = removeText(src, srcLen, UnicodeString("\\s+$"), 0, UnicodeString(), status); // remove trailing spcaes + return srcLen; +} + +U_CFUNC int32_t +removeCmtText(char16_t* source, int32_t srcLen, UErrorCode* status){ + srcLen = trim(source, srcLen, status); + UnicodeString patString("^\\s*?\\*\\s*?"); // remove pattern like " * " at the beginning of the line + srcLen = removeText(source, srcLen, patString, UREGEX_MULTILINE, UnicodeString(), status); + return removeText(source, srcLen, UnicodeString("[ \\r\\n]+"), 0, UnicodeString(" "), status);// remove new lines; +} + +U_CFUNC int32_t +getText(const char16_t* source, int32_t srcLen, + char16_t** dest, int32_t destCapacity, + UnicodeString patternString, + UErrorCode* status){ + + if(status == nullptr || U_FAILURE(*status)){ + return 0; + } + + UnicodeString stringArray[MAX_SPLIT_STRINGS]; + RegexPattern *pattern = RegexPattern::compile(UnicodeString("@"), 0, *status); + UnicodeString src (source,srcLen); + + if (U_FAILURE(*status)) { + return 0; + } + pattern->split(src, stringArray, MAX_SPLIT_STRINGS, *status); + + RegexMatcher matcher(patternString, UREGEX_DOTALL, *status); + if (U_FAILURE(*status)) { + return 0; + } + for(int32_t i=0; i<MAX_SPLIT_STRINGS; i++){ + matcher.reset(stringArray[i]); + if(matcher.lookingAt(*status)){ + UnicodeString out = matcher.group(1, *status); + + return out.extract(*dest, destCapacity,*status); + } + } + return 0; +} + + +#define AT_SIGN 0x0040 + +U_CFUNC int32_t +getDescription( const char16_t* source, int32_t srcLen, + char16_t** dest, int32_t destCapacity, + UErrorCode* status){ + if(status == nullptr || U_FAILURE(*status)){ + return 0; + } + + UnicodeString stringArray[MAX_SPLIT_STRINGS]; + RegexPattern *pattern = RegexPattern::compile(UnicodeString("@"), UREGEX_MULTILINE, *status); + UnicodeString src(source, srcLen); + + if (U_FAILURE(*status)) { + return 0; + } + pattern->split(src, stringArray,MAX_SPLIT_STRINGS , *status); + + if(stringArray[0].indexOf((char16_t)AT_SIGN)==-1){ + int32_t destLen = stringArray[0].extract(*dest, destCapacity, *status); + return trim(*dest, destLen, status); + } + return 0; +} + +U_CFUNC int32_t +getCount(const char16_t* source, int32_t srcLen, + UParseCommentsOption option, UErrorCode *status){ + + if(status == nullptr || U_FAILURE(*status)){ + return 0; + } + + UnicodeString stringArray[MAX_SPLIT_STRINGS]; + RegexPattern *pattern = RegexPattern::compile(UnicodeString("@"), UREGEX_MULTILINE, *status); + UnicodeString src (source, srcLen); + + + if (U_FAILURE(*status)) { + return 0; + } + int32_t retLen = pattern->split(src, stringArray, MAX_SPLIT_STRINGS, *status); + + UnicodeString patternString(patternStrings[option]); + RegexMatcher matcher(patternString, UREGEX_DOTALL, *status); + if (U_FAILURE(*status)) { + return 0; + } + int32_t count = 0; + for(int32_t i=0; i<retLen; i++){ + matcher.reset(stringArray[i]); + if(matcher.lookingAt(*status)){ + count++; + } + } + if(option == UPC_TRANSLATE && count > 1){ + fprintf(stderr, "Multiple @translate tags cannot be supported.\n"); + exit(U_UNSUPPORTED_ERROR); + } + return count; +} + +U_CFUNC int32_t +getAt(const char16_t* source, int32_t srcLen, + char16_t** dest, int32_t destCapacity, + int32_t index, + UParseCommentsOption option, + UErrorCode* status){ + + if(status == nullptr || U_FAILURE(*status)){ + return 0; + } + + UnicodeString stringArray[MAX_SPLIT_STRINGS]; + RegexPattern *pattern = RegexPattern::compile(UnicodeString("@"), UREGEX_MULTILINE, *status); + UnicodeString src (source, srcLen); + + + if (U_FAILURE(*status)) { + return 0; + } + int32_t retLen = pattern->split(src, stringArray, MAX_SPLIT_STRINGS, *status); + + UnicodeString patternString(patternStrings[option]); + RegexMatcher matcher(patternString, UREGEX_DOTALL, *status); + if (U_FAILURE(*status)) { + return 0; + } + int32_t count = 0; + for(int32_t i=0; i<retLen; i++){ + matcher.reset(stringArray[i]); + if(matcher.lookingAt(*status)){ + if(count == index){ + UnicodeString out = matcher.group(1, *status); + return out.extract(*dest, destCapacity,*status); + } + count++; + + } + } + return 0; + +} + +U_CFUNC int32_t +getTranslate( const char16_t* source, int32_t srcLen, + char16_t** dest, int32_t destCapacity, + UErrorCode* status){ + UnicodeString notePatternString("^translate\\s*?(.*)"); + + int32_t destLen = getText(source, srcLen, dest, destCapacity, notePatternString, status); + return trim(*dest, destLen, status); +} + +U_CFUNC int32_t +getNote(const char16_t* source, int32_t srcLen, + char16_t** dest, int32_t destCapacity, + UErrorCode* status){ + + UnicodeString notePatternString("^note\\s*?(.*)"); + int32_t destLen = getText(source, srcLen, dest, destCapacity, notePatternString, status); + return trim(*dest, destLen, status); + +} + +#endif /* UCONFIG_NO_REGULAR_EXPRESSIONS */ + diff --git a/intl/icu/source/tools/genrb/prscmnts.h b/intl/icu/source/tools/genrb/prscmnts.h new file mode 100644 index 0000000000..43195d2d30 --- /dev/null +++ b/intl/icu/source/tools/genrb/prscmnts.h @@ -0,0 +1,66 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File read.h +* +* Modification History: +* +* Date Name Description +* 05/26/99 stephen Creation. +* 5/10/01 Ram removed ustdio dependency +******************************************************************************* +*/ + +#ifndef PRSCMNTS_H +#define PRSCMNTS_H 1 + +#include "unicode/utypes.h" + +#if UCONFIG_NO_REGULAR_EXPRESSIONS==0 /* donot compile when no RegularExpressions are available */ + +enum UParseCommentsOption { + UPC_TRANSLATE, + UPC_NOTE, + UPC_LIMIT +}; + +typedef enum UParseCommentsOption UParseCommentsOption; + +U_CFUNC int32_t +getNote(const UChar* source, int32_t srcLen, + UChar** dest, int32_t destCapacity, + UErrorCode* status); +U_CFUNC int32_t +removeCmtText(UChar* source, int32_t srcLen, UErrorCode* status); + +U_CFUNC int32_t +getDescription( const UChar* source, int32_t srcLen, + UChar** dest, int32_t destCapacity, + UErrorCode* status); +U_CFUNC int32_t +getTranslate( const UChar* source, int32_t srcLen, + UChar** dest, int32_t destCapacity, + UErrorCode* status); + +U_CFUNC int32_t +getAt(const UChar* source, int32_t srcLen, + UChar** dest, int32_t destCapacity, + int32_t index, + UParseCommentsOption option, + UErrorCode* status); + +U_CFUNC int32_t +getCount(const UChar* source, int32_t srcLen, + UParseCommentsOption option, UErrorCode *status); + +#endif /* UCONFIG_NO_REGULAR_EXPRESSIONS */ + +#endif + diff --git a/intl/icu/source/tools/genrb/rbutil.c b/intl/icu/source/tools/genrb/rbutil.c new file mode 100644 index 0000000000..ed3e66b250 --- /dev/null +++ b/intl/icu/source/tools/genrb/rbutil.c @@ -0,0 +1,119 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2008, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File util.c +* +* Modification History: +* +* Date Name Description +* 06/10/99 stephen Creation. +* 02/07/08 Spieth Correct XLIFF generation on EBCDIC platform +* +******************************************************************************* +*/ + +#include "unicode/putil.h" +#include "rbutil.h" +#include "cmemory.h" +#include "cstring.h" + + +/* go from "/usr/local/include/curses.h" to "/usr/local/include" */ +void +get_dirname(char *dirname, + const char *filename) +{ + const char *lastSlash = uprv_strrchr(filename, U_FILE_SEP_CHAR); + if (lastSlash != NULL) { + lastSlash++; + } + + if(lastSlash>filename) { + uprv_strncpy(dirname, filename, (lastSlash - filename)); + *(dirname + (lastSlash - filename)) = '\0'; + } else { + *dirname = '\0'; + } +} + +/* go from "/usr/local/include/curses.h" to "curses" */ +void +get_basename(char *basename, + const char *filename) +{ + /* strip off any leading directory portions */ + const char *lastSlash = uprv_strrchr(filename, U_FILE_SEP_CHAR); + if (lastSlash != NULL) { + lastSlash++; + } + char *lastDot; + + if(lastSlash>filename) { + uprv_strcpy(basename, lastSlash); + } else { + uprv_strcpy(basename, filename); + } + + /* strip off any suffix */ + lastDot = uprv_strrchr(basename, '.'); + + if(lastDot != NULL) { + *lastDot = '\0'; + } +} + +#define MAX_DIGITS 10 +int32_t +itostr(char * buffer, int32_t i, uint32_t radix, int32_t pad) +{ + const char digits[16] = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'}; + int32_t length = 0; + int32_t num = 0; + int32_t save = i; + int digit; + int32_t j; + char temp; + + /* if i is negative make it positive */ + if(i<0){ + i=-i; + } + + do{ + digit = (int)(i % radix); + buffer[length++]= digits[digit]; + i=i/radix; + } while(i); + + while (length < pad){ + buffer[length++] = '0';/*zero padding */ + } + + /* if i is negative add the negative sign */ + if(save < 0){ + buffer[length++]='-'; + } + + /* null terminate the buffer */ + if(length<MAX_DIGITS){ + buffer[length] = 0x0000; + } + + num= (pad>=length) ? pad :length; + + + /* Reverses the string */ + for (j = 0; j < (num / 2); j++){ + temp = buffer[(length-1) - j]; + buffer[(length-1) - j] = buffer[j]; + buffer[j] = temp; + } + return length; +} diff --git a/intl/icu/source/tools/genrb/rbutil.h b/intl/icu/source/tools/genrb/rbutil.h new file mode 100644 index 0000000000..9a12c50959 --- /dev/null +++ b/intl/icu/source/tools/genrb/rbutil.h @@ -0,0 +1,33 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File rbutil.h +* +* Modification History: +* +* Date Name Description +* 06/10/99 stephen Creation. +******************************************************************************* +*/ + +#ifndef UTIL_H +#define UTIL_H 1 + +#include "unicode/utypes.h" + +U_CDECL_BEGIN + +void get_dirname(char *dirname, const char *filename); +void get_basename(char *basename, const char *filename); +int32_t itostr(char * buffer, int32_t i, uint32_t radix, int32_t pad); + +U_CDECL_END + +#endif /* ! UTIL_H */ diff --git a/intl/icu/source/tools/genrb/read.c b/intl/icu/source/tools/genrb/read.c new file mode 100644 index 0000000000..0d4a318a89 --- /dev/null +++ b/intl/icu/source/tools/genrb/read.c @@ -0,0 +1,479 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File read.c +* +* Modification History: +* +* Date Name Description +* 05/26/99 stephen Creation. +* 5/10/01 Ram removed ustdio dependency +******************************************************************************* +*/ + +#include <stdbool.h> + +#include "read.h" +#include "errmsg.h" +#include "toolutil.h" +#include "unicode/ustring.h" +#include "unicode/utf16.h" + +#define OPENBRACE 0x007B +#define CLOSEBRACE 0x007D +#define COMMA 0x002C +#define QUOTE 0x0022 +#define ESCAPE 0x005C +#define SLASH 0x002F +#define ASTERISK 0x002A +#define SPACE 0x0020 +#define COLON 0x003A +#define BADBOM 0xFFFE +#define CR 0x000D +#define LF 0x000A + +static int32_t lineCount; + +/* Protos */ +static enum ETokenType getStringToken(UCHARBUF *buf, + UChar32 initialChar, + struct UString *token, + UErrorCode *status); + +static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status); +static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status); +static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status); +static UBool isWhitespace (UChar32 c); +static UBool isNewline (UChar32 c); + +U_CFUNC void resetLineNumber() { + lineCount = 1; +} + +/* Read and return the next token from the stream. If the token is of + type eString, fill in the token parameter with the token. If the + token is eError, then the status parameter will contain the + specific error. This will be eItemNotFound at the end of file, + indicating that all tokens have been returned. This method will + never return eString twice in a row; instead, multiple adjacent + string tokens will be merged into one, with no intervening + space. */ +U_CFUNC enum ETokenType +getNextToken(UCHARBUF* buf, + struct UString *token, + uint32_t *linenumber, /* out: linenumber of token */ + struct UString *comment, + UErrorCode *status) { + enum ETokenType result; + UChar32 c; + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + /* Skip whitespace */ + c = getNextChar(buf, true, comment, status); + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + *linenumber = lineCount; + + switch(c) { + case BADBOM: + return TOK_ERROR; + case OPENBRACE: + return TOK_OPEN_BRACE; + case CLOSEBRACE: + return TOK_CLOSE_BRACE; + case COMMA: + return TOK_COMMA; + case U_EOF: + return TOK_EOF; + case COLON: + return TOK_COLON; + + default: + result = getStringToken(buf, c, token, status); + } + + *linenumber = lineCount; + return result; +} + +/* Copy a string token into the given UnicodeString. Upon entry, we + have already read the first character of the string token, which is + not a whitespace character (but may be a QUOTE or ESCAPE). This + function reads all subsequent characters that belong with this + string, and copy them into the token parameter. The other + important, and slightly convoluted purpose of this function is to + merge adjacent strings. It looks forward a bit, and if the next + non comment, non whitespace item is a string, it reads it in as + well. If two adjacent strings are quoted, they are merged without + intervening space. Otherwise a single SPACE character is + inserted. */ +static enum ETokenType getStringToken(UCHARBUF* buf, + UChar32 initialChar, + struct UString *token, + UErrorCode *status) { + UBool lastStringWasQuoted; + UChar32 c; + UChar target[3] = { '\0' }; + UChar *pTarget = target; + int len=0; + UBool isFollowingCharEscaped=false; + UBool isNLUnescaped = false; + UChar32 prevC=0; + + /* We are guaranteed on entry that initialChar is not a whitespace + character. If we are at the EOF, or have some other problem, it + doesn't matter; we still want to validly return the initialChar + (if nothing else) as a string token. */ + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + /* setup */ + lastStringWasQuoted = false; + c = initialChar; + ustr_setlen(token, 0, status); + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + for (;;) { + if (c == QUOTE) { + if (!lastStringWasQuoted && token->fLength > 0) { + ustr_ucat(token, SPACE, status); + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + } + + lastStringWasQuoted = true; + + for (;;) { + c = ucbuf_getc(buf,status); + + /* EOF reached */ + if (c == U_EOF) { + return TOK_EOF; + } + + /* Unterminated quoted strings */ + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + if (c == QUOTE && !isFollowingCharEscaped) { + break; + } + + if (c == ESCAPE && !isFollowingCharEscaped) { + pTarget = target; + c = unescape(buf, status); + + if (c == U_ERR) { + return TOK_ERROR; + } + if(c == CR || c == LF){ + isNLUnescaped = true; + } + } + + if(c==ESCAPE && !isFollowingCharEscaped){ + isFollowingCharEscaped = true; + }else{ + U_APPEND_CHAR32(c, pTarget,len); + pTarget = target; + ustr_uscat(token, pTarget,len, status); + isFollowingCharEscaped = false; + len=0; + if(c == CR || c == LF){ + if(isNLUnescaped == false && prevC!=CR){ + lineCount++; + } + isNLUnescaped = false; + } + } + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + prevC = c; + } + } else { + if (token->fLength > 0) { + ustr_ucat(token, SPACE, status); + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + } + + if(lastStringWasQuoted){ + if(getShowWarning()){ + warning(lineCount, "Mixing quoted and unquoted strings"); + } + if(isStrict()){ + return TOK_ERROR; + } + + } + + lastStringWasQuoted = false; + + /* if we reach here we are mixing + * quoted and unquoted strings + * warn in normal mode and error in + * pedantic mode + */ + + if (c == ESCAPE) { + pTarget = target; + c = unescape(buf, status); + + /* EOF reached */ + if (c == U_EOF) { + return TOK_ERROR; + } + } + + U_APPEND_CHAR32(c, pTarget,len); + pTarget = target; + ustr_uscat(token, pTarget,len, status); + len=0; + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + for (;;) { + /* DON'T skip whitespace */ + c = getNextChar(buf, false, NULL, status); + + /* EOF reached */ + if (c == U_EOF) { + ucbuf_ungetc(c, buf); + return TOK_STRING; + } + + if (U_FAILURE(*status)) { + return TOK_STRING; + } + + if (c == QUOTE + || c == OPENBRACE + || c == CLOSEBRACE + || c == COMMA + || c == COLON) { + ucbuf_ungetc(c, buf); + break; + } + + if (isWhitespace(c)) { + break; + } + + if (c == ESCAPE) { + pTarget = target; + c = unescape(buf, status); + + if (c == U_ERR) { + return TOK_ERROR; + } + } + + U_APPEND_CHAR32(c, pTarget,len); + pTarget = target; + ustr_uscat(token, pTarget,len, status); + len=0; + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + } + } + + /* DO skip whitespace */ + c = getNextChar(buf, true, NULL, status); + + if (U_FAILURE(*status)) { + return TOK_STRING; + } + + if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) { + ucbuf_ungetc(c, buf); + return TOK_STRING; + } + } +} + +/* Retrieve the next character. If skipwhite is + true, whitespace is skipped as well. */ +static UChar32 getNextChar(UCHARBUF* buf, + UBool skipwhite, + struct UString *token, + UErrorCode *status) { + UChar32 c, c2; + + if (U_FAILURE(*status)) { + return U_EOF; + } + + for (;;) { + c = ucbuf_getc(buf,status); + + if (c == U_EOF) { + return U_EOF; + } + + if (skipwhite && isWhitespace(c)) { + continue; + } + + /* This also handles the get() failing case */ + if (c != SLASH) { + return c; + } + + c = ucbuf_getc(buf,status); /* "/c" */ + + if (c == U_EOF) { + return U_EOF; + } + + switch (c) { + case SLASH: /* "//" */ + seekUntilNewline(buf, NULL, status); + break; + + case ASTERISK: /* " / * " */ + c2 = ucbuf_getc(buf, status); /* "/ * c" */ + if(c2 == ASTERISK){ /* "/ * *" */ + /* parse multi-line comment and store it in token*/ + seekUntilEndOfComment(buf, token, status); + } else { + ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */ + seekUntilEndOfComment(buf, NULL, status); + } + break; + + default: + ucbuf_ungetc(c, buf); /* "/c" - put back the c */ + /* If get() failed this is a NOP */ + return SLASH; + } + + } +} + +static void seekUntilNewline(UCHARBUF* buf, + struct UString *token, + UErrorCode *status) { + UChar32 c; + + if (U_FAILURE(*status)) { + return; + } + + do { + c = ucbuf_getc(buf,status); + /* add the char to token */ + if(token!=NULL){ + ustr_u32cat(token, c, status); + } + } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR); +} + +static void seekUntilEndOfComment(UCHARBUF *buf, + struct UString *token, + UErrorCode *status) { + UChar32 c, d; + uint32_t line; + + if (U_FAILURE(*status)) { + return; + } + + line = lineCount; + + do { + c = ucbuf_getc(buf, status); + + if (c == ASTERISK) { + d = ucbuf_getc(buf, status); + + if (d != SLASH) { + ucbuf_ungetc(d, buf); + } else { + break; + } + } + /* add the char to token */ + if(token!=NULL){ + ustr_u32cat(token, c, status); + } + /* increment the lineCount */ + isNewline(c); + + } while (c != U_EOF && *status == U_ZERO_ERROR); + + if (c == U_EOF) { + *status = U_INVALID_FORMAT_ERROR; + error(line, "unterminated comment detected"); + } +} + +U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) { + if (U_FAILURE(*status)) { + return U_EOF; + } + + /* We expect to be called after the ESCAPE has been seen, but + * u_fgetcx needs an ESCAPE to do its magic. */ + ucbuf_ungetc(ESCAPE, buf); + + return ucbuf_getcx32(buf, status); +} + +static UBool isWhitespace(UChar32 c) { + switch (c) { + /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */ + case 0x000A: + case 0x2029: + lineCount++; + case 0x000D: + case 0x0020: + case 0x0009: + case 0xFEFF: + return true; + + default: + return false; + } +} + +static UBool isNewline(UChar32 c) { + switch (c) { + /* '\n', '\r', 0x2029 */ + case 0x000A: + case 0x2029: + lineCount++; + case 0x000D: + return true; + + default: + return false; + } +} diff --git a/intl/icu/source/tools/genrb/read.h b/intl/icu/source/tools/genrb/read.h new file mode 100644 index 0000000000..e5b8d155da --- /dev/null +++ b/intl/icu/source/tools/genrb/read.h @@ -0,0 +1,54 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File read.h +* +* Modification History: +* +* Date Name Description +* 05/26/99 stephen Creation. +* 5/10/01 Ram removed ustdio dependency +******************************************************************************* +*/ + +#ifndef READ_H +#define READ_H 1 + +#include "unicode/utypes.h" +#include "ustr.h" +#include "ucbuf.h" + +/* The types of tokens which may be returned by getNextToken. + NOTE: Keep these in sync with tokenNames in parse.c */ +enum ETokenType +{ + TOK_STRING, /* A string token, such as "MonthNames" */ + TOK_OPEN_BRACE, /* An opening brace character */ + TOK_CLOSE_BRACE, /* A closing brace character */ + TOK_COMMA, /* A comma */ + TOK_COLON, /* A colon */ + + TOK_EOF, /* End of the file has been reached successfully */ + TOK_ERROR, /* An error, such an unterminated quoted string */ + TOK_TOKEN_COUNT /* Number of "real" token types */ +}; + +U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status); + +U_CFUNC void resetLineNumber(void); + +U_CFUNC enum ETokenType +getNextToken(UCHARBUF *buf, + struct UString *token, + uint32_t *linenumber, /* out: linenumber of token */ + struct UString *comment, + UErrorCode *status); + +#endif diff --git a/intl/icu/source/tools/genrb/reslist.cpp b/intl/icu/source/tools/genrb/reslist.cpp new file mode 100644 index 0000000000..e1c2d25061 --- /dev/null +++ b/intl/icu/source/tools/genrb/reslist.cpp @@ -0,0 +1,1794 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File reslist.cpp +* +* Modification History: +* +* Date Name Description +* 02/21/00 weiv Creation. +******************************************************************************* +*/ + +// Safer use of UnicodeString. +#ifndef UNISTR_FROM_CHAR_EXPLICIT +# define UNISTR_FROM_CHAR_EXPLICIT explicit +#endif + +// Less important, but still a good idea. +#ifndef UNISTR_FROM_STRING_EXPLICIT +# define UNISTR_FROM_STRING_EXPLICIT explicit +#endif + +#include <assert.h> +#include <iostream> +#include <set> +#include <stdio.h> + +#include "unicode/localpointer.h" +#include "reslist.h" +#include "unewdata.h" +#include "unicode/ures.h" +#include "unicode/putil.h" +#include "errmsg.h" +#include "filterrb.h" +#include "toolutil.h" + +#include "uarrsort.h" +#include "uelement.h" +#include "uhash.h" +#include "uinvchar.h" +#include "ustr_imp.h" +#include "unicode/utf16.h" +#include "uassert.h" + +/* + * Align binary data at a 16-byte offset from the start of the resource bundle, + * to be safe for any data type it may contain. + */ +#define BIN_ALIGNMENT 16 + +// This numeric constant must be at least 1. +// If StringResource.fNumUnitsSaved == 0 then the string occurs only once, +// and it makes no sense to move it to the pool bundle. +// The larger the threshold for fNumUnitsSaved +// the smaller the savings, and the smaller the pool bundle. +// We trade some total size reduction to reduce the pool bundle a bit, +// so that one can reasonably save data size by +// removing bundle files without rebuilding the pool bundle. +// This can also help to keep the pool and total (pool+local) string indexes +// within 16 bits, that is, within range of Table16 and Array16 containers. +#ifndef GENRB_MIN_16BIT_UNITS_SAVED_FOR_POOL_STRING +# define GENRB_MIN_16BIT_UNITS_SAVED_FOR_POOL_STRING 10 +#endif + +U_NAMESPACE_USE + +static UBool gIncludeCopyright = false; +static UBool gUsePoolBundle = false; +static UBool gIsDefaultFormatVersion = true; +static int32_t gFormatVersion = 3; + +/* How do we store string values? */ +enum { + STRINGS_UTF16_V1, /* formatVersion 1: int length + UChars + NUL + padding to 4 bytes */ + STRINGS_UTF16_V2 /* formatVersion 2 & up: optional length in 1..3 UChars + UChars + NUL */ +}; + +static const int32_t MAX_IMPLICIT_STRING_LENGTH = 40; /* do not store the length explicitly for such strings */ + +static const ResFile kNoPoolBundle; + +/* + * res_none() returns the address of kNoResource, + * for use in non-error cases when no resource is to be added to the bundle. + * (nullptr is used in error cases.) + */ +static SResource kNoResource; // TODO: const + +static UDataInfo dataInfo= { + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + sizeof(char16_t), + 0, + + {0x52, 0x65, 0x73, 0x42}, /* dataFormat="ResB" */ + {1, 3, 0, 0}, /* formatVersion */ + {1, 4, 0, 0} /* dataVersion take a look at version inside parsed resb*/ +}; + +static const UVersionInfo gFormatVersions[4] = { /* indexed by a major-formatVersion integer */ + { 0, 0, 0, 0 }, + { 1, 3, 0, 0 }, + { 2, 0, 0, 0 }, + { 3, 0, 0, 0 } +}; +// Remember to update genrb.h GENRB_VERSION when changing the data format. +// (Or maybe we should remove GENRB_VERSION and report the ICU version number?) + +static uint8_t calcPadding(uint32_t size) { + /* returns space we need to pad */ + return (uint8_t) ((size % sizeof(uint32_t)) ? (sizeof(uint32_t) - (size % sizeof(uint32_t))) : 0); + +} + +void setIncludeCopyright(UBool val){ + gIncludeCopyright=val; +} + +UBool getIncludeCopyright(){ + return gIncludeCopyright; +} + +void setFormatVersion(int32_t formatVersion) { + gIsDefaultFormatVersion = false; + gFormatVersion = formatVersion; +} + +int32_t getFormatVersion() { + return gFormatVersion; +} + +void setUsePoolBundle(UBool use) { + gUsePoolBundle = use; +} + +// TODO: return const pointer, or find another way to express "none" +struct SResource* res_none() { + return &kNoResource; +} + +SResource::SResource() + : fType(URES_NONE), fWritten(false), fRes(RES_BOGUS), fRes16(-1), fKey(-1), fKey16(-1), + line(0), fNext(nullptr) { + ustr_init(&fComment); +} + +SResource::SResource(SRBRoot *bundle, const char *tag, int8_t type, const UString* comment, + UErrorCode &errorCode) + : fType(type), fWritten(false), fRes(RES_BOGUS), fRes16(-1), + fKey(bundle != nullptr ? bundle->addTag(tag, errorCode) : -1), fKey16(-1), + line(0), fNext(nullptr) { + ustr_init(&fComment); + if(comment != nullptr) { + ustr_cpy(&fComment, comment, &errorCode); + } +} + +SResource::~SResource() { + ustr_deinit(&fComment); +} + +ContainerResource::~ContainerResource() { + SResource *current = fFirst; + while (current != nullptr) { + SResource *next = current->fNext; + delete current; + current = next; + } +} + +TableResource::~TableResource() {} + +// TODO: clarify that containers adopt new items, even in error cases; use LocalPointer +void TableResource::add(SResource *res, int linenumber, UErrorCode &errorCode) { + if (U_FAILURE(errorCode) || res == nullptr || res == &kNoResource) { + return; + } + + /* remember this linenumber to report to the user if there is a duplicate key */ + res->line = linenumber; + + /* here we need to traverse the list */ + ++fCount; + + /* is the list still empty? */ + if (fFirst == nullptr) { + fFirst = res; + res->fNext = nullptr; + return; + } + + const char *resKeyString = fRoot->fKeys + res->fKey; + + SResource *current = fFirst; + + SResource *prev = nullptr; + while (current != nullptr) { + const char *currentKeyString = fRoot->fKeys + current->fKey; + int diff; + /* + * formatVersion 1: compare key strings in native-charset order + * formatVersion 2 and up: compare key strings in ASCII order + */ + if (gFormatVersion == 1 || U_CHARSET_FAMILY == U_ASCII_FAMILY) { + diff = uprv_strcmp(currentKeyString, resKeyString); + } else { + diff = uprv_compareInvCharsAsAscii(currentKeyString, resKeyString); + } + if (diff < 0) { + prev = current; + current = current->fNext; + } else if (diff > 0) { + /* we're either in front of the list, or in the middle */ + if (prev == nullptr) { + /* front of the list */ + fFirst = res; + } else { + /* middle of the list */ + prev->fNext = res; + } + + res->fNext = current; + return; + } else { + /* Key already exists! ERROR! */ + error(linenumber, "duplicate key '%s' in table, first appeared at line %d", currentKeyString, current->line); + errorCode = U_UNSUPPORTED_ERROR; + return; + } + } + + /* end of list */ + prev->fNext = res; + res->fNext = nullptr; +} + +ArrayResource::~ArrayResource() {} + +void ArrayResource::add(SResource *res) { + if (res != nullptr && res != &kNoResource) { + if (fFirst == nullptr) { + fFirst = res; + } else { + fLast->fNext = res; + } + fLast = res; + ++fCount; + } +} + +PseudoListResource::~PseudoListResource() {} + +void PseudoListResource::add(SResource *res) { + if (res != nullptr && res != &kNoResource) { + res->fNext = fFirst; + fFirst = res; + ++fCount; + } +} + +StringBaseResource::StringBaseResource(SRBRoot *bundle, const char *tag, int8_t type, + const char16_t *value, int32_t len, + const UString* comment, UErrorCode &errorCode) + : SResource(bundle, tag, type, comment, errorCode) { + if (len == 0 && gFormatVersion > 1) { + fRes = URES_MAKE_EMPTY_RESOURCE(type); + fWritten = true; + return; + } + + fString.setTo(ConstChar16Ptr(value), len); + fString.getTerminatedBuffer(); // Some code relies on NUL-termination. + if (U_SUCCESS(errorCode) && fString.isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } +} + +StringBaseResource::StringBaseResource(SRBRoot *bundle, int8_t type, + const icu::UnicodeString &value, UErrorCode &errorCode) + : SResource(bundle, nullptr, type, nullptr, errorCode), fString(value) { + if (value.isEmpty() && gFormatVersion > 1) { + fRes = URES_MAKE_EMPTY_RESOURCE(type); + fWritten = true; + return; + } + + fString.getTerminatedBuffer(); // Some code relies on NUL-termination. + if (U_SUCCESS(errorCode) && fString.isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } +} + +// Pool bundle string, alias the buffer. Guaranteed NUL-terminated and not empty. +StringBaseResource::StringBaseResource(int8_t type, const char16_t *value, int32_t len, + UErrorCode &errorCode) + : SResource(nullptr, nullptr, type, nullptr, errorCode), fString(true, value, len) { + assert(len > 0); + assert(!fString.isBogus()); +} + +StringBaseResource::~StringBaseResource() {} + +static int32_t U_CALLCONV +string_hash(const UElement key) { + const StringResource *res = static_cast<const StringResource *>(key.pointer); + return res->fString.hashCode(); +} + +static UBool U_CALLCONV +string_comp(const UElement key1, const UElement key2) { + const StringResource *res1 = static_cast<const StringResource *>(key1.pointer); + const StringResource *res2 = static_cast<const StringResource *>(key2.pointer); + return res1->fString == res2->fString; +} + +StringResource::~StringResource() {} + +AliasResource::~AliasResource() {} + +IntResource::IntResource(SRBRoot *bundle, const char *tag, int32_t value, + const UString* comment, UErrorCode &errorCode) + : SResource(bundle, tag, URES_INT, comment, errorCode) { + fValue = value; + fRes = URES_MAKE_RESOURCE(URES_INT, value & RES_MAX_OFFSET); + fWritten = true; +} + +IntResource::~IntResource() {} + +IntVectorResource::IntVectorResource(SRBRoot *bundle, const char *tag, + const UString* comment, UErrorCode &errorCode) + : SResource(bundle, tag, URES_INT_VECTOR, comment, errorCode), + fCount(0), fSize(RESLIST_INT_VECTOR_INIT_SIZE), + fArray(new uint32_t[fSize]) { + if (fArray == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } +} + +IntVectorResource::~IntVectorResource() { + delete[] fArray; +} + +void IntVectorResource::add(int32_t value, UErrorCode &errorCode) { + if (fCount == fSize) { + uint32_t* tmp = new uint32_t[2 * fSize]; + if (tmp == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_memcpy(tmp, fArray, fSize * sizeof(uint32_t)); + delete[] fArray; + fArray = tmp; + fSize *= 2; + } + if (U_SUCCESS(errorCode)) { + fArray[fCount++] = value; + } +} + +BinaryResource::BinaryResource(SRBRoot *bundle, const char *tag, + uint32_t length, uint8_t *data, const char* fileName, + const UString* comment, UErrorCode &errorCode) + : SResource(bundle, tag, URES_BINARY, comment, errorCode), + fLength(length), fData(nullptr), fFileName(nullptr) { + if (U_FAILURE(errorCode)) { + return; + } + if (fileName != nullptr && *fileName != 0){ + fFileName = new char[uprv_strlen(fileName)+1]; + if (fFileName == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_strcpy(fFileName, fileName); + } + if (length > 0) { + fData = new uint8_t[length]; + if (fData == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_memcpy(fData, data, length); + } else { + if (gFormatVersion > 1) { + fRes = URES_MAKE_EMPTY_RESOURCE(URES_BINARY); + fWritten = true; + } + } +} + +BinaryResource::~BinaryResource() { + delete[] fData; + delete[] fFileName; +} + +/* Writing Functions */ + +void +StringResource::handlePreflightStrings(SRBRoot *bundle, UHashtable *stringSet, + UErrorCode &errorCode) { + assert(fSame == nullptr); + fSame = static_cast<StringResource *>(uhash_get(stringSet, this)); + if (fSame != nullptr) { + // This is a duplicate of a pool bundle string or of an earlier-visited string. + if (++fSame->fNumCopies == 1) { + assert(fSame->fWritten); + int32_t poolStringIndex = (int32_t)RES_GET_OFFSET(fSame->fRes); + if (poolStringIndex >= bundle->fPoolStringIndexLimit) { + bundle->fPoolStringIndexLimit = poolStringIndex + 1; + } + } + return; + } + /* Put this string into the set for finding duplicates. */ + fNumCopies = 1; + uhash_put(stringSet, this, this, &errorCode); + + if (bundle->fStringsForm != STRINGS_UTF16_V1) { + int32_t len = length(); + if (len <= MAX_IMPLICIT_STRING_LENGTH && + !U16_IS_TRAIL(fString[0]) && fString.indexOf((char16_t)0) < 0) { + /* + * This string will be stored without an explicit length. + * Runtime will detect !U16_IS_TRAIL(s[0]) and call u_strlen(). + */ + fNumCharsForLength = 0; + } else if (len <= 0x3ee) { + fNumCharsForLength = 1; + } else if (len <= 0xfffff) { + fNumCharsForLength = 2; + } else { + fNumCharsForLength = 3; + } + bundle->f16BitStringsLength += fNumCharsForLength + len + 1; /* +1 for the NUL */ + } +} + +void +ContainerResource::handlePreflightStrings(SRBRoot *bundle, UHashtable *stringSet, + UErrorCode &errorCode) { + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + current->preflightStrings(bundle, stringSet, errorCode); + } +} + +void +SResource::preflightStrings(SRBRoot *bundle, UHashtable *stringSet, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { + return; + } + if (fRes != RES_BOGUS) { + /* + * The resource item word was already precomputed, which means + * no further data needs to be written. + * This might be an integer, or an empty string/binary/etc. + */ + return; + } + handlePreflightStrings(bundle, stringSet, errorCode); +} + +void +SResource::handlePreflightStrings(SRBRoot * /*bundle*/, UHashtable * /*stringSet*/, + UErrorCode & /*errorCode*/) { + /* Neither a string nor a container. */ +} + +int32_t +SRBRoot::makeRes16(uint32_t resWord) const { + if (resWord == 0) { + return 0; /* empty string */ + } + uint32_t type = RES_GET_TYPE(resWord); + int32_t offset = (int32_t)RES_GET_OFFSET(resWord); + if (type == URES_STRING_V2) { + assert(offset > 0); + if (offset < fPoolStringIndexLimit) { + if (offset < fPoolStringIndex16Limit) { + return offset; + } + } else { + offset = offset - fPoolStringIndexLimit + fPoolStringIndex16Limit; + if (offset <= 0xffff) { + return offset; + } + } + } + return -1; +} + +int32_t +SRBRoot::mapKey(int32_t oldpos) const { + const KeyMapEntry *map = fKeyMap; + if (map == nullptr) { + return oldpos; + } + int32_t i, start, limit; + + /* do a binary search for the old, pre-compactKeys() key offset */ + start = fUsePoolBundle->fKeysCount; + limit = start + fKeysCount; + while (start < limit - 1) { + i = (start + limit) / 2; + if (oldpos < map[i].oldpos) { + limit = i; + } else { + start = i; + } + } + assert(oldpos == map[start].oldpos); + return map[start].newpos; +} + +/* + * Only called for UTF-16 v1 strings and duplicate UTF-16 v2 strings. + * For unique UTF-16 v2 strings, write16() sees fRes != RES_BOGUS + * and exits early. + */ +void +StringResource::handleWrite16(SRBRoot * /*bundle*/) { + SResource *same; + if ((same = fSame) != nullptr) { + /* This is a duplicate. */ + assert(same->fRes != RES_BOGUS && same->fWritten); + fRes = same->fRes; + fWritten = same->fWritten; + } +} + +void +ContainerResource::writeAllRes16(SRBRoot *bundle) { + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + bundle->f16BitUnits.append((char16_t)current->fRes16); + } + fWritten = true; +} + +void +ArrayResource::handleWrite16(SRBRoot *bundle) { + if (fCount == 0 && gFormatVersion > 1) { + fRes = URES_MAKE_EMPTY_RESOURCE(URES_ARRAY); + fWritten = true; + return; + } + + int32_t res16 = 0; + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + current->write16(bundle); + res16 |= current->fRes16; + } + if (fCount <= 0xffff && res16 >= 0 && gFormatVersion > 1) { + fRes = URES_MAKE_RESOURCE(URES_ARRAY16, bundle->f16BitUnits.length()); + bundle->f16BitUnits.append((char16_t)fCount); + writeAllRes16(bundle); + } +} + +void +TableResource::handleWrite16(SRBRoot *bundle) { + if (fCount == 0 && gFormatVersion > 1) { + fRes = URES_MAKE_EMPTY_RESOURCE(URES_TABLE); + fWritten = true; + return; + } + /* Find the smallest table type that fits the data. */ + int32_t key16 = 0; + int32_t res16 = 0; + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + current->write16(bundle); + key16 |= current->fKey16; + res16 |= current->fRes16; + } + if(fCount > (uint32_t)bundle->fMaxTableLength) { + bundle->fMaxTableLength = fCount; + } + if (fCount <= 0xffff && key16 >= 0) { + if (res16 >= 0 && gFormatVersion > 1) { + /* 16-bit count, key offsets and values */ + fRes = URES_MAKE_RESOURCE(URES_TABLE16, bundle->f16BitUnits.length()); + bundle->f16BitUnits.append((char16_t)fCount); + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + bundle->f16BitUnits.append((char16_t)current->fKey16); + } + writeAllRes16(bundle); + } else { + /* 16-bit count, 16-bit key offsets, 32-bit values */ + fTableType = URES_TABLE; + } + } else { + /* 32-bit count, key offsets and values */ + fTableType = URES_TABLE32; + } +} + +void +PseudoListResource::handleWrite16(SRBRoot * /*bundle*/) { + fRes = URES_MAKE_EMPTY_RESOURCE(URES_TABLE); + fWritten = true; +} + +void +SResource::write16(SRBRoot *bundle) { + if (fKey >= 0) { + // A tagged resource has a non-negative key index into the parsed key strings. + // compactKeys() built a map from parsed key index to the final key index. + // After the mapping, negative key indexes are used for shared pool bundle keys. + fKey = bundle->mapKey(fKey); + // If the key index fits into a Key16 for a Table or Table16, + // then set the fKey16 field accordingly. + // Otherwise keep it at -1. + if (fKey >= 0) { + if (fKey < bundle->fLocalKeyLimit) { + fKey16 = fKey; + } + } else { + int32_t poolKeyIndex = fKey & 0x7fffffff; + if (poolKeyIndex <= 0xffff) { + poolKeyIndex += bundle->fLocalKeyLimit; + if (poolKeyIndex <= 0xffff) { + fKey16 = poolKeyIndex; + } + } + } + } + /* + * fRes != RES_BOGUS: + * The resource item word was already precomputed, which means + * no further data needs to be written. + * This might be an integer, or an empty or UTF-16 v2 string, + * an empty binary, etc. + */ + if (fRes == RES_BOGUS) { + handleWrite16(bundle); + } + // Compute fRes16 for precomputed as well as just-computed fRes. + fRes16 = bundle->makeRes16(fRes); +} + +void +SResource::handleWrite16(SRBRoot * /*bundle*/) { + /* Only a few resource types write 16-bit units. */ +} + +/* + * Only called for UTF-16 v1 strings, and for aliases. + * For UTF-16 v2 strings, preWrite() sees fRes != RES_BOGUS + * and exits early. + */ +void +StringBaseResource::handlePreWrite(uint32_t *byteOffset) { + /* Write the UTF-16 v1 string. */ + fRes = URES_MAKE_RESOURCE(fType, *byteOffset >> 2); + *byteOffset += 4 + (length() + 1) * U_SIZEOF_UCHAR; +} + +void +IntVectorResource::handlePreWrite(uint32_t *byteOffset) { + if (fCount == 0 && gFormatVersion > 1) { + fRes = URES_MAKE_EMPTY_RESOURCE(URES_INT_VECTOR); + fWritten = true; + } else { + fRes = URES_MAKE_RESOURCE(URES_INT_VECTOR, *byteOffset >> 2); + *byteOffset += (1 + fCount) * 4; + } +} + +void +BinaryResource::handlePreWrite(uint32_t *byteOffset) { + uint32_t pad = 0; + uint32_t dataStart = *byteOffset + sizeof(fLength); + + if (dataStart % BIN_ALIGNMENT) { + pad = (BIN_ALIGNMENT - dataStart % BIN_ALIGNMENT); + *byteOffset += pad; /* pad == 4 or 8 or 12 */ + } + fRes = URES_MAKE_RESOURCE(URES_BINARY, *byteOffset >> 2); + *byteOffset += 4 + fLength; +} + +void +ContainerResource::preWriteAllRes(uint32_t *byteOffset) { + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + current->preWrite(byteOffset); + } +} + +void +ArrayResource::handlePreWrite(uint32_t *byteOffset) { + preWriteAllRes(byteOffset); + fRes = URES_MAKE_RESOURCE(URES_ARRAY, *byteOffset >> 2); + *byteOffset += (1 + fCount) * 4; +} + +void +TableResource::handlePreWrite(uint32_t *byteOffset) { + preWriteAllRes(byteOffset); + if (fTableType == URES_TABLE) { + /* 16-bit count, 16-bit key offsets, 32-bit values */ + fRes = URES_MAKE_RESOURCE(URES_TABLE, *byteOffset >> 2); + *byteOffset += 2 + fCount * 6; + } else { + /* 32-bit count, key offsets and values */ + fRes = URES_MAKE_RESOURCE(URES_TABLE32, *byteOffset >> 2); + *byteOffset += 4 + fCount * 8; + } +} + +void +SResource::preWrite(uint32_t *byteOffset) { + if (fRes != RES_BOGUS) { + /* + * The resource item word was already precomputed, which means + * no further data needs to be written. + * This might be an integer, or an empty or UTF-16 v2 string, + * an empty binary, etc. + */ + return; + } + handlePreWrite(byteOffset); + *byteOffset += calcPadding(*byteOffset); +} + +void +SResource::handlePreWrite(uint32_t * /*byteOffset*/) { + assert(false); +} + +/* + * Only called for UTF-16 v1 strings, and for aliases. For UTF-16 v2 strings, + * write() sees fWritten and exits early. + */ +void +StringBaseResource::handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) { + /* Write the UTF-16 v1 string. */ + int32_t len = length(); + udata_write32(mem, len); + udata_writeUString(mem, getBuffer(), len + 1); + *byteOffset += 4 + (len + 1) * U_SIZEOF_UCHAR; + fWritten = true; +} + +void +ContainerResource::writeAllRes(UNewDataMemory *mem, uint32_t *byteOffset) { + uint32_t i = 0; + for (SResource *current = fFirst; current != nullptr; ++i, current = current->fNext) { + current->write(mem, byteOffset); + } + assert(i == fCount); +} + +void +ContainerResource::writeAllRes32(UNewDataMemory *mem, uint32_t *byteOffset) { + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + udata_write32(mem, current->fRes); + } + *byteOffset += fCount * 4; +} + +void +ArrayResource::handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) { + writeAllRes(mem, byteOffset); + udata_write32(mem, fCount); + *byteOffset += 4; + writeAllRes32(mem, byteOffset); +} + +void +IntVectorResource::handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) { + udata_write32(mem, fCount); + for(uint32_t i = 0; i < fCount; ++i) { + udata_write32(mem, fArray[i]); + } + *byteOffset += (1 + fCount) * 4; +} + +void +BinaryResource::handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) { + uint32_t pad = 0; + uint32_t dataStart = *byteOffset + sizeof(fLength); + + if (dataStart % BIN_ALIGNMENT) { + pad = (BIN_ALIGNMENT - dataStart % BIN_ALIGNMENT); + udata_writePadding(mem, pad); /* pad == 4 or 8 or 12 */ + *byteOffset += pad; + } + + udata_write32(mem, fLength); + if (fLength > 0) { + udata_writeBlock(mem, fData, fLength); + } + *byteOffset += 4 + fLength; +} + +void +TableResource::handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) { + writeAllRes(mem, byteOffset); + if(fTableType == URES_TABLE) { + udata_write16(mem, (uint16_t)fCount); + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + udata_write16(mem, current->fKey16); + } + *byteOffset += (1 + fCount)* 2; + if ((fCount & 1) == 0) { + /* 16-bit count and even number of 16-bit key offsets need padding before 32-bit resource items */ + udata_writePadding(mem, 2); + *byteOffset += 2; + } + } else /* URES_TABLE32 */ { + udata_write32(mem, fCount); + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + udata_write32(mem, (uint32_t)current->fKey); + } + *byteOffset += (1 + fCount)* 4; + } + writeAllRes32(mem, byteOffset); +} + +void +SResource::write(UNewDataMemory *mem, uint32_t *byteOffset) { + if (fWritten) { + assert(fRes != RES_BOGUS); + return; + } + handleWrite(mem, byteOffset); + uint8_t paddingSize = calcPadding(*byteOffset); + if (paddingSize > 0) { + udata_writePadding(mem, paddingSize); + *byteOffset += paddingSize; + } + fWritten = true; +} + +void +SResource::handleWrite(UNewDataMemory * /*mem*/, uint32_t * /*byteOffset*/) { + assert(false); +} + +void SRBRoot::write(const char *outputDir, const char *outputPkg, + char *writtenFilename, int writtenFilenameLen, + UErrorCode &errorCode) { + UNewDataMemory *mem = nullptr; + uint32_t byteOffset = 0; + uint32_t top, size; + char dataName[1024]; + int32_t indexes[URES_INDEX_TOP]; + + compactKeys(errorCode); + /* + * Add padding bytes to fKeys so that fKeysTop is 4-aligned. + * Safe because the capacity is a multiple of 4. + */ + while (fKeysTop & 3) { + fKeys[fKeysTop++] = (char)0xaa; + } + /* + * In URES_TABLE, use all local key offsets that fit into 16 bits, + * and use the remaining 16-bit offsets for pool key offsets + * if there are any. + * If there are no local keys, then use the whole 16-bit space + * for pool key offsets. + * Note: This cannot be changed without changing the major formatVersion. + */ + if (fKeysBottom < fKeysTop) { + if (fKeysTop <= 0x10000) { + fLocalKeyLimit = fKeysTop; + } else { + fLocalKeyLimit = 0x10000; + } + } else { + fLocalKeyLimit = 0; + } + + UHashtable *stringSet; + if (gFormatVersion > 1) { + stringSet = uhash_open(string_hash, string_comp, string_comp, &errorCode); + if (U_SUCCESS(errorCode) && + fUsePoolBundle != nullptr && fUsePoolBundle->fStrings != nullptr) { + for (SResource *current = fUsePoolBundle->fStrings->fFirst; + current != nullptr; + current = current->fNext) { + StringResource *sr = static_cast<StringResource *>(current); + sr->fNumCopies = 0; + sr->fNumUnitsSaved = 0; + uhash_put(stringSet, sr, sr, &errorCode); + } + } + fRoot->preflightStrings(this, stringSet, errorCode); + } else { + stringSet = nullptr; + } + if (fStringsForm == STRINGS_UTF16_V2 && f16BitStringsLength > 0) { + compactStringsV2(stringSet, errorCode); + } + uhash_close(stringSet); + if (U_FAILURE(errorCode)) { + return; + } + + int32_t formatVersion = gFormatVersion; + if (fPoolStringIndexLimit != 0) { + int32_t sum = fPoolStringIndexLimit + fLocalStringIndexLimit; + if ((sum - 1) > RES_MAX_OFFSET) { + errorCode = U_BUFFER_OVERFLOW_ERROR; + return; + } + if (fPoolStringIndexLimit < 0x10000 && sum <= 0x10000) { + // 16-bit indexes work for all pool + local strings. + fPoolStringIndex16Limit = fPoolStringIndexLimit; + } else { + // Set the pool index threshold so that 16-bit indexes work + // for some pool strings and some local strings. + fPoolStringIndex16Limit = (int32_t)( + ((int64_t)fPoolStringIndexLimit * 0xffff) / sum); + } + } else if (gIsDefaultFormatVersion && formatVersion == 3 && !fIsPoolBundle) { + // If we just default to formatVersion 3 + // but there are no pool bundle strings to share + // and we do not write a pool bundle, + // then write formatVersion 2 which is just as good. + formatVersion = 2; + } + + fRoot->write16(this); + if (f16BitUnits.isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + if (f16BitUnits.length() & 1) { + f16BitUnits.append((char16_t)0xaaaa); /* pad to multiple of 4 bytes */ + } + + byteOffset = fKeysTop + f16BitUnits.length() * 2; + fRoot->preWrite(&byteOffset); + + /* total size including the root item */ + top = byteOffset; + + if (writtenFilename && writtenFilenameLen) { + *writtenFilename = 0; + } + + if (writtenFilename) { + int32_t off = 0, len = 0; + if (outputDir) { + uprv_strncpy(writtenFilename, outputDir, writtenFilenameLen); + } + if (writtenFilenameLen -= len) { + off += len; + writtenFilename[off] = U_FILE_SEP_CHAR; + if (--writtenFilenameLen) { + ++off; + if(outputPkg != nullptr) + { + uprv_strcpy(writtenFilename+off, outputPkg); + off += (int32_t)uprv_strlen(outputPkg); + writtenFilename[off] = '_'; + ++off; + } + + len = (int32_t)uprv_strlen(fLocale); + if (len > writtenFilenameLen) { + len = writtenFilenameLen; + } + uprv_strncpy(writtenFilename + off, fLocale, writtenFilenameLen - off); + if (writtenFilenameLen -= len) { + off += len; + uprv_strncpy(writtenFilename + off, ".res", writtenFilenameLen - off); + } + } + } + } + + if(outputPkg) + { + uprv_strcpy(dataName, outputPkg); + uprv_strcat(dataName, "_"); + uprv_strcat(dataName, fLocale); + } + else + { + uprv_strcpy(dataName, fLocale); + } + + uprv_memcpy(dataInfo.formatVersion, gFormatVersions + formatVersion, sizeof(UVersionInfo)); + + mem = udata_create(outputDir, "res", dataName, + &dataInfo, (gIncludeCopyright==true)? U_COPYRIGHT_STRING:nullptr, &errorCode); + if(U_FAILURE(errorCode)){ + return; + } + + /* write the root item */ + udata_write32(mem, fRoot->fRes); + + /* + * formatVersion 1.1 (ICU 2.8): + * write int32_t indexes[] after root and before the key strings + * to make it easier to parse resource bundles in icuswap or from Java etc. + */ + uprv_memset(indexes, 0, sizeof(indexes)); + indexes[URES_INDEX_LENGTH]= fIndexLength; + indexes[URES_INDEX_KEYS_TOP]= fKeysTop>>2; + indexes[URES_INDEX_RESOURCES_TOP]= (int32_t)(top>>2); + indexes[URES_INDEX_BUNDLE_TOP]= indexes[URES_INDEX_RESOURCES_TOP]; + indexes[URES_INDEX_MAX_TABLE_LENGTH]= fMaxTableLength; + + /* + * formatVersion 1.2 (ICU 3.6): + * write indexes[URES_INDEX_ATTRIBUTES] with URES_ATT_NO_FALLBACK set or not set + * the memset() above initialized all indexes[] to 0 + */ + if (fNoFallback) { + indexes[URES_INDEX_ATTRIBUTES]=URES_ATT_NO_FALLBACK; + } + /* + * formatVersion 2.0 (ICU 4.4): + * more compact string value storage, optional pool bundle + */ + if (URES_INDEX_16BIT_TOP < fIndexLength) { + indexes[URES_INDEX_16BIT_TOP] = (fKeysTop>>2) + (f16BitUnits.length()>>1); + } + if (URES_INDEX_POOL_CHECKSUM < fIndexLength) { + if (fIsPoolBundle) { + indexes[URES_INDEX_ATTRIBUTES] |= URES_ATT_IS_POOL_BUNDLE | URES_ATT_NO_FALLBACK; + uint32_t checksum = computeCRC((const char *)(fKeys + fKeysBottom), + (uint32_t)(fKeysTop - fKeysBottom), 0); + if (f16BitUnits.length() <= 1) { + // no pool strings to checksum + } else if (U_IS_BIG_ENDIAN) { + checksum = computeCRC(reinterpret_cast<const char *>(f16BitUnits.getBuffer()), + (uint32_t)f16BitUnits.length() * 2, checksum); + } else { + // Swap to big-endian so we get the same checksum on all platforms + // (except for charset family, due to the key strings). + UnicodeString s(f16BitUnits); + assert(!s.isBogus()); + // .getBuffer(capacity) returns a mutable buffer + char16_t* p = s.getBuffer(f16BitUnits.length()); + for (int32_t count = f16BitUnits.length(); count > 0; --count) { + uint16_t x = *p; + *p++ = (uint16_t)((x << 8) | (x >> 8)); + } + s.releaseBuffer(f16BitUnits.length()); + checksum = computeCRC((const char *)s.getBuffer(), + (uint32_t)f16BitUnits.length() * 2, checksum); + } + indexes[URES_INDEX_POOL_CHECKSUM] = (int32_t)checksum; + } else if (gUsePoolBundle) { + indexes[URES_INDEX_ATTRIBUTES] |= URES_ATT_USES_POOL_BUNDLE; + indexes[URES_INDEX_POOL_CHECKSUM] = fUsePoolBundle->fChecksum; + } + } + // formatVersion 3 (ICU 56): + // share string values via pool bundle strings + indexes[URES_INDEX_LENGTH] |= fPoolStringIndexLimit << 8; // bits 23..0 -> 31..8 + indexes[URES_INDEX_ATTRIBUTES] |= (fPoolStringIndexLimit >> 12) & 0xf000; // bits 27..24 -> 15..12 + indexes[URES_INDEX_ATTRIBUTES] |= fPoolStringIndex16Limit << 16; + + /* write the indexes[] */ + udata_writeBlock(mem, indexes, fIndexLength*4); + + /* write the table key strings */ + udata_writeBlock(mem, fKeys+fKeysBottom, + fKeysTop-fKeysBottom); + + /* write the v2 UTF-16 strings, URES_TABLE16 and URES_ARRAY16 */ + udata_writeBlock(mem, f16BitUnits.getBuffer(), f16BitUnits.length()*2); + + /* write all of the bundle contents: the root item and its children */ + byteOffset = fKeysTop + f16BitUnits.length() * 2; + fRoot->write(mem, &byteOffset); + assert(byteOffset == top); + + size = udata_finish(mem, &errorCode); + if(top != size) { + fprintf(stderr, "genrb error: wrote %u bytes but counted %u\n", + (int)size, (int)top); + errorCode = U_INTERNAL_PROGRAM_ERROR; + } +} + +/* Opening Functions */ + +TableResource* table_open(struct SRBRoot *bundle, const char *tag, const struct UString* comment, UErrorCode *status) { + LocalPointer<TableResource> res(new TableResource(bundle, tag, comment, *status), *status); + return U_SUCCESS(*status) ? res.orphan() : nullptr; +} + +ArrayResource* array_open(struct SRBRoot *bundle, const char *tag, const struct UString* comment, UErrorCode *status) { + LocalPointer<ArrayResource> res(new ArrayResource(bundle, tag, comment, *status), *status); + return U_SUCCESS(*status) ? res.orphan() : nullptr; +} + +struct SResource *string_open(struct SRBRoot *bundle, const char *tag, const char16_t *value, int32_t len, const struct UString* comment, UErrorCode *status) { + LocalPointer<SResource> res( + new StringResource(bundle, tag, value, len, comment, *status), *status); + return U_SUCCESS(*status) ? res.orphan() : nullptr; +} + +struct SResource *alias_open(struct SRBRoot *bundle, const char *tag, char16_t *value, int32_t len, const struct UString* comment, UErrorCode *status) { + LocalPointer<SResource> res( + new AliasResource(bundle, tag, value, len, comment, *status), *status); + return U_SUCCESS(*status) ? res.orphan() : nullptr; +} + +IntVectorResource *intvector_open(struct SRBRoot *bundle, const char *tag, const struct UString* comment, UErrorCode *status) { + LocalPointer<IntVectorResource> res( + new IntVectorResource(bundle, tag, comment, *status), *status); + return U_SUCCESS(*status) ? res.orphan() : nullptr; +} + +struct SResource *int_open(struct SRBRoot *bundle, const char *tag, int32_t value, const struct UString* comment, UErrorCode *status) { + LocalPointer<SResource> res(new IntResource(bundle, tag, value, comment, *status), *status); + return U_SUCCESS(*status) ? res.orphan() : nullptr; +} + +struct SResource *bin_open(struct SRBRoot *bundle, const char *tag, uint32_t length, uint8_t *data, const char* fileName, const struct UString* comment, UErrorCode *status) { + LocalPointer<SResource> res( + new BinaryResource(bundle, tag, length, data, fileName, comment, *status), *status); + return U_SUCCESS(*status) ? res.orphan() : nullptr; +} + +SRBRoot::SRBRoot(const UString *comment, UBool isPoolBundle, UErrorCode &errorCode) + : fRoot(nullptr), fLocale(nullptr), fIndexLength(0), fMaxTableLength(0), fNoFallback(false), + fStringsForm(STRINGS_UTF16_V1), fIsPoolBundle(isPoolBundle), + fKeys(nullptr), fKeyMap(nullptr), + fKeysBottom(0), fKeysTop(0), fKeysCapacity(0), + fKeysCount(0), fLocalKeyLimit(0), + f16BitUnits(), f16BitStringsLength(0), + fUsePoolBundle(&kNoPoolBundle), + fPoolStringIndexLimit(0), fPoolStringIndex16Limit(0), fLocalStringIndexLimit(0), + fWritePoolBundle(nullptr) { + if (U_FAILURE(errorCode)) { + return; + } + + if (gFormatVersion > 1) { + // f16BitUnits must start with a zero for empty resources. + // We might be able to omit it if there are no empty 16-bit resources. + f16BitUnits.append((char16_t)0); + } + + fKeys = (char *) uprv_malloc(sizeof(char) * KEY_SPACE_SIZE); + if (isPoolBundle) { + fRoot = new PseudoListResource(this, errorCode); + } else { + fRoot = new TableResource(this, nullptr, comment, errorCode); + } + if (fKeys == nullptr || fRoot == nullptr || U_FAILURE(errorCode)) { + if (U_SUCCESS(errorCode)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } + return; + } + + fKeysCapacity = KEY_SPACE_SIZE; + /* formatVersion 1.1 and up: start fKeysTop after the root item and indexes[] */ + if (gUsePoolBundle || isPoolBundle) { + fIndexLength = URES_INDEX_POOL_CHECKSUM + 1; + } else if (gFormatVersion >= 2) { + fIndexLength = URES_INDEX_16BIT_TOP + 1; + } else /* formatVersion 1 */ { + fIndexLength = URES_INDEX_ATTRIBUTES + 1; + } + fKeysBottom = (1 /* root */ + fIndexLength) * 4; + uprv_memset(fKeys, 0, fKeysBottom); + fKeysTop = fKeysBottom; + + if (gFormatVersion == 1) { + fStringsForm = STRINGS_UTF16_V1; + } else { + fStringsForm = STRINGS_UTF16_V2; + } +} + +/* Closing Functions */ + +void res_close(struct SResource *res) { + delete res; +} + +SRBRoot::~SRBRoot() { + delete fRoot; + uprv_free(fLocale); + uprv_free(fKeys); + uprv_free(fKeyMap); +} + +/* Misc Functions */ + +void SRBRoot::setLocale(char16_t *locale, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return; + } + + uprv_free(fLocale); + fLocale = (char*) uprv_malloc(sizeof(char) * (u_strlen(locale)+1)); + if(fLocale == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + + u_UCharsToChars(locale, fLocale, u_strlen(locale)+1); +} + +const char * +SRBRoot::getKeyString(int32_t key) const { + if (key < 0) { + return fUsePoolBundle->fKeys + (key & 0x7fffffff); + } else { + return fKeys + key; + } +} + +const char * +SResource::getKeyString(const SRBRoot *bundle) const { + if (fKey == -1) { + return nullptr; + } + return bundle->getKeyString(fKey); +} + +const char * +SRBRoot::getKeyBytes(int32_t *pLength) const { + *pLength = fKeysTop - fKeysBottom; + return fKeys + fKeysBottom; +} + +int32_t +SRBRoot::addKeyBytes(const char *keyBytes, int32_t length, UErrorCode &errorCode) { + int32_t keypos; + + // It is not legal to add new key bytes after compactKeys is run! + U_ASSERT(fKeyMap == nullptr); + + if (U_FAILURE(errorCode)) { + return -1; + } + if (length < 0 || (keyBytes == nullptr && length != 0)) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return -1; + } + if (length == 0) { + return fKeysTop; + } + + keypos = fKeysTop; + fKeysTop += length; + if (fKeysTop >= fKeysCapacity) { + /* overflow - resize the keys buffer */ + fKeysCapacity += KEY_SPACE_SIZE; + fKeys = static_cast<char *>(uprv_realloc(fKeys, fKeysCapacity)); + if(fKeys == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return -1; + } + } + + uprv_memcpy(fKeys + keypos, keyBytes, length); + + return keypos; +} + +int32_t +SRBRoot::addTag(const char *tag, UErrorCode &errorCode) { + int32_t keypos; + + if (U_FAILURE(errorCode)) { + return -1; + } + + if (tag == nullptr) { + /* no error: the root table and array items have no keys */ + return -1; + } + + keypos = addKeyBytes(tag, (int32_t)(uprv_strlen(tag) + 1), errorCode); + if (U_SUCCESS(errorCode)) { + ++fKeysCount; + } + return keypos; +} + +static int32_t +compareInt32(int32_t lPos, int32_t rPos) { + /* + * Compare possibly-negative key offsets. Don't just return lPos - rPos + * because that is prone to negative-integer underflows. + */ + if (lPos < rPos) { + return -1; + } else if (lPos > rPos) { + return 1; + } else { + return 0; + } +} + +static int32_t U_CALLCONV +compareKeySuffixes(const void *context, const void *l, const void *r) { + const struct SRBRoot *bundle=(const struct SRBRoot *)context; + int32_t lPos = ((const KeyMapEntry *)l)->oldpos; + int32_t rPos = ((const KeyMapEntry *)r)->oldpos; + const char *lStart = bundle->getKeyString(lPos); + const char *lLimit = lStart; + const char *rStart = bundle->getKeyString(rPos); + const char *rLimit = rStart; + int32_t diff; + while (*lLimit != 0) { ++lLimit; } + while (*rLimit != 0) { ++rLimit; } + /* compare keys in reverse character order */ + while (lStart < lLimit && rStart < rLimit) { + diff = (int32_t)(uint8_t)*--lLimit - (int32_t)(uint8_t)*--rLimit; + if (diff != 0) { + return diff; + } + } + /* sort equal suffixes by descending key length */ + diff = (int32_t)(rLimit - rStart) - (int32_t)(lLimit - lStart); + if (diff != 0) { + return diff; + } + /* Sort pool bundle keys first (negative oldpos), and otherwise keys in parsing order. */ + return compareInt32(lPos, rPos); +} + +static int32_t U_CALLCONV +compareKeyNewpos(const void * /*context*/, const void *l, const void *r) { + return compareInt32(((const KeyMapEntry *)l)->newpos, ((const KeyMapEntry *)r)->newpos); +} + +static int32_t U_CALLCONV +compareKeyOldpos(const void * /*context*/, const void *l, const void *r) { + return compareInt32(((const KeyMapEntry *)l)->oldpos, ((const KeyMapEntry *)r)->oldpos); +} + +void SResource::collectKeys(std::function<void(int32_t)> collector) const { + collector(fKey); +} + +void ContainerResource::collectKeys(std::function<void(int32_t)> collector) const { + collector(fKey); + for (SResource* curr = fFirst; curr != nullptr; curr = curr->fNext) { + curr->collectKeys(collector); + } +} + +void +SRBRoot::compactKeys(UErrorCode &errorCode) { + KeyMapEntry *map; + char *keys; + int32_t i; + + // Except for pool bundles, keys might not be used. + // Do not add unused keys to the final bundle. + std::set<int32_t> keysInUse; + if (!fIsPoolBundle) { + fRoot->collectKeys([&keysInUse](int32_t key) { + if (key >= 0) { + keysInUse.insert(key); + } + }); + fKeysCount = static_cast<int32_t>(keysInUse.size()); + } + + int32_t keysCount = fUsePoolBundle->fKeysCount + fKeysCount; + if (U_FAILURE(errorCode) || fKeyMap != nullptr) { + return; + } + map = (KeyMapEntry *)uprv_malloc(keysCount * sizeof(KeyMapEntry)); + if (map == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + keys = (char *)fUsePoolBundle->fKeys; + for (i = 0; i < fUsePoolBundle->fKeysCount; ++i) { + map[i].oldpos = + (int32_t)(keys - fUsePoolBundle->fKeys) | 0x80000000; /* negative oldpos */ + map[i].newpos = 0; + while (*keys != 0) { ++keys; } /* skip the key */ + ++keys; /* skip the NUL */ + } + keys = fKeys + fKeysBottom; + while (i < keysCount) { + int32_t keyOffset = static_cast<int32_t>(keys - fKeys); + if (!fIsPoolBundle && keysInUse.count(keyOffset) == 0) { + // Mark the unused key as deleted + while (*keys != 0) { *keys++ = 1; } + *keys++ = 1; + } else { + map[i].oldpos = keyOffset; + map[i].newpos = 0; + while (*keys != 0) { ++keys; } /* skip the key */ + ++keys; /* skip the NUL */ + i++; + } + } + if (keys != fKeys + fKeysTop) { + // Throw away any unused keys from the end + fKeysTop = static_cast<int32_t>(keys - fKeys); + } + /* Sort the keys so that each one is immediately followed by all of its suffixes. */ + uprv_sortArray(map, keysCount, (int32_t)sizeof(KeyMapEntry), + compareKeySuffixes, this, false, &errorCode); + /* + * Make suffixes point into earlier, longer strings that contain them + * and mark the old, now unused suffix bytes as deleted. + */ + if (U_SUCCESS(errorCode)) { + keys = fKeys; + for (i = 0; i < keysCount;) { + /* + * This key is not a suffix of the previous one; + * keep this one and delete the following ones that are + * suffixes of this one. + */ + const char *key; + const char *keyLimit; + int32_t j = i + 1; + map[i].newpos = map[i].oldpos; + if (j < keysCount && map[j].oldpos < 0) { + /* Key string from the pool bundle, do not delete. */ + i = j; + continue; + } + key = getKeyString(map[i].oldpos); + for (keyLimit = key; *keyLimit != 0; ++keyLimit) {} + for (; j < keysCount && map[j].oldpos >= 0; ++j) { + const char *k; + char *suffix; + const char *suffixLimit; + int32_t offset; + suffix = keys + map[j].oldpos; + for (suffixLimit = suffix; *suffixLimit != 0; ++suffixLimit) {} + offset = static_cast<int32_t>((keyLimit - key) - (suffixLimit - suffix)); + if (offset < 0) { + break; /* suffix cannot be longer than the original */ + } + /* Is it a suffix of the earlier, longer key? */ + for (k = keyLimit; suffix < suffixLimit && *--k == *--suffixLimit;) {} + if (suffix == suffixLimit && *k == *suffixLimit) { + map[j].newpos = map[i].oldpos + offset; /* yes, point to the earlier key */ + // Mark the suffix as deleted + while (*suffix != 0) { *suffix++ = 1; } + *suffix = 1; + } else { + break; /* not a suffix, restart from here */ + } + } + i = j; + } + /* + * Re-sort by newpos, then modify the key characters array in-place + * to squeeze out unused bytes, and readjust the newpos offsets. + */ + uprv_sortArray(map, keysCount, (int32_t)sizeof(KeyMapEntry), + compareKeyNewpos, nullptr, false, &errorCode); + if (U_SUCCESS(errorCode)) { + int32_t oldpos, newpos, limit; + oldpos = newpos = fKeysBottom; + limit = fKeysTop; + /* skip key offsets that point into the pool bundle rather than this new bundle */ + for (i = 0; i < keysCount && map[i].newpos < 0; ++i) {} + if (i < keysCount) { + while (oldpos < limit) { + if (keys[oldpos] == 1) { + ++oldpos; /* skip unused bytes */ + } else { + /* adjust the new offsets for keys starting here */ + while (i < keysCount && map[i].newpos == oldpos) { + map[i++].newpos = newpos; + } + /* move the key characters to their new position */ + keys[newpos++] = keys[oldpos++]; + } + } + U_ASSERT(i == keysCount); + } + fKeysTop = newpos; + /* Re-sort once more, by old offsets for binary searching. */ + uprv_sortArray(map, keysCount, (int32_t)sizeof(KeyMapEntry), + compareKeyOldpos, nullptr, false, &errorCode); + if (U_SUCCESS(errorCode)) { + /* key size reduction by limit - newpos */ + fKeyMap = map; + map = nullptr; + } + } + } + uprv_free(map); +} + +static int32_t U_CALLCONV +compareStringSuffixes(const void * /*context*/, const void *l, const void *r) { + const StringResource *left = *((const StringResource **)l); + const StringResource *right = *((const StringResource **)r); + const char16_t *lStart = left->getBuffer(); + const char16_t *lLimit = lStart + left->length(); + const char16_t *rStart = right->getBuffer(); + const char16_t *rLimit = rStart + right->length(); + int32_t diff; + /* compare keys in reverse character order */ + while (lStart < lLimit && rStart < rLimit) { + diff = (int32_t)*--lLimit - (int32_t)*--rLimit; + if (diff != 0) { + return diff; + } + } + /* sort equal suffixes by descending string length */ + return right->length() - left->length(); +} + +static int32_t U_CALLCONV +compareStringLengths(const void * /*context*/, const void *l, const void *r) { + const StringResource *left = *((const StringResource **)l); + const StringResource *right = *((const StringResource **)r); + int32_t diff; + /* Make "is suffix of another string" compare greater than a non-suffix. */ + diff = (int)(left->fSame != nullptr) - (int)(right->fSame != nullptr); + if (diff != 0) { + return diff; + } + /* sort by ascending string length */ + diff = left->length() - right->length(); + if (diff != 0) { + return diff; + } + // sort by descending size reduction + diff = right->fNumUnitsSaved - left->fNumUnitsSaved; + if (diff != 0) { + return diff; + } + // sort lexically + return left->fString.compare(right->fString); +} + +void +StringResource::writeUTF16v2(int32_t base, UnicodeString &dest) { + int32_t len = length(); + fRes = URES_MAKE_RESOURCE(URES_STRING_V2, base + dest.length()); + fWritten = true; + switch(fNumCharsForLength) { + case 0: + break; + case 1: + dest.append((char16_t)(0xdc00 + len)); + break; + case 2: + dest.append((char16_t)(0xdfef + (len >> 16))); + dest.append((char16_t)len); + break; + case 3: + dest.append((char16_t)0xdfff); + dest.append((char16_t)(len >> 16)); + dest.append((char16_t)len); + break; + default: + break; /* will not occur */ + } + dest.append(fString); + dest.append((char16_t)0); +} + +void +SRBRoot::compactStringsV2(UHashtable *stringSet, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { + return; + } + // Store the StringResource pointers in an array for + // easy sorting and processing. + // We enumerate a set of strings, so there are no duplicates. + int32_t count = uhash_count(stringSet); + LocalArray<StringResource *> array(new StringResource *[count], errorCode); + if (U_FAILURE(errorCode)) { + return; + } + for (int32_t pos = UHASH_FIRST, i = 0; i < count; ++i) { + array[i] = (StringResource *)uhash_nextElement(stringSet, &pos)->key.pointer; + } + /* Sort the strings so that each one is immediately followed by all of its suffixes. */ + uprv_sortArray(array.getAlias(), count, (int32_t)sizeof(struct SResource **), + compareStringSuffixes, nullptr, false, &errorCode); + if (U_FAILURE(errorCode)) { + return; + } + /* + * Make suffixes point into earlier, longer strings that contain them. + * Temporarily use fSame and fSuffixOffset for suffix strings to + * refer to the remaining ones. + */ + for (int32_t i = 0; i < count;) { + /* + * This string is not a suffix of the previous one; + * write this one and subsume the following ones that are + * suffixes of this one. + */ + StringResource *res = array[i]; + res->fNumUnitsSaved = (res->fNumCopies - 1) * res->get16BitStringsLength(); + // Whole duplicates of pool strings are already account for in fPoolStringIndexLimit, + // see StringResource::handlePreflightStrings(). + int32_t j; + for (j = i + 1; j < count; ++j) { + StringResource *suffixRes = array[j]; + /* Is it a suffix of the earlier, longer string? */ + if (res->fString.endsWith(suffixRes->fString)) { + assert(res->length() != suffixRes->length()); // Set strings are unique. + if (suffixRes->fWritten) { + // Pool string, skip. + } else if (suffixRes->fNumCharsForLength == 0) { + /* yes, point to the earlier string */ + suffixRes->fSame = res; + suffixRes->fSuffixOffset = res->length() - suffixRes->length(); + if (res->fWritten) { + // Suffix-share res which is a pool string. + // Compute the resource word and collect the maximum. + suffixRes->fRes = + res->fRes + res->fNumCharsForLength + suffixRes->fSuffixOffset; + int32_t poolStringIndex = (int32_t)RES_GET_OFFSET(suffixRes->fRes); + if (poolStringIndex >= fPoolStringIndexLimit) { + fPoolStringIndexLimit = poolStringIndex + 1; + } + suffixRes->fWritten = true; + } + res->fNumUnitsSaved += suffixRes->fNumCopies * suffixRes->get16BitStringsLength(); + } else { + /* write the suffix by itself if we need explicit length */ + } + } else { + break; /* not a suffix, restart from here */ + } + } + i = j; + } + /* + * Re-sort the strings by ascending length (except suffixes last) + * to optimize for URES_TABLE16 and URES_ARRAY16: + * Keep as many as possible within reach of 16-bit offsets. + */ + uprv_sortArray(array.getAlias(), count, (int32_t)sizeof(struct SResource **), + compareStringLengths, nullptr, false, &errorCode); + if (U_FAILURE(errorCode)) { + return; + } + if (fIsPoolBundle) { + // Write strings that are sufficiently shared. + // Avoid writing other strings. + int32_t numStringsWritten = 0; + int32_t numUnitsSaved = 0; + int32_t numUnitsNotSaved = 0; + for (int32_t i = 0; i < count; ++i) { + StringResource *res = array[i]; + // Maximum pool string index when suffix-sharing the last character. + int32_t maxStringIndex = + f16BitUnits.length() + res->fNumCharsForLength + res->length() - 1; + if (res->fNumUnitsSaved >= GENRB_MIN_16BIT_UNITS_SAVED_FOR_POOL_STRING && + maxStringIndex < RES_MAX_OFFSET) { + res->writeUTF16v2(0, f16BitUnits); + ++numStringsWritten; + numUnitsSaved += res->fNumUnitsSaved; + } else { + numUnitsNotSaved += res->fNumUnitsSaved; + res->fRes = URES_MAKE_EMPTY_RESOURCE(URES_STRING); + res->fWritten = true; + } + } + if (f16BitUnits.isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } + if (getShowWarning()) { // not quiet + printf("number of shared strings: %d\n", (int)numStringsWritten); + printf("16-bit units for strings: %6d = %6d bytes\n", + (int)f16BitUnits.length(), (int)f16BitUnits.length() * 2); + printf("16-bit units saved: %6d = %6d bytes\n", + (int)numUnitsSaved, (int)numUnitsSaved * 2); + printf("16-bit units not saved: %6d = %6d bytes\n", + (int)numUnitsNotSaved, (int)numUnitsNotSaved * 2); + } + } else { + assert(fPoolStringIndexLimit <= fUsePoolBundle->fStringIndexLimit); + /* Write the non-suffix strings. */ + int32_t i; + for (i = 0; i < count && array[i]->fSame == nullptr; ++i) { + StringResource *res = array[i]; + if (!res->fWritten) { + int32_t localStringIndex = f16BitUnits.length(); + if (localStringIndex >= fLocalStringIndexLimit) { + fLocalStringIndexLimit = localStringIndex + 1; + } + res->writeUTF16v2(fPoolStringIndexLimit, f16BitUnits); + } + } + if (f16BitUnits.isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + if (fWritePoolBundle != nullptr && gFormatVersion >= 3) { + PseudoListResource *poolStrings = + static_cast<PseudoListResource *>(fWritePoolBundle->fRoot); + for (i = 0; i < count && array[i]->fSame == nullptr; ++i) { + assert(!array[i]->fString.isEmpty()); + StringResource *poolString = + new StringResource(fWritePoolBundle, array[i]->fString, errorCode); + if (poolString == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + break; + } + poolStrings->add(poolString); + } + } + /* Write the suffix strings. Make each point to the real string. */ + for (; i < count; ++i) { + StringResource *res = array[i]; + if (res->fWritten) { + continue; + } + StringResource *same = res->fSame; + assert(res->length() != same->length()); // Set strings are unique. + res->fRes = same->fRes + same->fNumCharsForLength + res->fSuffixOffset; + int32_t localStringIndex = (int32_t)RES_GET_OFFSET(res->fRes) - fPoolStringIndexLimit; + // Suffixes of pool strings have been set already. + assert(localStringIndex >= 0); + if (localStringIndex >= fLocalStringIndexLimit) { + fLocalStringIndexLimit = localStringIndex + 1; + } + res->fWritten = true; + } + } + // +1 to account for the initial zero in f16BitUnits + assert(f16BitUnits.length() <= (f16BitStringsLength + 1)); +} + +void SResource::applyFilter( + const PathFilter& /*filter*/, + ResKeyPath& /*path*/, + const SRBRoot* /*bundle*/) { + // Only a few resource types (tables) are capable of being filtered. +} + +void TableResource::applyFilter( + const PathFilter& filter, + ResKeyPath& path, + const SRBRoot* bundle) { + SResource* prev = nullptr; + SResource* curr = fFirst; + for (; curr != nullptr;) { + path.push(curr->getKeyString(bundle)); + auto inclusion = filter.match(path); + if (inclusion == PathFilter::EInclusion::INCLUDE) { + // Include whole subtree + // no-op + if (isVerbose()) { + std::cout << "genrb subtree: " << bundle->fLocale << ": INCLUDE: " << path << std::endl; + } + } else if (inclusion == PathFilter::EInclusion::EXCLUDE) { + // Reject the whole subtree + // Remove it from the linked list + if (isVerbose()) { + std::cout << "genrb subtree: " << bundle->fLocale << ": DELETE: " << path << std::endl; + } + if (prev == nullptr) { + fFirst = curr->fNext; + } else { + prev->fNext = curr->fNext; + } + fCount--; + delete curr; + curr = prev; + } else { + U_ASSERT(inclusion == PathFilter::EInclusion::PARTIAL); + // Recurse into the child + curr->applyFilter(filter, path, bundle); + } + path.pop(); + + prev = curr; + if (curr == nullptr) { + curr = fFirst; + } else { + curr = curr->fNext; + } + } +} diff --git a/intl/icu/source/tools/genrb/reslist.h b/intl/icu/source/tools/genrb/reslist.h new file mode 100644 index 0000000000..17797bc36c --- /dev/null +++ b/intl/icu/source/tools/genrb/reslist.h @@ -0,0 +1,446 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File reslist.h +* +* Modification History: +* +* Date Name Description +* 02/21/00 weiv Creation. +******************************************************************************* +*/ + +#ifndef RESLIST_H +#define RESLIST_H + +#define KEY_SPACE_SIZE 65536 +#define RESLIST_INT_VECTOR_INIT_SIZE 2048 + +#include <functional> + +#include "unicode/utypes.h" +#include "unicode/unistr.h" +#include "unicode/ures.h" +#include "unicode/ustring.h" +#include "cmemory.h" +#include "cstring.h" +#include "uhash.h" +#include "unewdata.h" +#include "uresdata.h" +#include "ustr.h" + +U_CDECL_BEGIN + +class PathFilter; +class PseudoListResource; +class ResKeyPath; + +struct ResFile { + ResFile() + : fBytes(nullptr), fIndexes(nullptr), + fKeys(nullptr), fKeysLength(0), fKeysCount(0), + fStrings(nullptr), fStringIndexLimit(0), + fChecksum(0) {} + ~ResFile() { close(); } + + void close(); + + uint8_t *fBytes; + const int32_t *fIndexes; + const char *fKeys; + int32_t fKeysLength; + int32_t fKeysCount; + + PseudoListResource *fStrings; + int32_t fStringIndexLimit; + + int32_t fChecksum; +}; + +struct SResource; + +typedef struct KeyMapEntry { + int32_t oldpos, newpos; +} KeyMapEntry; + +/* Resource bundle root table */ +struct SRBRoot { + SRBRoot(const UString *comment, UBool isPoolBundle, UErrorCode &errorCode); + ~SRBRoot(); + + void write(const char *outputDir, const char *outputPkg, + char *writtenFilename, int writtenFilenameLen, UErrorCode &errorCode); + + void setLocale(char16_t *locale, UErrorCode &errorCode); + int32_t addTag(const char *tag, UErrorCode &errorCode); + + const char *getKeyString(int32_t key) const; + const char *getKeyBytes(int32_t *pLength) const; + + int32_t addKeyBytes(const char *keyBytes, int32_t length, UErrorCode &errorCode); + + void compactKeys(UErrorCode &errorCode); + + int32_t makeRes16(uint32_t resWord) const; + int32_t mapKey(int32_t oldpos) const; + +private: + void compactStringsV2(UHashtable *stringSet, UErrorCode &errorCode); + +public: + // TODO: private + + SResource *fRoot; // Normally a TableResource. + char *fLocale; + int32_t fIndexLength; + int32_t fMaxTableLength; + UBool fNoFallback; /* see URES_ATT_NO_FALLBACK */ + int8_t fStringsForm; /* default STRINGS_UTF16_V1 */ + UBool fIsPoolBundle; + + char *fKeys; + KeyMapEntry *fKeyMap; + int32_t fKeysBottom, fKeysTop; + int32_t fKeysCapacity; + int32_t fKeysCount; + int32_t fLocalKeyLimit; /* key offset < limit fits into URES_TABLE */ + + icu::UnicodeString f16BitUnits; + int32_t f16BitStringsLength; + + const ResFile *fUsePoolBundle; + int32_t fPoolStringIndexLimit; + int32_t fPoolStringIndex16Limit; + int32_t fLocalStringIndexLimit; + SRBRoot *fWritePoolBundle; +}; + +/* write a java resource file */ +// TODO: C++ify +void bundle_write_java(struct SRBRoot *bundle, const char *outputDir, const char* outputEnc, char *writtenFilename, + int writtenFilenameLen, const char* packageName, const char* bundleName, UErrorCode *status); + +/* write a xml resource file */ +// TODO: C++ify +void bundle_write_xml(struct SRBRoot *bundle, const char *outputDir,const char* outputEnc, const char* rbname, + char *writtenFilename, int writtenFilenameLen, const char* language, const char* package, UErrorCode *status); + +/* Various resource types */ + +/* + * Return a unique pointer to a dummy object, + * for use in non-error cases when no resource is to be added to the bundle. + * (nullptr is used in error cases.) + */ +struct SResource* res_none(); + +class ArrayResource; +class TableResource; +class IntVectorResource; + +TableResource *table_open(struct SRBRoot *bundle, const char *tag, const struct UString* comment, UErrorCode *status); + +ArrayResource *array_open(struct SRBRoot *bundle, const char *tag, const struct UString* comment, UErrorCode *status); + +struct SResource *string_open(struct SRBRoot *bundle, const char *tag, const char16_t *value, int32_t len, const struct UString* comment, UErrorCode *status); + +struct SResource *alias_open(struct SRBRoot *bundle, const char *tag, char16_t *value, int32_t len, const struct UString* comment, UErrorCode *status); + +IntVectorResource *intvector_open(struct SRBRoot *bundle, const char *tag, const struct UString* comment, UErrorCode *status); + +struct SResource *int_open(struct SRBRoot *bundle, const char *tag, int32_t value, const struct UString* comment, UErrorCode *status); + +struct SResource *bin_open(struct SRBRoot *bundle, const char *tag, uint32_t length, uint8_t *data, const char* fileName, const struct UString* comment, UErrorCode *status); + +/* Resource place holder */ + +struct SResource { + SResource(); + SResource(SRBRoot *bundle, const char *tag, int8_t type, const UString* comment, + UErrorCode &errorCode); + virtual ~SResource(); + + UBool isTable() const { return fType == URES_TABLE; } + UBool isString() const { return fType == URES_STRING; } + + const char *getKeyString(const SRBRoot *bundle) const; + + /** + * Preflights strings. + * Finds duplicates and counts the total number of string code units + * so that they can be written first to the 16-bit array, + * for minimal string and container storage. + * + * We walk the final parse tree, rather than collecting this information while building it, + * so that we need not deal with changes to the parse tree (especially removing resources). + */ + void preflightStrings(SRBRoot *bundle, UHashtable *stringSet, UErrorCode &errorCode); + virtual void handlePreflightStrings(SRBRoot *bundle, UHashtable *stringSet, UErrorCode &errorCode); + + /** + * Writes resource values into f16BitUnits + * and determines the resource item word, if possible. + */ + void write16(SRBRoot *bundle); + virtual void handleWrite16(SRBRoot *bundle); + + /** + * Calculates ("preflights") and advances the *byteOffset + * by the size of the resource's data in the binary file and + * determines the resource item word. + * + * Most handlePreWrite() functions may add any number of bytes, but preWrite() + * will always pad it to a multiple of 4. + * The resource item type may be a related subtype of the fType. + * + * The preWrite() and write() functions start and end at the same + * byteOffset values. + * Prewriting allows bundle.write() to determine the root resource item word, + * before actually writing the bundle contents to the file, + * which is necessary because the root item is stored at the beginning. + */ + void preWrite(uint32_t *byteOffset); + virtual void handlePreWrite(uint32_t *byteOffset); + + /** + * Writes the resource's data to mem and updates the byteOffset + * in parallel. + */ + void write(UNewDataMemory *mem, uint32_t *byteOffset); + virtual void handleWrite(UNewDataMemory *mem, uint32_t *byteOffset); + + /** + * Applies the given filter with the given base path to this resource. + * Removes child resources rejected by the filter recursively. + * + * @param bundle Needed in order to access the key for this and child resources. + */ + virtual void applyFilter(const PathFilter& filter, ResKeyPath& path, const SRBRoot* bundle); + + /** + * Calls the given function for every key ID present in this tree. + */ + virtual void collectKeys(std::function<void(int32_t)> collector) const; + + int8_t fType; /* nominal type: fRes (when != 0xffffffff) may use subtype */ + UBool fWritten; /* res_write() can exit early */ + uint32_t fRes; /* resource item word; RES_BOGUS=0xffffffff if not known yet */ + int32_t fRes16; /* Res16 version of fRes for Table, Table16, Array16; -1 if it does not fit. */ + int32_t fKey; /* Index into bundle->fKeys; -1 if no key. */ + int32_t fKey16; /* Key16 version of fKey for Table & Table16; -1 if no key or it does not fit. */ + int line; /* used internally to report duplicate keys in tables */ + SResource *fNext; /* This is for internal chaining while building */ + struct UString fComment; +}; + +class ContainerResource : public SResource { +public: + ContainerResource(SRBRoot *bundle, const char *tag, int8_t type, + const UString* comment, UErrorCode &errorCode) + : SResource(bundle, tag, type, comment, errorCode), + fCount(0), fFirst(nullptr) {} + virtual ~ContainerResource(); + + void handlePreflightStrings(SRBRoot *bundle, UHashtable *stringSet, UErrorCode &errorCode) override; + + void collectKeys(std::function<void(int32_t)> collector) const override; + +protected: + void writeAllRes16(SRBRoot *bundle); + void preWriteAllRes(uint32_t *byteOffset); + void writeAllRes(UNewDataMemory *mem, uint32_t *byteOffset); + void writeAllRes32(UNewDataMemory *mem, uint32_t *byteOffset); + +public: + // TODO: private with getter? + uint32_t fCount; + SResource *fFirst; +}; + +class TableResource : public ContainerResource { +public: + TableResource(SRBRoot *bundle, const char *tag, + const UString* comment, UErrorCode &errorCode) + : ContainerResource(bundle, tag, URES_TABLE, comment, errorCode), + fTableType(URES_TABLE), fRoot(bundle) {} + virtual ~TableResource(); + + void add(SResource *res, int linenumber, UErrorCode &errorCode); + + void handleWrite16(SRBRoot *bundle) override; + void handlePreWrite(uint32_t *byteOffset) override; + void handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) override; + + void applyFilter(const PathFilter& filter, ResKeyPath& path, const SRBRoot* bundle) override; + + int8_t fTableType; // determined by table_write16() for table_preWrite() & table_write() + SRBRoot *fRoot; +}; + +class ArrayResource : public ContainerResource { +public: + ArrayResource(SRBRoot *bundle, const char *tag, + const UString* comment, UErrorCode &errorCode) + : ContainerResource(bundle, tag, URES_ARRAY, comment, errorCode), + fLast(nullptr) {} + virtual ~ArrayResource(); + + void add(SResource *res); + + virtual void handleWrite16(SRBRoot *bundle) override; + virtual void handlePreWrite(uint32_t *byteOffset) override; + virtual void handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) override; + + SResource *fLast; +}; + +/** + * List of resources for a pool bundle. + * Writes an empty table resource, rather than a container structure. + */ +class PseudoListResource : public ContainerResource { +public: + PseudoListResource(SRBRoot *bundle, UErrorCode &errorCode) + : ContainerResource(bundle, nullptr, URES_TABLE, nullptr, errorCode) {} + virtual ~PseudoListResource(); + + void add(SResource *res); + + virtual void handleWrite16(SRBRoot *bundle) override; +}; + +class StringBaseResource : public SResource { +public: + StringBaseResource(SRBRoot *bundle, const char *tag, int8_t type, + const char16_t *value, int32_t len, + const UString* comment, UErrorCode &errorCode); + StringBaseResource(SRBRoot *bundle, int8_t type, + const icu::UnicodeString &value, UErrorCode &errorCode); + StringBaseResource(int8_t type, const char16_t *value, int32_t len, UErrorCode &errorCode); + virtual ~StringBaseResource(); + + const char16_t *getBuffer() const { return icu::toUCharPtr(fString.getBuffer()); } + int32_t length() const { return fString.length(); } + + virtual void handlePreWrite(uint32_t *byteOffset) override; + virtual void handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) override; + + // TODO: private with getter? + icu::UnicodeString fString; +}; + +class StringResource : public StringBaseResource { +public: + StringResource(SRBRoot *bundle, const char *tag, const char16_t *value, int32_t len, + const UString* comment, UErrorCode &errorCode) + : StringBaseResource(bundle, tag, URES_STRING, value, len, comment, errorCode), + fSame(nullptr), fSuffixOffset(0), + fNumCopies(0), fNumUnitsSaved(0), fNumCharsForLength(0) {} + StringResource(SRBRoot *bundle, const icu::UnicodeString &value, UErrorCode &errorCode) + : StringBaseResource(bundle, URES_STRING, value, errorCode), + fSame(nullptr), fSuffixOffset(0), + fNumCopies(0), fNumUnitsSaved(0), fNumCharsForLength(0) {} + StringResource(int32_t poolStringIndex, int8_t numCharsForLength, + const char16_t *value, int32_t length, + UErrorCode &errorCode) + : StringBaseResource(URES_STRING, value, length, errorCode), + fSame(nullptr), fSuffixOffset(0), + fNumCopies(0), fNumUnitsSaved(0), fNumCharsForLength(numCharsForLength) { + // v3 pool string encoded as string-v2 with low offset + fRes = URES_MAKE_RESOURCE(URES_STRING_V2, poolStringIndex); + fWritten = true; + } + virtual ~StringResource(); + + int32_t get16BitStringsLength() const { + return fNumCharsForLength + length() + 1; // +1 for the NUL + } + + virtual void handlePreflightStrings(SRBRoot *bundle, UHashtable *stringSet, UErrorCode &errorCode) override; + virtual void handleWrite16(SRBRoot *bundle) override; + + void writeUTF16v2(int32_t base, icu::UnicodeString &dest); + + StringResource *fSame; // used for duplicates + int32_t fSuffixOffset; // this string is a suffix of fSame at this offset + int32_t fNumCopies; // number of equal strings represented by one stringSet element + int32_t fNumUnitsSaved; // from not writing duplicates and suffixes + int8_t fNumCharsForLength; +}; + +class AliasResource : public StringBaseResource { +public: + AliasResource(SRBRoot *bundle, const char *tag, const char16_t *value, int32_t len, + const UString* comment, UErrorCode &errorCode) + : StringBaseResource(bundle, tag, URES_ALIAS, value, len, comment, errorCode) {} + virtual ~AliasResource(); +}; + +class IntResource : public SResource { +public: + IntResource(SRBRoot *bundle, const char *tag, int32_t value, + const UString* comment, UErrorCode &errorCode); + virtual ~IntResource(); + + // TODO: private with getter? + int32_t fValue; +}; + +class IntVectorResource : public SResource { +public: + IntVectorResource(SRBRoot *bundle, const char *tag, + const UString* comment, UErrorCode &errorCode); + virtual ~IntVectorResource(); + + void add(int32_t value, UErrorCode &errorCode); + + virtual void handlePreWrite(uint32_t *byteOffset) override; + virtual void handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) override; + + // TODO: UVector32 + size_t fCount; + size_t fSize; + uint32_t *fArray; +}; + +class BinaryResource : public SResource { +public: + BinaryResource(SRBRoot *bundle, const char *tag, + uint32_t length, uint8_t *data, const char* fileName, + const UString* comment, UErrorCode &errorCode); + virtual ~BinaryResource(); + + virtual void handlePreWrite(uint32_t *byteOffset) override; + virtual void handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) override; + + // TODO: CharString? + uint32_t fLength; + uint8_t *fData; + // TODO: CharString + char* fFileName; // file name for binary or import binary tags if any +}; + +// TODO: use LocalPointer or delete +void res_close(struct SResource *res); + +void setIncludeCopyright(UBool val); +UBool getIncludeCopyright(); + +void setFormatVersion(int32_t formatVersion); + +int32_t getFormatVersion(); + +void setUsePoolBundle(UBool use); + +/* in wrtxml.cpp */ +uint32_t computeCRC(const char *ptr, uint32_t len, uint32_t lastcrc); + +U_CDECL_END +#endif /* #ifndef RESLIST_H */ diff --git a/intl/icu/source/tools/genrb/rle.c b/intl/icu/source/tools/genrb/rle.c new file mode 100644 index 0000000000..f737c45491 --- /dev/null +++ b/intl/icu/source/tools/genrb/rle.c @@ -0,0 +1,408 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2003, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File writejava.c +* +* Modification History: +* +* Date Name Description +* 01/11/02 Ram Creation. +******************************************************************************* +*/ +#include <stdbool.h> +#include "rle.h" +/** + * The ESCAPE character is used during run-length encoding. It signals + * a run of identical chars. + */ +static const uint16_t ESCAPE = 0xA5A5; + +/** + * The ESCAPE_BYTE character is used during run-length encoding. It signals + * a run of identical bytes. + */ +static const uint8_t ESCAPE_BYTE = (uint8_t)0xA5; + +/** + * Append a byte to the given StringBuffer, packing two bytes into each + * character. The state parameter maintains intermediary data between + * calls. + * @param state A two-element array, with state[0] == 0 if this is the + * first byte of a pair, or state[0] != 0 if this is the second byte + * of a pair, in which case state[1] is the first byte. + */ +static uint16_t* +appendEncodedByte(uint16_t* buffer, uint16_t* buffLimit, uint8_t value, uint8_t state[],UErrorCode* status) { + if(!status || U_FAILURE(*status)){ + return NULL; + } + if (state[0] != 0) { + uint16_t c = (uint16_t) ((state[1] << 8) | (((int32_t) value) & 0xFF)); + if(buffer < buffLimit){ + *buffer++ = c; + }else{ + *status = U_BUFFER_OVERFLOW_ERROR; + } + state[0] = 0; + return buffer; + } + else { + state[0] = 1; + state[1] = value; + return buffer; + } +} +/** + * Encode a run, possibly a degenerate run (of < 4 values). + * @param length The length of the run; must be > 0 && <= 0xFF. + */ +static uint16_t* +encodeRunByte(uint16_t* buffer,uint16_t* bufLimit, uint8_t value, int32_t length, uint8_t state[], UErrorCode* status) { + if(!status || U_FAILURE(*status)){ + return NULL; + } + if (length < 4) { + int32_t j=0; + for (; j<length; ++j) { + if (value == ESCAPE_BYTE) { + buffer = appendEncodedByte(buffer,bufLimit, ESCAPE_BYTE, state,status); + } + buffer = appendEncodedByte(buffer,bufLimit, value, state, status); + } + } + else { + if (length == ESCAPE_BYTE) { + if (value == ESCAPE_BYTE){ + buffer = appendEncodedByte(buffer, bufLimit,ESCAPE_BYTE, state,status); + } + buffer = appendEncodedByte(buffer,bufLimit, value, state, status); + --length; + } + buffer = appendEncodedByte(buffer,bufLimit, ESCAPE_BYTE, state,status); + buffer = appendEncodedByte(buffer,bufLimit, (char)length, state, status); + buffer = appendEncodedByte(buffer,bufLimit, value, state, status); /* Don't need to escape this value*/ + } + return buffer; +} + +#define APPEND( buffer, bufLimit, value, num, status) UPRV_BLOCK_MACRO_BEGIN { \ + if(buffer<bufLimit){ \ + *buffer++=(value); \ + }else{ \ + *status = U_BUFFER_OVERFLOW_ERROR; \ + } \ + num++; \ +} UPRV_BLOCK_MACRO_END + +/** + * Encode a run, possibly a degenerate run (of < 4 values). + * @param length The length of the run; must be > 0 && <= 0xFFFF. + */ +static uint16_t* +encodeRunShort(uint16_t* buffer,uint16_t* bufLimit, uint16_t value, int32_t length,UErrorCode* status) { + int32_t num=0; + if (length < 4) { + int j=0; + for (; j<length; ++j) { + if (value == (int32_t) ESCAPE){ + APPEND(buffer,bufLimit,ESCAPE, num, status); + + } + APPEND(buffer,bufLimit,value,num, status); + } + } + else { + if (length == (int32_t) ESCAPE) { + if (value == (int32_t) ESCAPE){ + APPEND(buffer,bufLimit,ESCAPE,num,status); + + } + APPEND(buffer,bufLimit,value,num,status); + --length; + } + APPEND(buffer,bufLimit,ESCAPE,num,status); + APPEND(buffer,bufLimit,(uint16_t) length, num,status); + APPEND(buffer,bufLimit,(uint16_t)value, num, status); /* Don't need to escape this value */ + } + return buffer; +} + +/** + * Construct a string representing a char array. Use run-length encoding. + * A character represents itself, unless it is the ESCAPE character. Then + * the following notations are possible: + * ESCAPE ESCAPE ESCAPE literal + * ESCAPE n c n instances of character c + * Since an encoded run occupies 3 characters, we only encode runs of 4 or + * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. + * If we encounter a run where n == ESCAPE, we represent this as: + * c ESCAPE n-1 c + * The ESCAPE value is chosen so as not to collide with commonly + * seen values. + */ +int32_t +usArrayToRLEString(const uint16_t* src,int32_t srcLen,uint16_t* buffer, int32_t bufLen,UErrorCode* status) { + uint16_t* bufLimit = buffer+bufLen; + uint16_t* saveBuffer = buffer; + if(buffer < bufLimit){ + *buffer++ = (uint16_t)(srcLen>>16); + if(buffer<bufLimit){ + uint16_t runValue = src[0]; + int32_t runLength = 1; + int i=1; + *buffer++ = (uint16_t) srcLen; + + for (; i<srcLen; ++i) { + uint16_t s = src[i]; + if (s == runValue && runLength < 0xFFFF){ + ++runLength; + }else { + buffer = encodeRunShort(buffer,bufLimit, (uint16_t)runValue, runLength,status); + runValue = s; + runLength = 1; + } + } + buffer= encodeRunShort(buffer,bufLimit,(uint16_t)runValue, runLength,status); + }else{ + *status = U_BUFFER_OVERFLOW_ERROR; + } + }else{ + *status = U_BUFFER_OVERFLOW_ERROR; + } + return (int32_t)(buffer - saveBuffer); +} + +/** + * Construct a string representing a byte array. Use run-length encoding. + * Two bytes are packed into a single char, with a single extra zero byte at + * the end if needed. A byte represents itself, unless it is the + * ESCAPE_BYTE. Then the following notations are possible: + * ESCAPE_BYTE ESCAPE_BYTE ESCAPE_BYTE literal + * ESCAPE_BYTE n b n instances of byte b + * Since an encoded run occupies 3 bytes, we only encode runs of 4 or + * more bytes. Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF. + * If we encounter a run where n == ESCAPE_BYTE, we represent this as: + * b ESCAPE_BYTE n-1 b + * The ESCAPE_BYTE value is chosen so as not to collide with commonly + * seen values. + */ +int32_t +byteArrayToRLEString(const uint8_t* src,int32_t srcLen, uint16_t* buffer,int32_t bufLen, UErrorCode* status) { + const uint16_t* saveBuf = buffer; + uint16_t* bufLimit = buffer+bufLen; + if(buffer < bufLimit){ + *buffer++ = ((uint16_t) (srcLen >> 16)); + + if(buffer<bufLimit){ + uint8_t runValue = src[0]; + int runLength = 1; + uint8_t state[2]= {0}; + int i=1; + *buffer++=((uint16_t) srcLen); + for (; i<srcLen; ++i) { + uint8_t b = src[i]; + if (b == runValue && runLength < 0xFF){ + ++runLength; + } + else { + buffer = encodeRunByte(buffer, bufLimit,runValue, runLength, state,status); + runValue = b; + runLength = 1; + } + } + buffer = encodeRunByte(buffer,bufLimit, runValue, runLength, state, status); + + /* We must save the final byte, if there is one, by padding + * an extra zero. + */ + if (state[0] != 0) { + buffer = appendEncodedByte(buffer,bufLimit, 0, state ,status); + } + }else{ + *status = U_BUFFER_OVERFLOW_ERROR; + } + }else{ + *status = U_BUFFER_OVERFLOW_ERROR; + } + return (int32_t) (buffer - saveBuf); +} + + +/** + * Construct an array of shorts from a run-length encoded string. + */ +int32_t +rleStringToUCharArray(uint16_t* src, int32_t srcLen, uint16_t* target, int32_t tgtLen, UErrorCode* status) { + int32_t length = 0; + int32_t ai = 0; + int i=2; + + if(!status || U_FAILURE(*status)){ + return 0; + } + /* the source is null terminated */ + if(srcLen == -1){ + srcLen = u_strlen(src); + } + if(srcLen <= 2){ + return 2; + } + length = (((int32_t) src[0]) << 16) | ((int32_t) src[1]); + + if(target == NULL){ + return length; + } + if(tgtLen < length){ + *status = U_BUFFER_OVERFLOW_ERROR; + return length; + } + + for (; i<srcLen; ++i) { + uint16_t c = src[i]; + if (c == ESCAPE) { + c = src[++i]; + if (c == ESCAPE) { + target[ai++] = c; + } else { + int32_t runLength = (int32_t) c; + uint16_t runValue = src[++i]; + int j=0; + for (; j<runLength; ++j) { + target[ai++] = runValue; + } + } + } + else { + target[ai++] = c; + } + } + + if (ai != length){ + *status = U_INTERNAL_PROGRAM_ERROR; + } + + return length; +} + +/** + * Construct an array of bytes from a run-length encoded string. + */ +int32_t +rleStringToByteArray(uint16_t* src, int32_t srcLen, uint8_t* target, int32_t tgtLen, UErrorCode* status) { + + int32_t length = 0; + UBool nextChar = true; + uint16_t c = 0; + int32_t node = 0; + int32_t runLength = 0; + int32_t i = 2; + int32_t ai=0; + + if(!status || U_FAILURE(*status)){ + return 0; + } + /* the source is null terminated */ + if(srcLen == -1){ + srcLen = u_strlen(src); + } + if(srcLen <= 2){ + return 2; + } + length = (((int32_t) src[0]) << 16) | ((int32_t) src[1]); + + if(target == NULL){ + return length; + } + if(tgtLen < length){ + *status = U_BUFFER_OVERFLOW_ERROR; + return length; + } + + for (; ai<tgtLen; ) { + /* This part of the loop places the next byte into the local + * variable 'b' each time through the loop. It keeps the + * current character in 'c' and uses the boolean 'nextChar' + * to see if we've taken both bytes out of 'c' yet. + */ + uint8_t b; + if (nextChar) { + c = src[i++]; + b = (uint8_t) (c >> 8); + nextChar = false; + } + else { + b = (uint8_t) (c & 0xFF); + nextChar = true; + } + + /* This part of the loop is a tiny state machine which handles + * the parsing of the run-length encoding. This would be simpler + * if we could look ahead, but we can't, so we use 'node' to + * move between three nodes in the state machine. + */ + switch (node) { + case 0: + /* Normal idle node */ + if (b == ESCAPE_BYTE) { + node = 1; + } + else { + target[ai++] = b; + } + break; + case 1: + /* We have seen one ESCAPE_BYTE; we expect either a second + * one, or a run length and value. + */ + if (b == ESCAPE_BYTE) { + target[ai++] = ESCAPE_BYTE; + node = 0; + } + else { + runLength = b; + node = 2; + } + break; + case 2: + { + int j=0; + /* We have seen an ESCAPE_BYTE and length byte. We interpret + * the next byte as the value to be repeated. + */ + for (; j<runLength; ++j){ + if(ai<tgtLen){ + target[ai++] = b; + }else{ + *status = U_BUFFER_OVERFLOW_ERROR; + return ai; + } + } + node = 0; + break; + } + } + } + + if (node != 0){ + *status = U_INTERNAL_PROGRAM_ERROR; + /*("Bad run-length encoded byte array")*/ + return 0; + } + + + if (i != srcLen){ + /*("Excess data in RLE byte array string");*/ + *status = U_INTERNAL_PROGRAM_ERROR; + return ai; + } + + return ai; +} + diff --git a/intl/icu/source/tools/genrb/rle.h b/intl/icu/source/tools/genrb/rle.h new file mode 100644 index 0000000000..2684bbe6b2 --- /dev/null +++ b/intl/icu/source/tools/genrb/rle.h @@ -0,0 +1,74 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File writejava.c +* +* Modification History: +* +* Date Name Description +* 01/11/02 Ram Creation. +******************************************************************************* +*/ + +#ifndef RLE_H +#define RLE_H 1 + +#include "unicode/utypes.h" +#include "unicode/ustring.h" + +U_CDECL_BEGIN +/** + * Construct a string representing a byte array. Use run-length encoding. + * Two bytes are packed into a single char, with a single extra zero byte at + * the end if needed. A byte represents itself, unless it is the + * ESCAPE_BYTE. Then the following notations are possible: + * ESCAPE_BYTE ESCAPE_BYTE ESCAPE_BYTE literal + * ESCAPE_BYTE n b n instances of byte b + * Since an encoded run occupies 3 bytes, we only encode runs of 4 or + * more bytes. Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF. + * If we encounter a run where n == ESCAPE_BYTE, we represent this as: + * b ESCAPE_BYTE n-1 b + * The ESCAPE_BYTE value is chosen so as not to collide with commonly + * seen values. + */ +int32_t +byteArrayToRLEString(const uint8_t* src,int32_t srcLen, uint16_t* buffer,int32_t bufLen, UErrorCode* status); + + +/** + * Construct a string representing a char array. Use run-length encoding. + * A character represents itself, unless it is the ESCAPE character. Then + * the following notations are possible: + * ESCAPE ESCAPE ESCAPE literal + * ESCAPE n c n instances of character c + * Since an encoded run occupies 3 characters, we only encode runs of 4 or + * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. + * If we encounter a run where n == ESCAPE, we represent this as: + * c ESCAPE n-1 c + * The ESCAPE value is chosen so as not to collide with commonly + * seen values. + */ +int32_t +usArrayToRLEString(const uint16_t* src,int32_t srcLen,uint16_t* buffer, int32_t bufLen,UErrorCode* status); + +/** + * Construct an array of bytes from a run-length encoded string. + */ +int32_t +rleStringToByteArray(uint16_t* src, int32_t srcLen, uint8_t* target, int32_t tgtLen, UErrorCode* status); +/** + * Construct an array of shorts from a run-length encoded string. + */ +int32_t +rleStringToUCharArray(uint16_t* src, int32_t srcLen, uint16_t* target, int32_t tgtLen, UErrorCode* status); + +U_CDECL_END + +#endif diff --git a/intl/icu/source/tools/genrb/sources.txt b/intl/icu/source/tools/genrb/sources.txt new file mode 100644 index 0000000000..0128e2094f --- /dev/null +++ b/intl/icu/source/tools/genrb/sources.txt @@ -0,0 +1,12 @@ +errmsg.c +filterrb.cpp +genrb.cpp +parse.cpp +prscmnts.cpp +rbutil.c +read.c +reslist.cpp +rle.c +ustr.c +wrtjava.cpp +wrtxml.cpp diff --git a/intl/icu/source/tools/genrb/ustr.c b/intl/icu/source/tools/genrb/ustr.c new file mode 100644 index 0000000000..15f76a80ca --- /dev/null +++ b/intl/icu/source/tools/genrb/ustr.c @@ -0,0 +1,219 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File ustr.c +* +* Modification History: +* +* Date Name Description +* 05/28/99 stephen Creation. +******************************************************************************* +*/ + +#include "ustr.h" +#include "cmemory.h" +#include "cstring.h" +#include "unicode/ustring.h" +#include "unicode/putil.h" +#include "unicode/utf16.h" + +/* Protos */ +static void ustr_resize(struct UString *s, int32_t len, UErrorCode *status); + +/* Macros */ +#define ALLOCATION(minSize) (minSize < 0x80 ? 0x80 : (2 * minSize + 0x80) & ~(0x80 - 1)) + +U_CFUNC void +ustr_init(struct UString *s) +{ + s->fChars = 0; + s->fLength = s->fCapacity = 0; +} + +U_CFUNC void +ustr_initChars(struct UString *s, const char* source, int32_t length, UErrorCode *status) +{ + int i = 0; + if (U_FAILURE(*status)) return; + s->fChars = 0; + s->fLength = s->fCapacity = 0; + if (length == -1) { + length = (int32_t)uprv_strlen(source); + } + if(s->fCapacity < length) { + ustr_resize(s, ALLOCATION(length), status); + if(U_FAILURE(*status)) return; + } + for (; i < length; i++) + { + UChar charToAppend; + u_charsToUChars(source+i, &charToAppend, 1); + ustr_ucat(s, charToAppend, status); + /* +#if U_CHARSET_FAMILY==U_ASCII_FAMILY + ustr_ucat(s, (UChar)(uint8_t)(source[i]), status); +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY + ustr_ucat(s, (UChar)asciiFromEbcdic[(uint8_t)(*cs++)], status); +#else +# error U_CHARSET_FAMILY is not valid +#endif + */ + } +} + +U_CFUNC void +ustr_deinit(struct UString *s) +{ + if (s) { + uprv_free(s->fChars); + s->fChars = 0; + s->fLength = s->fCapacity = 0; + } +} + +U_CFUNC void +ustr_cpy(struct UString *dst, + const struct UString *src, + UErrorCode *status) +{ + if(U_FAILURE(*status) || dst == src) + return; + + if(dst->fCapacity < src->fLength) { + ustr_resize(dst, ALLOCATION(src->fLength), status); + if(U_FAILURE(*status)) + return; + } + if(src->fChars == NULL || dst->fChars == NULL){ + return; + } + u_memcpy(dst->fChars, src->fChars, src->fLength); + dst->fLength = src->fLength; + dst->fChars[dst->fLength] = 0x0000; +} + +U_CFUNC void +ustr_setlen(struct UString *s, + int32_t len, + UErrorCode *status) +{ + if(U_FAILURE(*status)) + return; + + if(s->fCapacity < (len + 1)) { + ustr_resize(s, ALLOCATION(len), status); + if(U_FAILURE(*status)) + return; + } + + s->fLength = len; + s->fChars[len] = 0x0000; +} + +U_CFUNC void +ustr_cat(struct UString *dst, + const struct UString *src, + UErrorCode *status) +{ + ustr_ncat(dst, src, src->fLength, status); +} + +U_CFUNC void +ustr_ncat(struct UString *dst, + const struct UString *src, + int32_t n, + UErrorCode *status) +{ + if(U_FAILURE(*status) || dst == src) + return; + + if(dst->fCapacity < (dst->fLength + n)) { + ustr_resize(dst, ALLOCATION(dst->fLength + n), status); + if(U_FAILURE(*status)) + return; + } + + uprv_memcpy(dst->fChars + dst->fLength, src->fChars, + sizeof(UChar) * n); + dst->fLength += src->fLength; + dst->fChars[dst->fLength] = 0x0000; +} + +U_CFUNC void +ustr_ucat(struct UString *dst, + UChar c, + UErrorCode *status) +{ + if(U_FAILURE(*status)) + return; + + if(dst->fCapacity < (dst->fLength + 1)) { + ustr_resize(dst, ALLOCATION(dst->fLength + 1), status); + if(U_FAILURE(*status)) + return; + } + + uprv_memcpy(dst->fChars + dst->fLength, &c, + sizeof(UChar) * 1); + dst->fLength += 1; + dst->fChars[dst->fLength] = 0x0000; +} +U_CFUNC void +ustr_u32cat(struct UString *dst, UChar32 c, UErrorCode *status){ + if(c > 0x10FFFF){ + *status = U_ILLEGAL_CHAR_FOUND; + return; + } + if(c >0xFFFF){ + ustr_ucat(dst, U16_LEAD(c), status); + ustr_ucat(dst, U16_TRAIL(c), status); + }else{ + ustr_ucat(dst, (UChar) c, status); + } +} +U_CFUNC void +ustr_uscat(struct UString *dst, + const UChar* src,int len, + UErrorCode *status) +{ + if(U_FAILURE(*status)) + return; + + if(dst->fCapacity < (dst->fLength + len)) { + ustr_resize(dst, ALLOCATION(dst->fLength + len), status); + if(U_FAILURE(*status)) + return; + } + + uprv_memcpy(dst->fChars + dst->fLength, src, + sizeof(UChar) * len); + dst->fLength += len; + dst->fChars[dst->fLength] = 0x0000; +} + +/* Destroys data in the string */ +static void +ustr_resize(struct UString *s, + int32_t len, + UErrorCode *status) +{ + if(U_FAILURE(*status)) + return; + + /* +1 for trailing 0x0000 */ + s->fChars = (UChar*) uprv_realloc(s->fChars, sizeof(UChar) * (len + 1)); + if(s->fChars == 0) { + *status = U_MEMORY_ALLOCATION_ERROR; + s->fLength = s->fCapacity = 0; + return; + } + + s->fCapacity = len; +} diff --git a/intl/icu/source/tools/genrb/ustr.h b/intl/icu/source/tools/genrb/ustr.h new file mode 100644 index 0000000000..8a69e9d4d5 --- /dev/null +++ b/intl/icu/source/tools/genrb/ustr.h @@ -0,0 +1,81 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File ustr.h +* +* Modification History: +* +* Date Name Description +* 05/28/99 stephen Creation. +******************************************************************************* +*/ + +#ifndef USTR_H +#define USTR_H 1 + +#include "unicode/utypes.h" + +#define U_APPEND_CHAR32(c,target,len) UPRV_BLOCK_MACRO_BEGIN { \ + if (c <= 0xffff) \ + { \ + *(target)++ = (UChar) c; \ + len=1; \ + } \ + else \ + { \ + target[0] = U16_LEAD(c); \ + target[1] = U16_TRAIL(c); \ + len=2; \ + target +=2; \ + } \ +} UPRV_BLOCK_MACRO_END + +#define U_APPEND_CHAR32_ONLY(c,target) UPRV_BLOCK_MACRO_BEGIN { \ + if (c <= 0xffff) \ + { \ + *(target)++ = (UChar) c; \ + } \ + else \ + { \ + target[0] = U16_LEAD(c); \ + target[1] = U16_TRAIL(c); \ + target +=2; \ + } \ +} UPRV_BLOCK_MACRO_END + +/* A C representation of a string "object" (to avoid realloc all the time) */ +struct UString { + UChar *fChars; + int32_t fLength; + int32_t fCapacity; +}; + +U_CFUNC void ustr_init(struct UString *s); + +U_CFUNC void +ustr_initChars(struct UString *s, const char* source, int32_t length, UErrorCode *status); + +U_CFUNC void ustr_deinit(struct UString *s); + +U_CFUNC void ustr_setlen(struct UString *s, int32_t len, UErrorCode *status); + +U_CFUNC void ustr_cpy(struct UString *dst, const struct UString *src, + UErrorCode *status); + +U_CFUNC void ustr_cat(struct UString *dst, const struct UString *src, + UErrorCode *status); + +U_CFUNC void ustr_ncat(struct UString *dst, const struct UString *src, + int32_t n, UErrorCode *status); + +U_CFUNC void ustr_ucat(struct UString *dst, UChar c, UErrorCode *status); +U_CFUNC void ustr_u32cat(struct UString *dst, UChar32 c, UErrorCode *status); +U_CFUNC void ustr_uscat(struct UString *dst, const UChar* src,int len,UErrorCode *status); +#endif diff --git a/intl/icu/source/tools/genrb/wrtjava.cpp b/intl/icu/source/tools/genrb/wrtjava.cpp new file mode 100644 index 0000000000..cb04b5a44a --- /dev/null +++ b/intl/icu/source/tools/genrb/wrtjava.cpp @@ -0,0 +1,701 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File wrtjava.cpp +* +* Modification History: +* +* Date Name Description +* 01/11/02 Ram Creation. +* 02/12/08 Spieth Fix errant 'new Object[][]{' insertion +* 02/19/08 Spieth Removed ICUListResourceBundle dependency +******************************************************************************* +*/ + +#include <assert.h> +#include "unicode/unistr.h" +#include "reslist.h" +#include "unewdata.h" +#include "unicode/ures.h" +#include "errmsg.h" +#include "filestrm.h" +#include "cstring.h" +#include "unicode/ucnv.h" +#include "genrb.h" +#include "rle.h" +#include "uhash.h" +#include "uresimp.h" +#include "unicode/ustring.h" +#include "unicode/utf8.h" + +void res_write_java(struct SResource *res,UErrorCode *status); + + +static const char copyRight[] = + "/* \n" + " *******************************************************************************\n" + " *\n" + " * Copyright (C) International Business Machines\n" + " * Corporation and others. All Rights Reserved.\n" + " *\n" + " *******************************************************************************\n" + " * $" "Source: $ \n" + " * $" "Date: $ \n" + " * $" "Revision: $ \n" + " *******************************************************************************\n" + " */\n\n"; +static const char warningMsg[] = + "/*********************************************************************\n" + "######################################################################\n" + "\n" + " WARNING: This file is generated by genrb Version " GENRB_VERSION ".\n" + " If you edit this file, please make sure that, the source\n" + " of this file (XXXX.txt in LocaleElements_XXXX.java)\n" + " is also edited.\n" + "######################################################################\n" + " *********************************************************************\n" + " */\n\n"; +static const char* openBrace="{\n"; +static const char* closeClass=" };\n" + "}\n"; + +static const char* javaClass = "import java.util.ListResourceBundle;\n\n" + "public class "; + +static const char* javaClass1= " extends ListResourceBundle {\n\n" + " /**\n" + " * Overrides ListResourceBundle \n" + " */\n" + " public final Object[][] getContents() { \n" + " return contents;\n" + " }\n\n" + " private static Object[][] contents = {\n"; +/*static const char* javaClassICU= " extends ListResourceBundle {\n\n" + " public %s () {\n" + " super.contents = data;\n" + " }\n" + " static final Object[][] data = new Object[][] { \n";*/ +static int tabCount = 3; + +static FileStream* out=nullptr; +static struct SRBRoot* srBundle ; +/*static const char* outDir = nullptr;*/ + +static const char* bName=nullptr; +static const char* pName=nullptr; + +static void write_tabs(FileStream* os){ + int i=0; + for(;i<=tabCount;i++){ + T_FileStream_write(os," ",4); + } +} + +#define ZERO 0x30 + +static const char* enc =""; +static UConverter* conv = nullptr; + +static int32_t +uCharsToChars(char *target, int32_t targetLen, const char16_t *source, int32_t sourceLen, UErrorCode *status) { + int i=0, j=0; + char str[30]={'\0'}; + while(i<sourceLen){ + if (source[i] == '\n') { + if (j + 2 < targetLen) { + uprv_strcat(target, "\\n"); + } + j += 2; + }else if(source[i]==0x0D){ + if(j+2<targetLen){ + uprv_strcat(target,"\\f"); + } + j+=2; + }else if(source[i] == '"'){ + if(source[i-1]=='\''){ + if(j+2<targetLen){ + uprv_strcat(target,"\\"); + target[j+1]= (char)source[i]; + } + j+=2; + }else if(source[i-1]!='\\'){ + + if(j+2<targetLen){ + uprv_strcat(target,"\\"); + target[j+1]= (char)source[i]; + } + j+=2; + }else if(source[i-1]=='\\'){ + target[j++]= (char)source[i]; + } + }else if(source[i]=='\\'){ + if(i+1<sourceLen){ + switch(source[i+1]){ + case ',': + case '!': + case '?': + case '#': + case '.': + case '%': + case '&': + case ':': + case ';': + if(j+2<targetLen){ + uprv_strcat(target,"\\\\"); + } + j+=2; + break; + case '"': + case '\'': + if(j+3<targetLen){ + uprv_strcat(target,"\\\\\\"); + } + j+=3; + break; + default : + if(j<targetLen){ + target[j]=(char)source[i]; + } + j++; + break; + } + }else{ + if(j<targetLen){ + uprv_strcat(target,"\\\\"); + } + j+=2; + } + }else if(source[i]>=0x20 && source[i]<0x7F/*ASCII*/){ + if(j<targetLen){ + target[j] = (char) source[i]; + } + j++; + }else{ + if(*enc =='\0' || source[i]==0x0000){ + uprv_strcpy(str,"\\u"); + itostr(str+2,source[i],16,4); + if(j+6<targetLen){ + uprv_strcat(target,str); + } + j+=6; + }else{ + char dest[30] = {0}; + int retVal=ucnv_fromUChars(conv,dest,30,source+i,1,status); + if(U_FAILURE(*status)){ + return 0; + } + if(j+retVal<targetLen){ + uprv_strcat(target,dest); + } + j+=retVal; + } + } + i++; + } + return j; +} + + +static uint32_t +strrch(const char* source,uint32_t sourceLen,char find){ + const char* tSourceEnd =source + (sourceLen-1); + while(tSourceEnd>= source){ + if(*tSourceEnd==find){ + return (uint32_t)(tSourceEnd-source); + } + tSourceEnd--; + } + return (uint32_t)(tSourceEnd-source); +} + +static int32_t getColumnCount(int32_t len){ + int32_t columnCount = 80; + int32_t maxLines = 3000; + int32_t adjustedLen = len*5; /* assume that every codepoint is represented in \uXXXX format*/ + /* + * calculate the number of lines that + * may be required if column count is 80 + */ + if (maxLines < (adjustedLen / columnCount) ){ + columnCount = adjustedLen / maxLines; + } + return columnCount; +} +static void +str_write_java(const char16_t *src, int32_t srcLen, UBool printEndLine, UErrorCode *status) { + + uint32_t length = srcLen*8; + uint32_t bufLen = 0; + uint32_t columnCount; + char* buf = (char*) malloc(sizeof(char)*length); + + if(buf == nullptr) { + *status = U_MEMORY_ALLOCATION_ERROR; + return; + } + + columnCount = getColumnCount(srcLen); + memset(buf,0,length); + + bufLen = uCharsToChars(buf,length,src,srcLen,status); + // buflen accounts for extra bytes added due to multi byte encoding of + // non ASCII characters + if(printEndLine) + write_tabs(out); + + if(U_FAILURE(*status)){ + uprv_free(buf); + return; + } + + if(bufLen+(tabCount*4) > columnCount ){ + uint32_t len = 0; + char* current = buf; + uint32_t add; + while(len < bufLen){ + add = columnCount-(tabCount*4)-5/* for ", +\n */; + current = buf +len; + if (add < (bufLen-len)) { + uint32_t idx = strrch(current,add,'\\'); + if (idx > add) { + idx = add; + } else { + int32_t num =idx-1; + uint32_t seqLen; + while(num>0){ + if(current[num]=='\\'){ + num--; + }else{ + break; + } + } + if ((idx-num)%2==0) { + idx--; + } + seqLen = (current[idx+1]=='u') ? 6 : 2; + if ((add-idx) < seqLen) { + add = idx + seqLen; + } + } + } + T_FileStream_write(out,"\"",1); + uint32_t byteIndex = 0; + uint32_t trailBytes = 0; + if(len+add<bufLen){ + // check the trail bytes to be added to the output line + while (byteIndex < add) { + if (U8_IS_LEAD(*(current + byteIndex))) { + trailBytes = U8_COUNT_TRAIL_BYTES(*(current + byteIndex)); + add += trailBytes; + } + byteIndex++; + } + T_FileStream_write(out,current,add); + if (len + add < bufLen) { + T_FileStream_write(out,"\" +\n",4); + write_tabs(out); + } + }else{ + T_FileStream_write(out,current,bufLen-len); + } + len+=add; + } + }else{ + T_FileStream_write(out,"\"",1); + T_FileStream_write(out, buf,bufLen); + } + if(printEndLine){ + T_FileStream_write(out,"\",\n",3); + }else{ + T_FileStream_write(out,"\"",1); + } + uprv_free(buf); +} + +/* Writing Functions */ +static void +string_write_java(const StringResource *res,UErrorCode *status) { + (void)res->getKeyString(srBundle); + + str_write_java(res->getBuffer(), res->length(), true, status); +} + +static void +array_write_java(const ArrayResource *res, UErrorCode *status) { + + uint32_t i = 0; + const char* arr ="new String[] { \n"; + struct SResource *current = nullptr; + UBool allStrings = true; + + if (U_FAILURE(*status)) { + return; + } + + if (res->fCount > 0) { + + current = res->fFirst; + i = 0; + while(current != nullptr){ + if(!current->isString()){ + allStrings = false; + break; + } + current= current->fNext; + } + + current = res->fFirst; + if(allStrings==false){ + const char* object = "new Object[]{\n"; + write_tabs(out); + T_FileStream_write(out, object, (int32_t)uprv_strlen(object)); + tabCount++; + }else{ + write_tabs(out); + T_FileStream_write(out, arr, (int32_t)uprv_strlen(arr)); + tabCount++; + } + while (current != nullptr) { + /*if(current->isString()){ + write_tabs(out); + }*/ + res_write_java(current, status); + if(U_FAILURE(*status)){ + return; + } + i++; + current = current->fNext; + } + T_FileStream_write(out,"\n",1); + + tabCount--; + write_tabs(out); + T_FileStream_write(out,"},\n",3); + + } else { + write_tabs(out); + T_FileStream_write(out,arr,(int32_t)uprv_strlen(arr)); + write_tabs(out); + T_FileStream_write(out,"},\n",3); + } +} + +static void +intvector_write_java(const IntVectorResource *res, UErrorCode * /*status*/) { + uint32_t i = 0; + const char* intArr = "new int[] {\n"; + /* const char* intC = "new Integer("; */ + const char* stringArr = "new String[]{\n"; + const char *resname = res->getKeyString(srBundle); + char buf[100]; + int len =0; + buf[0]=0; + write_tabs(out); + + if(resname != nullptr && uprv_strcmp(resname,"DateTimeElements")==0){ + T_FileStream_write(out, stringArr, (int32_t)uprv_strlen(stringArr)); + tabCount++; + for(i = 0; i<res->fCount; i++) { + write_tabs(out); + len=itostr(buf,res->fArray[i],10,0); + T_FileStream_write(out,"\"",1); + T_FileStream_write(out,buf,len); + T_FileStream_write(out,"\",",2); + T_FileStream_write(out,"\n",1); + } + }else{ + T_FileStream_write(out, intArr, (int32_t)uprv_strlen(intArr)); + tabCount++; + for(i = 0; i<res->fCount; i++) { + write_tabs(out); + /* T_FileStream_write(out, intC, (int32_t)uprv_strlen(intC)); */ + len=itostr(buf,res->fArray[i],10,0); + T_FileStream_write(out,buf,len); + /* T_FileStream_write(out,"),",2); */ + /* T_FileStream_write(out,"\n",1); */ + T_FileStream_write(out,",\n",2); + } + } + tabCount--; + write_tabs(out); + T_FileStream_write(out,"},\n",3); +} + +static void +int_write_java(const IntResource *res, UErrorCode * /*status*/) { + const char* intC = "new Integer("; + char buf[100]; + int len =0; + buf[0]=0; + + /* write the binary data */ + write_tabs(out); + T_FileStream_write(out, intC, (int32_t)uprv_strlen(intC)); + len=itostr(buf, res->fValue, 10, 0); + T_FileStream_write(out,buf,len); + T_FileStream_write(out,"),\n",3 ); + +} + +static void +bytes_write_java(const BinaryResource *res, UErrorCode * /*status*/) { + const char* type = "new byte[] {"; + const char* byteDecl = "%i, "; + char byteBuffer[100] = { 0 }; + uint8_t* byteArray = nullptr; + int byteIterator = 0; + int32_t srcLen=res->fLength; + if(srcLen>0 ) + { + byteArray = res->fData; + + write_tabs(out); + T_FileStream_write(out, type, (int32_t)uprv_strlen(type)); + T_FileStream_write(out, "\n", 1); + tabCount++; + + for (;byteIterator<srcLen;byteIterator++) + { + if (byteIterator%16 == 0) + { + write_tabs(out); + } + + if (byteArray[byteIterator] < 128) + { + snprintf(byteBuffer, sizeof(byteBuffer), byteDecl, byteArray[byteIterator]); + } + else + { + snprintf(byteBuffer, sizeof(byteBuffer), byteDecl, (byteArray[byteIterator]-256)); + } + + T_FileStream_write(out, byteBuffer, (int32_t)uprv_strlen(byteBuffer)); + + if (byteIterator%16 == 15) + { + T_FileStream_write(out, "\n", 1); + } + + } + + if (((byteIterator-1)%16) != 15) + { + T_FileStream_write(out, "\n", 1); + } + + tabCount--; + write_tabs(out); + T_FileStream_write(out, "},\n", 3); + + } + else + { + /* Empty array */ + write_tabs(out); + T_FileStream_write(out,type,(int32_t)uprv_strlen(type)); + T_FileStream_write(out,"},\n",3); + } + +} + +static UBool start = true; + +static void +table_write_java(const TableResource *res, UErrorCode *status) { + uint32_t i = 0; + struct SResource *current = nullptr; + const char* obj = "new Object[][]{\n"; + + if (U_FAILURE(*status)) { + return ; + } + + if (res->fCount > 0) { + if(start==false){ + write_tabs(out); + T_FileStream_write(out, obj, (int32_t)uprv_strlen(obj)); + tabCount++; + } + start = false; + current = res->fFirst; + i = 0; + + + while (current != nullptr) { + const char *currentKeyString = current->getKeyString(srBundle); + + assert(i < res->fCount); + write_tabs(out); + + T_FileStream_write(out, openBrace, 2); + + + tabCount++; + + write_tabs(out); + if(currentKeyString != nullptr) { + T_FileStream_write(out, "\"", 1); + T_FileStream_write(out, currentKeyString, + (int32_t)uprv_strlen(currentKeyString)); + T_FileStream_write(out, "\",\n", 2); + + T_FileStream_write(out, "\n", 1); + } + res_write_java(current, status); + if(U_FAILURE(*status)){ + return; + } + i++; + current = current->fNext; + tabCount--; + write_tabs(out); + T_FileStream_write(out, "},\n", 3); + } + if(tabCount>4){ + tabCount--; + write_tabs(out); + T_FileStream_write(out, "},\n", 3); + } + + } else { + write_tabs(out); + T_FileStream_write(out,obj,(int32_t)uprv_strlen(obj)); + + write_tabs(out); + T_FileStream_write(out,"},\n",3); + + } + +} + +void +res_write_java(struct SResource *res,UErrorCode *status) { + + if (U_FAILURE(*status)) { + return ; + } + + if (res != nullptr) { + switch (res->fType) { + case URES_STRING: + string_write_java (static_cast<const StringResource *>(res), status); + return; + case URES_ALIAS: + printf("Encountered unsupported resource type %d of alias\n", res->fType); + *status = U_UNSUPPORTED_ERROR; + return; + case URES_INT_VECTOR: + intvector_write_java (static_cast<const IntVectorResource *>(res), status); + return; + case URES_BINARY: + bytes_write_java (static_cast<const BinaryResource *>(res), status); + return; + case URES_INT: + int_write_java (static_cast<const IntResource *>(res), status); + return; + case URES_ARRAY: + array_write_java (static_cast<const ArrayResource *>(res), status); + return; + case URES_TABLE: + table_write_java (static_cast<const TableResource *>(res), status); + return; + default: + break; + } + } + + *status = U_INTERNAL_PROGRAM_ERROR; +} + +void +bundle_write_java(struct SRBRoot *bundle, const char *outputDir,const char* outputEnc, + char *writtenFilename, int writtenFilenameLen, + const char* packageName, const char* bundleName, + UErrorCode *status) { + + char fileName[256] = {'\0'}; + char className[256]={'\0'}; + /*char constructor[1000] = { 0 };*/ + /*UBool j1 =false;*/ + /*outDir = outputDir;*/ + + start = true; /* Reset the start indicator*/ + + bName = (bundleName==nullptr) ? "LocaleElements" : bundleName; + pName = (packageName==nullptr)? "com.ibm.icu.impl.data" : packageName; + + uprv_strcpy(className, bName); + srBundle = bundle; + if(uprv_strcmp(srBundle->fLocale,"root")!=0){ + uprv_strcat(className,"_"); + uprv_strcat(className,srBundle->fLocale); + } + if(outputDir){ + uprv_strcpy(fileName, outputDir); + if(outputDir[uprv_strlen(outputDir)-1] !=U_FILE_SEP_CHAR){ + uprv_strcat(fileName,U_FILE_SEP_STRING); + } + uprv_strcat(fileName,className); + uprv_strcat(fileName,".java"); + }else{ + uprv_strcat(fileName,className); + uprv_strcat(fileName,".java"); + } + + if (writtenFilename) { + uprv_strncpy(writtenFilename, fileName, writtenFilenameLen); + } + + if (U_FAILURE(*status)) { + return; + } + + out= T_FileStream_open(fileName,"w"); + + if(out==nullptr){ + *status = U_FILE_ACCESS_ERROR; + return; + } + if(getIncludeCopyright()){ + T_FileStream_write(out, copyRight, (int32_t)uprv_strlen(copyRight)); + T_FileStream_write(out, warningMsg, (int32_t)uprv_strlen(warningMsg)); + } + T_FileStream_write(out,"package ",(int32_t)uprv_strlen("package ")); + T_FileStream_write(out,pName,(int32_t)uprv_strlen(pName)); + T_FileStream_write(out,";\n\n",3); + T_FileStream_write(out, javaClass, (int32_t)uprv_strlen(javaClass)); + T_FileStream_write(out, className, (int32_t)uprv_strlen(className)); + T_FileStream_write(out, javaClass1, (int32_t)uprv_strlen(javaClass1)); + + /* if(j1){ + T_FileStream_write(out, javaClass1, (int32_t)uprv_strlen(javaClass1)); + }else{ + sprintf(constructor,javaClassICU,className); + T_FileStream_write(out, constructor, (int32_t)uprv_strlen(constructor)); + } + */ + + if(outputEnc && *outputEnc!='\0'){ + /* store the output encoding */ + enc = outputEnc; + conv=ucnv_open(enc,status); + if(U_FAILURE(*status)){ + return; + } + } + res_write_java(bundle->fRoot, status); + + T_FileStream_write(out, closeClass, (int32_t)uprv_strlen(closeClass)); + + T_FileStream_close(out); + + ucnv_close(conv); +} diff --git a/intl/icu/source/tools/genrb/wrtxml.cpp b/intl/icu/source/tools/genrb/wrtxml.cpp new file mode 100644 index 0000000000..16f67fabca --- /dev/null +++ b/intl/icu/source/tools/genrb/wrtxml.cpp @@ -0,0 +1,1213 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2002-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File wrtxml.cpp +* +* Modification History: +* +* Date Name Description +* 10/01/02 Ram Creation. +* 02/07/08 Spieth Correct XLIFF generation on EBCDIC platform +* +******************************************************************************* +*/ + +// Safer use of UnicodeString. +#ifndef UNISTR_FROM_CHAR_EXPLICIT +# define UNISTR_FROM_CHAR_EXPLICIT explicit +#endif + +// Less important, but still a good idea. +#ifndef UNISTR_FROM_STRING_EXPLICIT +# define UNISTR_FROM_STRING_EXPLICIT explicit +#endif + +#include "reslist.h" +#include "unewdata.h" +#include "unicode/ures.h" +#include "errmsg.h" +#include "filestrm.h" +#include "cstring.h" +#include "unicode/ucnv.h" +#include "genrb.h" +#include "rle.h" +#include "uhash.h" +#include "uresimp.h" +#include "unicode/ustring.h" +#include "unicode/uchar.h" +#include "ustr.h" +#include "prscmnts.h" +#include "unicode/unistr.h" +#include "unicode/utf8.h" +#include "unicode/utf16.h" +#include <time.h> + +U_NAMESPACE_USE + +static int tabCount = 0; + +static FileStream* out=nullptr; +static struct SRBRoot* srBundle ; +static const char* outDir = nullptr; +static const char* enc =""; +static UConverter* conv = nullptr; + +const char* const* ISOLanguages; +const char* const* ISOCountries; +const char* textExt = ".txt"; +const char* xliffExt = ".xlf"; + +static int32_t write_utf8_file(FileStream* fileStream, UnicodeString outString) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t len = 0; + + // preflight to get the destination buffer size + u_strToUTF8(nullptr, + 0, + &len, + toUCharPtr(outString.getBuffer()), + outString.length(), + &status); + + // allocate the buffer + char* dest = (char*)uprv_malloc(len); + status = U_ZERO_ERROR; + + // convert the data + u_strToUTF8(dest, + len, + &len, + toUCharPtr(outString.getBuffer()), + outString.length(), + &status); + + // write data to out file + int32_t ret = T_FileStream_write(fileStream, dest, len); + uprv_free(dest); + return (ret); +} + +/*write indentation for formatting*/ +static void write_tabs(FileStream* os){ + int i=0; + for(;i<=tabCount;i++){ + write_utf8_file(os,UnicodeString(" ")); + } +} + +/*get ID for each element. ID is globally unique.*/ +static char* getID(const char* id, const char* curKey, char* result) { + if(curKey == nullptr) { + result = (char *)uprv_malloc(sizeof(char)*uprv_strlen(id) + 1); + uprv_memset(result, 0, sizeof(char)*uprv_strlen(id) + 1); + uprv_strcpy(result, id); + } else { + result = (char *)uprv_malloc(sizeof(char)*(uprv_strlen(id) + 1 + uprv_strlen(curKey)) + 1); + uprv_memset(result, 0, sizeof(char)*(uprv_strlen(id) + 1 + uprv_strlen(curKey)) + 1); + if(id[0]!='\0'){ + uprv_strcpy(result, id); + uprv_strcat(result, "_"); + } + uprv_strcat(result, curKey); + } + return result; +} + +/*compute CRC for binary code*/ +/* The code is from http://www.theorem.com/java/CRC32.java + * Calculates the CRC32 - 32 bit Cyclical Redundancy Check + * <P> This check is used in numerous systems to verify the integrity + * of information. It's also used as a hashing function. Unlike a regular + * checksum, it's sensitive to the order of the characters. + * It produces a 32 bit + * + * @author Michael Lecuyer (mjl@theorem.com) + * @version 1.1 August 11, 1998 + */ + +/* ICU is not endian portable, because ICU data generated on big endian machines can be + * ported to big endian machines but not to little endian machines and vice versa. The + * conversion is not portable across platforms with different endianness. + */ + +uint32_t computeCRC(const char *ptr, uint32_t len, uint32_t lastcrc){ + int32_t crc; + uint32_t temp1; + uint32_t temp2; + + int32_t crc_ta[256]; + int i = 0; + int j = 0; + uint32_t crc2 = 0; + +#define CRC32_POLYNOMIAL 0xEDB88320 + + /*build crc table*/ + for (i = 0; i <= 255; i++) { + crc2 = i; + for (j = 8; j > 0; j--) { + if ((crc2 & 1) == 1) { + crc2 = (crc2 >> 1) ^ CRC32_POLYNOMIAL; + } else { + crc2 >>= 1; + } + } + crc_ta[i] = crc2; + } + + crc = lastcrc; + while(len--!=0) { + temp1 = (uint32_t)crc>>8; + temp2 = crc_ta[(crc^*ptr) & 0xFF]; + crc = temp1^temp2; + ptr++; + } + return(crc); +} + +static void strnrepchr(char* src, int32_t srcLen, char s, char r){ + int32_t i = 0; + for(i=0;i<srcLen;i++){ + if(src[i]==s){ + src[i]=r; + } + } +} +/* Parse the filename, and get its language information. + * If it fails to get the language information from the filename, + * use "en" as the default value for language + */ +static char* parseFilename(const char* id, char* /*lang*/) { + int idLen = (int) uprv_strlen(id); + char* localeID = (char*) uprv_malloc(idLen); + int pos = 0; + int canonCapacity = 0; + char* canon = nullptr; + int canonLen = 0; + /*int i;*/ + UErrorCode status = U_ZERO_ERROR; + const char *ext = uprv_strchr(id, '.'); + + if(ext != nullptr){ + pos = (int) (ext - id); + } else { + pos = idLen; + } + uprv_memcpy(localeID, id, pos); + localeID[pos]=0; /* NUL terminate the string */ + + canonCapacity =pos*3; + canon = (char*) uprv_malloc(canonCapacity); + canonLen = uloc_canonicalize(localeID, canon, canonCapacity, &status); + + if(U_FAILURE(status)){ + fprintf(stderr, "Could not canonicalize the locale ID: %s. Error: %s\n", localeID, u_errorName(status)); + exit(status); + } + strnrepchr(canon, canonLen, '_', '-'); + return canon; +} + +static const char* xmlHeader = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"; +#if 0 +static const char* bundleStart = "<xliff version = \"1.2\" " + "xmlns='urn:oasis:names:tc:xliff:document:1.2' " + "xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' " + "xsi:schemaLocation='urn:oasis:names:tc:xliff:document:1.2 xliff-core-1.2-transitional.xsd'>\n"; +#else +static const char* bundleStart = "<xliff version = \"1.1\" " + "xmlns='urn:oasis:names:tc:xliff:document:1.1' " + "xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' " + "xsi:schemaLocation='urn:oasis:names:tc:xliff:document:1.1 http://www.oasis-open.org/committees/xliff/documents/xliff-core-1.1.xsd'>\n"; +#endif +static const char* bundleEnd = "</xliff>\n"; + +void res_write_xml(struct SResource *res, const char* id, const char* language, UBool isTopLevel, UErrorCode *status); + +static char* convertAndEscape(char** pDest, int32_t destCap, int32_t* destLength, + const char16_t* src, int32_t srcLen, UErrorCode* status){ + int32_t srcIndex=0; + char* dest=nullptr; + char* temp=nullptr; + int32_t destLen=0; + UChar32 c = 0; + + if(status==nullptr || U_FAILURE(*status) || pDest==nullptr || srcLen==0 || src == nullptr){ + return nullptr; + } + dest =*pDest; + if(dest==nullptr || destCap <=0){ + destCap = srcLen * 8; + dest = (char*) uprv_malloc(sizeof(char) * destCap); + if(dest==nullptr){ + *status=U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + } + + dest[0]=0; + + while(srcIndex<srcLen){ + U16_NEXT(src, srcIndex, srcLen, c); + + if (U16_IS_LEAD(c) || U16_IS_TRAIL(c)) { + *status = U_ILLEGAL_CHAR_FOUND; + fprintf(stderr, "Illegal Surrogate! \n"); + uprv_free(dest); + return nullptr; + } + + if((destLen+U8_LENGTH(c)) < destCap){ + + /* ASCII Range */ + if(c <=0x007F){ + switch(c) { + case '\x26': + uprv_strcpy(dest+( destLen),"\x26\x61\x6d\x70\x3b"); /* &*/ + destLen+=(int32_t)uprv_strlen("\x26\x61\x6d\x70\x3b"); + break; + case '\x3c': + uprv_strcpy(dest+(destLen),"\x26\x6c\x74\x3b"); /* <*/ + destLen+=(int32_t)uprv_strlen("\x26\x6c\x74\x3b"); + break; + case '\x3e': + uprv_strcpy(dest+(destLen),"\x26\x67\x74\x3b"); /* >*/ + destLen+=(int32_t)uprv_strlen("\x26\x67\x74\x3b"); + break; + case '\x22': + uprv_strcpy(dest+(destLen),"\x26\x71\x75\x6f\x74\x3b"); /* "*/ + destLen+=(int32_t)uprv_strlen("\x26\x71\x75\x6f\x74\x3b"); + break; + case '\x27': + uprv_strcpy(dest+(destLen),"\x26\x61\x70\x6f\x73\x3b"); /* ' */ + destLen+=(int32_t)uprv_strlen("\x26\x61\x70\x6f\x73\x3b"); + break; + + /* Disallow C0 controls except TAB, CR, LF*/ + case 0x00: + case 0x01: + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + case 0x07: + case 0x08: + /*case 0x09:*/ + /*case 0x0A: */ + case 0x0B: + case 0x0C: + /*case 0x0D:*/ + case 0x0E: + case 0x0F: + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: + case 0x18: + case 0x19: + case 0x1A: + case 0x1B: + case 0x1C: + case 0x1D: + case 0x1E: + case 0x1F: + *status = U_ILLEGAL_CHAR_FOUND; + fprintf(stderr, "Illegal Character \\u%04X!\n",(int)c); + uprv_free(dest); + return nullptr; + default: + dest[destLen++]=(char)c; + } + }else{ + UBool isError = false; + U8_APPEND((unsigned char*)dest,destLen,destCap,c,isError); + if(isError){ + *status = U_ILLEGAL_CHAR_FOUND; + fprintf(stderr, "Illegal Character \\U%08X!\n",(int)c); + uprv_free(dest); + return nullptr; + } + } + }else{ + destCap += destLen; + + temp = (char*) uprv_malloc(sizeof(char)*destCap); + if(temp==nullptr){ + *status=U_MEMORY_ALLOCATION_ERROR; + uprv_free(dest); + return nullptr; + } + uprv_memmove(temp,dest,destLen); + destLen=0; + uprv_free(dest); + dest=temp; + temp=nullptr; + } + + } + *destLength = destLen; + return dest; +} + +#define ASTERISK 0x002A +#define SPACE 0x0020 +#define CR 0x000A +#define LF 0x000D +#define AT_SIGN 0x0040 + +#if UCONFIG_NO_REGULAR_EXPRESSIONS==0 +static void +trim(char **src, int32_t *len){ + + char *s = nullptr; + int32_t i = 0; + if(src == nullptr || *src == nullptr){ + return; + } + s = *src; + /* trim from the end */ + for( i=(*len-1); i>= 0; i--){ + switch(s[i]){ + case ASTERISK: + case SPACE: + case CR: + case LF: + s[i] = 0; + continue; + default: + break; + } + break; + + } + *len = i+1; +} + +static void +print(char16_t* src, int32_t srcLen,const char *tagStart,const char *tagEnd, UErrorCode *status){ + int32_t bufCapacity = srcLen*4; + char *buf = nullptr; + int32_t bufLen = 0; + + if(U_FAILURE(*status)){ + return; + } + + buf = (char*) (uprv_malloc(bufCapacity)); + if(buf==0){ + fprintf(stderr, "Could not allocate memory!!"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + buf = convertAndEscape(&buf, bufCapacity, &bufLen, src, srcLen,status); + if(U_SUCCESS(*status)){ + trim(&buf,&bufLen); + write_utf8_file(out,UnicodeString(tagStart)); + write_utf8_file(out,UnicodeString(buf, bufLen, "UTF-8")); + write_utf8_file(out,UnicodeString(tagEnd)); + write_utf8_file(out,UnicodeString("\n")); + + } +} +#endif + +static void +printNoteElements(const UString *src, UErrorCode *status){ + +#if UCONFIG_NO_REGULAR_EXPRESSIONS==0 /* donot compile when no RegularExpressions are available */ + + int32_t capacity = 0; + char16_t* note = nullptr; + int32_t noteLen = 0; + int32_t count = 0,i; + + if(src == nullptr){ + return; + } + + capacity = src->fLength; + note = (char16_t*) uprv_malloc(U_SIZEOF_UCHAR * capacity); + + count = getCount(src->fChars,src->fLength, UPC_NOTE, status); + if(U_FAILURE(*status)){ + uprv_free(note); + return; + } + for(i=0; i < count; i++){ + noteLen = getAt(src->fChars,src->fLength, ¬e, capacity, i, UPC_NOTE, status); + if(U_FAILURE(*status)){ + uprv_free(note); + return; + } + if(noteLen > 0){ + write_tabs(out); + print(note, noteLen,"<note>", "</note>", status); + } + } + uprv_free(note); +#else + + fprintf(stderr, "Warning: Could not output comments to XLIFF file. ICU has been built without RegularExpression support.\n"); + +#endif /* UCONFIG_NO_REGULAR_EXPRESSIONS */ + +} + +static void printAttribute(const char *name, const char *value, int32_t /*len*/) +{ + write_utf8_file(out, UnicodeString(" ")); + write_utf8_file(out, UnicodeString(name)); + write_utf8_file(out, UnicodeString(" = \"")); + write_utf8_file(out, UnicodeString(value)); + write_utf8_file(out, UnicodeString("\"")); +} + +#if UCONFIG_NO_REGULAR_EXPRESSIONS==0 /* donot compile when no RegularExpressions are available */ +static void printAttribute(const char *name, const UnicodeString value, int32_t /*len*/) +{ + write_utf8_file(out, UnicodeString(" ")); + write_utf8_file(out, UnicodeString(name)); + write_utf8_file(out, UnicodeString(" = \"")); + write_utf8_file(out, value); + write_utf8_file(out, UnicodeString("\"")); +} +#endif + +static void +printComments(struct UString *src, const char *resName, UBool printTranslate, UErrorCode *status){ + +#if UCONFIG_NO_REGULAR_EXPRESSIONS==0 /* donot compile when no RegularExpressions are available */ + + if(status==nullptr || U_FAILURE(*status)){ + return; + } + + int32_t capacity = src->fLength + 1; + char* buf = nullptr; + int32_t bufLen = 0; + char16_t* desc = (char16_t*) uprv_malloc(U_SIZEOF_UCHAR * capacity); + char16_t* trans = (char16_t*) uprv_malloc(U_SIZEOF_UCHAR * capacity); + + int32_t descLen = 0, transLen=0; + if(desc==nullptr || trans==nullptr){ + *status = U_MEMORY_ALLOCATION_ERROR; + uprv_free(desc); + uprv_free(trans); + return; + } + // TODO: make src const, stop modifying it in-place, make printContainer() take const resource, etc. + src->fLength = removeCmtText(src->fChars, src->fLength, status); + descLen = getDescription(src->fChars,src->fLength, &desc, capacity, status); + transLen = getTranslate(src->fChars,src->fLength, &trans, capacity, status); + + /* first print translate attribute */ + if(transLen > 0){ + if(printTranslate){ + /* print translate attribute */ + buf = convertAndEscape(&buf, 0, &bufLen, trans, transLen, status); + if(U_SUCCESS(*status)){ + printAttribute("translate", UnicodeString(buf, bufLen, "UTF-8"), bufLen); + write_utf8_file(out,UnicodeString(">\n")); + } + }else if(getShowWarning()){ + fprintf(stderr, "Warning: Translate attribute for resource %s cannot be set. XLIFF prohibits it.\n", resName); + /* no translate attribute .. just close the tag */ + write_utf8_file(out,UnicodeString(">\n")); + } + }else{ + /* no translate attribute .. just close the tag */ + write_utf8_file(out,UnicodeString(">\n")); + } + + if(descLen > 0){ + write_tabs(out); + print(desc, descLen, "<!--", "-->", status); + } + + uprv_free(desc); + uprv_free(trans); +#else + + fprintf(stderr, "Warning: Could not output comments to XLIFF file. ICU has been built without RegularExpression support.\n"); + +#endif /* UCONFIG_NO_REGULAR_EXPRESSIONS */ + +} + +/* + * Print out a containing element, like: + * <trans-unit id = "blah" resname = "blah" restype = "x-id-alias" translate = "no"> + * <group id "calendar_gregorian" resname = "gregorian" restype = "x-icu-array"> + */ +static char *printContainer(SResource *res, const char *container, const char *restype, const char *mimetype, const char *id, UErrorCode *status) +{ + const char *resname = nullptr; + char *sid = nullptr; + + write_tabs(out); + + resname = res->getKeyString(srBundle); + if (resname != nullptr && *resname != 0) { + sid = getID(id, resname, sid); + } else { + sid = getID(id, nullptr, sid); + } + + write_utf8_file(out, UnicodeString("<")); + write_utf8_file(out, UnicodeString(container)); + printAttribute("id", sid, (int32_t) uprv_strlen(sid)); + + if (resname != nullptr) { + printAttribute("resname", resname, (int32_t) uprv_strlen(resname)); + } + + if (mimetype != nullptr) { + printAttribute("mime-type", mimetype, (int32_t) uprv_strlen(mimetype)); + } + + if (restype != nullptr) { + printAttribute("restype", restype, (int32_t) uprv_strlen(restype)); + } + + tabCount += 1; + if (res->fComment.fLength > 0) { + /* printComments will print the closing ">\n" */ + printComments(&res->fComment, resname, true, status); + } else { + write_utf8_file(out, UnicodeString(">\n")); + } + + return sid; +} + +/* Writing Functions */ + +static const char *trans_unit = "trans-unit"; +static const char *close_trans_unit = "</trans-unit>\n"; +static const char *source = "<source>"; +static const char *close_source = "</source>\n"; +static const char *group = "group"; +static const char *close_group = "</group>\n"; + +static const char *bin_unit = "bin-unit"; +static const char *close_bin_unit = "</bin-unit>\n"; +static const char *bin_source = "<bin-source>\n"; +static const char *close_bin_source = "</bin-source>\n"; +static const char *external_file = "<external-file"; +/*static const char *close_external_file = "</external-file>\n";*/ +static const char *internal_file = "<internal-file"; +static const char *close_internal_file = "</internal-file>\n"; + +static const char *application_mimetype = "application"; /* add "/octet-stream"? */ + +static const char *alias_restype = "x-icu-alias"; +static const char *array_restype = "x-icu-array"; +static const char *binary_restype = "x-icu-binary"; +static const char *integer_restype = "x-icu-integer"; +static const char *intvector_restype = "x-icu-intvector"; +static const char *table_restype = "x-icu-table"; + +static void +string_write_xml(StringResource *res, const char* id, const char* /*language*/, UErrorCode *status) { + + char *sid = nullptr; + char* buf = nullptr; + int32_t bufLen = 0; + + if(status==nullptr || U_FAILURE(*status)){ + return; + } + + sid = printContainer(res, trans_unit, nullptr, nullptr, id, status); + + write_tabs(out); + + write_utf8_file(out, UnicodeString(source)); + + buf = convertAndEscape(&buf, 0, &bufLen, res->getBuffer(), res->length(), status); + + if (U_FAILURE(*status)) { + return; + } + + write_utf8_file(out, UnicodeString(buf, bufLen, "UTF-8")); + write_utf8_file(out, UnicodeString(close_source)); + + printNoteElements(&res->fComment, status); + + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(close_trans_unit)); + + uprv_free(buf); + uprv_free(sid); +} + +static void +alias_write_xml(AliasResource *res, const char* id, const char* /*language*/, UErrorCode *status) { + char *sid = nullptr; + char* buf = nullptr; + int32_t bufLen=0; + + sid = printContainer(res, trans_unit, alias_restype, nullptr, id, status); + + write_tabs(out); + + write_utf8_file(out, UnicodeString(source)); + + buf = convertAndEscape(&buf, 0, &bufLen, res->getBuffer(), res->length(), status); + + if(U_FAILURE(*status)){ + return; + } + write_utf8_file(out, UnicodeString(buf, bufLen, "UTF-8")); + write_utf8_file(out, UnicodeString(close_source)); + + printNoteElements(&res->fComment, status); + + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(close_trans_unit)); + + uprv_free(buf); + uprv_free(sid); +} + +static void +array_write_xml(ArrayResource *res, const char* id, const char* language, UErrorCode *status) { + char* sid = nullptr; + int index = 0; + + struct SResource *current = nullptr; + + sid = printContainer(res, group, array_restype, nullptr, id, status); + + current = res->fFirst; + + while (current != nullptr) { + char c[256] = {0}; + char* subId = nullptr; + + itostr(c, index, 10, 0); + index += 1; + subId = getID(sid, c, subId); + + res_write_xml(current, subId, language, false, status); + uprv_free(subId); + subId = nullptr; + + if(U_FAILURE(*status)){ + return; + } + + current = current->fNext; + } + + tabCount -= 1; + write_tabs(out); + write_utf8_file(out, UnicodeString(close_group)); + + uprv_free(sid); +} + +static void +intvector_write_xml(IntVectorResource *res, const char* id, const char* /*language*/, UErrorCode *status) { + char* sid = nullptr; + char* ivd = nullptr; + uint32_t i=0; + uint32_t len=0; + char buf[256] = {'0'}; + + sid = printContainer(res, group, intvector_restype, nullptr, id, status); + + for(i = 0; i < res->fCount; i += 1) { + char c[256] = {0}; + + itostr(c, i, 10, 0); + ivd = getID(sid, c, ivd); + len = itostr(buf, res->fArray[i], 10, 0); + + write_tabs(out); + write_utf8_file(out, UnicodeString("<")); + write_utf8_file(out, UnicodeString(trans_unit)); + + printAttribute("id", ivd, (int32_t)uprv_strlen(ivd)); + printAttribute("restype", integer_restype, (int32_t) strlen(integer_restype)); + + write_utf8_file(out, UnicodeString(">\n")); + + tabCount += 1; + write_tabs(out); + write_utf8_file(out, UnicodeString(source)); + + write_utf8_file(out, UnicodeString(buf, len)); + + write_utf8_file(out, UnicodeString(close_source)); + tabCount -= 1; + write_tabs(out); + write_utf8_file(out, UnicodeString(close_trans_unit)); + + uprv_free(ivd); + ivd = nullptr; + } + + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(close_group)); + uprv_free(sid); + sid = nullptr; +} + +static void +int_write_xml(IntResource *res, const char* id, const char* /*language*/, UErrorCode *status) { + char* sid = nullptr; + char buf[256] = {0}; + uint32_t len = 0; + + sid = printContainer(res, trans_unit, integer_restype, nullptr, id, status); + + write_tabs(out); + + write_utf8_file(out, UnicodeString(source)); + + len = itostr(buf, res->fValue, 10, 0); + write_utf8_file(out, UnicodeString(buf, len)); + + write_utf8_file(out, UnicodeString(close_source)); + + printNoteElements(&res->fComment, status); + + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(close_trans_unit)); + + uprv_free(sid); + sid = nullptr; +} + +static void +bin_write_xml(BinaryResource *res, const char* id, const char* /*language*/, UErrorCode *status) { + const char* m_type = application_mimetype; + char* sid = nullptr; + uint32_t crc = 0xFFFFFFFF; + + char fileName[1024] ={0}; + int32_t tLen = ( outDir == nullptr) ? 0 :(int32_t)uprv_strlen(outDir); + char* fn = (char*) uprv_malloc(sizeof(char) * (tLen+1024 + + (res->fFileName !=nullptr ? + uprv_strlen(res->fFileName) :0))); + const char* ext = nullptr; + + char* f = nullptr; + + fn[0]=0; + + if(res->fFileName != nullptr){ + uprv_strcpy(fileName, res->fFileName); + f = uprv_strrchr(fileName, '\\'); + + if (f != nullptr) { + f++; + } else { + f = fileName; + } + + ext = uprv_strrchr(fileName, '.'); + + if (ext == nullptr) { + fprintf(stderr, "Error: %s is an unknown binary filename type.\n", fileName); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + + if(uprv_strcmp(ext, ".jpg")==0 || uprv_strcmp(ext, ".jpeg")==0 || uprv_strcmp(ext, ".gif")==0 ){ + m_type = "image"; + } else if(uprv_strcmp(ext, ".wav")==0 || uprv_strcmp(ext, ".au")==0 ){ + m_type = "audio"; + } else if(uprv_strcmp(ext, ".avi")==0 || uprv_strcmp(ext, ".mpg")==0 || uprv_strcmp(ext, ".mpeg")==0){ + m_type = "video"; + } else if(uprv_strcmp(ext, ".txt")==0 || uprv_strcmp(ext, ".text")==0){ + m_type = "text"; + } + + sid = printContainer(res, bin_unit, binary_restype, m_type, id, status); + + write_tabs(out); + + write_utf8_file(out, UnicodeString(bin_source)); + + tabCount+= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(external_file)); + printAttribute("href", f, (int32_t)uprv_strlen(f)); + write_utf8_file(out, UnicodeString("/>\n")); + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(close_bin_source)); + + printNoteElements(&res->fComment, status); + tabCount -= 1; + write_tabs(out); + write_utf8_file(out, UnicodeString(close_bin_unit)); + } else { + char temp[256] = {0}; + uint32_t i = 0; + int32_t len=0; + + sid = printContainer(res, bin_unit, binary_restype, m_type, id, status); + + write_tabs(out); + write_utf8_file(out, UnicodeString(bin_source)); + + tabCount += 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(internal_file)); + printAttribute("form", application_mimetype, (int32_t) uprv_strlen(application_mimetype)); + + while(i <res->fLength){ + len = itostr(temp, res->fData[i], 16, 2); + crc = computeCRC(temp, len, crc); + i++; + } + + len = itostr(temp, crc, 10, 0); + printAttribute("crc", temp, len); + + write_utf8_file(out, UnicodeString(">")); + + i = 0; + while(i <res->fLength){ + len = itostr(temp, res->fData[i], 16, 2); + write_utf8_file(out, UnicodeString(temp)); + i += 1; + } + + write_utf8_file(out, UnicodeString(close_internal_file)); + + tabCount -= 2; + write_tabs(out); + + write_utf8_file(out, UnicodeString(close_bin_source)); + printNoteElements(&res->fComment, status); + + tabCount -= 1; + write_tabs(out); + write_utf8_file(out, UnicodeString(close_bin_unit)); + + uprv_free(sid); + sid = nullptr; + } + + uprv_free(fn); +} + + + +static void +table_write_xml(TableResource *res, const char* id, const char* language, UBool isTopLevel, UErrorCode *status) { + + struct SResource *current = nullptr; + char* sid = nullptr; + + if (U_FAILURE(*status)) { + return ; + } + + sid = printContainer(res, group, table_restype, nullptr, id, status); + + if(isTopLevel) { + sid[0] = '\0'; + } + + current = res->fFirst; + + while (current != nullptr) { + res_write_xml(current, sid, language, false, status); + + if(U_FAILURE(*status)){ + return; + } + + current = current->fNext; + } + + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(close_group)); + + uprv_free(sid); + sid = nullptr; +} + +void +res_write_xml(struct SResource *res, const char* id, const char* language, UBool isTopLevel, UErrorCode *status) { + + if (U_FAILURE(*status)) { + return ; + } + + if (res != nullptr) { + switch (res->fType) { + case URES_STRING: + string_write_xml (static_cast<StringResource *>(res), id, language, status); + return; + + case URES_ALIAS: + alias_write_xml (static_cast<AliasResource *>(res), id, language, status); + return; + + case URES_INT_VECTOR: + intvector_write_xml (static_cast<IntVectorResource *>(res), id, language, status); + return; + + case URES_BINARY: + bin_write_xml (static_cast<BinaryResource *>(res), id, language, status); + return; + + case URES_INT: + int_write_xml (static_cast<IntResource *>(res), id, language, status); + return; + + case URES_ARRAY: + array_write_xml (static_cast<ArrayResource *>(res), id, language, status); + return; + + case URES_TABLE: + table_write_xml (static_cast<TableResource *>(res), id, language, isTopLevel, status); + return; + + default: + break; + } + } + + *status = U_INTERNAL_PROGRAM_ERROR; +} + +void +bundle_write_xml(struct SRBRoot *bundle, const char *outputDir,const char* outputEnc, const char* filename, + char *writtenFilename, int writtenFilenameLen, + const char* language, const char* outFileName, UErrorCode *status) { + + char* xmlfileName = nullptr; + char* outputFileName = nullptr; + char* originalFileName = nullptr; + const char* fileStart = "<file xml:space = \"preserve\" source-language = \""; + const char* file1 = "\" datatype = \"x-icu-resource-bundle\" "; + const char* file2 = "original = \""; + const char* file4 = "\" date = \""; + const char* fileEnd = "</file>\n"; + const char* headerStart = "<header>\n"; + const char* headerEnd = "</header>\n"; + const char* bodyStart = "<body>\n"; + const char* bodyEnd = "</body>\n"; + + const char *tool_start = "<tool"; + const char *tool_id = "genrb-" GENRB_VERSION "-icu-" U_ICU_VERSION; + const char *tool_name = "genrb"; + + char* temp = nullptr; + char* lang = nullptr; + const char* pos = nullptr; + int32_t first, index; + time_t currTime; + char timeBuf[128]; + + outDir = outputDir; + + srBundle = bundle; + + pos = uprv_strrchr(filename, '\\'); + if(pos != nullptr) { + first = (int32_t)(pos - filename + 1); + } else { + first = 0; + } + index = (int32_t)(uprv_strlen(filename) - uprv_strlen(textExt) - first); + originalFileName = (char *)uprv_malloc(sizeof(char)*index+1); + uprv_memset(originalFileName, 0, sizeof(char)*index+1); + uprv_strncpy(originalFileName, filename + first, index); + + if(uprv_strcmp(originalFileName, srBundle->fLocale) != 0) { + fprintf(stdout, "Warning: The file name is not same as the resource name!\n"); + } + + temp = originalFileName; + originalFileName = (char *)uprv_malloc(sizeof(char)* (uprv_strlen(temp)+uprv_strlen(textExt)) + 1); + uprv_memset(originalFileName, 0, sizeof(char)* (uprv_strlen(temp)+uprv_strlen(textExt)) + 1); + uprv_strcat(originalFileName, temp); + uprv_strcat(originalFileName, textExt); + uprv_free(temp); + temp = nullptr; + + + if (language == nullptr) { +/* lang = parseFilename(filename, lang); + if (lang == nullptr) {*/ + /* now check if locale name is valid or not + * this is to cater for situation where + * pegasusServer.txt contains + * + * en{ + * .. + * } + */ + lang = parseFilename(srBundle->fLocale, lang); + /* + * Neither the file name nor the table name inside the + * txt file contain a valid country and language codes + * throw an error. + * pegasusServer.txt contains + * + * testelements{ + * .... + * } + */ + if(lang==nullptr){ + fprintf(stderr, "Error: The file name and table name do not contain a valid language code. Please use -l option to specify it.\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + /* }*/ + } else { + lang = (char *)uprv_malloc(sizeof(char)*uprv_strlen(language) +1); + uprv_memset(lang, 0, sizeof(char)*uprv_strlen(language) +1); + uprv_strcpy(lang, language); + } + + if(outFileName) { + outputFileName = (char *)uprv_malloc(sizeof(char)*uprv_strlen(outFileName) + 1); + uprv_memset(outputFileName, 0, sizeof(char)*uprv_strlen(outFileName) + 1); + uprv_strcpy(outputFileName,outFileName); + } else { + outputFileName = (char *)uprv_malloc(sizeof(char)*uprv_strlen(srBundle->fLocale) + 1); + uprv_memset(outputFileName, 0, sizeof(char)*uprv_strlen(srBundle->fLocale) + 1); + uprv_strcpy(outputFileName,srBundle->fLocale); + } + + if(outputDir) { + xmlfileName = (char *)uprv_malloc(sizeof(char)*(uprv_strlen(outputDir) + uprv_strlen(outputFileName) + uprv_strlen(xliffExt) + 1) +1); + uprv_memset(xmlfileName, 0, sizeof(char)*(uprv_strlen(outputDir)+ uprv_strlen(outputFileName) + uprv_strlen(xliffExt) + 1) +1); + } else { + xmlfileName = (char *)uprv_malloc(sizeof(char)*(uprv_strlen(outputFileName) + uprv_strlen(xliffExt)) +1); + uprv_memset(xmlfileName, 0, sizeof(char)*(uprv_strlen(outputFileName) + uprv_strlen(xliffExt)) +1); + } + + if(outputDir){ + uprv_strcpy(xmlfileName, outputDir); + if(outputDir[uprv_strlen(outputDir)-1] !=U_FILE_SEP_CHAR){ + uprv_strcat(xmlfileName,U_FILE_SEP_STRING); + } + } + uprv_strcat(xmlfileName,outputFileName); + uprv_strcat(xmlfileName,xliffExt); + + if (writtenFilename) { + uprv_strncpy(writtenFilename, xmlfileName, writtenFilenameLen); + } + + if (U_FAILURE(*status)) { + goto cleanup_bundle_write_xml; + } + + out= T_FileStream_open(xmlfileName,"w"); + + if(out==nullptr){ + *status = U_FILE_ACCESS_ERROR; + goto cleanup_bundle_write_xml; + } + write_utf8_file(out, UnicodeString(xmlHeader)); + + if(outputEnc && *outputEnc!='\0'){ + /* store the output encoding */ + enc = outputEnc; + conv=ucnv_open(enc,status); + if(U_FAILURE(*status)){ + goto cleanup_bundle_write_xml; + } + } + write_utf8_file(out, UnicodeString(bundleStart)); + write_tabs(out); + write_utf8_file(out, UnicodeString(fileStart)); + /* check if lang and language are the same */ + if(language != nullptr && uprv_strcmp(lang, srBundle->fLocale)!=0){ + fprintf(stderr,"Warning: The top level tag in the resource and language specified are not the same. Please check the input.\n"); + } + write_utf8_file(out, UnicodeString(lang)); + write_utf8_file(out, UnicodeString(file1)); + write_utf8_file(out, UnicodeString(file2)); + write_utf8_file(out, UnicodeString(originalFileName)); + write_utf8_file(out, UnicodeString(file4)); + + time(&currTime); + strftime(timeBuf, sizeof(timeBuf), "%Y-%m-%dT%H:%M:%SZ", gmtime(&currTime)); + write_utf8_file(out, UnicodeString(timeBuf)); + write_utf8_file(out, UnicodeString("\">\n")); + + tabCount += 1; + write_tabs(out); + write_utf8_file(out, UnicodeString(headerStart)); + + tabCount += 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(tool_start)); + printAttribute("tool-id", tool_id, (int32_t) uprv_strlen(tool_id)); + printAttribute("tool-name", tool_name, (int32_t) uprv_strlen(tool_name)); + write_utf8_file(out, UnicodeString("/>\n")); + + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(headerEnd)); + + write_tabs(out); + tabCount += 1; + + write_utf8_file(out, UnicodeString(bodyStart)); + + + res_write_xml(bundle->fRoot, bundle->fLocale, lang, true, status); + + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(bodyEnd)); + tabCount--; + write_tabs(out); + write_utf8_file(out, UnicodeString(fileEnd)); + tabCount--; + write_tabs(out); + write_utf8_file(out, UnicodeString(bundleEnd)); + T_FileStream_close(out); + + ucnv_close(conv); + +cleanup_bundle_write_xml: + uprv_free(originalFileName); + uprv_free(lang); + if(xmlfileName != nullptr) { + uprv_free(xmlfileName); + } + if(outputFileName != nullptr){ + uprv_free(outputFileName); + } +} diff --git a/intl/icu/source/tools/genren/Makefile b/intl/icu/source/tools/genren/Makefile new file mode 100644 index 0000000000..f0ab666999 --- /dev/null +++ b/intl/icu/source/tools/genren/Makefile @@ -0,0 +1,105 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +#****************************************************************************** +# +# Copyright (C) 2002-2011, International Business Machines +# Corporation and others. All Rights Reserved. +# +#****************************************************************************** + +TOP=../.. + +srcdir = . +top_srcdir = $(TOP) +top_builddir = $(TOP) + +# override if you have an out-of-source build (not yet working.) +BUILDDIR = $(top_builddir) + +ICUDIR=ICUunrenamed + +# Extra flags to prevent internal API from being hidden. +# This is important because ELF (Linux) based platforms that don't hide internal +# API will allow a duplicate internal name to resolve to an external library. +# See the gcc manual on the "visibility" attribute for details. +FLAG_OVERRIDE= LIBCFLAGS= LIBCXXFLAGS= $(EXTRA_MAKE_OPTIONS) + +## any local overrides +-include Makefile.local + +# load definition of .SO, etc (but not if we are doing 'make clean') +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +include $(BUILDDIR)/icudefs.mk +endif + +# For MinGW, do we want the DLL to go in the bin location? +ifeq ($(MINGW_MOVEDLLSTOBINDIR),YES) +installdir = bin +else +installdir = lib +endif + +COM=$(ICUDIR)/$(installdir)/libicuuc.$(SO) +I18=$(ICUDIR)/$(installdir)/libicui18n.$(SO) +LEX=$(ICUDIR)/$(installdir)/libiculx.$(SO) +DAT=$(ICUDIR)/stubdata/libicudata.$(SO) +UIO=$(ICUDIR)/$(installdir)/libicuio.$(SO) + +LIBS=$(COM) $(I18) $(LEX) $(UIO) + +## Targets. + + +all: + @cat README + +clean: + -rm -rf $(ICUDIR) urename.* *~ + +# We use config.status to mean we have a valid out of source tree. + +$(ICUDIR)/config.status: + -mv $(ICUDIR) $(ICUDIR)old + -(rm -rf $(ICUDIR)old &) + mkdir $(ICUDIR) + ( cd $(ICUDIR) ; CPPFLAGS="-DU_DISABLE_RENAMING=1 -DUCONFIG_ENABLE_PLUGINS" $(GENREN_CONFIGURE_ENV) $(top_srcdir)/../configure --with-data-packaging=archive --enable-tests=no --prefix=`pwd` $(GENREN_CONFIGURE_OPTS) ) + # cause lib and bin to be created, and any other general sanity + $(MAKE) $(FLAG_OVERRIDE) -C $(ICUDIR) clean + $(MAKE) $(FLAG_OVERRIDE) -C $(ICUDIR) all-local + +# build the libraries +$(DAT): $(ICUDIR)/config.status Makefile + $(MAKE) $(FLAG_OVERRIDE) -C $(ICUDIR)/stubdata all-local + +$(COM): $(DAT) $(ICUDIR)/config.status Makefile + $(MAKE) $(FLAG_OVERRIDE) -C $(ICUDIR)/common all-local + +$(I18): $(DAT) $(COM) $(ICUDIR)/config.status Makefile + $(MAKE) $(FLAG_OVERRIDE) -C $(ICUDIR)/i18n all-local + +$(LEX): $(DAT) $(I18) $(COM) $(ICUDIR)/config.status Makefile + $(MAKE) $(FLAG_OVERRIDE) -C $(ICUDIR)/layoutex all-local + +$(UIO): $(DAT) $(I18) $(COM) $(ICUDIR)/config.status Makefile + $(MAKE) $(FLAG_OVERRIDE) -C $(ICUDIR)/io all-local + +# the header itself +urename.h: $(LIBS) genren.pl + -cp urename.h urename.h.old + perl ./genren.pl $(GENREN_PL_OPTS) $(LIBS) + +# This is still here, but less useful with the "new" macro-based rename. Just use 'svn diff'. +sorts: urename.sort urename.old.sort + @echo "*** Please check urename.h manually before committing it." + @echo "Try 'diff --side-by-side urename.old.sort urename.sort'" + +urename.sort: urename.h + sort urename.h > $@ + +urename.old.sort: $(top_srcdir)/common/unicode/urename.h + sort $(top_srcdir)/common/unicode/urename.h > $@ + +install-header: urename.h + cp urename.h $(top_srcdir)/common/unicode/ + @echo "*** Please check urename.h manually before committing it." + diff --git a/intl/icu/source/tools/genren/README b/intl/icu/source/tools/genren/README new file mode 100644 index 0000000000..a18c294136 --- /dev/null +++ b/intl/icu/source/tools/genren/README @@ -0,0 +1,47 @@ +Copyright (C) 2016 and later: Unicode, Inc. and others. License & terms of use: http://www.unicode.org/copyright.html +Copyright (c) 2002-2011, International Business Machines Corporation and others. All Rights Reserved. + +The genren.pl script is used to generate source/common/unicode/urename.h header file, which is needed for renaming the ICU exported names. + +This script is intended to be used on Linux, although it should work on any platform that has Perl and nm command. Makefile may need to be updated, it's not 100% portable. + +It also does not currently work well in an out-of-source situation. + +The following instructions are for Linux version. +- urename.h file should be generated after implementation is complete for a release. +- the version number for a release should be set according to the list in source/common/unicode/uvernum.h +- Note: If you are running the script in a clean checkout, you must run the runConfigureICU at least once before + running the make install-header command below. + +Before generating urename.h, the layout engine header files must be installed from the harfbuzz project. +This is prerequisite for the icu layoutex (Paragraph Layout) project, which is subject to renaming. +(Using the svn command is the simplest way of getting just the files from one subdirectory of the git project.) + + cd icu4c/source + svn export https://github.com/behdad/icu-le-hb/trunk/src layout + +(As an alternative to the above handling of layout engine header files, you can do the following: +1. In the Makefile in this directory, temporarily delete $(LEX) from the list of objects for LIBS + before running make install-header +2. After running make install-header, restore the deleted $(LEX) in the Makefile +3. Then when comparing the old urename.h to the newly generated one, copy all of the lines beginning + "#define pl_" from the old version to the new one. + - Peter E) + +- Regenerate urename.h + + cd icu4c/source/tools/genren + make install-header + +- urename.h will be updated in icu/source/common/unicode/urename.h **in your original source directory** +- Warnings concerning bad namespace (not 'icu') on UCaseMap can be ignored. +- The defines for "__bss_start", "_edata", and "_end" should be ignored/removed (See ICU-20176). +- Eyeball the new file for errors + + cd icu4c/source + git diff common/unicode/urename.h + +- Other make targets here + + clean - cleans out intermediate files + urename.h -just builds ./urename.h diff --git a/intl/icu/source/tools/genren/genren.pl b/intl/icu/source/tools/genren/genren.pl new file mode 100755 index 0000000000..f85b96ac60 --- /dev/null +++ b/intl/icu/source/tools/genren/genren.pl @@ -0,0 +1,275 @@ +#!/usr/bin/perl +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +#* +#******************************************************************************* +#* Copyright (C) 2001-2012, International Business Machines +#* Corporation and others. All Rights Reserved. +#******************************************************************************* +#* +#* file name: genren.pl +#* encoding: UTF-8 +#* tab size: 8 (not used) +#* indentation:4 +#* +#* Created by: Vladimir Weinstein +#* 07/19/2001 +#* +#* Used to generate renaming headers. +#* Run on UNIX platforms (linux) in order to catch all the exports + +use POSIX qw(strftime); + +$headername = 'urename.h'; + +$path = substr($0, 0, rindex($0, "/")+1)."../../common/unicode/uversion.h"; + +$nmopts = '-Cg -f s'; +$post = ''; + +$mode = 'POSIX'; + +(-e $path) || die "Cannot find uversion.h"; + +open(UVERSION, $path); + +while(<UVERSION>) { + if(/\#define U_ICU_VERSION_SUFFIX/) { + chop; + s/\#define U_ICU_VERSION_SUFFIX //; + $U_ICU_VERSION_SUFFIX = "$_"; + last; + } +} + +while($ARGV[0] =~ /^-/) { # detects whether there are any arguments + $_ = shift @ARGV; # extracts the argument for processing + /^-v/ && ($VERBOSE++, next); # verbose + /^-h/ && (&printHelpMsgAndExit, next); # help + /^-o/ && (($headername = shift (@ARGV)), next); # output file + /^-n/ && (($nmopts = shift (@ARGV)), next); # nm opts + /^-p/ && (($post = shift (@ARGV)), next); # nm opts + /^-x/ && (($mode = shift (@ARGV)), next); # nm opts + /^-S/ && (($U_ICU_VERSION_SUFFIX = shift(@ARGV)), next); # pick the suffix + warn("Invalid option $_\n"); + &printHelpMsgAndExit; +} + +unless(@ARGV > 0) { + warn "No libraries, exiting...\n"; + &printHelpMsgAndExit; +} + +#$headername = "uren".substr($ARGV[0], 6, index(".", $ARGV[0])-7).".h"; + +$HEADERDEF = uc($headername); # this is building the constant for #define +$HEADERDEF =~ s/\./_/; + + + open HEADER, ">$headername"; # opening a header file + +#We will print our copyright here + warnings + +print HEADER <<"EndOfHeaderComment"; +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2002-2016, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* +* file name: $headername +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* Created by: Perl script tools/genren.pl written by Vladimir Weinstein +* +* Contains data for renaming ICU exports. +* Gets included by umachine.h +* +* THIS FILE IS MACHINE-GENERATED, DON'T PLAY WITH IT IF YOU DON'T KNOW WHAT +* YOU ARE DOING, OTHERWISE VERY BAD THINGS WILL HAPPEN! +*/ + +#ifndef $HEADERDEF +#define $HEADERDEF + +/* U_DISABLE_RENAMING can be defined in the following ways: + * - when running configure, e.g. + * runConfigureICU Linux --disable-renaming + * - by changing the default setting of U_DISABLE_RENAMING in uconfig.h + */ + +#include "unicode/uconfig.h" + +#if !U_DISABLE_RENAMING + +// Disable Renaming for Visual Studio's IntelliSense feature, so that 'Go-to-Definition' (F12) will work. +#if !(defined(_MSC_VER) && defined(__INTELLISENSE__)) + +/* We need the U_ICU_ENTRY_POINT_RENAME definition. There's a default one in unicode/uvernum.h we can use, but we will give + the platform a chance to define it first. + Normally (if utypes.h or umachine.h was included first) this will not be necessary as it will already be defined. + */ + +#ifndef U_ICU_ENTRY_POINT_RENAME +#include "unicode/umachine.h" +#endif + +/* If we still don't have U_ICU_ENTRY_POINT_RENAME use the default. */ +#ifndef U_ICU_ENTRY_POINT_RENAME +#include "unicode/uvernum.h" +#endif + +/* Error out before the following defines cause very strange and unexpected code breakage */ +#ifndef U_ICU_ENTRY_POINT_RENAME +#error U_ICU_ENTRY_POINT_RENAME is not defined - cannot continue. Consider defining U_DISABLE_RENAMING if renaming should not be used. +#endif + +EndOfHeaderComment + +$fileCount = 0; +$itemCount = 0; +$symbolCount = 0; + +for(;@ARGV; shift(@ARGV)) { + $fileCount++; + @NMRESULT = `nm $nmopts $ARGV[0] $post`; + if($?) { + warn "Couldn't do 'nm' for $ARGV[0], continuing...\n"; + next; # Couldn't do nm for the file + } + if($mode =~ /POSIX/) { + splice @NMRESULT, 0, 6; + } elsif ($mode =~ /Mach-O/) { +# splice @NMRESULT, 0, 10; + } + foreach (@NMRESULT) { # Process every line of result and stuff it in $_ + $itemCount++; + if($mode =~ /POSIX/) { + &verbose(" $_"); + ($_, $address, $type) = split(/\|/); + chop $qtype; + } elsif ($mode =~ /Mach-O/) { + ($address, $type, $_) = split(/ /); + if(/^_(.*)$/) { + $_ = $1; + } else { + next; + } + } else { + die "Unknown mode $mode"; + } + &verbose( "type: \"$type\" "); + if(!($type =~ /[UAwW?]/)) { + if(/@@/) { # These would be imports + &verbose( "Import: $_ \"$type\"\n"); + &verbose( "C++ method: $_\n"); + } elsif (/^[^\(]*::/) { # C++ methods, stuff class name in associative array + ## DON'T match ... ( foo::bar ... want :: to be to the left of paren + ## icu::CharString::~CharString(void) -> CharString + @CppName = split(/::/); ## remove scope stuff + + if(@CppName>1) { + ## MessageFormat virtual table -> MessageFormat + if(! ($CppName[0] =~ /icu/ )) { + # *** WARNING Bad namespace (not 'icu') on ShoeSize::ShoeSize() + warn "*** WARNING Bad namespace (not 'icu') on $_\n"; + next; + } + &verbose ( "(Chopping scope $CppName[0] )"); + @CppName = split(/ /, $CppName[1]); ## remove debug stuff + } + ## ures_getUnicodeStringByIndex(UResourceBundle -> ures_getUnicodeStringByIndex + @CppName = split(/\(/, $CppName[0]); ## remove function args + if($CppName[0] =~ /^operator/) { + &verbose ("Skipping C++ function: $_\n"); + } elsif($CppName[0] =~ /^~/) { + &verbose ("Skipping C++ destructor: $_\n"); + } else { + &verbose( "Skipping C++ class: '$CppName[0]': $_ \n"); + # $CppClasses{$CppName[0]}++; + # $symbolCount++; + } + } elsif ( my ($cfn) = m/^([A-Za-z0-9_]*)\(.*/ ) { + &verbose ( "$ARGV[0]: got global C++ function $cfn with '$_'\n" ); + $CFuncs{$cfn}++; + $symbolCount++; + } elsif ( /\(/) { # These are strange functions + print STDERR "$ARGV[0]: Not sure what to do with '$_'\n"; + } elsif ( /^_init/ ) { + &verbose( "$ARGV[0]: Skipped initializer $_\n" ); + } elsif ( /^_fini/ ) { + &verbose( "$ARGV[0]: Skipped finilizer $_\n" ); + } elsif ( /icu_/) { + print STDERR "$ARGV[0]: Skipped strange mangled function $_\n"; + } elsif ( /^vtable for /) { + print STDERR "$ARGV[0]: Skipped vtable $_\n"; + } elsif ( /^typeinfo/) { + print STDERR "$ARGV[0]: Skipped typeinfo $_\n"; + } elsif ( /operator\+/ ) { + print STDERR "$ARGV[0]: Skipped ignored function $_\n"; + } else { # This is regular C function + &verbose( "C func: $_\n"); + @funcname = split(/[\(\s+]/); + $CFuncs{$funcname[0]}++; + $symbolCount++; + } + } else { + &verbose( "Skipped: $_ $1\n"); + } + } +} + +if( $fileCount == 0 ) { + die "Error: $itemCount lines from $fileCount files processed, but $symbolCount symbols were found.\n"; +} + +if( $symbolCount == 0 ) { + die "Error: $itemCount lines from $fileCount files processed, but $symbolCount symbols were found.\n"; +} + +print " Loaded $symbolCount symbols from $itemCount lines in $fileCount files.\n"; + +print HEADER "\n/* C exports renaming data */\n\n"; +foreach(sort keys(%CFuncs)) { + print HEADER "#define $_ U_ICU_ENTRY_POINT_RENAME($_)\n"; +# print HEADER "#define $_ $_$U_ICU_VERSION_SUFFIX\n"; +} + + +print HEADER <<"EndOfHeaderFooter"; + +#endif /* !(defined(_MSC_VER) && defined(__INTELLISENSE__)) */ +#endif /* U_DISABLE_RENAMING */ +#endif /* URENAME_H */ + +EndOfHeaderFooter + + +close HEADER; + +sub verbose { + if($VERBOSE) { + print STDERR @_; + } +} + + +sub printHelpMsgAndExit { + print STDERR <<"EndHelpText"; +Usage: $0 [OPTIONS] LIBRARY_FILES + Options: + -v - verbose + -h - help + -o - output file name (defaults to 'urename.h' + -S - suffix (defaults to _MAJOR_MINOR of current ICU version) +Will produce a renaming .h file + +EndHelpText + + exit 0; + +} diff --git a/intl/icu/source/tools/gensprep/Makefile.in b/intl/icu/source/tools/gensprep/Makefile.in new file mode 100644 index 0000000000..7f475aeb56 --- /dev/null +++ b/intl/icu/source/tools/gensprep/Makefile.in @@ -0,0 +1,97 @@ +## Makefile.in for ICU - tools/gensprep +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 2001-2011, International Business Machines Corporation and +## others. All Rights Reserved. +## Steven R. Loomis/Markus W. Scherer + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/gensprep + +TARGET_STUB_NAME = gensprep + +SECTION = 8 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(SOURCES:.c=.o) + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(sbindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir) + +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/gensprep/filterRFC3454.pl b/intl/icu/source/tools/gensprep/filterRFC3454.pl new file mode 100755 index 0000000000..321b03512c --- /dev/null +++ b/intl/icu/source/tools/gensprep/filterRFC3454.pl @@ -0,0 +1,678 @@ +#!/usr/bin/perl +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (c) 2001-2015 International Business Machines +# Corporation and others. All Rights Reserved. + +#################################################################################### +# filterRFC3454.pl: +# This tool filters the RFC-3454 txt file for StringPrep tables and creates a table +# to be used in NamePrepProfile +# +# Author: Ram Viswanadha +# +#################################################################################### + +use File::Find; +use File::Basename; +use IO::File; +use Cwd; +use File::Copy; +use Getopt::Long; +use File::Path; +use File::Copy; +use Time::localtime; + +$icu_copyright = "#####################################################################\n# Copyright (c) %d, International Business Machines Corporation and\n# others. All Rights Reserved.\n#####################################################################\n\n"; +$copyright = "###################\n# This file was generated from RFC 3454 (http://www.ietf.org/rfc/rfc3454.txt)\n# Copyright (C) The Internet Society (2002). All Rights Reserved. \n###################\n\n"; +$warning = "###################\n# WARNING: This table is generated by filterRFC3454.pl tool with\n# options: @ARGV \n###################\n\n"; +#run the program) +main(); + +#--------------------------------------------------------------------- +# The main program + +sub main(){ + GetOptions( + "--sourcedir=s" => \$sourceDir, + "--destdir=s" => \$destDir, + "--src-filename=s" => \$srcFileName, + "--dest-filename=s" => \$destFileName, + "--A1" => \$a1, + "--B1" => \$b1, + "--B2" => \$b2, + "--B3" => \$b3, + "--C11" => \$c11, + "--C12" => \$c12, + "--C21" => \$c21, + "--C22" => \$c22, + "--C3" => \$c3, + "--C4" => \$c4, + "--C5" => \$c5, + "--C6" => \$c6, + "--C7" => \$c7, + "--C8" => \$c8, + "--C9" => \$c9, + "--iscsi" => \$writeISCSIProhibitedExtra, + "--xmpp-node" => \$writeXMPPNodeProhibitedExtra, + "--sasl" => \$writeSASLMap, + "--ldap" => \$writeLDAPMap, + "--normalize" => \$norm, + "--check-bidi" => \$checkBidi, + ); + usage() unless defined $sourceDir; + usage() unless defined $destDir; + usage() unless defined $srcFileName; + usage() unless defined $destFileName; + + $infile = $sourceDir."/".$srcFileName; + $inFH = IO::File->new($infile,"r") + or die "could not open the file $infile for reading: $! \n"; + $outfile = $destDir."/".$destFileName; + + unlink($outfile); + $outFH = IO::File->new($outfile,"a") + or die "could not open the file $outfile for writing: $! \n"; + + printf $outFH $icu_copyright, localtime->year()+1900; + print $outFH $copyright; + print $outFH $warning; + + if(defined $norm) { + print $outFH "\@normalize;;\n"; + } + if(defined $checkBidi) { + print $outFH "\@check-bidi;;\n"; + } + print $outFH "\n"; + close($outFH); + + if(defined $b2 && defined $b3){ + die "ERROR: --B2 and --B3 are both specified\!\n"; + } + + while(defined ($line=<$inFH>)){ + next unless $line=~ /Start\sTable/; + if($line =~ /A.1/){ + createUnassignedTable($inFH,$outfile); + } + if($line =~ /B.1/ && defined $b1){ + createMapToNothing($inFH,$outfile); + } + if($line =~ /B.2/ && defined $b2){ + createCaseMapNorm($inFH,$outfile); + } + if($line =~ /B.3/ && defined $b3){ + createCaseMapNoNorm($inFH,$outfile); + } + if($line =~ /C.1.1/ && defined $c11 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.1.2/ && defined $c12 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.2.1/ && defined $c21 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.2.2/ && defined $c22 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.3/ && defined $c3 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.4/ && defined $c4 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.5/ && defined $c5 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.6/ && defined $c6 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.7/ && defined $c7 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.8/ && defined $c8 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.9/ && defined $c9 ){ + createProhibitedTable($inFH,$outfile,$line); + } + } + if( defined $writeISCSIProhibitedExtra){ + create_iSCSIExtraProhibitedTable($inFH, $outfile); + } + if( defined $writeXMPPNodeProhibitedExtra){ + create_XMPPNodeExtraProhibitedTable($inFH, $outfile); + } + if( defined $writeSASLMap){ + create_SASLMapTable($inFH, $outfile); + } + if( defined $writeLDAPMap){ + create_LDAPMapTable($inFH, $outfile); + } + close($inFH); +} + +#----------------------------------------------------------------------- +sub readPrint{ + local ($inFH, $outFH,$comment, $table) = @_; + $count = 0; + print $outFH $comment."\n"; + while(defined ($line = <$inFH>)){ + next if $line =~ /Hoffman\s\&\sBlanchet/; # ignore heading + next if $line =~ /RFC\s3454/; # ignore heading + next if $line =~ /\f/; # ignore form feed + next if $line eq "\n"; # ignore blank lines + # break if "End Table" is found + if( $line =~ /End\sTable/){ + print $outFH "\n# Total code points $count\n\n"; + return; + } + if($print==1){ + print $line; + } + $line =~ s/-/../; + $line =~ s/^\s+//; + if($line =~ /\;/){ + }else{ + $line =~ s/$/;/; + } + if($table =~ /A/ ){ + ($code, $noise) = split /;/ , $line; + $line = $code."; ; UNASSIGNED\n"; + }elsif ( $table =~ /B\.1/ ){ + $line =~ s/Map to nothing/MAP/; + }elsif ( $table =~ /B\.[23]/ ){ + $line =~ s/Case map/MAP/; + $line =~ s/Additional folding/MAP/; + }elsif ( $table =~ /C/ ) { + ($code, $noise) = split /;/ , $line; + $line = $code."; ; PROHIBITED\n"; + } + if($line =~ /\.\./){ + ($code, $noise) = split /;/ , $line; + ($startStr, $endStr ) = split /\.\./, $code; + $start = atoi($startStr); + $end = atoi($endStr); + #print $start." ".$end."\n"; + while($start <= $end){ + $count++; + $start++; + } + }else{ + $count++; + } + print $outFH $line; + } +} +#----------------------------------------------------------------------- +sub atoi { + my $t; + foreach my $d (split(//, shift())) { + $t = $t * 16 + $d; + } + return $t; +} +#----------------------------------------------------------------------- +sub createUnassignedTable{ + ($inFH,$outfile) = @_; + $outFH = IO::File->new($outfile,"a") + or die "could not open the file $outfile for writing: $! \n"; + $comment = "# This table contains code points from Table A.1 from RFC 3454\n"; + readPrint($inFH,$outFH, $comment, "A"); + close($outFH); +} +#----------------------------------------------------------------------- +sub createMapToNothing{ + ($inFH,$outfile) = @_; + $outFH = IO::File->new($outfile,"a") + or die "could not open the file $outfile for writing: $! \n"; + $comment = "# This table contains code points from Table B.1 from RFC 3454\n"; + readPrint($inFH,$outFH,$comment, "B.1"); + close($outFH); +} +#----------------------------------------------------------------------- +sub createCaseMapNorm{ + ($inFH,$outfile) = @_; + $outFH = IO::File->new($outfile,"a") + or die "could not open the file $outfile for writing: $! \n"; + $comment = $warning."# This table contains code points from Table B.2 from RFC 3454\n"; + readPrint($inFH,$outFH,$comment, "B.2"); + close($outFH); +} +#----------------------------------------------------------------------- +sub createCaseMapNoNorm{ + ($inFH,$outfile) = @_; + $outFH = IO::File->new($outfile,"a") + or die "could not open the file $outfile for writing: $! \n"; + $comment = $warning."# This table contains code points from Table B.3 from RFC 3454\n"; + readPrint($inFH,$outFH,$comment, "B.3"); + close($outFH); +} +#----------------------------------------------------------------------- +sub createProhibitedTable{ + ($inFH,$outfile,$line) = @_; + $line =~ s/Start//; + $line =~ s/-//g; + $comment = "# code points from $line"; + + $outFH = IO::File->new($outfile, "a") + or die "could not open the file $outfile for writing: $! \n"; + readPrint($inFH,$outFH,$comment, "C"); + close($outFH); +} + +#----------------------------------------------------------------------- +sub create_iSCSIExtraProhibitedTable{ + ($inFH,$outfile,$line) = @_; + $comment ="# Additional prohibitions from iSCSI profile (rfc3722.txt)\n\n"; + + $outFH = IO::File->new($outfile, "a") + or die "could not open the file $outfile for writing: $! \n"; + print $outFH $comment; + print $outFH "0021..002C; ; PROHIBITED\n"; + print $outFH "002F; ; PROHIBITED\n"; + print $outFH "003B..0040; ; PROHIBITED\n"; + print $outFH "005B..0060; ; PROHIBITED\n"; + print $outFH "007B..007E; ; PROHIBITED\n"; + print $outFH "3002; ; PROHIBITED\n"; + print $outFH "\n# Total code points 30\n"; + close($outFH); +} +#----------------------------------------------------------------------- +sub create_XMPPNodeExtraProhibitedTable{ + ($inFH,$outfile,$line) = @_; + $comment ="# Additional prohibitions from XMPP Nodeprep profile (rfc3920.txt)\n\n"; + + $outFH = IO::File->new($outfile, "a") + or die "could not open the file $outfile for writing: $! \n"; + print $outFH $comment; + print $outFH "0022; ; PROHIBITED\n"; + print $outFH "0026; ; PROHIBITED\n"; + print $outFH "0027; ; PROHIBITED\n"; + print $outFH "002F; ; PROHIBITED\n"; + print $outFH "003A; ; PROHIBITED\n"; + print $outFH "003C; ; PROHIBITED\n"; + print $outFH "003E; ; PROHIBITED\n"; + print $outFH "0040; ; PROHIBITED\n"; + print $outFH "\n# Total code points 8\n"; + close($outFH); +} +#----------------------------------------------------------------------- +sub create_SASLMapTable{ + ($inFH,$outfile,$line) = @_; + $comment ="# Map table for SASL profile (rfc4013.txt)\n\n"; + + $outFH = IO::File->new($outfile, "a") + or die "could not open the file $outfile for writing: $! \n"; + print $outFH $comment; + # non-ASCII space characters [C.1.2] to SPACE + print $outFH "00A0; 0020; MAP\n"; + print $outFH "1680; 0020; MAP\n"; + print $outFH "2000; 0020; MAP\n"; + print $outFH "2001; 0020; MAP\n"; + print $outFH "2002; 0020; MAP\n"; + print $outFH "2003; 0020; MAP\n"; + print $outFH "2004; 0020; MAP\n"; + print $outFH "2005; 0020; MAP\n"; + print $outFH "2006; 0020; MAP\n"; + print $outFH "2007; 0020; MAP\n"; + print $outFH "2008; 0020; MAP\n"; + print $outFH "2009; 0020; MAP\n"; + print $outFH "200A; 0020; MAP\n"; + print $outFH "200B; 0020; MAP\n"; + print $outFH "202F; 0020; MAP\n"; + print $outFH "205F; 0020; MAP\n"; + print $outFH "3000; 0020; MAP\n"; + + # commonly mapped to nothing characters except U+200B to nothing + print $outFH "00AD; ; MAP\n"; + print $outFH "034F; ; MAP\n"; + print $outFH "1806; ; MAP\n"; + print $outFH "180B; ; MAP\n"; + print $outFH "180C; ; MAP\n"; + print $outFH "180D; ; MAP\n"; + print $outFH "200C; ; MAP\n"; + print $outFH "200D; ; MAP\n"; + print $outFH "2060; ; MAP\n"; + print $outFH "FE00; ; MAP\n"; + print $outFH "FE01; ; MAP\n"; + print $outFH "FE02; ; MAP\n"; + print $outFH "FE03; ; MAP\n"; + print $outFH "FE04; ; MAP\n"; + print $outFH "FE05; ; MAP\n"; + print $outFH "FE06; ; MAP\n"; + print $outFH "FE07; ; MAP\n"; + print $outFH "FE08; ; MAP\n"; + print $outFH "FE09; ; MAP\n"; + print $outFH "FE0A; ; MAP\n"; + print $outFH "FE0B; ; MAP\n"; + print $outFH "FE0C; ; MAP\n"; + print $outFH "FE0D; ; MAP\n"; + print $outFH "FE0E; ; MAP\n"; + print $outFH "FE0F; ; MAP\n"; + print $outFH "FEFF; ; MAP\n"; + print $outFH "\n# Total code points 43\n"; + close($outFH); +} +#----------------------------------------------------------------------- +sub create_LDAPMapTable{ + ($inFH,$outfile,$line) = @_; + $comment ="# Map table for LDAP profile (rfc4518.txt)\n\n"; + + $outFH = IO::File->new($outfile, "a") + or die "could not open the file $outfile for writing: $! \n"; + print $outFH $comment; + + # SOFT HYPHEN (U+00AD) and MONGOLIAN TODO SOFT HYPHEN (U+1806) code + # points are mapped to nothing. COMBINING GRAPHEME JOINER (U+034F) and + # VARIATION SELECTORs (U+180B-180D, FF00-FE0F) code points are also + # mapped to nothing. The OBJECT REPLACEMENT CHARACTER (U+FFFC) is + # mapped to nothing. + + print $outFH "00AD; ; MAP\n"; + print $outFH "034F; ; MAP\n"; + print $outFH "1806; ; MAP\n"; + print $outFH "180B; ; MAP\n"; + print $outFH "180C; ; MAP\n"; + print $outFH "180D; ; MAP\n"; + print $outFH "FE00; ; MAP\n"; + print $outFH "FE01; ; MAP\n"; + print $outFH "FE02; ; MAP\n"; + print $outFH "FE03; ; MAP\n"; + print $outFH "FE04; ; MAP\n"; + print $outFH "FE05; ; MAP\n"; + print $outFH "FE06; ; MAP\n"; + print $outFH "FE07; ; MAP\n"; + print $outFH "FE08; ; MAP\n"; + print $outFH "FE09; ; MAP\n"; + print $outFH "FE0A; ; MAP\n"; + print $outFH "FE0B; ; MAP\n"; + print $outFH "FE0C; ; MAP\n"; + print $outFH "FE0D; ; MAP\n"; + print $outFH "FE0E; ; MAP\n"; + print $outFH "FE0F; ; MAP\n"; + print $outFH "FFFC; ; MAP\n"; + +# CHARACTER TABULATION (U+0009), LINE FEED (LF) (U+000A), LINE +# TABULATION (U+000B), FORM FEED (FF) (U+000C), CARRIAGE RETURN (CR) +# (U+000D), and NEXT LINE (NEL) (U+0085) are mapped to SPACE (U+0020). + + print $outFH "0009; 0020; MAP\n"; + print $outFH "000A; 0020; MAP\n"; + print $outFH "000B; 0020; MAP\n"; + print $outFH "000C; 0020; MAP\n"; + print $outFH "000D; 0020; MAP\n"; + print $outFH "0085; 0020; MAP\n"; + + # All other control code (e.g., Cc) points or code points with a + # control function (e.g., Cf) are mapped to nothing. The following is + # a complete list of these code points: U+0000-0008, 000E-001F, 007F- + # 0084, 0086-009F, 06DD, 070F, 180E, 200C-200F, 202A-202E, 2060-2063, + # 206A-206F, FEFF, FFF9-FFFB, 1D173-1D17A, E0001, E0020-E007F. + + print $outFH "0000; ; MAP\n"; + print $outFH "0001; ; MAP\n"; + print $outFH "0002; ; MAP\n"; + print $outFH "0003; ; MAP\n"; + print $outFH "0004; ; MAP\n"; + print $outFH "0005; ; MAP\n"; + print $outFH "0006; ; MAP\n"; + print $outFH "0007; ; MAP\n"; + print $outFH "0008; ; MAP\n"; + print $outFH "000E; ; MAP\n"; + print $outFH "000F; ; MAP\n"; + print $outFH "0010; ; MAP\n"; + print $outFH "0011; ; MAP\n"; + print $outFH "0012; ; MAP\n"; + print $outFH "0013; ; MAP\n"; + print $outFH "0014; ; MAP\n"; + print $outFH "0015; ; MAP\n"; + print $outFH "0016; ; MAP\n"; + print $outFH "0017; ; MAP\n"; + print $outFH "0018; ; MAP\n"; + print $outFH "0019; ; MAP\n"; + print $outFH "001A; ; MAP\n"; + print $outFH "001B; ; MAP\n"; + print $outFH "001C; ; MAP\n"; + print $outFH "001D; ; MAP\n"; + print $outFH "001E; ; MAP\n"; + print $outFH "001F; ; MAP\n"; + print $outFH "007F; ; MAP\n"; + print $outFH "0080; ; MAP\n"; + print $outFH "0081; ; MAP\n"; + print $outFH "0082; ; MAP\n"; + print $outFH "0083; ; MAP\n"; + print $outFH "0084; ; MAP\n"; + print $outFH "0086; ; MAP\n"; + print $outFH "0087; ; MAP\n"; + print $outFH "0088; ; MAP\n"; + print $outFH "0089; ; MAP\n"; + print $outFH "008A; ; MAP\n"; + print $outFH "008B; ; MAP\n"; + print $outFH "008C; ; MAP\n"; + print $outFH "008D; ; MAP\n"; + print $outFH "008E; ; MAP\n"; + print $outFH "008F; ; MAP\n"; + print $outFH "0090; ; MAP\n"; + print $outFH "0091; ; MAP\n"; + print $outFH "0092; ; MAP\n"; + print $outFH "0093; ; MAP\n"; + print $outFH "0094; ; MAP\n"; + print $outFH "0095; ; MAP\n"; + print $outFH "0096; ; MAP\n"; + print $outFH "0097; ; MAP\n"; + print $outFH "0098; ; MAP\n"; + print $outFH "0099; ; MAP\n"; + print $outFH "009A; ; MAP\n"; + print $outFH "009B; ; MAP\n"; + print $outFH "009C; ; MAP\n"; + print $outFH "009D; ; MAP\n"; + print $outFH "009E; ; MAP\n"; + print $outFH "009F; ; MAP\n"; + print $outFH "06DD; ; MAP\n"; + print $outFH "070F; ; MAP\n"; + print $outFH "180E; ; MAP\n"; + print $outFH "200C; ; MAP\n"; + print $outFH "200D; ; MAP\n"; + print $outFH "200E; ; MAP\n"; + print $outFH "200F; ; MAP\n"; + print $outFH "202A; ; MAP\n"; + print $outFH "202B; ; MAP\n"; + print $outFH "202C; ; MAP\n"; + print $outFH "202D; ; MAP\n"; + print $outFH "202E; ; MAP\n"; + print $outFH "2060; ; MAP\n"; + print $outFH "2061; ; MAP\n"; + print $outFH "2062; ; MAP\n"; + print $outFH "2063; ; MAP\n"; + print $outFH "206A; ; MAP\n"; + print $outFH "206B; ; MAP\n"; + print $outFH "206C; ; MAP\n"; + print $outFH "206D; ; MAP\n"; + print $outFH "206E; ; MAP\n"; + print $outFH "206F; ; MAP\n"; + print $outFH "FEFF; ; MAP\n"; + print $outFH "FFF9; ; MAP\n"; + print $outFH "FFFA; ; MAP\n"; + print $outFH "FFFB; ; MAP\n"; + print $outFH "1D173; ; MAP\n"; + print $outFH "1D174; ; MAP\n"; + print $outFH "1D175; ; MAP\n"; + print $outFH "1D176; ; MAP\n"; + print $outFH "1D177; ; MAP\n"; + print $outFH "1D178; ; MAP\n"; + print $outFH "1D179; ; MAP\n"; + print $outFH "1D17A; ; MAP\n"; + print $outFH "E0001; ; MAP\n"; + print $outFH "E0020; ; MAP\n"; + print $outFH "E0021; ; MAP\n"; + print $outFH "E0022; ; MAP\n"; + print $outFH "E0023; ; MAP\n"; + print $outFH "E0024; ; MAP\n"; + print $outFH "E0025; ; MAP\n"; + print $outFH "E0026; ; MAP\n"; + print $outFH "E0027; ; MAP\n"; + print $outFH "E0028; ; MAP\n"; + print $outFH "E0029; ; MAP\n"; + print $outFH "E002A; ; MAP\n"; + print $outFH "E002B; ; MAP\n"; + print $outFH "E002C; ; MAP\n"; + print $outFH "E002D; ; MAP\n"; + print $outFH "E002E; ; MAP\n"; + print $outFH "E002F; ; MAP\n"; + print $outFH "E0030; ; MAP\n"; + print $outFH "E0031; ; MAP\n"; + print $outFH "E0032; ; MAP\n"; + print $outFH "E0033; ; MAP\n"; + print $outFH "E0034; ; MAP\n"; + print $outFH "E0035; ; MAP\n"; + print $outFH "E0036; ; MAP\n"; + print $outFH "E0037; ; MAP\n"; + print $outFH "E0038; ; MAP\n"; + print $outFH "E0039; ; MAP\n"; + print $outFH "E003A; ; MAP\n"; + print $outFH "E003B; ; MAP\n"; + print $outFH "E003C; ; MAP\n"; + print $outFH "E003D; ; MAP\n"; + print $outFH "E003E; ; MAP\n"; + print $outFH "E003F; ; MAP\n"; + print $outFH "E0040; ; MAP\n"; + print $outFH "E0041; ; MAP\n"; + print $outFH "E0042; ; MAP\n"; + print $outFH "E0043; ; MAP\n"; + print $outFH "E0044; ; MAP\n"; + print $outFH "E0045; ; MAP\n"; + print $outFH "E0046; ; MAP\n"; + print $outFH "E0047; ; MAP\n"; + print $outFH "E0048; ; MAP\n"; + print $outFH "E0049; ; MAP\n"; + print $outFH "E004A; ; MAP\n"; + print $outFH "E004B; ; MAP\n"; + print $outFH "E004C; ; MAP\n"; + print $outFH "E004D; ; MAP\n"; + print $outFH "E004E; ; MAP\n"; + print $outFH "E004F; ; MAP\n"; + print $outFH "E0050; ; MAP\n"; + print $outFH "E0051; ; MAP\n"; + print $outFH "E0052; ; MAP\n"; + print $outFH "E0053; ; MAP\n"; + print $outFH "E0054; ; MAP\n"; + print $outFH "E0055; ; MAP\n"; + print $outFH "E0056; ; MAP\n"; + print $outFH "E0057; ; MAP\n"; + print $outFH "E0058; ; MAP\n"; + print $outFH "E0059; ; MAP\n"; + print $outFH "E005A; ; MAP\n"; + print $outFH "E005B; ; MAP\n"; + print $outFH "E005C; ; MAP\n"; + print $outFH "E005D; ; MAP\n"; + print $outFH "E005E; ; MAP\n"; + print $outFH "E005F; ; MAP\n"; + print $outFH "E0060; ; MAP\n"; + print $outFH "E0061; ; MAP\n"; + print $outFH "E0062; ; MAP\n"; + print $outFH "E0063; ; MAP\n"; + print $outFH "E0064; ; MAP\n"; + print $outFH "E0065; ; MAP\n"; + print $outFH "E0066; ; MAP\n"; + print $outFH "E0067; ; MAP\n"; + print $outFH "E0068; ; MAP\n"; + print $outFH "E0069; ; MAP\n"; + print $outFH "E006A; ; MAP\n"; + print $outFH "E006B; ; MAP\n"; + print $outFH "E006C; ; MAP\n"; + print $outFH "E006D; ; MAP\n"; + print $outFH "E006E; ; MAP\n"; + print $outFH "E006F; ; MAP\n"; + print $outFH "E0070; ; MAP\n"; + print $outFH "E0071; ; MAP\n"; + print $outFH "E0072; ; MAP\n"; + print $outFH "E0073; ; MAP\n"; + print $outFH "E0074; ; MAP\n"; + print $outFH "E0075; ; MAP\n"; + print $outFH "E0076; ; MAP\n"; + print $outFH "E0077; ; MAP\n"; + print $outFH "E0078; ; MAP\n"; + print $outFH "E0079; ; MAP\n"; + print $outFH "E007A; ; MAP\n"; + print $outFH "E007B; ; MAP\n"; + print $outFH "E007C; ; MAP\n"; + print $outFH "E007D; ; MAP\n"; + print $outFH "E007E; ; MAP\n"; + print $outFH "E007F; ; MAP\n"; + + # ZERO WIDTH SPACE (U+200B) is mapped to nothing. All other code + # points with Separator (space, line, or paragraph) property (e.g., Zs, + # Zl, or Zp) are mapped to SPACE (U+0020). The following is a complete + # list of these code points: U+0020, 00A0, 1680, 2000-200A, 2028-2029, + # 202F, 205F, 3000. + + print $outFH "200B; ; MAP\n"; + print $outFH "00A0; 0020; MAP\n"; + print $outFH "1680; 0020; MAP\n"; + print $outFH "2000; 0020; MAP\n"; + print $outFH "2001; 0020; MAP\n"; + print $outFH "2002; 0020; MAP\n"; + print $outFH "2003; 0020; MAP\n"; + print $outFH "2004; 0020; MAP\n"; + print $outFH "2005; 0020; MAP\n"; + print $outFH "2006; 0020; MAP\n"; + print $outFH "2007; 0020; MAP\n"; + print $outFH "2008; 0020; MAP\n"; + print $outFH "2009; 0020; MAP\n"; + print $outFH "200A; 0020; MAP\n"; + print $outFH "2028; 0020; MAP\n"; + print $outFH "2029; 0020; MAP\n"; + print $outFH "202F; 0020; MAP\n"; + print $outFH "205F; 0020; MAP\n"; + print $outFH "3000; 0020; MAP\n"; + + print $outFH "\n# Total code points 238\n"; + close($outFH); +} +#----------------------------------------------------------------------- +sub usage { + print << "END"; +Usage: +filterRFC3454.pl +Options: + --sourcedir=<directory> + --destdir=<directory> + --src-filename=<name of RFC file> + --dest-filename=<name of destination file> + --A1 Generate data for table A.1 + --B1 Generate data for table B.1 + --B2 Generate data for table B.2 + --B3 Generate data for table B.3 + --C11 Generate data for table C.1.1 + --C12 Generate data for table C.1.2 + --C21 Generate data for table C.2.1 + --C22 Generate data for table C.2.2 + --C3 Generate data for table C.3 + --C4 Generate data for table C.4 + --C5 Generate data for table C.5 + --C6 Generate data for table C.6 + --C7 Generate data for table C.7 + --C8 Generate data for table C.8 + --C9 Generate data for table C.9 + --iscsi Generate data for iSCSI extra prohibited table + --xmpp-node Generate data for XMPP extra prohibited table + --sasl Generate data for SASL map table + --ldap Generate data for LDAP map table + --normalize Embed the normalization directive in the output file + --check-bidi Embed the check bidi directove in the output file + +Note, --B2 and --B3 are mutually exclusive. + +e.g.: filterRFC3454.pl --sourcedir=. --destdir=./output --src-filename=rfc3454.txt --dest-filename=NamePrepProfile.txt --A1 --B1 --B2 --C12 --C22 --C3 --C4 --C5 --C6 --C7 --C8 --C9 --normalize --check-bidi + +filterRFC3454.pl filters the RFC file and creates String prep table files. +The RFC text can be downloaded from ftp://ftp.rfc-editor.org/in-notes/rfc3454.txt + +END + exit(0); +} + + diff --git a/intl/icu/source/tools/gensprep/gensprep.8.in b/intl/icu/source/tools/gensprep/gensprep.8.in new file mode 100644 index 0000000000..e1e9fb32e2 --- /dev/null +++ b/intl/icu/source/tools/gensprep/gensprep.8.in @@ -0,0 +1,104 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" gensprep.8: manual page for the gensprep utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2003 IBM, Inc. and others. +.\" +.TH gensprep 8 "18 March 2003" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B gensprep +\- compile StringPrep data from files filtered by filterRFC3454.pl +.SH SYNOPSIS +.B gensprep +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-v\fP, \fB\-\-verbose" +] +[ +.BI "\-c\fP, \fB\-\-copyright" +] +[ +.BI "\-s\fP, \fB\-\-sourcedir" " source" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +.SH DESCRIPTION +.B gensprep +reads filtered RFC 3454 files and compiles their +information into a binary form. +The resulting file, +.BR <name>.icu , +can then be read directly by ICU, or used by +.BR pkgdata (8) +for incorporation into a larger archive or library. +.LP +The files read by +.B gensprep +are described in the +.B FILES +section. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BR "\-v\fP, \fB\-\-verbose" +Display extra informative messages during execution. +.TP +.BI "\-c\fP, \fB\-\-copyright" +Include a copyright notice into the binary data. +.TP +.BI "\-s\fP, \fB\-\-sourcedir" " source" +Set the source directory to +.IR source . +The default source directory is specified by the environment variable +.BR ICU_DATA . +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory to +.IR destination . +The default destination directory is specified by the environment variable +.BR ICU_DATA . +.SH ENVIRONMENT +.TP 10 +.B ICU_DATA +Specifies the directory containing ICU data. Defaults to +.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ . +Some tools in ICU depend on the presence of the trailing slash. It is thus +important to make sure that it is present if +.B ICU_DATA +is set. +.SH FILES +The following files are read by +.B gensprep +and are looked for in the +.I source +/misc for rfc3454_*.txt files and in +.I source +/unidata for NormalizationCorrections.txt. +.TP 20 +.B rfc3453_A_1.txt +Contains the list of unassigned codepoints in Unicode version 3.2.0.\|.\|.. +.TP +.B rfc3454_B_1.txt +Contains the list of code points that are commonly mapped to nothing.\|.\|.. +.TP +.B rfc3454_B_2.txt +Contains the list of mappings for casefolding of code points when Normalization form NFKC is specified.\|.\|.. +.TP +.B rfc3454_C_X.txt +Contains the list of code points that are prohibited for IDNA. +.TP +.B NormalizationCorrections.txt +Contains the list of code points whose normalization has changed since Unicode Version 3.2.0. +.SH VERSION +@VERSION@ +.SH COPYRIGHT +Copyright (C) 2000-2002 IBM, Inc. and others. +.SH SEE ALSO +.BR pkgdata (8) diff --git a/intl/icu/source/tools/gensprep/gensprep.c b/intl/icu/source/tools/gensprep/gensprep.c new file mode 100644 index 0000000000..10b0e45390 --- /dev/null +++ b/intl/icu/source/tools/gensprep/gensprep.c @@ -0,0 +1,460 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2003-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: gensprep.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003-02-06 +* created by: Ram Viswanadha +* +* This program reads the Profile.txt files, +* parses them, and extracts the data for StringPrep profile. +* It then preprocesses it and writes a binary file for efficient use +* in various StringPrep conversion processes. +*/ + +#define USPREP_TYPE_NAMES_ARRAY 1 + +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> + +#include "cmemory.h" +#include "cstring.h" +#include "toolutil.h" +#include "unewdata.h" +#include "uoptions.h" +#include "uparse.h" +#include "sprpimpl.h" + +#include "unicode/uclean.h" +#include "unicode/udata.h" +#include "unicode/utypes.h" +#include "unicode/putil.h" + + +U_CDECL_BEGIN +#include "gensprep.h" +U_CDECL_END + +UBool beVerbose=false, haveCopyright=true; + +#define NORM_CORRECTIONS_FILE_NAME "NormalizationCorrections.txt" + +#define NORMALIZE_DIRECTIVE "normalize" +#define NORMALIZE_DIRECTIVE_LEN 9 +#define CHECK_BIDI_DIRECTIVE "check-bidi" +#define CHECK_BIDI_DIRECTIVE_LEN 10 + +/* prototypes --------------------------------------------------------------- */ + +static void +parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode); + +static void +parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode); + + +/* -------------------------------------------------------------------------- */ + +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_VERBOSE, + UOPTION_COPYRIGHT, + UOPTION_DESTDIR, + UOPTION_SOURCEDIR, + UOPTION_ICUDATADIR, + UOPTION_BUNDLE_NAME, + { "normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0 }, + { "norm-correction", NULL, NULL, NULL, 'm', UOPT_REQUIRES_ARG, 0 }, + { "check-bidi", NULL, NULL, NULL, 'k', UOPT_NO_ARG, 0}, + { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }, +}; + +enum{ + HELP, + HELP_QUESTION_MARK, + VERBOSE, + COPYRIGHT, + DESTDIR, + SOURCEDIR, + ICUDATADIR, + BUNDLE_NAME, + NORMALIZE, + NORM_CORRECTION_DIR, + CHECK_BIDI, + UNICODE_VERSION +}; + +static int printHelp(int argc, char* argv[]){ + /* + * Broken into chucks because the C89 standard says the minimum + * required supported string length is 509 bytes. + */ + fprintf(stderr, + "Usage: %s [-options] [file_name]\n" + "\n" + "Read the files specified and\n" + "create a binary file [package-name]_[bundle-name]." DATA_TYPE " with the StringPrep profile data\n" + "\n", + argv[0]); + fprintf(stderr, + "Options:\n" + "\t-h or -? or --help print this usage text\n" + "\t-v or --verbose verbose output\n" + "\t-c or --copyright include a copyright notice\n"); + fprintf(stderr, + "\t-d or --destdir destination directory, followed by the path\n" + "\t-s or --sourcedir source directory of ICU data, followed by the path\n" + "\t-b or --bundle-name generate the output data file with the name specified\n" + "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" + "\t followed by path, defaults to %s\n", + u_getDataDirectory()); + fprintf(stderr, + "\t-n or --normalize turn on the option for normalization and include mappings\n" + "\t from NormalizationCorrections.txt from the given path,\n" + "\t e.g: /test/icu/source/data/unidata\n"); + fprintf(stderr, + "\t-m or --norm-correction use NormalizationCorrections.txt from the given path\n" + "\t when the input file contains a normalization directive.\n" + "\t unlike -n/--normalize, this option does not force the\n" + "\t normalization.\n"); + fprintf(stderr, + "\t-k or --check-bidi turn on the option for checking for BiDi in the profile\n" + "\t-u or --unicode version of Unicode to be used with this profile followed by the version\n" + ); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; +} + + +extern int +main(int argc, char* argv[]) { +#if !UCONFIG_NO_IDNA + char* filename = NULL; +#endif + const char *srcDir=NULL, *destDir=NULL, *icuUniDataDir=NULL; + const char *bundleName=NULL, *inputFileName = NULL; + char *basename=NULL; + int32_t sprepOptions = 0; + + UErrorCode errorCode=U_ZERO_ERROR; + + U_MAIN_INIT_ARGS(argc, argv); + + /* preset then read command line options */ + options[DESTDIR].value=u_getDataDirectory(); + options[SOURCEDIR].value=""; + options[UNICODE_VERSION].value="0"; /* don't assume the unicode version */ + options[BUNDLE_NAME].value = DATA_NAME; + options[NORMALIZE].value = ""; + + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } + if(argc<0 || options[HELP].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { + return printHelp(argc, argv); + + } + + /* get the options values */ + beVerbose=options[VERBOSE].doesOccur; + haveCopyright=options[COPYRIGHT].doesOccur; + srcDir=options[SOURCEDIR].value; + destDir=options[DESTDIR].value; + bundleName = options[BUNDLE_NAME].value; + if(options[NORMALIZE].doesOccur) { + icuUniDataDir = options[NORMALIZE].value; + } else { + icuUniDataDir = options[NORM_CORRECTION_DIR].value; + } + + if(argc<2) { + /* print the help message */ + return printHelp(argc, argv); + } else { + inputFileName = argv[1]; + } + if(!options[UNICODE_VERSION].doesOccur){ + return printHelp(argc, argv); + } + if(options[ICUDATADIR].doesOccur) { + u_setDataDirectory(options[ICUDATADIR].value); + } +#if UCONFIG_NO_IDNA + + fprintf(stderr, + "gensprep writes dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE + " because UCONFIG_NO_IDNA is set, \n" + "see icu/source/common/unicode/uconfig.h\n"); + generateData(destDir, bundleName); + +#else + + setUnicodeVersion(options[UNICODE_VERSION].value); + filename = (char* ) uprv_malloc(uprv_strlen(srcDir) + uprv_strlen(inputFileName) + (icuUniDataDir == NULL ? 0 : uprv_strlen(icuUniDataDir)) + 40); /* hopefully this should be enough */ + + /* prepare the filename beginning with the source dir */ + if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL && uprv_strchr(srcDir,U_FILE_ALT_SEP_CHAR) == NULL){ + filename[0] = '.'; + filename[1] = U_FILE_SEP_CHAR; + uprv_strcpy(filename+2,srcDir); + }else{ + uprv_strcpy(filename, srcDir); + } + + basename=filename+uprv_strlen(filename); + if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { + *basename++=U_FILE_SEP_CHAR; + } + + /* initialize */ + init(); + + /* process the file */ + uprv_strcpy(basename,inputFileName); + parseMappings(filename,false, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "Could not open file %s for reading. Error: %s \n", filename, u_errorName(errorCode)); + return errorCode; + } + + if(options[NORMALIZE].doesOccur){ /* this option might be set by @normalize;; in the source file */ + /* set up directory for NormalizationCorrections.txt */ + uprv_strcpy(filename,icuUniDataDir); + basename=filename+uprv_strlen(filename); + if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { + *basename++=U_FILE_SEP_CHAR; + } + + *basename++=U_FILE_SEP_CHAR; + uprv_strcpy(basename,NORM_CORRECTIONS_FILE_NAME); + + parseNormalizationCorrections(filename,&errorCode); + if(U_FAILURE(errorCode)){ + fprintf(stderr,"Could not open file %s for reading \n", filename); + return errorCode; + } + sprepOptions |= _SPREP_NORMALIZATION_ON; + } + + if(options[CHECK_BIDI].doesOccur){ /* this option might be set by @check-bidi;; in the source file */ + sprepOptions |= _SPREP_CHECK_BIDI_ON; + } + + setOptions(sprepOptions); + + /* process parsed data */ + if(U_SUCCESS(errorCode)) { + /* write the data file */ + generateData(destDir, bundleName); + + cleanUpData(); + } + + uprv_free(filename); + + u_cleanup(); + +#endif + + return errorCode; +} + +#if !UCONFIG_NO_IDNA + +static void U_CALLCONV +normalizationCorrectionsLineFn(void *context, + char *fields[][2], int32_t fieldCount, + UErrorCode *pErrorCode) { + (void)context; // suppress compiler warnings about unused variable + (void)fieldCount; // suppress compiler warnings about unused variable + uint32_t mapping[40]; + char *end, *s; + uint32_t code; + int32_t length; + UVersionInfo version; + UVersionInfo thisVersion; + + /* get the character code, field 0 */ + code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "gensprep: error parsing NormalizationCorrections.txt mapping at %s\n", fields[0][0]); + exit(*pErrorCode); + } + /* Original (erroneous) decomposition */ + s = fields[1][0]; + + /* parse the mapping string */ + length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode); + + /* ignore corrected decomposition */ + + u_versionFromString(version,fields[3][0] ); + u_versionFromString(thisVersion, "3.2.0"); + + + + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "gensprep error parsing NormalizationCorrections.txt of U+%04lx - %s\n", + (long)code, u_errorName(*pErrorCode)); + exit(*pErrorCode); + } + + /* store the mapping */ + if( version[0] > thisVersion[0] || + ((version[0]==thisVersion[0]) && (version[1] > thisVersion[1])) + ){ + storeMapping(code,mapping, length, USPREP_MAP, pErrorCode); + } + setUnicodeVersionNC(version); +} + +static void +parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode) { + char *fields[4][2]; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return; + } + + u_parseDelimitedFile(filename, ';', fields, 4, normalizationCorrectionsLineFn, NULL, pErrorCode); + + /* fprintf(stdout,"Number of code points that have NormalizationCorrections mapping with length >1 : %i\n",len); */ + + if(U_FAILURE(*pErrorCode) && ( *pErrorCode!=U_FILE_ACCESS_ERROR)) { + fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); + exit(*pErrorCode); + } +} + +static void U_CALLCONV +strprepProfileLineFn(void *context, + char *fields[][2], int32_t fieldCount, + UErrorCode *pErrorCode) { + (void)fieldCount; // suppress compiler warnings about unused variable + uint32_t mapping[40]; + char *end, *map; + uint32_t code; + int32_t length; + /*UBool* mapWithNorm = (UBool*) context;*/ + const char* typeName; + uint32_t rangeStart=0,rangeEnd =0; + const char* filename = (const char*) context; + const char *s; + + s = u_skipWhitespace(fields[0][0]); + if (*s == '@') { + /* special directive */ + s++; + length = (int32_t)(fields[0][1] - s); + if (length >= NORMALIZE_DIRECTIVE_LEN + && uprv_strncmp(s, NORMALIZE_DIRECTIVE, NORMALIZE_DIRECTIVE_LEN) == 0) { + options[NORMALIZE].doesOccur = true; + return; + } + else if (length >= CHECK_BIDI_DIRECTIVE_LEN + && uprv_strncmp(s, CHECK_BIDI_DIRECTIVE, CHECK_BIDI_DIRECTIVE_LEN) == 0) { + options[CHECK_BIDI].doesOccur = true; + return; + } + else { + fprintf(stderr, "gensprep error parsing a directive %s.", fields[0][0]); + } + } + + typeName = fields[2][0]; + map = fields[1][0]; + + if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){ + + u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); + if(U_FAILURE(*pErrorCode)){ + fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); + return; + } + + /* store the range */ + storeRange(rangeStart,rangeEnd,USPREP_UNASSIGNED, pErrorCode); + + }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){ + + u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); + if(U_FAILURE(*pErrorCode)){ + fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); + return; + } + + /* store the range */ + storeRange(rangeStart,rangeEnd,USPREP_PROHIBITED, pErrorCode); + + }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){ + + /* get the character code, field 0 */ + code=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || end!=fields[0][1]) { + fprintf(stderr, "gensprep: syntax error in field 0 at %s\n", fields[0][0]); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + + /* parse the mapping string */ + length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode); + + /* store the mapping */ + storeMapping(code,mapping, length,USPREP_MAP, pErrorCode); + + }else{ + *pErrorCode = U_INVALID_FORMAT_ERROR; + } + + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "gensprep error parsing %s line %s at %s. Error: %s\n",filename, + fields[0][0],fields[2][0],u_errorName(*pErrorCode)); + exit(*pErrorCode); + } + +} + +static void +parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode) { + char *fields[3][2]; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return; + } + + u_parseDelimitedFile(filename, ';', fields, 3, strprepProfileLineFn, (void*)filename, pErrorCode); + + /*fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);*/ + + if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) { + fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); + exit(*pErrorCode); + } +} + + +#endif /* #if !UCONFIG_NO_IDNA */ + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ diff --git a/intl/icu/source/tools/gensprep/gensprep.h b/intl/icu/source/tools/gensprep/gensprep.h new file mode 100644 index 0000000000..a2e5e61f9a --- /dev/null +++ b/intl/icu/source/tools/gensprep/gensprep.h @@ -0,0 +1,83 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2006, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: gensprep.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003-02-06 +* created by: Ram Viswanadha +*/ + +#ifndef __GENIDN_H__ +#define __GENIDN_H__ + +#include "unicode/utypes.h" +#include "sprpimpl.h" + +/* file definitions */ +#define DATA_NAME "sprep" +#define DATA_TYPE "spp" + +/* + * data structure that holds the IDN properties for one or more + * code point(s) at build time + */ + + +/* global flags */ +extern UBool beVerbose, haveCopyright; + +/* prototypes */ + +extern void +setUnicodeVersion(const char *v); + +extern void +setUnicodeVersionNC(UVersionInfo version); + +extern void +init(void); + +#if !UCONFIG_NO_IDNA +extern void +storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, UStringPrepType type, UErrorCode* status); +extern void +storeRange(uint32_t start, uint32_t end, UStringPrepType type,UErrorCode* status); +#endif + +extern void +generateData(const char *dataDir, const char* bundleName); + +extern void +setOptions(int32_t options); + +extern void +cleanUpData(void); + +/* +extern void +storeIDN(uint32_t code, IDN *idn); + +extern void +processData(void); + + +*/ +#endif + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ diff --git a/intl/icu/source/tools/gensprep/gensprep.vcxproj b/intl/icu/source/tools/gensprep/gensprep.vcxproj new file mode 100644 index 0000000000..c6f7bbd861 --- /dev/null +++ b/intl/icu/source/tools/gensprep/gensprep.vcxproj @@ -0,0 +1,84 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{631C23CE-6C1D-4875-88F0-85E0A42B36EA}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)\gensprep.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)\gensprep.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)\gensprep.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)\gensprep.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="gensprep.c" /> + <ClCompile Include="store.c" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="gensprep.h" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/gensprep/gensprep.vcxproj.filters b/intl/icu/source/tools/gensprep/gensprep.vcxproj.filters new file mode 100644 index 0000000000..2791b3aa6a --- /dev/null +++ b/intl/icu/source/tools/gensprep/gensprep.vcxproj.filters @@ -0,0 +1,30 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{bb521e6b-d70a-4efd-9399-408729059da6}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{837c7f4e-341d-4455-aa1e-f6ff7a03b065}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{a80f327a-7fb8-4737-8bd9-0f4b26c2c344}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="gensprep.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="store.c"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="gensprep.h"> + <Filter>Header Files</Filter> + </ClInclude> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/gensprep/sources.txt b/intl/icu/source/tools/gensprep/sources.txt new file mode 100644 index 0000000000..c369456cb3 --- /dev/null +++ b/intl/icu/source/tools/gensprep/sources.txt @@ -0,0 +1,2 @@ +gensprep.c +store.c diff --git a/intl/icu/source/tools/gensprep/store.c b/intl/icu/source/tools/gensprep/store.c new file mode 100644 index 0000000000..c3712febb4 --- /dev/null +++ b/intl/icu/source/tools/gensprep/store.c @@ -0,0 +1,653 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: store.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003-02-06 +* created by: Ram Viswanadha +* +*/ + +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include "unicode/utypes.h" +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" +#include "toolutil.h" +#include "unicode/udata.h" +#include "unicode/utf16.h" +#include "utrie.h" +#include "unewdata.h" +#include "gensprep.h" +#include "uhash.h" + + +#define DO_DEBUG_OUT 0 + + +/* + * StringPrep profile file format ------------------------------------ + * + * The file format prepared and written here contains a 16-bit trie and a mapping table. + * + * Before the data contents described below, there are the headers required by + * the udata API for loading ICU data. Especially, a UDataInfo structure + * precedes the actual data. It contains platform properties values and the + * file format version. + * + * The following is a description of format version 2. + * + * Data contents: + * + * The contents is a parsed, binary form of RFC3454 and possibly + * NormalizationCorrections.txt depending on the options specified on the profile. + * + * Any Unicode code point from 0 to 0x10ffff can be looked up to get + * the trie-word, if any, for that code point. This means that the input + * to the lookup are 21-bit unsigned integers, with not all of the + * 21-bit range used. + * + * *.spp files customarily begin with a UDataInfo structure, see udata.h and .c. + * After that there are the following structures: + * + * int32_t indexes[_SPREP_INDEX_TOP]; -- _SPREP_INDEX_TOP=16, see enum in sprpimpl.h file + * + * UTrie stringPrepTrie; -- size in bytes=indexes[_SPREP_INDEX_TRIE_SIZE] + * + * uint16_t mappingTable[]; -- Contains the sequence of code units that the code point maps to + * size in bytes = indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] + * + * The indexes array contains the following values: + * indexes[_SPREP_INDEX_TRIE_SIZE] -- The size of the StringPrep trie in bytes + * indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] -- The size of the mappingTable in bytes + * indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] -- The index of Unicode version of last entry in NormalizationCorrections.txt + * indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] -- The starting index of 1 UChar mapping index in the mapping table + * indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] -- The starting index of 2 UChars mapping index in the mapping table + * indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] -- The starting index of 3 UChars mapping index in the mapping table + * indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] -- The starting index of 4 UChars mapping index in the mapping table + * indexes[_SPREP_OPTIONS] -- Bit set of options to turn on in the profile, e.g: USPREP_NORMALIZATION_ON, USPREP_CHECK_BIDI_ON + * + * + * StringPrep Trie : + * + * The StringPrep tries is a 16-bit trie that contains data for the profile. + * Each code point is associated with a value (trie-word) in the trie. + * + * - structure of data words from the trie + * + * i) A value greater than or equal to _SPREP_TYPE_THRESHOLD (0xFFF0) + * represents the type associated with the code point + * if(trieWord >= _SPREP_TYPE_THRESHOLD){ + * type = trieWord - 0xFFF0; + * } + * The type can be : + * USPREP_UNASSIGNED + * USPREP_PROHIBITED + * USPREP_DELETE + * + * ii) A value less than _SPREP_TYPE_THRESHOLD means the type is USPREP_MAP and + * contains distribution described below + * + * 0 - ON : The code point is prohibited (USPREP_PROHIBITED). This is to allow for codepoint that are both prohibited and mapped. + * 1 - ON : The value in the next 14 bits is an index into the mapping table + * OFF: The value in the next 14 bits is an delta value from the code point + * 2..15 - Contains data as described by bit 1. If all bits are set + * (value = _SPREP_MAX_INDEX_VALUE) then the type is USPREP_DELETE + * + * + * Mapping Table: + * The data in mapping table is sorted according to the length of the mapping sequence. + * If the type of the code point is USPREP_MAP and value in trie word is an index, the index + * is compared with start indexes of sequence length start to figure out the length according to + * the following algorithm: + * + * if( index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] && + * index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){ + * length = 1; + * }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] && + * index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){ + * length = 2; + * }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] && + * index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){ + * length = 3; + * }else{ + * // The first position in the mapping table contains the length + * // of the sequence + * length = mappingTable[index++]; + * + * } + * + */ + +/* file data ---------------------------------------------------------------- */ +/* indexes[] value names */ + +#if UCONFIG_NO_IDNA + +/* dummy UDataInfo cf. udata.h */ +static UDataInfo dataInfo = { + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, + + { 0, 0, 0, 0 }, /* dummy dataFormat */ + { 0, 0, 0, 0 }, /* dummy formatVersion */ + { 0, 0, 0, 0 } /* dummy dataVersion */ +}; + +#else + +static int32_t indexes[_SPREP_INDEX_TOP]={ 0 }; + +static uint16_t* mappingData= NULL; +static int32_t mappingDataCapacity = 0; /* we skip the first index in mapping data */ +static int16_t currentIndex = 0; /* the current index into the data trie */ +static int32_t maxLength = 0; /* maximum length of mapping string */ + + +/* UDataInfo cf. udata.h */ +static UDataInfo dataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, + + { 0x53, 0x50, 0x52, 0x50 }, /* dataFormat="SPRP" */ + { 3, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ + { 3, 2, 0, 0 } /* dataVersion (Unicode version) */ +}; +void +setUnicodeVersion(const char *v) { + UVersionInfo version; + u_versionFromString(version, v); + uprv_memcpy(dataInfo.dataVersion, version, 4); +} + +void +setUnicodeVersionNC(UVersionInfo version){ + uint32_t univer = version[0] << 24; + univer += version[1] << 16; + univer += version[2] << 8; + univer += version[3]; + indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] = univer; +} +static UNewTrie *sprepTrie; + +#define MAX_DATA_LENGTH 11500 + + +#define SPREP_DELTA_RANGE_POSITIVE_LIMIT 8191 +#define SPREP_DELTA_RANGE_NEGATIVE_LIMIT -8192 + + +extern void +init() { + + sprepTrie = (UNewTrie *)uprv_calloc(1, sizeof(UNewTrie)); + + /* initialize the two tries */ + if(NULL==utrie_open(sprepTrie, NULL, MAX_DATA_LENGTH, 0, 0, false)) { + fprintf(stderr, "error: failed to initialize tries\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } +} + +static UHashtable* hashTable = NULL; + + +typedef struct ValueStruct { + UChar* mapping; + int16_t length; + UStringPrepType type; +} ValueStruct; + +/* Callback for deleting the value from the hashtable */ +static void U_CALLCONV valueDeleter(void* obj){ + ValueStruct* value = (ValueStruct*) obj; + uprv_free(value->mapping); + uprv_free(value); +} + +/* Callback for hashing the entry */ +static int32_t U_CALLCONV hashEntry(const UHashTok parm) { + return parm.integer; +} + +/* Callback for comparing two entries */ +static UBool U_CALLCONV compareEntries(const UHashTok p1, const UHashTok p2) { + return (UBool)(p1.integer != p2.integer); +} + + +static void +storeMappingData(void){ + + int32_t pos = UHASH_FIRST; + const UHashElement* element = NULL; + ValueStruct* value = NULL; + int32_t codepoint = 0; + int32_t elementCount = 0; + int32_t writtenElementCount = 0; + int32_t mappingLength = 1; /* minimum mapping length */ + int32_t oldMappingLength = 0; + uint16_t trieWord =0; + int32_t limitIndex = 0; + + if (hashTable == NULL) { + return; + } + elementCount = uhash_count(hashTable); + + /*initialize the mapping data */ + mappingData = (uint16_t*) uprv_calloc(mappingDataCapacity, U_SIZEOF_UCHAR); + + while(writtenElementCount < elementCount){ + + while( (element = uhash_nextElement(hashTable, &pos))!=NULL){ + + codepoint = element->key.integer; + value = (ValueStruct*)element->value.pointer; + + /* store the start of indexes */ + if(oldMappingLength != mappingLength){ + /* Assume that index[] is used according to the enums defined */ + if(oldMappingLength <=_SPREP_MAX_INDEX_TOP_LENGTH){ + indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex; + } + if(oldMappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH && + mappingLength == _SPREP_MAX_INDEX_TOP_LENGTH +1){ + + limitIndex = currentIndex; + + } + oldMappingLength = mappingLength; + } + + if(value->length == mappingLength){ + uint32_t savedTrieWord = 0; + trieWord = currentIndex << 2; + /* turn on the 2nd bit to signal that the following bits contain an index */ + trieWord += 0x02; + + if(trieWord > _SPREP_TYPE_THRESHOLD){ + fprintf(stderr,"trieWord cannot contain value greater than 0x%04X.\n",_SPREP_TYPE_THRESHOLD); + exit(U_ILLEGAL_CHAR_FOUND); + } + /* figure out if the code point has type already stored */ + savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); + if(savedTrieWord!=0){ + if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ + /* turn on the first bit in trie word */ + trieWord += 0x01; + }else{ + /* + * the codepoint has value something other than prohibited + * and a mapping .. error! + */ + fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + } + + /* now set the value in the trie */ + if(!utrie_set32(sprepTrie,codepoint,trieWord)){ + fprintf(stderr,"Could not set the value for code point.\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + + /* written the trie word for the codepoint... increment the count*/ + writtenElementCount++; + + /* sanity check are we exceeding the max number allowed */ + if(currentIndex+value->length+1 > _SPREP_MAX_INDEX_VALUE){ + fprintf(stderr, "Too many entries in the mapping table %i. Maximum allowed is %i\n", + currentIndex+value->length, _SPREP_MAX_INDEX_VALUE); + exit(U_INDEX_OUTOFBOUNDS_ERROR); + } + + /* copy the mapping data */ + /* write the length */ + if(mappingLength > _SPREP_MAX_INDEX_TOP_LENGTH ){ + /* the cast here is safe since we donot expect the length to be > 65535 */ + mappingData[currentIndex++] = (uint16_t) mappingLength; + } + /* copy the contents to mappindData array */ + u_memmove(mappingData+currentIndex, value->mapping, value->length); + currentIndex += value->length; + if (currentIndex > mappingDataCapacity) { + /* If this happens there is a bug in the computation of the mapping data size in storeMapping() */ + fprintf(stderr, "gensprep, fatal error at %s, %d. Aborting.\n", __FILE__, __LINE__); + exit(U_INTERNAL_PROGRAM_ERROR); + } + } + } + mappingLength++; + pos = -1; + } + /* set the last length for range check */ + if(mappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH){ + indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex+1; + }else{ + indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] = limitIndex; + } + +} + +extern void setOptions(int32_t options){ + indexes[_SPREP_OPTIONS] = options; +} +extern void +storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, + UStringPrepType type, UErrorCode* status){ + + + UChar* map = NULL; + int16_t adjustedLen=0, i, j; + uint16_t trieWord = 0; + ValueStruct *value = NULL; + uint32_t savedTrieWord = 0; + + /* initialize the hashtable */ + if(hashTable==NULL){ + hashTable = uhash_open(hashEntry, compareEntries, NULL, status); + uhash_setValueDeleter(hashTable, valueDeleter); + } + + /* figure out if the code point has type already stored */ + savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); + if(savedTrieWord!=0){ + if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ + /* turn on the first bit in trie word */ + trieWord += 0x01; + }else{ + /* + * the codepoint has value something other than prohibited + * and a mapping .. error! + */ + fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + } + + /* figure out the real length */ + for(i=0; i<length; i++){ + adjustedLen += U16_LENGTH(mapping[i]); + } + + if(adjustedLen == 0){ + trieWord = (uint16_t)(_SPREP_MAX_INDEX_VALUE << 2); + /* make sure that the value of trieWord is less than the threshold */ + if(trieWord < _SPREP_TYPE_THRESHOLD){ + /* now set the value in the trie */ + if(!utrie_set32(sprepTrie,codepoint,trieWord)){ + fprintf(stderr,"Could not set the value for code point.\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + /* value is set so just return */ + return; + }else{ + fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); + exit(U_ILLEGAL_CHAR_FOUND); + } + } + + if(adjustedLen == 1){ + /* calculate the delta */ + int16_t delta = (int16_t)((int32_t)codepoint - (int16_t) mapping[0]); + if(delta >= SPREP_DELTA_RANGE_NEGATIVE_LIMIT && delta <= SPREP_DELTA_RANGE_POSITIVE_LIMIT){ + + trieWord = delta; + trieWord <<= 2; + + + /* make sure that the second bit is OFF */ + if((trieWord & 0x02) != 0 ){ + fprintf(stderr,"The second bit in the trie word is not zero while storing a delta.\n"); + exit(U_INTERNAL_PROGRAM_ERROR); + } + /* make sure that the value of trieWord is less than the threshold */ + if(trieWord < _SPREP_TYPE_THRESHOLD){ + /* now set the value in the trie */ + if(!utrie_set32(sprepTrie,codepoint,trieWord)){ + fprintf(stderr,"Could not set the value for code point.\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + /* value is set so just return */ + return; + } + } + /* + * if the delta is not in the given range or if the trieWord is larger than the threshold + * just fall through for storing the mapping in the mapping table + */ + } + + map = (UChar*) uprv_calloc(adjustedLen + 1, U_SIZEOF_UCHAR); + + for (i=0, j=0; i<length; i++) { + U16_APPEND_UNSAFE(map, j, mapping[i]); + } + + value = (ValueStruct*) uprv_malloc(sizeof(ValueStruct)); + value->mapping = map; + value->type = type; + value->length = adjustedLen; + if(value->length > _SPREP_MAX_INDEX_TOP_LENGTH){ + mappingDataCapacity++; + } + if(maxLength < value->length){ + maxLength = value->length; + } + uhash_iput(hashTable,codepoint,value,status); + mappingDataCapacity += adjustedLen; + + if(U_FAILURE(*status)){ + fprintf(stderr, "Failed to put entries into the hash table. Error: %s\n", u_errorName(*status)); + exit(*status); + } +} + + +extern void +storeRange(uint32_t start, uint32_t end, UStringPrepType type, UErrorCode* status){ + (void)status; // suppress compiler warnings about unused variable + uint16_t trieWord = 0; + + if((int)(_SPREP_TYPE_THRESHOLD + type) > 0xFFFF){ + fprintf(stderr,"trieWord cannot contain value greater than 0xFFFF.\n"); + exit(U_ILLEGAL_CHAR_FOUND); + } + trieWord = (_SPREP_TYPE_THRESHOLD + type); /* the top 4 bits contain the value */ + if(start == end){ + uint32_t savedTrieWord = utrie_get32(sprepTrie, start, NULL); + if(savedTrieWord>0){ + if(savedTrieWord < _SPREP_TYPE_THRESHOLD && type == USPREP_PROHIBITED){ + /* + * A mapping is stored in the trie word + * and the only other possible type that a + * code point can have is USPREP_PROHIBITED + * + */ + + /* turn on the 0th bit in the savedTrieWord */ + savedTrieWord += 0x01; + + /* the downcast is safe since we only save 16 bit values */ + trieWord = (uint16_t)savedTrieWord; + + /* make sure that the value of trieWord is less than the threshold */ + if(trieWord < _SPREP_TYPE_THRESHOLD){ + /* now set the value in the trie */ + if(!utrie_set32(sprepTrie,start,trieWord)){ + fprintf(stderr,"Could not set the value for code point.\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + /* value is set so just return */ + return; + }else{ + fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); + exit(U_ILLEGAL_CHAR_FOUND); + } + + }else if(savedTrieWord != trieWord){ + fprintf(stderr,"Value for codepoint \\U%08X already set!.\n", (int)start); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + /* if savedTrieWord == trieWord .. fall through and set the value */ + } + if(!utrie_set32(sprepTrie,start,trieWord)){ + fprintf(stderr,"Could not set the value for code point \\U%08X.\n", (int)start); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + }else{ + if(!utrie_setRange32(sprepTrie, start, end+1, trieWord, false)){ + fprintf(stderr,"Value for certain codepoint already set.\n"); + exit(U_ILLEGAL_CHAR_FOUND); + } + } + +} + +/* folding value: just store the offset (16 bits) if there is any non-0 entry */ +static uint32_t U_CALLCONV +getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) { + uint32_t value; + UChar32 limit=0; + UBool inBlockZero; + + limit=start+0x400; + while(start<limit) { + value=utrie_get32(trie, start, &inBlockZero); + if(inBlockZero) { + start+=UTRIE_DATA_BLOCK_LENGTH; + } else if(value!=0) { + return (uint32_t)offset; + } else { + ++start; + } + } + return 0; + +} + +#endif /* #if !UCONFIG_NO_IDNA */ + +extern void +generateData(const char *dataDir, const char* bundleName) { + static uint8_t sprepTrieBlock[100000]; + + UNewDataMemory *pData; + UErrorCode errorCode=U_ZERO_ERROR; + int32_t size, dataLength; + char* fileName = (char*) uprv_malloc(uprv_strlen(bundleName) +100); + +#if UCONFIG_NO_IDNA + + size=0; + +#else + + int32_t sprepTrieSize; + + /* sort and add mapping data */ + storeMappingData(); + + sprepTrieSize=utrie_serialize(sprepTrie, sprepTrieBlock, sizeof(sprepTrieBlock), getFoldedValue, true, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "error: utrie_serialize(sprep trie) failed, %s\n", u_errorName(errorCode)); + exit(errorCode); + } + + size = sprepTrieSize + mappingDataCapacity*U_SIZEOF_UCHAR + sizeof(indexes); + if(beVerbose) { + printf("size of sprep trie %5u bytes\n", (int)sprepTrieSize); + printf("size of " U_ICUDATA_NAME "_%s." DATA_TYPE " contents: %ld bytes\n", bundleName,(long)size); + printf("size of mapping data array %5u bytes\n",(int)mappingDataCapacity * U_SIZEOF_UCHAR); + printf("Number of code units in mappingData (currentIndex) are: %i \n", currentIndex); + printf("Maximum length of the mapping string is : %i \n", (int)maxLength); + } + +#endif + + fileName[0]=0; + uprv_strcat(fileName,bundleName); + /* write the data */ + pData=udata_create(dataDir, DATA_TYPE, fileName, &dataInfo, + haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "gensprep: unable to create the output file, error %d\n", errorCode); + exit(errorCode); + } + +#if !UCONFIG_NO_IDNA + + indexes[_SPREP_INDEX_TRIE_SIZE]=sprepTrieSize; + indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]=mappingDataCapacity*U_SIZEOF_UCHAR; + + udata_writeBlock(pData, indexes, sizeof(indexes)); + udata_writeBlock(pData, sprepTrieBlock, sprepTrieSize); + udata_writeBlock(pData, mappingData, indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]); + + +#endif + + /* finish up */ + dataLength=udata_finish(pData, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "gensprep: error %d writing the output file\n", errorCode); + exit(errorCode); + } + + if(dataLength!=size) { + fprintf(stderr, "gensprep error: data length %ld != calculated size %ld\n", + (long)dataLength, (long)size); + exit(U_INTERNAL_PROGRAM_ERROR); + } + +#if !UCONFIG_NO_IDNA + /* done with writing the data .. close the hashtable */ + if (hashTable != NULL) { + uhash_close(hashTable); + } +#endif + + uprv_free(fileName); +} + +#if !UCONFIG_NO_IDNA + +extern void +cleanUpData(void) { + uprv_free(mappingData); + utrie_close(sprepTrie); + uprv_free(sprepTrie); +} + +#endif /* #if !UCONFIG_NO_IDNA */ + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ diff --git a/intl/icu/source/tools/gentest/Makefile.in b/intl/icu/source/tools/gentest/Makefile.in new file mode 100644 index 0000000000..4aba2b6edd --- /dev/null +++ b/intl/icu/source/tools/gentest/Makefile.in @@ -0,0 +1,79 @@ +## Makefile.in for ICU - tools/gentest +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 1999-2011, International Business Machines Corporation and +## others. All Rights Reserved. +## Madhu Katragadda + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/gentest + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) + +## Target information +TARGET = gentest$(EXEEXT) + +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/tools/ctestfw +CPPFLAGS+= -I$(top_srcdir)/i18n +LIBS = $(LIBCTESTFW) $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(SOURCES:.c=.o) + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) + +install-local: all-local + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/gentest/genres32.c b/intl/icu/source/tools/gentest/genres32.c new file mode 100644 index 0000000000..64171559e3 --- /dev/null +++ b/intl/icu/source/tools/gentest/genres32.c @@ -0,0 +1,104 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2003-2006, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: genres32.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003sep10 +* created by: Markus W. Scherer +* +* Write an ICU resource bundle with a table whose +* number of key characters and number of items both exceed 64k. +* Writing it as the root table tests also that +* the new table type is recognized for the root resource by the reader code. +*/ +#include <stdio.h> +#include "unicode/putil.h" +#include "cstring.h" +#include "gentest.h" + +static void +incKey(char *key, char *limit) { + char c; + + while(limit>key) { + c=*--limit; + if(c=='o') { + *limit='1'; + break; + } else { + *limit='o'; + } + } +} + +U_CFUNC int +genres32(const char *prog, const char *path) { + /* + * key string, gets incremented binary numbers + * letter 'o'=0 and digit '1'=1 so that data swapping can be tested + * with reordering (ASCII: '1'<'o' EBCDIC: '1'>'o') + * + * need 17 digits for >64k unique items + */ + char key[20]="ooooooooooooooooo"; + char *limit; + int i; + char file[512]; + FILE *out; + + uprv_strcpy(file,path); + if(file[strlen(file)-1]!=U_FILE_SEP_CHAR) { + uprv_strcat(file,U_FILE_SEP_STRING); + } + uprv_strcat(file,"testtable32.txt"); + out = fopen(file, "w"); + /*puts(file);*/ + puts("Generating testtable32.txt"); + if(out == NULL) { + fprintf(stderr, "%s: Couldn't create resource test file %s\n", + prog, file); + return 1; + } + + /* find the limit of the key string */ + for(limit=key; *limit!=0; ++limit) { + } + + /* output the beginning of the bundle */ + fputs( + "testtable32 {", out + ); + + /* output the table entries */ + for(i=0; i<66000; ++i) { + if(i%10==0) { + /* + * every 10th entry contains a string with + * the entry index as its code point + */ + fprintf(out, "%s{\"\\U%08x\"}\n", key, i); + } else { + /* other entries contain their index as an integer */ + fprintf(out, "%s:int{%d}\n", key, i); + } + + incKey(key, limit); + } + + /* output the end of the bundle */ + fputs( + "}", out + ); + + fclose(out); + return 0; +} diff --git a/intl/icu/source/tools/gentest/gentest.c b/intl/icu/source/tools/gentest/gentest.c new file mode 100644 index 0000000000..77076e9369 --- /dev/null +++ b/intl/icu/source/tools/gentest/gentest.c @@ -0,0 +1,229 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: gentest.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000mar03 +* created by: Madhu Katragadda +* +* This program writes a little data file for testing the udata API. +*/ + +#include <stdio.h> +#include <stdlib.h> +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/uclean.h" +#include "unicode/udata.h" +#include "udbgutil.h" +#include "unewdata.h" +#include "cmemory.h" +#include "cstring.h" +#include "uoptions.h" +#include "gentest.h" + +#define DATA_NAME "test" +#define DATA_TYPE "icu" + +/* UDataInfo cf. udata.h */ +static const UDataInfo dataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + sizeof(UChar), + 0, + + {0x54, 0x65, 0x73, 0x74}, /* dataFormat="Test" */ + {1, 0, 0, 0}, /* formatVersion */ + {1, 0, 0, 0} /* dataVersion */ +}; + +static void createData(const char*, UErrorCode *); + +static int outputJavaStuff(const char * progname, const char *outputDir); + +static UOption options[]={ + /*0*/ UOPTION_HELP_H, + /*1*/ UOPTION_HELP_QUESTION_MARK, + /*2*/ UOPTION_DESTDIR, + /*3*/ UOPTION_DEF("genres", 'r', UOPT_NO_ARG), + /*4*/ UOPTION_DEF("javastuff", 'j', UOPT_NO_ARG), +}; + +extern int +main(int argc, char* argv[]) { + UErrorCode errorCode = U_ZERO_ERROR; + + /* preset then read command line options */ + options[2].value=u_getDataDirectory(); + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } + if(argc<0 || options[0].doesOccur || options[1].doesOccur) { + fprintf(stderr, + "usage: %s [-options]\n" + "\tcreate the test file " DATA_NAME "." DATA_TYPE " unless the -r option is given.\n" + "\toptions:\n" + "\t\t-h or -? or --help this usage text\n" + "\t\t-d or --destdir destination directory, followed by the path\n" + "\t\t-r or --genres generate resource file testtable32.txt instead of UData test \n" + "\t\t-j or --javastuff generate Java source for DebugUtilities. \n", + argv[0]); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + if( options[4].doesOccur ) { + return outputJavaStuff( argv[0], options[2].value ); + } else if ( options[3].doesOccur ) { + return genres32( argv[0], options[2].value ); + } else { + /* printf("Generating the test memory mapped file\n"); */ + createData(options[2].value, &errorCode); + } + return U_FAILURE(errorCode); +} + +/* Create data file ----------------------------------------------------- */ +static void +createData(const char* outputDirectory, UErrorCode *errorCode) { + UNewDataMemory *pData; + char stringValue[]={'Y', 'E', 'A', 'R', '\0'}; + uint16_t intValue=2000; + + long dataLength; + uint32_t size; + + pData=udata_create(outputDirectory, DATA_TYPE, DATA_NAME, &dataInfo, + U_COPYRIGHT_STRING, errorCode); + if(U_FAILURE(*errorCode)) { + fprintf(stderr, "gentest: unable to create data memory, %s\n", u_errorName(*errorCode)); + exit(*errorCode); + } + + /* write the data to the file */ + /* a 16 bit value and a String*/ + udata_write16(pData, intValue); + udata_writeString(pData, stringValue, sizeof(stringValue)); + + /* finish up */ + dataLength=udata_finish(pData, errorCode); + if(U_FAILURE(*errorCode)) { + fprintf(stderr, "gentest: error %d writing the output file\n", *errorCode); + exit(*errorCode); + } + size=sizeof(stringValue) + sizeof(intValue); + + + if(dataLength!=(long)size) { + fprintf(stderr, "gentest: data length %ld != calculated size %lu\n", + dataLength, (unsigned long)size); + exit(U_INTERNAL_PROGRAM_ERROR); + } +} + +/* Create Java file ----------------------------------------------------- */ + +static int +outputJavaStuff(const char* progname, const char *outputDir) { + int32_t i,t,count; + char file[512]; + FILE *out; + + uprv_strcpy(file,outputDir); + if(*outputDir && /* don't put a trailing slash if outputDir is empty */ + file[strlen(file)-1]!=U_FILE_SEP_CHAR) { + uprv_strcat(file,U_FILE_SEP_STRING); + } + uprv_strcat(file,"DebugUtilitiesData.java"); + out = fopen(file, "w"); + /*puts(file);*/ + printf("%s: Generating %s\n", progname, file); + if(out == NULL) { + fprintf(stderr, "%s: Couldn't create resource test file %s\n", + progname, file); + return 1; + } + + fprintf(out, "// Copyright (C) 2016 and later: Unicode, Inc. and others.\n"); + fprintf(out, "// License & terms of use: http://www.unicode.org/copyright.html\n\n"); + fprintf(out, "/** Copyright (C) 2007-2016, International Business Machines Corporation and Others. All Rights Reserved. **/\n\n"); + fprintf(out, "/* NOTE: this file is AUTOMATICALLY GENERATED by gentest.\n" + " * See: {ICU4C}/source/data/icu4j-readme.txt for more information. \n" + " **/\n\n"); + fprintf(out, "package com.ibm.icu.dev.test.util;\n\n"); + fprintf(out, "public class DebugUtilitiesData extends Object {\n"); + fprintf(out, " public static final String ICU4C_VERSION=\"%s\";\n", U_ICU_VERSION); + for(t=0;t<UDBG_ENUM_COUNT;t++) { + fprintf(out, " public static final int %s = %d;\n", udbg_enumName(UDBG_UDebugEnumType,t), t); + } + fprintf(out, " public static final String [] TYPES = { \n"); + for(t=0;t<UDBG_ENUM_COUNT;t++) { + fprintf(out, " \"%s\", /* %d */\n", udbg_enumName(UDBG_UDebugEnumType,t), t); + } + fprintf(out, " };\n\n"); + + fprintf(out, " public static final String [][] NAMES = { \n"); + for(t=0;t<UDBG_ENUM_COUNT;t++) { + count = udbg_enumCount((UDebugEnumType)t); + fprintf(out, " /* %s, %d */\n", udbg_enumName(UDBG_UDebugEnumType,t), t); + fprintf(out, " { \n"); + for(i=0;i<count;i++) { + fprintf(out, + " \"%s\", /* %d */ \n", udbg_enumName((UDebugEnumType)t,i), i); + } + fprintf(out, " },\n"); + } + fprintf(out, " };\n\n"); + + fprintf(out, " public static final int [][] VALUES = { \n"); + for(t=0;t<UDBG_ENUM_COUNT;t++) { + count = udbg_enumCount((UDebugEnumType)t); + fprintf(out, " /* %s, %d */\n", udbg_enumName(UDBG_UDebugEnumType,t), t); + fprintf(out, " { \n"); + for(i=0;i<count;i++) { + fprintf(out, + " "); + switch(t) { +#if !UCONFIG_NO_FORMATTING + case UDBG_UCalendarDateFields: + case UDBG_UCalendarMonths: + /* Temporary workaround for IS_LEAP_MONTH #6051 */ + if (t == UDBG_UCalendarDateFields && i == 22) { + fprintf(out, "com.ibm.icu.util.ChineseCalendar.%s, /* %d */", udbg_enumName((UDebugEnumType)t,i), i); + } else { + fprintf(out, "com.ibm.icu.util.Calendar.%s, /* %d */", udbg_enumName((UDebugEnumType)t,i), i); + } + break; +#endif + case UDBG_UDebugEnumType: + default: + fprintf(out,"%d, /* %s */", i, udbg_enumName((UDebugEnumType)t,i)); + } + fprintf(out,"\n"); + } + fprintf(out, " },\n"); + } + fprintf(out, " };\n\n"); + fprintf(out, "}\n"); + + fclose(out); + + return 0; + +} diff --git a/intl/icu/source/tools/gentest/gentest.h b/intl/icu/source/tools/gentest/gentest.h new file mode 100644 index 0000000000..adb08c7ebc --- /dev/null +++ b/intl/icu/source/tools/gentest/gentest.h @@ -0,0 +1,16 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2003-2004, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +*/ + +#include "unicode/utypes.h" + +U_CFUNC int genres32(const char *prog, const char *path); + + diff --git a/intl/icu/source/tools/gentest/gentest.vcxproj b/intl/icu/source/tools/gentest/gentest.vcxproj new file mode 100644 index 0000000000..79155c11aa --- /dev/null +++ b/intl/icu/source/tools/gentest/gentest.vcxproj @@ -0,0 +1,82 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{77C78066-746F-4EA6-B3FE-B8C8A4A97891}</ProjectGuid> + <RootNamespace>gentest</RootNamespace> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)/gentest.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\..\include;..\..\common;..\toolutil;..\ctestfw;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)/gentest.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)/gentest.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)/gentest.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + <ProgramDatabaseFile>$(OutDir)/gentest.pdb</ProgramDatabaseFile> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icutestd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icutest.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="genres32.c" /> + <ClCompile Include="gentest.c" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="gentest.h" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/gentest/gentest.vcxproj.filters b/intl/icu/source/tools/gentest/gentest.vcxproj.filters new file mode 100644 index 0000000000..8d6187a6a6 --- /dev/null +++ b/intl/icu/source/tools/gentest/gentest.vcxproj.filters @@ -0,0 +1,30 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{e447c064-3b41-421f-8e6f-ecf661554c49}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{1a629c8b-9a21-4677-969f-6b262e4d56d4}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{f452beb5-882e-4c16-a7a6-479e858063d0}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="genres32.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="gentest.c"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="gentest.h"> + <Filter>Header Files</Filter> + </ClInclude> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/gentest/sources.txt b/intl/icu/source/tools/gentest/sources.txt new file mode 100644 index 0000000000..b6810849d0 --- /dev/null +++ b/intl/icu/source/tools/gentest/sources.txt @@ -0,0 +1,2 @@ +genres32.c +gentest.c diff --git a/intl/icu/source/tools/icuexportdata/Makefile.in b/intl/icu/source/tools/icuexportdata/Makefile.in new file mode 100644 index 0000000000..6899d74d1b --- /dev/null +++ b/intl/icu/source/tools/icuexportdata/Makefile.in @@ -0,0 +1,94 @@ +## Makefile.in for ICU - tools/icuexportdata +## Copyright (C) 2021 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/icuexportdata + +TARGET_STUB_NAME = icuexportdata + +SECTION = 1 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(srcdir)/../toolutil +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(patsubst %.cpp,%.o,$(patsubst %.c,%.o, $(SOURCES))) + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(bindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(bindir) + +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/icuexportdata/icuexportdata.1.in b/intl/icu/source/tools/icuexportdata/icuexportdata.1.in new file mode 100644 index 0000000000..71243ef053 --- /dev/null +++ b/intl/icu/source/tools/icuexportdata/icuexportdata.1.in @@ -0,0 +1,13 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" icuexportdata.1: manual page for the icuexportdata utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" +.\" Manual page by Shane Carr <shane@unicode.org>. +.\" +.TH MAKECONV 1 "12 June 2021" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B icuexportdata +\- Writes text files with Unicode properties data from ICU. diff --git a/intl/icu/source/tools/icuexportdata/icuexportdata.cpp b/intl/icu/source/tools/icuexportdata/icuexportdata.cpp new file mode 100644 index 0000000000..a286040eef --- /dev/null +++ b/intl/icu/source/tools/icuexportdata/icuexportdata.cpp @@ -0,0 +1,1566 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include <cstddef> +#include <cstdint> +#include <cstdio> +#include <iostream> +#include <unicode/localpointer.h> +#include <unicode/umachine.h> +#include <unicode/unistr.h> +#include <unicode/urename.h> +#include <unicode/uset.h> +#include <vector> +#include <algorithm> +#include "toolutil.h" +#include "uoptions.h" +#include "cmemory.h" +#include "charstr.h" +#include "cstring.h" +#include "unicode/uchar.h" +#include "unicode/errorcode.h" +#include "unicode/uniset.h" +#include "unicode/uscript.h" +#include "unicode/putil.h" +#include "unicode/umutablecptrie.h" +#include "unicode/ucharstriebuilder.h" +#include "ucase.h" +#include "unicode/normalizer2.h" +#include "normalizer2impl.h" +#include "writesrc.h" + +U_NAMESPACE_USE + +/* + * Global - verbosity + */ +UBool VERBOSE = false; +UBool QUIET = false; + +UBool haveCopyright = true; +UCPTrieType trieType = UCPTRIE_TYPE_SMALL; +const char* destdir = ""; + +// Mask constants for modified values in the Script CodePointTrie, values are logically 12-bits. +int16_t DATAEXPORT_SCRIPT_X_WITH_COMMON = 0x0400; +int16_t DATAEXPORT_SCRIPT_X_WITH_INHERITED = 0x0800; +int16_t DATAEXPORT_SCRIPT_X_WITH_OTHER = 0x0c00; + +// TODO(ICU-21821): Replace this with a call to a library function +int32_t scxCodePoints[] = { + 7415, 7377, 7380, 7387, 7390, 7391, 7394, 7395, 7396, 7397, + 7398, 7399, 7400, 7403, 7404, 7406, 7407, 7408, 7409, 113824, + 113825, 113826, 113827, 834, 837, 7616, 7617, 12294, 12350, 12351, + 12688, 12689, 12690, 12691, 12692, 12693, 12694, 12695, 12696, 12697, + 12698, 12699, 12700, 12701, 12702, 12703, 12736, 12737, 12738, 12739, + 12740, 12741, 12742, 12743, 12744, 12745, 12746, 12747, 12748, 12749, + 12750, 12751, 12752, 12753, 12754, 12755, 12756, 12757, 12758, 12759, + 12760, 12761, 12762, 12763, 12764, 12765, 12766, 12767, 12768, 12769, + 12770, 12771, 12832, 12833, 12834, 12835, 12836, 12837, 12838, 12839, + 12840, 12841, 12842, 12843, 12844, 12845, 12846, 12847, 12848, 12849, + 12850, 12851, 12852, 12853, 12854, 12855, 12856, 12857, 12858, 12859, + 12860, 12861, 12862, 12863, 12864, 12865, 12866, 12867, 12868, 12869, + 12870, 12871, 12928, 12929, 12930, 12931, 12932, 12933, 12934, 12935, + 12936, 12937, 12938, 12939, 12940, 12941, 12942, 12943, 12944, 12945, + 12946, 12947, 12948, 12949, 12950, 12951, 12952, 12953, 12954, 12955, + 12956, 12957, 12958, 12959, 12960, 12961, 12962, 12963, 12964, 12965, + 12966, 12967, 12968, 12969, 12970, 12971, 12972, 12973, 12974, 12975, + 12976, 12992, 12993, 12994, 12995, 12996, 12997, 12998, 12999, 13000, + 13001, 13002, 13003, 13055, 13144, 13145, 13146, 13147, 13148, 13149, + 13150, 13151, 13152, 13153, 13154, 13155, 13156, 13157, 13158, 13159, + 13160, 13161, 13162, 13163, 13164, 13165, 13166, 13167, 13168, 13179, + 13180, 13181, 13182, 13183, 13280, 13281, 13282, 13283, 13284, 13285, + 13286, 13287, 13288, 13289, 13290, 13291, 13292, 13293, 13294, 13295, + 13296, 13297, 13298, 13299, 13300, 13301, 13302, 13303, 13304, 13305, + 13306, 13307, 13308, 13309, 13310, 119648, 119649, 119650, 119651, 119652, + 119653, 119654, 119655, 119656, 119657, 119658, 119659, 119660, 119661, 119662, + 119663, 119664, 119665, 127568, 127569, 867, 868, 869, 870, 871, + 872, 873, 874, 875, 876, 877, 878, 879, 7418, 7674, + 66272, 66273, 66274, 66275, 66276, 66277, 66278, 66279, 66280, 66281, + 66282, 66283, 66284, 66285, 66286, 66287, 66288, 66289, 66290, 66291, + 66292, 66293, 66294, 66295, 66296, 66297, 66298, 66299, 1748, 64830, + 64831, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619, + 1620, 1621, 1648, 65010, 65021, 7381, 7382, 7384, 7393, 7402, + 7405, 7413, 7414, 43249, 12330, 12331, 12332, 12333, 43471, 65794, + 65847, 65848, 65849, 65850, 65851, 65852, 65853, 65854, 65855, 1156, + 1159, 11843, 42607, 1157, 1158, 1155, 7672, 7379, 7411, 7416, + 7417, 7401, 7383, 7385, 7388, 7389, 7392, 43251, 4347, 3046, + 3047, 3048, 3049, 3050, 3051, 3052, 3053, 3054, 3055, 3056, + 3057, 3058, 3059, 70401, 70403, 70459, 70460, 73680, 73681, 73683, + 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2799, + 2662, 2663, 2664, 2665, 2666, 2667, 2668, 2669, 2670, 2671, + 42752, 42753, 42754, 42755, 42756, 42757, 42758, 42759, 12337, 12338, + 12339, 12340, 12341, 12441, 12442, 12443, 12444, 12448, 12540, 65392, + 65438, 65439, 3302, 3303, 3304, 3305, 3306, 3307, 3308, 3309, + 3310, 3311, 8239, 68338, 6146, 6147, 6149, 1564, 1632, 1633, + 1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 2534, 2535, + 2536, 2537, 2538, 2539, 2540, 2541, 2542, 2543, 4160, 4161, + 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 65792, 65793, + 65799, 65800, 65801, 65802, 65803, 65804, 65805, 65806, 65807, 65808, + 65809, 65810, 65811, 65812, 65813, 65814, 65815, 65816, 65817, 65818, + 65819, 65820, 65821, 65822, 65823, 65824, 65825, 65826, 65827, 65828, + 65829, 65830, 65831, 65832, 65833, 65834, 65835, 65836, 65837, 65838, + 65839, 65840, 65841, 65842, 65843, 7412, 8432, 12348, 12349, 43310, + 7376, 7378, 5941, 5942, 2406, 2407, 2408, 2409, 2410, 2411, + 2412, 2413, 2414, 2415, 12291, 12307, 12316, 12317, 12318, 12319, + 12336, 12343, 65093, 65094, 1548, 1563, 12289, 12290, 12296, 12297, + 12298, 12299, 12300, 12301, 12302, 12303, 12304, 12305, 12308, 12309, + 12310, 12311, 12312, 12313, 12314, 12315, 12539, 65377, 65378, 65379, + 65380, 65381, 7386, 1567, 7410, 1600, 43062, 43063, 43064, 43065, + 2386, 2385, 43059, 43060, 43061, 43056, 43057, 43058, 2404, 2405 + }; + +void handleError(ErrorCode& status, const char* context) { + if (status.isFailure()) { + std::cerr << "Error: " << context << ": " << status.errorName() << std::endl; + exit(status.reset()); + } +} + +class PropertyValueNameGetter : public ValueNameGetter { +public: + PropertyValueNameGetter(UProperty prop) : property(prop) {} + ~PropertyValueNameGetter() override; + const char *getName(uint32_t value) override { + return u_getPropertyValueName(property, value, U_SHORT_PROPERTY_NAME); + } + +private: + UProperty property; +}; + +PropertyValueNameGetter::~PropertyValueNameGetter() {} + +// Dump an aliases = [...] key for properties with aliases +void dumpPropertyAliases(UProperty uproperty, FILE* f) { + int i = U_LONG_PROPERTY_NAME + 1; + + while(true) { + // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially, + // and returning null after that + const char* alias = u_getPropertyName(uproperty, (UPropertyNameChoice) i); + if (!alias) { + break; + } + if (i == U_LONG_PROPERTY_NAME + 1) { + fprintf(f, "aliases = [\"%s\"", alias); + } else { + fprintf(f, ", \"%s\"", alias); + } + i++; + } + if (i != U_LONG_PROPERTY_NAME + 1) { + fprintf(f, "]\n"); + } +} + +void dumpBinaryProperty(UProperty uproperty, FILE* f) { + IcuToolErrorCode status("icuexportdata: dumpBinaryProperty"); + const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME); + const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME); + const USet* uset = u_getBinaryPropertySet(uproperty, status); + handleError(status, fullPropName); + + fputs("[[binary_property]]\n", f); + fprintf(f, "long_name = \"%s\"\n", fullPropName); + if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName); + fprintf(f, "uproperty_discr = 0x%X\n", uproperty); + dumpPropertyAliases(uproperty, f); + usrc_writeUnicodeSet(f, uset, UPRV_TARGET_SYNTAX_TOML); +} + +// If the value exists, dump an indented entry of the format +// `" {discr = <discriminant>, long = <longname>, short = <shortname>, aliases = [<aliases>]},"` +void dumpValueEntry(UProperty uproperty, int v, bool is_mask, FILE* f) { + const char* fullValueName = u_getPropertyValueName(uproperty, v, U_LONG_PROPERTY_NAME); + const char* shortValueName = u_getPropertyValueName(uproperty, v, U_SHORT_PROPERTY_NAME); + if (!fullValueName) { + return; + } + if (is_mask) { + fprintf(f, " {discr = 0x%X", v); + } else { + fprintf(f, " {discr = %i", v); + } + fprintf(f, ", long = \"%s\"", fullValueName); + if (shortValueName) { + fprintf(f, ", short = \"%s\"", shortValueName); + } + int i = U_LONG_PROPERTY_NAME + 1; + while(true) { + // The API works by having extra names after U_LONG_PROPERTY_NAME, sequentially, + // and returning null after that + const char* alias = u_getPropertyValueName(uproperty, v, (UPropertyNameChoice) i); + if (!alias) { + break; + } + if (i == U_LONG_PROPERTY_NAME + 1) { + fprintf(f, ", aliases = [\"%s\"", alias); + } else { + fprintf(f, ", \"%s\"", alias); + } + i++; + } + if (i != U_LONG_PROPERTY_NAME + 1) { + fprintf(f, "]"); + } + fprintf(f, "},\n"); +} + +void dumpEnumeratedProperty(UProperty uproperty, FILE* f) { + IcuToolErrorCode status("icuexportdata: dumpEnumeratedProperty"); + const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME); + const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME); + const UCPMap* umap = u_getIntPropertyMap(uproperty, status); + handleError(status, fullPropName); + + fputs("[[enum_property]]\n", f); + fprintf(f, "long_name = \"%s\"\n", fullPropName); + if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName); + fprintf(f, "uproperty_discr = 0x%X\n", uproperty); + dumpPropertyAliases(uproperty, f); + + int32_t minValue = u_getIntPropertyMinValue(uproperty); + U_ASSERT(minValue >= 0); + int32_t maxValue = u_getIntPropertyMaxValue(uproperty); + U_ASSERT(maxValue >= 0); + + fprintf(f, "values = [\n"); + for (int v = minValue; v <= maxValue; v++) { + dumpValueEntry(uproperty, v, false, f); + } + fprintf(f, "]\n"); + + PropertyValueNameGetter valueNameGetter(uproperty); + usrc_writeUCPMap(f, umap, &valueNameGetter, UPRV_TARGET_SYNTAX_TOML); + fputs("\n", f); + + + UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32; + if (maxValue <= 0xff) { + width = UCPTRIE_VALUE_BITS_8; + } else if (maxValue <= 0xffff) { + width = UCPTRIE_VALUE_BITS_16; + } + LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(umap, status)); + LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + builder.getAlias(), + trieType, + width, + status)); + handleError(status, fullPropName); + + fputs("[enum_property.code_point_trie]\n", f); + usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); +} + +/* +* Export Bidi_Mirroring_Glyph values (code points) in a similar way to how enumerated +* properties are dumped to file. +* Note: the data will store 0 for code points without a value defined for +* Bidi_Mirroring_Glyph. +*/ +void dumpBidiMirroringGlyph(FILE* f) { + UProperty uproperty = UCHAR_BIDI_MIRRORING_GLYPH; + IcuToolErrorCode status("icuexportdata: dumpBidiMirroringGlyph"); + const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME); + const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME); + handleError(status, fullPropName); + + // Store 21-bit code point as is + UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_32; + + // note: unlike dumpEnumeratedProperty, which can get inversion map data using + // u_getIntPropertyMap(uproperty), the only reliable way to get Bidi_Mirroring_Glyph + // is to use u_charMirror(cp) over the code point space. + LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status)); + for(UChar32 c = UCHAR_MIN_VALUE; c <= UCHAR_MAX_VALUE; c++) { + UChar32 mirroringGlyph = u_charMirror(c); + // The trie builder code throws an error when it cannot compress the data sufficiently. + // Therefore, when the value is undefined for a code point, keep a 0 in the trie + // instead of the ICU API behavior of returning the code point value. Using 0 + // results in a relatively significant space savings by not including redundant data. + if (c != mirroringGlyph) { + umutablecptrie_set(builder.getAlias(), c, mirroringGlyph, status); + } + } + + LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + builder.getAlias(), + trieType, + width, + status)); + handleError(status, fullPropName); + + // currently a trie and inversion map are the same (as relied upon in characterproperties.cpp) + const UCPMap* umap = reinterpret_cast<UCPMap *>(utrie.getAlias()); + + fputs("[[enum_property]]\n", f); + fprintf(f, "long_name = \"%s\"\n", fullPropName); + if (shortPropName) { + fprintf(f, "short_name = \"%s\"\n", shortPropName); + } + fprintf(f, "uproperty_discr = 0x%X\n", uproperty); + dumpPropertyAliases(uproperty, f); + + usrc_writeUCPMap(f, umap, nullptr, UPRV_TARGET_SYNTAX_TOML); + fputs("\n", f); + + fputs("[enum_property.code_point_trie]\n", f); + usrc_writeUCPTrie(f, shortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); +} + +// After printing property value `v`, print `mask` if and only if `mask` comes immediately +// after the property in the listing +void maybeDumpMaskValue(UProperty uproperty, uint32_t v, uint32_t mask, FILE* f) { + if (U_MASK(v) < mask && U_MASK(v + 1) > mask) + dumpValueEntry(uproperty, mask, true, f); +} + +void dumpGeneralCategoryMask(FILE* f) { + IcuToolErrorCode status("icuexportdata: dumpGeneralCategoryMask"); + UProperty uproperty = UCHAR_GENERAL_CATEGORY_MASK; + + fputs("[[mask_property]]\n", f); + const char* fullPropName = u_getPropertyName(uproperty, U_LONG_PROPERTY_NAME); + const char* shortPropName = u_getPropertyName(uproperty, U_SHORT_PROPERTY_NAME); + fprintf(f, "long_name = \"%s\"\n", fullPropName); + if (shortPropName) fprintf(f, "short_name = \"%s\"\n", shortPropName); + fprintf(f, "uproperty_discr = 0x%X\n", uproperty); + dumpPropertyAliases(uproperty, f); + + + fprintf(f, "mask_for = \"General_Category\"\n"); + uint32_t minValue = u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY); + U_ASSERT(minValue >= 0); + uint32_t maxValue = u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY); + U_ASSERT(maxValue >= 0); + + fprintf(f, "values = [\n"); + for (uint32_t v = minValue; v <= maxValue; v++) { + dumpValueEntry(uproperty, U_MASK(v), true, f); + + // We want to dump these masks "in order", which means they + // should come immediately after every property they contain + maybeDumpMaskValue(uproperty, v, U_GC_L_MASK, f); + maybeDumpMaskValue(uproperty, v, U_GC_LC_MASK, f); + maybeDumpMaskValue(uproperty, v, U_GC_M_MASK, f); + maybeDumpMaskValue(uproperty, v, U_GC_N_MASK, f); + maybeDumpMaskValue(uproperty, v, U_GC_Z_MASK, f); + maybeDumpMaskValue(uproperty, v, U_GC_C_MASK, f); + maybeDumpMaskValue(uproperty, v, U_GC_P_MASK, f); + maybeDumpMaskValue(uproperty, v, U_GC_S_MASK, f); + } + fprintf(f, "]\n"); +} + +void dumpScriptExtensions(FILE* f) { + IcuToolErrorCode status("icuexportdata: dumpScriptExtensions"); + + fputs("[[script_extensions]]\n", f); + const char* scxFullPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_LONG_PROPERTY_NAME); + const char* scxShortPropName = u_getPropertyName(UCHAR_SCRIPT_EXTENSIONS, U_SHORT_PROPERTY_NAME); + fprintf(f, "long_name = \"%s\"\n", scxFullPropName); + if (scxShortPropName) fprintf(f, "short_name = \"%s\"\n", scxShortPropName); + fprintf(f, "uproperty_discr = 0x%X\n", UCHAR_SCRIPT_EXTENSIONS); + dumpPropertyAliases(UCHAR_SCRIPT_EXTENSIONS, f); + + // We want to use 16 bits for our exported trie of sc/scx data because we + // need 12 bits to match the 12 bits of data stored for sc/scx in the trie + // in the uprops.icu data file. + UCPTrieValueWidth scWidth = UCPTRIE_VALUE_BITS_16; + + // Create a mutable UCPTrie builder populated with Script property values data. + const UCPMap* scInvMap = u_getIntPropertyMap(UCHAR_SCRIPT, status); + handleError(status, scxFullPropName); + LocalUMutableCPTriePointer builder(umutablecptrie_fromUCPMap(scInvMap, status)); + handleError(status, scxFullPropName); + + // The values for the output scx companion array. + // Invariant is that all subvectors are distinct. + std::vector< std::vector<uint16_t> > outputDedupVec; + + // The sc/scx companion array is an array of arrays (of script codes) + fputs("script_code_array = [\n", f); + for(const UChar32 cp : scxCodePoints) { + // Get the Script value + uint32_t scVal = umutablecptrie_get(builder.getAlias(), cp); + // Get the Script_Extensions value (array of Script codes) + const int32_t SCX_ARRAY_CAPACITY = 32; + UScriptCode scxValArray[SCX_ARRAY_CAPACITY]; + int32_t numScripts = uscript_getScriptExtensions(cp, scxValArray, SCX_ARRAY_CAPACITY, status); + handleError(status, scxFullPropName); + + // Convert the scx array into a vector + std::vector<uint16_t> scxValVec; + for(int i = 0; i < numScripts; i++) { + scxValVec.push_back(scxValArray[i]); + } + // Ensure that it is sorted + std::sort(scxValVec.begin(), scxValVec.end()); + // Copy the Script value into the first position of the scx array only + // if we have the "other" case (Script value is not Common nor Inherited). + // This offers faster access when users want only the Script value. + if (scVal != USCRIPT_COMMON && scVal != USCRIPT_INHERITED) { + scxValVec.insert(scxValVec.begin(), scVal); + } + + // See if there is already an scx value array matching the newly built one. + // If there is, then use its index. + // If not, then append the new value array. + bool isScxValUnique = true; + size_t outputIndex = 0; + for (outputIndex = 0; outputIndex < outputDedupVec.size(); outputIndex++) { + if (outputDedupVec[outputIndex] == scxValVec) { + isScxValUnique = false; + break; + } + } + + if (isScxValUnique) { + outputDedupVec.push_back(scxValVec); + usrc_writeArray(f, " [", scxValVec.data(), 16, scxValVec.size(), " ", "],\n"); + } + + // We must update the value in the UCPTrie for the code point to contain: + // 9..0 the Script code in the lower 10 bits when 11..10 is 0, else it is + // the index into the companion array + // 11..10 the same higher-order 2 bits in the trie in uprops.icu indicating whether + // 3: other + // 2: Script=Inherited + // 1: Script=Common + // 0: Script=value in 9..0 (N/A because we are in this loop to create the companion array for non-0 cases) + uint16_t mask = 0; + if (scVal == USCRIPT_COMMON) { + mask = DATAEXPORT_SCRIPT_X_WITH_COMMON; + } else if (scVal == USCRIPT_INHERITED) { + mask = DATAEXPORT_SCRIPT_X_WITH_INHERITED; + } else { + mask = DATAEXPORT_SCRIPT_X_WITH_OTHER; + } + + // The new trie value is the index into the new array with the high order bits set + uint32_t newScVal = outputIndex | mask; + + // Update the code point in the mutable trie builder with the trie value + umutablecptrie_set(builder.getAlias(), cp, newScVal, status); + handleError(status, scxFullPropName); + } + fputs("]\n\n", f); // Print the TOML close delimiter for the outer array. + + // Convert from mutable trie builder to immutable trie. + LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + builder.getAlias(), + trieType, + scWidth, + status)); + handleError(status, scxFullPropName); + + fputs("[script_extensions.code_point_trie]\n", f); + usrc_writeUCPTrie(f, scxShortPropName, utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); +} + +FILE* prepareOutputFile(const char* basename) { + IcuToolErrorCode status("icuexportdata"); + CharString outFileName; + if (destdir != nullptr && *destdir != 0) { + outFileName.append(destdir, status).ensureEndsWithFileSeparator(status); + } + outFileName.append(basename, status); + outFileName.append(".toml", status); + handleError(status, basename); + + FILE* f = fopen(outFileName.data(), "w"); + if (f == nullptr) { + std::cerr << "Unable to open file: " << outFileName.data() << std::endl; + exit(U_FILE_ACCESS_ERROR); + } + if (!QUIET) { + std::cout << "Writing to: " << outFileName.data() << std::endl; + } + + if (haveCopyright) { + usrc_writeCopyrightHeader(f, "#", 2021); + } + usrc_writeFileNameGeneratedBy(f, "#", basename, "icuexportdata.cpp"); + + return f; +} + +#if !UCONFIG_NO_NORMALIZATION + +struct PendingDescriptor { + UChar32 scalar; + uint32_t descriptor; + UBool supplementary; +}; + +void writeCanonicalCompositions(USet* backwardCombiningStarters) { + IcuToolErrorCode status("icuexportdata: computeCanonicalCompositions"); + const char* basename = "compositions"; + FILE* f = prepareOutputFile(basename); + + LocalPointer<UCharsTrieBuilder> backwardBuilder(new UCharsTrieBuilder(status), status); + + const int32_t DECOMPOSITION_BUFFER_SIZE = 20; + UChar32 utf32[DECOMPOSITION_BUFFER_SIZE]; + + const Normalizer2* nfc = Normalizer2::getNFCInstance(status); + for (UChar32 c = 0; c <= 0x10FFFF; ++c) { + if (c >= 0xD800 && c < 0xE000) { + // Surrogate + continue; + } + UnicodeString decomposition; + if (!nfc->getRawDecomposition(c, decomposition)) { + continue; + } + int32_t len = decomposition.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); + if (len != 2) { + continue; + } + UChar32 starter = utf32[0]; + UChar32 second = utf32[1]; + UChar32 composite = nfc->composePair(starter, second); + if (composite < 0) { + continue; + } + if (c != composite) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (!u_getCombiningClass(second)) { + uset_add(backwardCombiningStarters, second); + } + if (composite >= 0xAC00 && composite <= 0xD7A3) { + // Hangul syllable + continue; + } + + UnicodeString backward; + backward.append(second); + backward.append(starter); + backwardBuilder->add(backward, int32_t(composite), status); + } + UnicodeString canonicalCompositionTrie; + backwardBuilder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, canonicalCompositionTrie, status); + + usrc_writeArray(f, "compositions = [\n ", canonicalCompositionTrie.getBuffer(), 16, canonicalCompositionTrie.length(), " ", "\n]\n"); + fclose(f); + handleError(status, basename); +} + +void writeDecompositionTables(const char* basename, const uint16_t* ptr16, size_t len16, const uint32_t* ptr32, size_t len32) { + FILE* f = prepareOutputFile(basename); + usrc_writeArray(f, "scalars16 = [\n ", ptr16, 16, len16, " ", "\n]\n"); + usrc_writeArray(f, "scalars32 = [\n ", ptr32, 32, len32, " ", "\n]\n"); + fclose(f); +} + +void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t baseSize32, uint32_t supplementSize16, USet* uset, USet* reference, const std::vector<PendingDescriptor>& pendingTrieInsertions, char16_t passthroughCap) { + IcuToolErrorCode status("icuexportdata: writeDecompositionData"); + FILE* f = prepareOutputFile(basename); + + // Zero is a magic number that means the character decomposes to itself. + LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status)); + + // Iterate backwards to insert lower code points in the trie first in case it matters + // for trie block allocation. + for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) { + const PendingDescriptor& pending = pendingTrieInsertions[i]; + uint32_t additional = 0; + if (!(pending.descriptor & 0xFFFE0000)) { + uint32_t offset = pending.descriptor & 0xFFF; + if (!pending.supplementary) { + if (offset >= baseSize16) { + // This is a offset to supplementary 16-bit data. We have + // 16-bit base data and 32-bit base data before. However, + // the 16-bit base data length is already part of offset. + additional = baseSize32; + } + } else { + if (offset >= baseSize32) { + // This is an offset to supplementary 32-bit data. We have 16-bit + // base data, 32-bit base data, and 16-bit supplementary data before. + // However, the 32-bit base data length is already part + // of offset. + additional = baseSize16 + supplementSize16; + } else { + // This is an offset to 32-bit base data. We have 16-bit + // base data before. + additional = baseSize16; + } + } + if (offset + additional > 0xFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + } + // It turns out it's better to swap the halves compared to the initial + // idea in order to put special marker values close to zero so that + // an important marker value becomes 1, so it's efficient to compare + // "1 or 0". Unfortunately, going through all the code to swap + // things is too error prone, so let's do the swapping here in one + // place. + uint32_t oldTrieValue = pending.descriptor + additional; + uint32_t swappedTrieValue = (oldTrieValue >> 16) | (oldTrieValue << 16); + umutablecptrie_set(builder.getAlias(), pending.scalar, swappedTrieValue, status); + } + LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + builder.getAlias(), + trieType, + UCPTRIE_VALUE_BITS_32, + status)); + handleError(status, basename); + + if (reference) { + if (uset_contains(reference, 0xFF9E) || uset_contains(reference, 0xFF9F) || !uset_contains(reference, 0x0345)) { + // NFD expectations don't hold. The set must not contain the half-width + // kana voicing marks and must contain iota subscript. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + + USet* halfWidthVoicing = uset_openEmpty(); + uset_add(halfWidthVoicing, 0xFF9E); + uset_add(halfWidthVoicing, 0xFF9F); + + USet* iotaSubscript = uset_openEmpty(); + uset_add(iotaSubscript, 0x0345); + + uint8_t flags = 0; + + USet* halfWidthCheck = uset_cloneAsThawed(uset); + uset_removeAll(halfWidthCheck, reference); + if (uset_equals(halfWidthCheck, halfWidthVoicing)) { + flags |= 1; + } else if (!uset_isEmpty(halfWidthCheck)) { + // The result was neither empty nor contained exactly + // the two half-width voicing marks. The ICU4X + // normalizer doesn't know how to deal with this case. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + uset_close(halfWidthCheck); + + USet* iotaCheck = uset_cloneAsThawed(reference); + uset_removeAll(iotaCheck, uset); + if (!(uset_equals(iotaCheck, iotaSubscript)) && !uset_isEmpty(iotaCheck)) { + // The result was neither empty nor contained exactly + // the iota subscript. The ICU4X normalizer doesn't + // know how to deal with this case. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + uset_close(halfWidthCheck); + + uset_close(iotaSubscript); + uset_close(halfWidthVoicing); + + fprintf(f, "flags = 0x%X\n", flags); + fprintf(f, "cap = 0x%X\n", passthroughCap); + } + fprintf(f, "[trie]\n"); + usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); + fclose(f); + handleError(status, basename); +} + +// Special marker for the NFKD form of U+FDFA +const int32_t FDFA_MARKER = 3; + +// Special marker for characters whose decomposition starts with a non-starter +// and the decomposition isn't the character itself. +const int32_t SPECIAL_NON_STARTER_DECOMPOSITION_MARKER = 2; + +// Special marker for starters that decompose to themselves but that may +// combine backwards under canonical composition +const int32_t BACKWARD_COMBINING_STARTER_MARKER = 1; + +/// Marker that a complex decomposition isn't round-trippable +/// under re-composition. +const uint32_t NON_ROUND_TRIP_MARKER = 1; + +UBool permissibleBmpPair(UBool knownToRoundTrip, UChar32 c, UChar32 second) { + if (knownToRoundTrip) { + return true; + } + // Nuktas, Hebrew presentation forms and polytonic Greek with oxia + // are special-cased in ICU4X. + if (c >= 0xFB1D && c <= 0xFB4E) { + // Hebrew presentation forms + return true; + } + if (c >= 0x1F71 && c <= 0x1FFB) { + // Polytonic Greek with oxia + return true; + } + if ((second & 0x7F) == 0x3C && second >= 0x0900 && second <= 0x0BFF) { + // Nukta + return true; + } + // To avoid more branchiness, 4 characters that decompose to + // a BMP starter followed by a BMP non-starter are excluded + // from being encoded directly into the trie value and are + // handled as complex decompositions instead. These are: + // U+0F76 TIBETAN VOWEL SIGN VOCALIC R + // U+0F78 TIBETAN VOWEL SIGN VOCALIC L + // U+212B ANGSTROM SIGN + // U+2ADC FORKING + return false; +} + +// Computes data for canonical decompositions +void computeDecompositions(const char* basename, + const USet* backwardCombiningStarters, + std::vector<uint16_t>& storage16, + std::vector<uint32_t>& storage32, + USet* decompositionStartsWithNonStarter, + USet* decompositionStartsWithBackwardCombiningStarter, + std::vector<PendingDescriptor>& pendingTrieInsertions, + UChar32& decompositionPassthroughBound, + UChar32& compositionPassthroughBound) { + IcuToolErrorCode status("icuexportdata: computeDecompositions"); + const Normalizer2* mainNormalizer; + const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(status); + const Normalizer2* nfcNormalizer = Normalizer2::getNFCInstance(status); + FILE* f = nullptr; + std::vector<uint32_t> nonRecursive32; + LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status)); + + if (uprv_strcmp(basename, "nfkd") == 0) { + mainNormalizer = Normalizer2::getNFKDInstance(status); + } else if (uprv_strcmp(basename, "uts46d") == 0) { + mainNormalizer = Normalizer2::getInstance(nullptr, "uts46", UNORM2_COMPOSE, status); + } else { + mainNormalizer = nfdNormalizer; + f = prepareOutputFile("decompositionex"); + } + + // Max length as of Unicode 14 is 4 for NFD. For NFKD the max + // is 18 (U+FDFA; special-cased), and the next longest is 8 (U+FDFB). + const int32_t LONGEST_ENCODABLE_LENGTH_16 = 9; + const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8; + const int32_t DECOMPOSITION_BUFFER_SIZE = 20; + UChar32 utf32[DECOMPOSITION_BUFFER_SIZE]; + const int32_t RAW_DECOMPOSITION_BUFFER_SIZE = 2; + UChar32 rawUtf32[RAW_DECOMPOSITION_BUFFER_SIZE]; + + // Iterate over all scalar values excluding Hangul syllables. + // + // We go backwards in order to better find overlapping decompositions. + // + // As of Unicode 14: + // Iterate forward without overlap search: + // nfd: 16 size: 896, 32 size: 173 + // nfkd: 16 size: 3854, 32 size: 179 + // + // Iterate forward with overlap search: + // nfd: 16 size: 888, 32 size: 173 + // nfkd: 16 size: 3266, 32 size: 179 + // + // Iterate backward with overlap search: + // nfd: 16 size: 776, 32 size: 173 + // nfkd: 16 size: 2941, 32 size: 179 + // + // UChar32 is signed! + for (UChar32 c = 0x10FFFF; c >= 0; --c) { + if (c >= 0xAC00 && c <= 0xD7A3) { + // Hangul syllable + continue; + } + if (c >= 0xD800 && c < 0xE000) { + // Surrogate + continue; + } + UnicodeString src; + UnicodeString dst; + // True if we're building non-NFD or we're building NFD but + // the `c` round trips to NFC. + // False if we're building NFD and `c` does not round trip to NFC. + UBool nonNfdOrRoundTrips = true; + src.append(c); + if (mainNormalizer != nfdNormalizer) { + UnicodeString inter; + mainNormalizer->normalize(src, inter, status); + nfdNormalizer->normalize(inter, dst, status); + } else { + nfdNormalizer->normalize(src, dst, status); + UnicodeString nfc; + nfcNormalizer->normalize(dst, nfc, status); + nonNfdOrRoundTrips = (src == nfc); + } + int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); + if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) { + // Characters that normalize to nothing or to U+FFFD (without the + // input being U+FFFD) in ICU4C's UTS 46 normalization normalize + // as in NFD in ICU4X's UTF 46 normalization in the interest + // of data size and ICU4X's normalizer being unable to handle + // normalizing to nothing. + // When UTS 46 is implemented on top of ICU4X, a preprocessing + // step is supposed to remove these characters before the + // normalization step. + if (uprv_strcmp(basename, "uts46d") != 0) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + nfdNormalizer->normalize(src, dst, status); + len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); + if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + } + if (len > DECOMPOSITION_BUFFER_SIZE) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + uint8_t firstCombiningClass = u_getCombiningClass(utf32[0]); + bool specialNonStarterDecomposition = false; + bool startsWithBackwardCombiningStarter = false; + if (firstCombiningClass) { + decompositionPassthroughBound = c; + compositionPassthroughBound = c; + uset_add(decompositionStartsWithNonStarter, c); + if (src != dst) { + if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F) { + specialNonStarterDecomposition = true; + } else { + // A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + } + } else if (uset_contains(backwardCombiningStarters, utf32[0])) { + compositionPassthroughBound = c; + startsWithBackwardCombiningStarter = true; + uset_add(decompositionStartsWithBackwardCombiningStarter, c); + } + if (c != BACKWARD_COMBINING_STARTER_MARKER && len == 1 && utf32[0] == BACKWARD_COMBINING_STARTER_MARKER) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (c != SPECIAL_NON_STARTER_DECOMPOSITION_MARKER && len == 1 && utf32[0] == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (c != FDFA_MARKER && len == 1 && utf32[0] == FDFA_MARKER) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (mainNormalizer != nfdNormalizer) { + UnicodeString nfd; + nfdNormalizer->normalize(src, nfd, status); + if (dst == nfd) { + continue; + } + decompositionPassthroughBound = c; + compositionPassthroughBound = c; + } else if (firstCombiningClass) { + len = 1; + if (specialNonStarterDecomposition) { + utf32[0] = SPECIAL_NON_STARTER_DECOMPOSITION_MARKER; // magic value + } else { + // Use the surrogate range to store the canonical combining class + utf32[0] = 0xD800 | UChar32(firstCombiningClass); + } + } else { + if (src == dst) { + if (startsWithBackwardCombiningStarter) { + pendingTrieInsertions.push_back({c, BACKWARD_COMBINING_STARTER_MARKER << 16, false}); + } + continue; + } + decompositionPassthroughBound = c; + // ICU4X hard-codes ANGSTROM SIGN + if (c != 0x212B) { + UnicodeString raw; + if (!nfdNormalizer->getRawDecomposition(c, raw)) { + // We're always supposed to have a non-recursive decomposition + // if we had a recursive one. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + // In addition to actual difference, put the whole range that contains characters + // with oxia into the non-recursive trie in order to catch cases where characters + // with oxia have singleton decompositions to corresponding characters with tonos. + // This way, the run-time decision to fall through can be done on the range + // without checking for individual characters inside the range. + if (raw != dst || (c >= 0x1F71 && c <= 0x1FFB)) { + int32_t rawLen = raw.toUTF32(rawUtf32, RAW_DECOMPOSITION_BUFFER_SIZE, status); + if (!rawLen) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (rawLen == 1) { + if (c >= 0xFFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, uint32_t(rawUtf32[0]), status); + } else if (rawUtf32[0] <= 0xFFFF && rawUtf32[1] <= 0xFFFF) { + if (!rawUtf32[0] || !rawUtf32[1]) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + // Swapped for consistency with the primary trie + uint32_t bmpPair = uint32_t(rawUtf32[1]) << 16 | uint32_t(rawUtf32[0]); + umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, bmpPair, status); + } else { + // Let's add 1 to index to make it always non-zero to distinguish + // it from the default zero. + uint32_t index = nonRecursive32.size() + 1; + nonRecursive32.push_back(uint32_t(rawUtf32[0])); + nonRecursive32.push_back(uint32_t(rawUtf32[1])); + if (index > 0xFFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, index << 16, status); + } + } + } + } + if (!nonNfdOrRoundTrips) { + compositionPassthroughBound = c; + } + if (len == 1 && utf32[0] <= 0xFFFF) { + if (startsWithBackwardCombiningStarter) { + if (mainNormalizer == nfdNormalizer) { + // Not supposed to happen in NFD + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } else if (!((utf32[0] >= 0x1161 && utf32[0] <= 0x1175) || (utf32[0] >= 0x11A8 && utf32[0] <= 0x11C2))) { + // Other than conjoining jamo vowels and trails + // unsupported for non-NFD. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + } + pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, false}); + } else if (len == 2 && + utf32[0] <= 0xFFFF && + utf32[1] <= 0xFFFF && + !u_getCombiningClass(utf32[0]) && + u_getCombiningClass(utf32[1]) && + permissibleBmpPair(nonNfdOrRoundTrips, c, utf32[1])) { + for (int32_t i = 0; i < len; ++i) { + if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) { + // Assert that iota subscript and half-width voicing marks never occur in these + // expansions in the normalization forms where they are special. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + } + if (startsWithBackwardCombiningStarter) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + pendingTrieInsertions.push_back({c, (uint32_t(utf32[0]) << 16) | uint32_t(utf32[1]), false}); + } else { + if (startsWithBackwardCombiningStarter) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + + UBool supplementary = false; + UBool nonInitialStarter = false; + for (int32_t i = 0; i < len; ++i) { + if (((utf32[i] == 0x0345) && (uprv_strcmp(basename, "uts46d") == 0)) || utf32[i] == 0xFF9E || utf32[i] == 0xFF9F) { + // Assert that iota subscript and half-width voicing marks never occur in these + // expansions in the normalization forms where they are special. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + + if (utf32[i] > 0xFFFF) { + supplementary = true; + } + if (utf32[i] == 0) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (i != 0 && !u_getCombiningClass(utf32[i])) { + nonInitialStarter = true; + } + } + if (!supplementary) { + if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) { + if (len == 18 && c == 0xFDFA) { + // Special marker for the one character whose decomposition + // is too long. + pendingTrieInsertions.push_back({c, FDFA_MARKER << 16, supplementary}); + continue; + } else { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + } + } else if (len > LONGEST_ENCODABLE_LENGTH_32 || !len) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + // Complex decomposition + // Format for 16-bit value: + // 15..13: length minus two for 16-bit case and length minus one for + // the 32-bit case. Length 8 needs to fit in three bits in + // the 16-bit case, and this way the value is future-proofed + // up to 9 in the 16-bit case. Zero is unused and length one + // in the 16-bit case goes directly into the trie. + // 12: 1 if all trailing characters are guaranteed non-starters, + // 0 if no guarantees about non-starterness. + // Note: The bit choice is this way around to allow for + // dynamically falling back to not having this but instead + // having one more bit for length by merely choosing + // different masks. + // 11..0: Start offset in storage. The offset is to the logical + // sequence of scalars16, scalars32, supplementary_scalars16, + // supplementary_scalars32. + uint32_t descriptor = uint32_t(!nonInitialStarter) << 12; + if (!supplementary) { + descriptor |= (uint32_t(len) - 2) << 13; + } else { + descriptor |= (uint32_t(len) - 1) << 13; + } + if (descriptor & 0xFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + size_t index = 0; + bool writeToStorage = false; + // Sadly, C++ lacks break and continue by label, so using goto in the + // inner loops to break or continue the outer loop. + if (!supplementary) { + outer16: for (;;) { + if (index == storage16.size()) { + writeToStorage = true; + break; + } + if (storage16[index] == utf32[0]) { + for (int32_t i = 1; i < len; ++i) { + if (storage16[index + i] != uint32_t(utf32[i])) { + ++index; + // continue outer + goto outer16; + } + } + // break outer + goto after; + } + ++index; + } + } else { + outer32: for (;;) { + if (index == storage32.size()) { + writeToStorage = true; + break; + } + if (storage32[index] == uint32_t(utf32[0])) { + for (int32_t i = 1; i < len; ++i) { + if (storage32[index + i] != uint32_t(utf32[i])) { + ++index; + // continue outer + goto outer32; + } + } + // break outer + goto after; + } + ++index; + } + } + after: + if (index > 0xFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + descriptor |= uint32_t(index); + if (!descriptor || descriptor > 0xFFFF) { + // > 0xFFFF should never happen if the code above is correct. + // == 0 should not happen due to the nature of the data. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + if (writeToStorage) { + if (!supplementary) { + for (int32_t i = 0; i < len; ++i) { + storage16.push_back(uint16_t(utf32[i])); + } + } else { + for (int32_t i = 0; i < len; ++i) { + storage32.push_back(uint32_t(utf32[i])); + } + } + } + + uint32_t nonRoundTripMarker = 0; + if (!nonNfdOrRoundTrips) { + nonRoundTripMarker = (NON_ROUND_TRIP_MARKER << 16); + } + pendingTrieInsertions.push_back({c, descriptor | nonRoundTripMarker, supplementary}); + } + } + if (storage16.size() + storage32.size() > 0xFFF) { + status.set(U_INTERNAL_PROGRAM_ERROR); + } + if (f) { + usrc_writeArray(f, "scalars32 = [\n ", nonRecursive32.data(), 32, nonRecursive32.size(), " ", "\n]\n"); + + LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + nonRecursiveBuilder.getAlias(), + trieType, + UCPTRIE_VALUE_BITS_32, + status)); + handleError(status, basename); + + fprintf(f, "[trie]\n"); + usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); + + fclose(f); + } + handleError(status, basename); +} + +#endif // !UCONFIG_NO_NORMALIZATION + +enum { + OPT_HELP_H, + OPT_HELP_QUESTION_MARK, + OPT_MODE, + OPT_TRIE_TYPE, + OPT_VERSION, + OPT_DESTDIR, + OPT_ALL, + OPT_INDEX, + OPT_COPYRIGHT, + OPT_VERBOSE, + OPT_QUIET, + + OPT_COUNT +}; + +#define UOPTION_MODE UOPTION_DEF("mode", 'm', UOPT_REQUIRES_ARG) +#define UOPTION_TRIE_TYPE UOPTION_DEF("trie-type", '\1', UOPT_REQUIRES_ARG) +#define UOPTION_ALL UOPTION_DEF("all", '\1', UOPT_NO_ARG) +#define UOPTION_INDEX UOPTION_DEF("index", '\1', UOPT_NO_ARG) + +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_MODE, + UOPTION_TRIE_TYPE, + UOPTION_VERSION, + UOPTION_DESTDIR, + UOPTION_ALL, + UOPTION_INDEX, + UOPTION_COPYRIGHT, + UOPTION_VERBOSE, + UOPTION_QUIET, +}; + +void printHelp(FILE* stdfile, const char* program) { + fprintf(stdfile, + "usage: %s -m mode [-options] [--all | properties...]\n" + "\tdump Unicode property data to .toml files\n" + "options:\n" + "\t-h or -? or --help this usage text\n" + "\t-V or --version show a version message\n" + "\t-m or --mode mode: currently only 'uprops', 'ucase', and 'norm', but more may be added\n" + "\t --trie-type set the trie type (small or fast, default small)\n" + "\t-d or --destdir destination directory, followed by the path\n" + "\t --all write out all properties known to icuexportdata\n" + "\t --index write an _index.toml summarizing all data exported\n" + "\t-c or --copyright include a copyright notice\n" + "\t-v or --verbose Turn on verbose output\n" + "\t-q or --quiet do not display warnings and progress\n", + program); +} + +int exportUprops(int argc, char* argv[]) { + // Load list of Unicode properties + std::vector<const char*> propNames; + for (int i=1; i<argc; i++) { + propNames.push_back(argv[i]); + } + if (options[OPT_ALL].doesOccur) { + int i = UCHAR_BINARY_START; + while (true) { + if (i == UCHAR_BINARY_LIMIT) { + i = UCHAR_INT_START; + } + if (i == UCHAR_INT_LIMIT) { + i = UCHAR_GENERAL_CATEGORY_MASK; + } + if (i == UCHAR_GENERAL_CATEGORY_MASK + 1) { + i = UCHAR_BIDI_MIRRORING_GLYPH; + } + if (i == UCHAR_BIDI_MIRRORING_GLYPH + 1) { + i = UCHAR_SCRIPT_EXTENSIONS; + } + if (i == UCHAR_SCRIPT_EXTENSIONS + 1) { + break; + } + UProperty uprop = static_cast<UProperty>(i); + const char* propName = u_getPropertyName(uprop, U_SHORT_PROPERTY_NAME); + if (propName == nullptr) { + propName = u_getPropertyName(uprop, U_LONG_PROPERTY_NAME); + if (propName != nullptr && VERBOSE) { + std::cerr << "Note: falling back to long name for: " << propName << std::endl; + } + } + if (propName != nullptr) { + propNames.push_back(propName); + } else { + std::cerr << "Warning: Could not find name for: " << uprop << std::endl; + } + i++; + } + } + + if (propNames.empty() + || options[OPT_HELP_H].doesOccur + || options[OPT_HELP_QUESTION_MARK].doesOccur + || !options[OPT_MODE].doesOccur) { + FILE *stdfile=argc<0 ? stderr : stdout; + fprintf(stdfile, + "usage: %s -m uprops [-options] [--all | properties...]\n" + "\tdump Unicode property data to .toml files\n" + "options:\n" + "\t-h or -? or --help this usage text\n" + "\t-V or --version show a version message\n" + "\t-m or --mode mode: currently only 'uprops', but more may be added\n" + "\t --trie-type set the trie type (small or fast, default small)\n" + "\t-d or --destdir destination directory, followed by the path\n" + "\t --all write out all properties known to icuexportdata\n" + "\t --index write an _index.toml summarizing all data exported\n" + "\t-c or --copyright include a copyright notice\n" + "\t-v or --verbose Turn on verbose output\n" + "\t-q or --quiet do not display warnings and progress\n", + argv[0]); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + const char* mode = options[OPT_MODE].value; + if (uprv_strcmp(mode, "uprops") != 0) { + fprintf(stderr, "Invalid option for --mode (must be uprops)\n"); + return U_ILLEGAL_ARGUMENT_ERROR; + } + + if (options[OPT_TRIE_TYPE].doesOccur) { + if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) { + trieType = UCPTRIE_TYPE_FAST; + } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) { + trieType = UCPTRIE_TYPE_SMALL; + } else { + fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n"); + return U_ILLEGAL_ARGUMENT_ERROR; + } + } + + for (const char* propName : propNames) { + UProperty propEnum = u_getPropertyEnum(propName); + if (propEnum == UCHAR_INVALID_CODE) { + std::cerr << "Error: Invalid property alias: " << propName << std::endl; + return U_ILLEGAL_ARGUMENT_ERROR; + } + + FILE* f = prepareOutputFile(propName); + + UVersionInfo versionInfo; + u_getUnicodeVersion(versionInfo); + char uvbuf[U_MAX_VERSION_STRING_LENGTH]; + u_versionToString(versionInfo, uvbuf); + fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n", + U_ICU_VERSION, + uvbuf); + + if (propEnum < UCHAR_BINARY_LIMIT) { + dumpBinaryProperty(propEnum, f); + } else if (UCHAR_INT_START <= propEnum && propEnum <= UCHAR_INT_LIMIT) { + dumpEnumeratedProperty(propEnum, f); + } else if (propEnum == UCHAR_GENERAL_CATEGORY_MASK) { + dumpGeneralCategoryMask(f); + } else if (propEnum == UCHAR_BIDI_MIRRORING_GLYPH) { + dumpBidiMirroringGlyph(f); + } else if (propEnum == UCHAR_SCRIPT_EXTENSIONS) { + dumpScriptExtensions(f); + } else { + std::cerr << "Don't know how to write property: " << propEnum << std::endl; + return U_INTERNAL_PROGRAM_ERROR; + } + + fclose(f); + } + + if (options[OPT_INDEX].doesOccur) { + FILE* f = prepareOutputFile("_index"); + fprintf(f, "index = [\n"); + for (const char* propName : propNames) { + // At this point, propName is a valid property name, so it should be alphanum ASCII + fprintf(f, " { filename=\"%s.toml\" },\n", propName); + } + fprintf(f, "]\n"); + fclose(f); + } + + return 0; +} + +struct AddRangeHelper { + UMutableCPTrie* ucptrie; +}; + +static UBool U_CALLCONV +addRangeToUCPTrie(const void* context, UChar32 start, UChar32 end, uint32_t value) { + IcuToolErrorCode status("addRangeToUCPTrie"); + UMutableCPTrie* ucptrie = ((const AddRangeHelper*) context)->ucptrie; + umutablecptrie_setRange(ucptrie, start, end, value, status); + handleError(status, "setRange"); + + return true; +} + +int exportCase(int argc, char* argv[]) { + if (argc > 1) { + fprintf(stderr, "ucase mode does not expect additional arguments\n"); + return U_ILLEGAL_ARGUMENT_ERROR; + } + (void) argv; // Suppress unused variable warning + + IcuToolErrorCode status("icuexportdata"); + LocalUMutableCPTriePointer builder(umutablecptrie_open(0, 0, status)); + handleError(status, "exportCase"); + + int32_t exceptionsLength, unfoldLength; + const UCaseProps *caseProps = ucase_getSingleton(&exceptionsLength, &unfoldLength); + const UTrie2* caseTrie = &caseProps->trie; + + AddRangeHelper helper = { builder.getAlias() }; + utrie2_enum(caseTrie, nullptr, addRangeToUCPTrie, &helper); + + UCPTrieValueWidth width = UCPTRIE_VALUE_BITS_16; + LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + builder.getAlias(), + trieType, + width, + status)); + handleError(status, "exportCase"); + + FILE* f = prepareOutputFile("ucase"); + + UVersionInfo versionInfo; + u_getUnicodeVersion(versionInfo); + char uvbuf[U_MAX_VERSION_STRING_LENGTH]; + u_versionToString(versionInfo, uvbuf); + fprintf(f, "icu_version = \"%s\"\nunicode_version = \"%s\"\n\n", + U_ICU_VERSION, + uvbuf); + + fputs("[ucase.code_point_trie]\n", f); + usrc_writeUCPTrie(f, "case_trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); + fputs("\n", f); + + const char* indent = " "; + const char* suffix = "\n]\n"; + + fputs("[ucase.exceptions]\n", f); + const char* exceptionsPrefix = "exceptions = [\n "; + int32_t exceptionsWidth = 16; + usrc_writeArray(f, exceptionsPrefix, caseProps->exceptions, exceptionsWidth, + exceptionsLength, indent, suffix); + fputs("\n", f); + + fputs("[ucase.unfold]\n", f); + const char* unfoldPrefix = "unfold = [\n "; + int32_t unfoldWidth = 16; + usrc_writeArray(f, unfoldPrefix, caseProps->unfold, unfoldWidth, + unfoldLength, indent, suffix); + + return 0; +} + +#if !UCONFIG_NO_NORMALIZATION + +int exportNorm() { + IcuToolErrorCode status("icuexportdata: exportNorm"); + USet* backwardCombiningStarters = uset_openEmpty(); + writeCanonicalCompositions(backwardCombiningStarters); + + std::vector<uint16_t> storage16; + std::vector<uint32_t> storage32; + + // Note: the USets are not exported. They are only used to check that a new + // Unicode version doesn't violate expectations that are hard-coded in ICU4X. + USet* nfdDecompositionStartsWithNonStarter = uset_openEmpty(); + USet* nfdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty(); + std::vector<PendingDescriptor> nfdPendingTrieInsertions; + UChar32 nfdBound = 0x10FFFF; + UChar32 nfcBound = 0x10FFFF; + computeDecompositions("nfd", + backwardCombiningStarters, + storage16, + storage32, + nfdDecompositionStartsWithNonStarter, + nfdDecompositionStartsWithBackwardCombiningStarter, + nfdPendingTrieInsertions, + nfdBound, + nfcBound); + if (!(nfdBound == 0xC0 && nfcBound == 0x300)) { + // Unexpected bounds for NFD/NFC. + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + + uint32_t baseSize16 = storage16.size(); + uint32_t baseSize32 = storage32.size(); + + USet* nfkdDecompositionStartsWithNonStarter = uset_openEmpty(); + USet* nfkdDecompositionStartsWithBackwardCombiningStarter = uset_openEmpty(); + std::vector<PendingDescriptor> nfkdPendingTrieInsertions; + UChar32 nfkdBound = 0x10FFFF; + UChar32 nfkcBound = 0x10FFFF; + computeDecompositions("nfkd", + backwardCombiningStarters, + storage16, + storage32, + nfkdDecompositionStartsWithNonStarter, + nfkdDecompositionStartsWithBackwardCombiningStarter, + nfkdPendingTrieInsertions, + nfkdBound, + nfkcBound); + if (!(nfkdBound <= 0xC0 && nfkcBound <= 0x300)) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + if (nfkcBound > 0xC0) { + if (nfkdBound != 0xC0) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + } else { + if (nfkdBound != nfkcBound) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + } + + USet* uts46DecompositionStartsWithNonStarter = uset_openEmpty(); + USet* uts46DecompositionStartsWithBackwardCombiningStarter = uset_openEmpty(); + std::vector<PendingDescriptor> uts46PendingTrieInsertions; + UChar32 uts46dBound = 0x10FFFF; + UChar32 uts46Bound = 0x10FFFF; + computeDecompositions("uts46d", + backwardCombiningStarters, + storage16, + storage32, + uts46DecompositionStartsWithNonStarter, + uts46DecompositionStartsWithBackwardCombiningStarter, + uts46PendingTrieInsertions, + uts46dBound, + uts46Bound); + if (!(uts46dBound <= 0xC0 && uts46Bound <= 0x300)) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + if (uts46Bound > 0xC0) { + if (uts46dBound != 0xC0) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + } else { + if (uts46dBound != uts46Bound) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, "exportNorm"); + } + } + + uint32_t supplementSize16 = storage16.size() - baseSize16; + uint32_t supplementSize32 = storage32.size() - baseSize32; + + writeDecompositionData("nfd", baseSize16, baseSize32, supplementSize16, nfdDecompositionStartsWithNonStarter, nullptr, nfdPendingTrieInsertions, char16_t(nfcBound)); + writeDecompositionData("nfkd", baseSize16, baseSize32, supplementSize16, nfkdDecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, nfkdPendingTrieInsertions, char16_t(nfkcBound)); + writeDecompositionData("uts46d", baseSize16, baseSize32, supplementSize16, uts46DecompositionStartsWithNonStarter, nfdDecompositionStartsWithNonStarter, uts46PendingTrieInsertions, char16_t(uts46Bound)); + + writeDecompositionTables("nfdex", storage16.data(), baseSize16, storage32.data(), baseSize32); + writeDecompositionTables("nfkdex", storage16.data() + baseSize16, supplementSize16, storage32.data() + baseSize32, supplementSize32); + + uset_close(nfdDecompositionStartsWithNonStarter); + uset_close(nfkdDecompositionStartsWithNonStarter); + uset_close(uts46DecompositionStartsWithNonStarter); + + uset_close(nfdDecompositionStartsWithBackwardCombiningStarter); + uset_close(nfkdDecompositionStartsWithBackwardCombiningStarter); + uset_close(uts46DecompositionStartsWithBackwardCombiningStarter); + + uset_close(backwardCombiningStarters); + handleError(status, "exportNorm"); + return 0; +} + +#endif // !UCONFIG_NO_NORMALIZATION + +int main(int argc, char* argv[]) { + U_MAIN_INIT_ARGS(argc, argv); + + /* preset then read command line options */ + options[OPT_DESTDIR].value=u_getDataDirectory(); + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + if(options[OPT_VERSION].doesOccur) { + printf("icuexportdata version %s, ICU tool to dump data files for external consumers\n", + U_ICU_DATA_VERSION); + printf("%s\n", U_COPYRIGHT_STRING); + exit(0); + } + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } + + if (argc < 0 + || options[OPT_HELP_H].doesOccur + || options[OPT_HELP_QUESTION_MARK].doesOccur + || !options[OPT_MODE].doesOccur) { + FILE *stdfile=argc<0 ? stderr : stdout; + printHelp(stdfile, argv[0]); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + /* get the options values */ + haveCopyright = options[OPT_COPYRIGHT].doesOccur; + destdir = options[OPT_DESTDIR].value; + VERBOSE = options[OPT_VERBOSE].doesOccur; + QUIET = options[OPT_QUIET].doesOccur; + + if (options[OPT_TRIE_TYPE].doesOccur) { + if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "fast") == 0) { + trieType = UCPTRIE_TYPE_FAST; + } else if (uprv_strcmp(options[OPT_TRIE_TYPE].value, "small") == 0) { + trieType = UCPTRIE_TYPE_SMALL; + } else { + fprintf(stderr, "Invalid option for --trie-type (must be small or fast)\n"); + return U_ILLEGAL_ARGUMENT_ERROR; + } + } + + const char* mode = options[OPT_MODE].value; + if (uprv_strcmp(mode, "norm") == 0) { +#if !UCONFIG_NO_NORMALIZATION + return exportNorm(); +#else + fprintf(stderr, "Exporting normalization data not supported when compiling without normalization support.\n"); + return U_ILLEGAL_ARGUMENT_ERROR; +#endif + } + if (uprv_strcmp(mode, "uprops") == 0) { + return exportUprops(argc, argv); + } else if (uprv_strcmp(mode, "ucase") == 0) { + return exportCase(argc, argv); + } + + fprintf(stderr, "Invalid option for --mode (must be uprops, ucase, or norm)\n"); + return U_ILLEGAL_ARGUMENT_ERROR; +} diff --git a/intl/icu/source/tools/icuexportdata/icuexportdata.vcxproj b/intl/icu/source/tools/icuexportdata/icuexportdata.vcxproj new file mode 100644 index 0000000000..48b4c23cf8 --- /dev/null +++ b/intl/icu/source/tools/icuexportdata/icuexportdata.vcxproj @@ -0,0 +1,80 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{C5185F6D-BC0A-4DF7-A63C-B107D1C9C82F}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)/icuexportdata.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\..\include;..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)/icuexportdata.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)/icuexportdata.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)/icuexportdata.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="icuexportdata.cpp" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/icuexportdata/icuexportdata.vcxproj.filters b/intl/icu/source/tools/icuexportdata/icuexportdata.vcxproj.filters new file mode 100644 index 0000000000..02b3257ba3 --- /dev/null +++ b/intl/icu/source/tools/icuexportdata/icuexportdata.vcxproj.filters @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{7641b9da-f313-4ee0-8c60-2c8050c87e45}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{0333a61f-f79b-490c-9761-a4e5966f3ff0}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{05869d75-29f4-43d9-bebc-9973e550d958}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="icuexportdata.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/icuexportdata/sources.txt b/intl/icu/source/tools/icuexportdata/sources.txt new file mode 100644 index 0000000000..13520ecb2f --- /dev/null +++ b/intl/icu/source/tools/icuexportdata/sources.txt @@ -0,0 +1 @@ +icuexportdata.cpp diff --git a/intl/icu/source/tools/icuinfo/Makefile.in b/intl/icu/source/tools/icuinfo/Makefile.in new file mode 100644 index 0000000000..68eeb084bf --- /dev/null +++ b/intl/icu/source/tools/icuinfo/Makefile.in @@ -0,0 +1,117 @@ +## Makefile.in for ICU - tools/icuinfo +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 1999-2015, International Business Machines Corporation and +## others. All Rights Reserved. +## Madhu Katragadda + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/icuinfo + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(PLUGIN_OBJECTS) $(PLUGINFILE) $(PLUGIN) + +## Target information +TARGET = icuinfo$(EXEEXT) + +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -I$(top_srcdir)/tools/ctestfw +CPPFLAGS+= -I$(top_srcdir)/i18n +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(SOURCES:.cpp=.o) +@PLUGINS_TRUE@PLUGIN_SOURCES = $(shell cat $(srcdir)/plugin_sources.txt) +@PLUGINS_TRUE@PLUGIN_OBJECTS = $(@PLUGINS_TRUE@PLUGIN_SOURCES:.c=.o) + +DEPS = $(OBJECTS:.o=.d) + +# pass some information + +ICUINFO_OPTS=-i ../../data/out/build/$(ICUDATA_PLATFORM_NAME) -x $(top_builddir)/config/icuinfo.xml -v -K + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local plugin-check + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) + +install-local: all-local + $(MKINSTALLDIRS) $(DESTDIR)$(bindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(bindir) + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: $(TARGET) + $(INVOKE) ./$(TARGET) $(ICUINFO_OPTS) + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status +-include Makefile.local + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + +PLUGIN=$(LIBPREFIX)plugin.$(SO) +SO_TARGET=$(PLUGIN) + +PLUGINDIR=$(shell pwd) + +PLUGINFILE=$(PLUGINDIR)/icuplugins$(SO_TARGET_VERSION_MAJOR).txt + +CFLAGS+=$(SHAREDLIBCFLAGS) + +@PLUGINS_TRUE@HAVE_PLUGINS=yes + +ifeq ($(HAVE_PLUGINS),yes) +$(PLUGINFILE): Makefile + echo "$(CURR_FULL_DIR)/$(PLUGIN) myPlugin x=4" > $@ + + +$(PLUGIN): $(PLUGIN_OBJECTS) + $(SHLIB.cc) $(SHAREDLIBCFLAGS) $(LD_SONAME) $(OUTOPT)$@ $^ $(LIBS) + +plugin: $(PLUGIN) + +plugin-check: $(PLUGIN) $(PLUGINFILE) + $(INVOKE) ICU_PLUGINS="$(CURR_FULL_DIR)" ./$(TARGET) -v -L +else +plugin plugin-check $(PLUGIN): + @echo "Plugins are disabled (use --enable-plugins to enable)" +endif + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/icuinfo/icuinfo.cpp b/intl/icu/source/tools/icuinfo/icuinfo.cpp new file mode 100644 index 0000000000..39dc6ab890 --- /dev/null +++ b/intl/icu/source/tools/icuinfo/icuinfo.cpp @@ -0,0 +1,307 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: icuinfo.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2009-2010 +* created by: Steven R. Loomis +* +* This program shows some basic info about the current ICU. +*/ + +#include <stdio.h> +#include <stdlib.h> +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/uclean.h" +#include "udbgutil.h" +#include "unewdata.h" +#include "cmemory.h" +#include "cstring.h" +#include "uoptions.h" +#include "toolutil.h" +#include "icuplugimp.h" +#include <unicode/uloc.h> +#include <unicode/ucnv.h> +#include "unicode/ucal.h" +#include <unicode/ulocdata.h> +#include "putilimp.h" +#include "unicode/uchar.h" + +static UOption options[]={ + /*0*/ UOPTION_HELP_H, + /*1*/ UOPTION_HELP_QUESTION_MARK, + /*2*/ UOPTION_ICUDATADIR, + /*3*/ UOPTION_VERBOSE, + /*4*/ UOPTION_DEF("list-plugins", 'L', UOPT_NO_ARG), // may be a no-op if disabled + /*5*/ UOPTION_DEF("milisecond-time", 'm', UOPT_NO_ARG), + /*6*/ UOPTION_DEF("cleanup", 'K', UOPT_NO_ARG), + /*7*/ UOPTION_DEF("xml", 'x', UOPT_REQUIRES_ARG), +}; + +static UErrorCode initStatus = U_ZERO_ERROR; +static UBool icuInitted = false; + +static void do_init() { + if(!icuInitted) { + u_init(&initStatus); + icuInitted = true; + } +} + +static void do_cleanup() { + if (icuInitted) { + u_cleanup(); + icuInitted = false; + } +} + +void cmd_millis() +{ + printf("Milliseconds since Epoch: %.0f\n", uprv_getUTCtime()); +} + +void cmd_version(UBool /* noLoad */, UErrorCode &errorCode) +{ + + do_init(); + + udbg_writeIcuInfo(stdout); /* print the XML format */ + + union { + uint8_t byte; + uint16_t word; + } u; + u.word=0x0100; + if(U_IS_BIG_ENDIAN==u.byte) { + //printf("U_IS_BIG_ENDIAN: %d\n", U_IS_BIG_ENDIAN); + } else { + fprintf(stderr, " error: U_IS_BIG_ENDIAN=%d != %d=actual 'is big endian'\n", + U_IS_BIG_ENDIAN, u.byte); + errorCode=U_INTERNAL_PROGRAM_ERROR; + } + +#if defined(_MSC_VER) +// Ignore warning 4127, conditional expression is constant. This is intentional below. +#pragma warning(push) +#pragma warning(disable: 4127) +#endif + + if(U_SIZEOF_WCHAR_T==sizeof(wchar_t)) { + //printf("U_SIZEOF_WCHAR_T: %d\n", U_SIZEOF_WCHAR_T); + } else { + fprintf(stderr, " error: U_SIZEOF_WCHAR_T=%d != %d=sizeof(wchar_t)\n", + U_SIZEOF_WCHAR_T, (int)sizeof(wchar_t)); + errorCode=U_INTERNAL_PROGRAM_ERROR; + } + + int charsetFamily; + if('A'==0x41) { + charsetFamily=U_ASCII_FAMILY; + } else if('A'==0xc1) { + charsetFamily=U_EBCDIC_FAMILY; + } else { + charsetFamily=-1; // unknown + } + if(U_CHARSET_FAMILY==charsetFamily) { + //printf("U_CHARSET_FAMILY: %d\n", U_CHARSET_FAMILY); + } else { + fprintf(stderr, " error: U_CHARSET_FAMILY=%d != %d=actual charset family\n", + U_CHARSET_FAMILY, charsetFamily); + errorCode=U_INTERNAL_PROGRAM_ERROR; + } + +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + + printf("\n\nICU Initialization returned: %s\n", u_errorName(initStatus)); + + +#if UCONFIG_ENABLE_PLUGINS +#if U_ENABLE_DYLOAD + const char *pluginFile = uplug_getPluginFile(); + printf("Plugin file is: %s\n", (pluginFile&&*pluginFile)?pluginFile:"(not set. try setting ICU_PLUGINS to a directory.)"); +#else + fprintf(stderr, "Dynamic Loading: is disabled. No plugins will be loaded at start-up.\n"); +#endif +#else + fprintf(stderr, "Plugins are disabled.\n"); +#endif +} + +void cmd_cleanup() +{ + u_cleanup(); + fprintf(stdout, "ICU u_cleanup() called.\n"); +} + + +void cmd_listplugins() { +#if UCONFIG_ENABLE_PLUGINS + int32_t i; + UPlugData *plug; + + do_init(); + printf("ICU Initialized: u_init() returned %s\n", u_errorName(initStatus)); + + printf("Plugins: \n"); + printf( "# %6s %s \n", + "Level", + "Name" ); + printf( " %10s:%-10s\n", + "Library", + "Symbol" + ); + + + printf( " config| (configuration string)\n"); + printf( " >>> Error | Explanation \n"); + printf( "-----------------------------------\n"); + + for(i=0;(plug=uplug_getPlugInternal(i))!=nullptr;i++) { + UErrorCode libStatus = U_ZERO_ERROR; + const char *name = uplug_getPlugName(plug); + const char *sym = uplug_getSymbolName(plug); + const char *lib = uplug_getLibraryName(plug, &libStatus); + const char *config = uplug_getConfiguration(plug); + UErrorCode loadStatus = uplug_getPlugLoadStatus(plug); + const char *message = nullptr; + + printf("\n#%d %-6s %s \n", + i+1, + udbg_enumName(UDBG_UPlugLevel,(int32_t)uplug_getPlugLevel(plug)), + name!=nullptr?(*name?name:"this plugin did not call uplug_setPlugName()"):"(null)" + ); + printf(" plugin| %10s:%-10s\n", + (U_SUCCESS(libStatus)?(lib!=nullptr?lib:"(null)"):u_errorName(libStatus)), + sym!=nullptr?sym:"(null)" + ); + + if(config!=nullptr&&*config) { + printf(" config| %s\n", config); + } + + switch(loadStatus) { + case U_PLUGIN_CHANGED_LEVEL_WARNING: + message = "Note: This plugin changed the system level (by allocating memory or calling something which does). Later plugins may not load."; + break; + + case U_PLUGIN_DIDNT_SET_LEVEL: + message = "Error: This plugin did not call uplug_setPlugLevel during QUERY."; + break; + + case U_PLUGIN_TOO_HIGH: + message = "Error: This plugin couldn't load because the system level was too high. Try loading this plugin earlier."; + break; + + case U_ZERO_ERROR: + message = nullptr; /* no message */ + break; + default: + if(U_FAILURE(loadStatus)) { + message = "error loading:"; + } else { + message = "warning during load:"; + } + } + + if(message!=nullptr) { + printf("\\\\\\ status| %s\n" + "/// %s\n", u_errorName(loadStatus), message); + } + + } + if(i==0) { + printf("No plugins loaded.\n"); + } +#endif +} + + + +extern int +main(int argc, char* argv[]) { + UErrorCode errorCode = U_ZERO_ERROR; + UBool didSomething = false; + + /* preset then read command line options */ + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } + if( options[0].doesOccur || options[1].doesOccur) { + fprintf(stderr, "%s: Output information about the current ICU\n", argv[0]); + fprintf(stderr, "Options:\n" + " -h or --help - Print this help message.\n" + " -m or --millisecond-time - Print the current UTC time in milliseconds.\n" + " -d <dir> or --icudatadir <dir> - Set the ICU Data Directory\n" + " -v - Print version and configuration information about ICU\n" +#if UCONFIG_ENABLE_PLUGINS + " -L or --list-plugins - List and diagnose issues with ICU Plugins\n" +#endif + " -K or --cleanup - Call u_cleanup() before exiting (will attempt to unload plugins)\n" + "\n" + "If no arguments are given, the tool will print ICU version and configuration information.\n" + ); + fprintf(stderr, "International Components for Unicode %s\n%s\n", U_ICU_VERSION, U_COPYRIGHT_STRING ); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + if(options[2].doesOccur) { + u_setDataDirectory(options[2].value); + } + + if(options[5].doesOccur) { + cmd_millis(); + didSomething=true; + } + if(options[4].doesOccur) { + cmd_listplugins(); + didSomething = true; + } + + if(options[3].doesOccur) { + cmd_version(false, errorCode); + didSomething = true; + } + + if(options[7].doesOccur) { /* 2nd part of version: cleanup */ + FILE *out = fopen(options[7].value, "w"); + if(out==nullptr) { + fprintf(stderr,"ERR: can't write to XML file %s\n", options[7].value); + return 1; + } + /* todo: API for writing DTD? */ + fprintf(out, "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"); + udbg_writeIcuInfo(out); + fclose(out); + didSomething = true; + } + + if(options[6].doesOccur) { /* 2nd part of version: cleanup */ + cmd_cleanup(); + didSomething = true; + } + + if(!didSomething) { + cmd_version(false, errorCode); /* at least print the version # */ + } + + do_cleanup(); + + return U_FAILURE(errorCode); +} diff --git a/intl/icu/source/tools/icuinfo/icuinfo.vcxproj b/intl/icu/source/tools/icuinfo/icuinfo.vcxproj new file mode 100644 index 0000000000..ac8bfa88eb --- /dev/null +++ b/intl/icu/source/tools/icuinfo/icuinfo.vcxproj @@ -0,0 +1,83 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{E7611F49-F088-4175-9446-6111444E72C8}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)\icuinfo.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level4</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\common;..\..\i18n;..\toolutil;..\ctestfw;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)\icuinfo.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)\icuinfo.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)\icuinfo.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icuind.lib;icutud.lib;icutestd.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icuin.lib;icutu.lib;icutest.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="icuinfo.cpp" /> + </ItemGroup> + <ItemGroup> + <None Include="icuplugins_windows_sample.txt" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/icuinfo/icuplugins_windows_sample.txt b/intl/icu/source/tools/icuinfo/icuplugins_windows_sample.txt new file mode 100644 index 0000000000..936e917b63 --- /dev/null +++ b/intl/icu/source/tools/icuinfo/icuplugins_windows_sample.txt @@ -0,0 +1,59 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2009-2010 IBM Corporation and Others. All Rights Reserved. +# +# This is a sample ICU Plugins control file for Windows. +# It's also an example control file for any platform. +# +# This file can be copied to, for example, C:\SOMEDIRECTORY\icuplugins##.txt +# where ## is the major and minor ICU versions (i.e. just 96 for version 9.6.3) +# and C:\SOMEDIRECTORY is any directory. +# +# Then, set the variable ICU_PLUGINS to C:\SOMEDIRECTORY +# +# Then, ICU will load the test plugin from either the debug or non-debug +# plugin DLL (depending on whether ICU is in debug or non-debug state). +# +# To see the results, run the command "icuinfo -v -L" +# +# The format of this file is pretty simple. +# These lines are comments. +# +# Non-comment lines have two or three elements in them, and look like this: +# +# LIBRARYNAME ENTRYPOINT [ CONFIGURATION .. ] +# +# Tabs or spaces separate the three items. +# +# LIBRARYNAME is the name of a shared library, either a short name if it is on the PATH, +# or a full pathname. +# +# ENTRYPOINT is the short (undecorated) symbol name of the plugin's entrypoint. +# see unicode/icuplug.h for information. +# +# CONFIGURATION is the entire rest of the line. It's passed as-is to the plugin. +# +# +# This sample file tries to load 'myPlugin'. +# It is in the testplug project. (You will need to rebuild either the debug or release version of this DLL.) +# The configuration string isn't used, but is just an example + +## A high level test plugin that does nothing. +testplug.dll myPlugin hello=world + +## A "bad" plugin that is low level but performs a malloc. +## Sometimes this is desired, but, note that it may cause +## later plugins to fail to load. +#testplug.dll myPluginBad hello=world + +## A "high-level" plugin that does nothing. +## It will be loaded after the low level plugins. +#testplug.dll myPluginHigh + +## A "low-level" plugin that does nothing. +## It will be loaded before the high level plugins. +#testplug.dll myPluginLow + +## A low level plugin that just prints a message when uprv_malloc and related functions are called +## Note, it cannot be unloaded. +#testplug.dll debugMemoryPlugin diff --git a/intl/icu/source/tools/icuinfo/plugin_sources.txt b/intl/icu/source/tools/icuinfo/plugin_sources.txt new file mode 100644 index 0000000000..7d5e663382 --- /dev/null +++ b/intl/icu/source/tools/icuinfo/plugin_sources.txt @@ -0,0 +1 @@ +testplug.c diff --git a/intl/icu/source/tools/icuinfo/sources.txt b/intl/icu/source/tools/icuinfo/sources.txt new file mode 100644 index 0000000000..67b9aa2df8 --- /dev/null +++ b/intl/icu/source/tools/icuinfo/sources.txt @@ -0,0 +1 @@ +icuinfo.cpp diff --git a/intl/icu/source/tools/icuinfo/testplug.c b/intl/icu/source/tools/icuinfo/testplug.c new file mode 100644 index 0000000000..8b48bc66d4 --- /dev/null +++ b/intl/icu/source/tools/icuinfo/testplug.c @@ -0,0 +1,212 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +****************************************************************************** +* +* Copyright (C) 2009-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* FILE NAME : testplug.c +* +* Date Name Description +* 10/29/2009 srl New. +****************************************************************************** +* +* +* This file implements a number of example ICU plugins. +* +*/ + +#include "unicode/icuplug.h" + +#if UCONFIG_ENABLE_PLUGINS +/* This file isn't usually compiled except on Windows. Guard it. */ + +#include <stdbool.h> +#include <stdio.h> /* for fprintf */ +#include <stdlib.h> /* for malloc */ +#include "udbgutil.h" +#include "unicode/uclean.h" +#include "cmemory.h" + +/** + * Prototypes + */ +#define DECLARE_PLUGIN(x) U_CAPI UPlugTokenReturn U_EXPORT2 x (UPlugData *data, UPlugReason reason, UErrorCode *status) + +DECLARE_PLUGIN(myPlugin); +DECLARE_PLUGIN(myPluginLow); +DECLARE_PLUGIN(myPluginFailQuery); +DECLARE_PLUGIN(myPluginFailToken); +DECLARE_PLUGIN(myPluginBad); +DECLARE_PLUGIN(myPluginHigh); +DECLARE_PLUGIN(debugMemoryPlugin); + +/** + * A simple, trivial plugin. + */ + +U_CAPI +UPlugTokenReturn U_EXPORT2 myPlugin ( + UPlugData *data, + UPlugReason reason, + UErrorCode *status) { + /* Just print this for debugging */ + fprintf(stderr,"MyPlugin: data=%p, reason=%s, status=%s\n", (void*)data, udbg_enumName(UDBG_UPlugReason,(int32_t)reason), u_errorName(*status)); + + if(reason==UPLUG_REASON_QUERY) { + uplug_setPlugName(data, "Just a Test High-Level Plugin"); /* This call is optional in response to UPLUG_REASON_QUERY, but is a good idea. */ + uplug_setPlugLevel(data, UPLUG_LEVEL_HIGH); /* This call is Mandatory in response to UPLUG_REASON_QUERY */ + } + + return UPLUG_TOKEN; /* This must always be returned, to indicate that the entrypoint was actually a plugin. */ +} + + +U_CAPI +UPlugTokenReturn U_EXPORT2 myPluginLow ( + UPlugData *data, + UPlugReason reason, + UErrorCode *status) { + fprintf(stderr,"MyPluginLow: data=%p, reason=%s, status=%s\n", (void*)data, udbg_enumName(UDBG_UPlugReason,(int32_t)reason), u_errorName(*status)); + + if(reason==UPLUG_REASON_QUERY) { + uplug_setPlugName(data, "Low Plugin"); + uplug_setPlugLevel(data, UPLUG_LEVEL_LOW); + } + + return UPLUG_TOKEN; +} + +/** + * Doesn't respond to QUERY properly. + */ +U_CAPI +UPlugTokenReturn U_EXPORT2 myPluginFailQuery ( + UPlugData *data, + UPlugReason reason, + UErrorCode *status) { + fprintf(stderr,"MyPluginFailQuery: data=%p, reason=%s, status=%s\n", (void*)data, udbg_enumName(UDBG_UPlugReason,(int32_t)reason), u_errorName(*status)); + + /* Should respond to UPLUG_REASON_QUERY here. */ + + return UPLUG_TOKEN; +} + +/** + * Doesn't return the proper token. + */ +U_CAPI +UPlugTokenReturn U_EXPORT2 myPluginFailToken ( + UPlugData *data, + UPlugReason reason, + UErrorCode *status) { + fprintf(stderr,"MyPluginFailToken: data=%p, reason=%s, status=%s\n", (void*)data, udbg_enumName(UDBG_UPlugReason,(int32_t)reason), u_errorName(*status)); + + if(reason==UPLUG_REASON_QUERY) { + uplug_setPlugName(data, "myPluginFailToken Plugin"); + uplug_setPlugLevel(data, UPLUG_LEVEL_LOW); + } + + return 0; /* Wrong. */ +} + + + +/** + * Says it's low, but isn't. + */ +U_CAPI +UPlugTokenReturn U_EXPORT2 myPluginBad ( + UPlugData *data, + UPlugReason reason, + UErrorCode *status) { + fprintf(stderr,"MyPluginLow: data=%p, reason=%s, status=%s\n", (void*)data, udbg_enumName(UDBG_UPlugReason,(int32_t)reason), u_errorName(*status)); + + if(reason==UPLUG_REASON_QUERY) { + uplug_setPlugName(data, "Bad Plugin"); + uplug_setPlugLevel(data, UPLUG_LEVEL_LOW); + } else if(reason == UPLUG_REASON_LOAD) { + void *ctx = uprv_malloc(12345); + + uplug_setContext(data, ctx); + fprintf(stderr,"I'm %p and I did a bad thing and malloced %p\n", (void*)data, (void*)ctx); + } else if(reason == UPLUG_REASON_UNLOAD) { + void * ctx = uplug_getContext(data); + + uprv_free(ctx); + } + + + return UPLUG_TOKEN; +} + +U_CAPI +UPlugTokenReturn U_EXPORT2 myPluginHigh ( + UPlugData *data, + UPlugReason reason, + UErrorCode *status) { + fprintf(stderr,"MyPluginHigh: data=%p, reason=%s, status=%s\n", (void*)data, udbg_enumName(UDBG_UPlugReason,(int32_t)reason), u_errorName(*status)); + + if(reason==UPLUG_REASON_QUERY) { + uplug_setPlugName(data, "High Plugin"); + uplug_setPlugLevel(data, UPLUG_LEVEL_HIGH); + } + + return UPLUG_TOKEN; +} + + +/* Debug Memory Plugin (see hpmufn.c) */ +static void * U_CALLCONV myMemAlloc(const void *context, size_t size) { + void *retPtr = (void *)malloc(size); + (void)context; /* unused */ + fprintf(stderr, "MEM: malloc(%d) = %p\n", (int32_t)size, retPtr); + return retPtr; +} + +static void U_CALLCONV myMemFree(const void *context, void *mem) { + (void)context; /* unused */ + + free(mem); + fprintf(stderr, "MEM: free(%p)\n", mem); +} + +static void * U_CALLCONV myMemRealloc(const void *context, void *mem, size_t size) { + void *retPtr; + (void)context; /* unused */ + + + if(mem==NULL) { + retPtr = NULL; + } else { + retPtr = realloc(mem, size); + } + fprintf(stderr, "MEM: realloc(%p, %d) = %p\n", mem, (int32_t)size, retPtr); + return retPtr; +} + +U_CAPI +UPlugTokenReturn U_EXPORT2 debugMemoryPlugin ( + UPlugData *data, + UPlugReason reason, + UErrorCode *status) { + fprintf(stderr,"debugMemoryPlugin: data=%p, reason=%s, status=%s\n", (void*)data, udbg_enumName(UDBG_UPlugReason,(int32_t)reason), u_errorName(*status)); + + if(reason==UPLUG_REASON_QUERY) { + uplug_setPlugLevel(data, UPLUG_LEVEL_LOW); + uplug_setPlugName(data, "Memory Plugin"); + } else if(reason==UPLUG_REASON_LOAD) { + u_setMemoryFunctions(uplug_getContext(data), &myMemAlloc, &myMemRealloc, &myMemFree, status); + fprintf(stderr, "MEM: status now %s\n", u_errorName(*status)); + } else if(reason==UPLUG_REASON_UNLOAD) { + fprintf(stderr, "MEM: not possible to unload this plugin (no way to reset memory functions)...\n"); + uplug_setPlugNoUnload(data, true); + } + + return UPLUG_TOKEN; +} + +#endif diff --git a/intl/icu/source/tools/icuinfo/testplug.vcxproj b/intl/icu/source/tools/icuinfo/testplug.vcxproj new file mode 100644 index 0000000000..dac99beb4e --- /dev/null +++ b/intl/icu/source/tools/icuinfo/testplug.vcxproj @@ -0,0 +1,83 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{659D0C08-D4ED-4BF3-B02B-2D8D4B5A7A7A}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>DynamicLibrary</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* project configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)\testplug.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <AdditionalIncludeDirectories>..\..\..\include;..\..\common;..\toolutil;..\ctestfw;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PreprocessorDefinitions>T_CTEST_IMPLEMENTATION;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DebugInformationFormat>ProgramDatabase</DebugInformationFormat> + <PrecompiledHeaderOutputFile>$(OutDir)\testplug.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)\testplug.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <GenerateDebugInformation>true</GenerateDebugInformation> + <OutputFile>..\..\..\$(IcuBinOutputDir)\testplugd.dll</OutputFile> + <ProgramDatabaseFile>..\..\..\$(IcuLibOutputDir)\testplugd.pdb</ProgramDatabaseFile> + <ImportLibrary>..\..\..\$(IcuLibOutputDir)\testplugd.lib</ImportLibrary> + <AdditionalDependencies>icuucd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <OutputFile>..\..\..\$(IcuBinOutputDir)\testplug.dll</OutputFile> + <ProgramDatabaseFile>..\..\..\$(IcuLibOutputDir)\testplug.pdb</ProgramDatabaseFile> + <ImportLibrary>..\..\..\$(IcuLibOutputDir)\testplug.lib</ImportLibrary> + <AdditionalDependencies>icuuc.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="testplug.c" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/icuinfo/testplug.vcxproj.filters b/intl/icu/source/tools/icuinfo/testplug.vcxproj.filters new file mode 100644 index 0000000000..5c9125e5dd --- /dev/null +++ b/intl/icu/source/tools/icuinfo/testplug.vcxproj.filters @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{a83708c0-2dc6-44b5-96e7-01cb39fcc0fe}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{f2528795-0c58-475a-a156-75756f9246eb}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{7c635351-2ada-418e-b675-5fbd534925b5}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="testplug.c"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/icupkg/Makefile.in b/intl/icu/source/tools/icupkg/Makefile.in new file mode 100644 index 0000000000..45f0b33f83 --- /dev/null +++ b/intl/icu/source/tools/icupkg/Makefile.in @@ -0,0 +1,97 @@ +## Makefile.in for ICU - tools/icupkg +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 1999-2011, International Business Machines Corporation and +## others. All Rights Reserved. +## Steven R. Loomis + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/icupkg + +TARGET_STUB_NAME = icupkg + +SECTION = 8 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(SOURCES:.cpp=.o) + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(sbindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir) + +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/icupkg/icupkg.8.in b/intl/icu/source/tools/icupkg/icupkg.8.in new file mode 100644 index 0000000000..6160ffffa8 --- /dev/null +++ b/intl/icu/source/tools/icupkg/icupkg.8.in @@ -0,0 +1,206 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" icupkg.8: manual page for the icupkg utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2000-2006 IBM, Inc. and others. +.\" +.TH ICUPKG 8 "18 August 2006" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B icupkg +\- extract or modify an ICU +.B .dat +archive +.SH SYNOPSIS +.B icupkg +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-tl\fP, \fB\-\-type" " l" +| +.BR "\-tb\fP, \fB\-\-type" " b" +| +.BR "\-te\fP, \fB\-\-type" " e" +] +[ +.BR "\-c\fP, \fB\-\-copyright" +| +.BI "\-C\fP, \fB\-\-comment" " comment" +] +[ +.BI "\-a\fP, \fB\-\-add" " list" +] +[ +.BI "\-r\fP, \fB\-\-remove" " list" +] +[ +.BI "\-x\fP, \fB\-\-extract" " list" +] +[ +.BI "\-l\fP, \fB\-\-list" +] +[ +.BI "\-s\fP, \fB\-\-sourcedir" " source" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +[ +.BI "\-w\fP, \fB\-\-writepkg" +] +[ +.BI "\-m\fP, \fB\-\-matchmode" " mode" +] +.IR infilename +[ +.BI "outfilename" +] +.SH DESCRIPTION +.B icupkg +reads the input ICU +.B .dat +package file, modify it according to the options, +swap it to the desired platform properties (charset & endianness), +and optionally write the resulting ICU +.B .dat +package to the output file. +Items are removed, then added, then extracted and listed. +An ICU +.B .dat +package is written if items are removed or added, +or if the input and output filenames differ, +or if the +.BR "\-w\fP, \fB\-\-writepkg" +option is set. +.PP +If the input filename is "new" then an empty package is created. +If the output filename is missing, then it is automatically generated +from the input filename. If the input filename ends with an l, b, or e +matching its platform properties, then the output filename will +contain the letter from the +.BI "\-t\fP, \fB\-\-type" +option. +.PP +This tool can also be used to just swap a single ICU data file, replacing the +former icuswap tool. For this mode, provide the infilename (and optional +outfilename) for a non-package ICU data file. +Allowed options include +.BI "\-t\fP, \fB\-w\fP, \fB\-s\fP" +and +.BI \-d +. +The filenames can be absolute, or relative to the source/dest dir paths. +Other options are not allowed in this mode. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BI "\-tl\fP, \fB\-\-type" " l" +Output for little-endian/ASCII charset family. +The output type defaults to the input type. +.TP +.BI "\-tb\fP, \fB\-\-type" " b" +Output for big-endian/ASCII charset family. +The output type defaults to the input type. +.TP +.BI "\-te\fP, \fB\-\-type" " e" +Output for big-endian/EBCDIC charset family. +The output type defaults to the input type. +.TP +.BR \-c\fP, \fB\-\-copyright +Include the ICU copyright notice in the resulting data. +.TP +.BI "\-C\fP, \fB\-\-comment" " comment" +Include the specified +.I comment +in the resulting data instead of the ICU copyright notice. +.TP +.BI "\-a\fP, \fB\-\-add" " list" +Add items from the +.I list +to the package. The list can be a single filename with a +.B .txt +file extension containing a list of item filenames, or an ICU +.B .dat +package filename. +.TP +.BI "\-r\fP, \fB\-\-remove" " list" +Remove items from the +.I list +from the package. The list can be a single filename with a +.B .txt +file extension containing a list of item filenames, or an ICU +.B .dat +package filename. +.TP +.BI "\-x\fP, \fB\-\-extract" " list" +Extract items from the +.I list +from the package. The list can be a single filename with a +.B .txt +file extension containing a list of item filenames, or an ICU +.B .dat +package filename. +.TP +.BI "\-m\fP, \fB\-\-matchmode" " mode" +Set the matching mode for item names with wildcards. +.TP +.BI "\-s\fP, \fB\-\-sourcedir" " source" +Set the source directory to +.IR source . +The default source directory is the current directory. +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory to +.IR destination . +The default destination directory is the current directory. +.TP +.BI "\-l\fP, \fB\-\-list" +List the package items to stdout (after modifying the package). +.SH LIST FILE SYNTAX +Items are listed on one or more lines and separated by whitespace (space+tab). +Comments begin with +.B # +and are ignored. Empty lines are ignored. Lines where the first non-whitespace +character is one of "%&'()*+,-./:;<=>?_ are also ignored +to reserve for future syntax. +.PP +Items for removal or extraction may contain a single +.B * +wildcard character. The +.B * +matches zero or more characters. If +.BI "\-m\fP, \fB\-\-matchmode" " noslash" +is set, then the +.B * +character does not match the +.B / +character. +.PP +Items must be listed relative to the package, and the +.B "\fB\-\-sourcedir" +or the +.B "\fB\-\-destdir" +path will be prepended. The paths are only prepended to item +filenames while adding or extracting items, not to ICU .dat package or list +filenames. +.PP +Paths may contain +.B / +instead of the platform's file separator character and are converted as +appropriate. +.SH AUTHORS +Markus Scherer +.br +George Rhoten +.SH VERSION +1.0 +.SH COPYRIGHT +Copyright (C) 2006 IBM, Inc. and others. +.SH SEE ALSO +.BR pkgdata (1) +.BR genrb (1) + diff --git a/intl/icu/source/tools/icupkg/icupkg.cpp b/intl/icu/source/tools/icupkg/icupkg.cpp new file mode 100644 index 0000000000..392ed58899 --- /dev/null +++ b/intl/icu/source/tools/icupkg/icupkg.cpp @@ -0,0 +1,563 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2005-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: icupkg.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005jul29 +* created by: Markus W. Scherer +* +* This tool operates on ICU data (.dat package) files. +* It takes one as input, or creates an empty one, and can remove, add, and +* extract data pieces according to command-line options. +* At the same time, it swaps each piece to a consistent set of platform +* properties as desired. +* Useful as an install-time tool for shipping only one flavor of ICU data +* and preparing data files for the target platform. +* Also for customizing ICU data (pruning, augmenting, replacing) and for +* taking it apart. +* Subsumes functionality and implementation code from +* gencmn, decmn, and icuswap tools. +* Will not work with data DLLs (shared libraries). +*/ + +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "cstring.h" +#include "toolutil.h" +#include "uoptions.h" +#include "uparse.h" +#include "filestrm.h" +#include "package.h" +#include "pkg_icu.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +U_NAMESPACE_USE + +// TODO: add --matchmode=regex for using the ICU regex engine for item name pattern matching? + +// general definitions ----------------------------------------------------- *** + +// main() ------------------------------------------------------------------ *** + +static void +printUsage(const char *pname, UBool isHelp) { + FILE *where=isHelp ? stdout : stderr; + + fprintf(where, + "%csage: %s [-h|-?|--help ] [-tl|-tb|-te] [-c] [-C comment]\n" + "\t[-a list] [-r list] [-x list] [-l [-o outputListFileName]]\n" + "\t[-s path] [-d path] [-w] [-m mode]\n" + "\t[--ignore-deps]\n" + "\t[--auto_toc_prefix] [--auto_toc_prefix_with_type] [--toc_prefix]\n" + "\tinfilename [outfilename]\n", + isHelp ? 'U' : 'u', pname); + if(isHelp) { + fprintf(where, + "\n" + "Read the input ICU .dat package file, modify it according to the options,\n" + "swap it to the desired platform properties (charset & endianness),\n" + "and optionally write the resulting ICU .dat package to the output file.\n" + "Items are removed, then added, then extracted and listed.\n" + "An ICU .dat package is written if items are removed or added,\n" + "or if the input and output filenames differ,\n" + "or if the --writepkg (-w) option is set.\n"); + fprintf(where, + "\n" + "If the input filename is \"new\" then an empty package is created.\n" + "If the output filename is missing, then it is automatically generated\n" + "from the input filename: If the input filename ends with an l, b, or e\n" + "matching its platform properties, then the output filename will\n" + "contain the letter from the -t (--type) option.\n"); + fprintf(where, + "\n" + "This tool can also be used to just swap a single ICU data file, replacing the\n" + "former icuswap tool. For this mode, provide the infilename (and optional\n" + "outfilename) for a non-package ICU data file.\n" + "Allowed options include -t, -w, -s and -d.\n" + "The filenames can be absolute, or relative to the source/dest dir paths.\n" + "Other options are not allowed in this mode.\n"); + fprintf(where, + "\n" + "Options:\n" + "\t(Only the last occurrence of an option is used.)\n" + "\n" + "\t-h or -? or --help print this message and exit\n"); + fprintf(where, + "\n" + "\t-tl or --type l output for little-endian/ASCII charset family\n" + "\t-tb or --type b output for big-endian/ASCII charset family\n" + "\t-te or --type e output for big-endian/EBCDIC charset family\n" + "\t The output type defaults to the input type.\n" + "\n" + "\t-c or --copyright include the ICU copyright notice\n" + "\t-C comment or --comment comment include a comment string\n"); + fprintf(where, + "\n" + "\t-a list or --add list add items to the package\n" + "\t-r list or --remove list remove items from the package\n" + "\t-x list or --extract list extract items from the package\n" + "\tThe list can be a single item's filename,\n" + "\tor a .txt filename with a list of item filenames,\n" + "\tor an ICU .dat package filename.\n"); + fprintf(where, + "\n" + "\t-w or --writepkg write the output package even if no items are removed\n" + "\t or added (e.g., for only swapping the data)\n"); + fprintf(where, + "\n" + "\t-m mode or --matchmode mode set the matching mode for item names with\n" + "\t wildcards\n" + "\t noslash: the '*' wildcard does not match the '/' tree separator\n"); + fprintf(where, + "\n" + "\t--ignore-deps Do not fail if not all resource dependencies are met. Use this\n" + "\t option if the missing resources come from another source."); + fprintf(where, + "\n" + "\tIn the .dat package, the Table of Contents (ToC) contains an entry\n" + "\tfor each item of the form prefix/tree/itemname .\n" + "\tThe prefix normally matches the package basename, and icupkg checks that,\n" + "\tbut this is not necessary when ICU need not find and load the package by filename.\n" + "\tICU package names end with the platform type letter, and thus differ\n" + "\tbetween platform types. This is not required for user data packages.\n"); + fprintf(where, + "\n" + "\t--auto_toc_prefix automatic ToC entries prefix\n" + "\t Uses the prefix of the first entry of the\n" + "\t input package, rather than its basename.\n" + "\t Requires a non-empty input package.\n" + "\t--auto_toc_prefix_with_type auto_toc_prefix + adjust platform type\n" + "\t Same as auto_toc_prefix but also checks that\n" + "\t the prefix ends with the input platform\n" + "\t type letter, and modifies it to the output\n" + "\t platform type letter.\n" + "\t At most one of the auto_toc_prefix options\n" + "\t can be used at a time.\n" + "\t--toc_prefix prefix ToC prefix to be used in the output package\n" + "\t Overrides the package basename\n" + "\t and --auto_toc_prefix.\n" + "\t Cannot be combined with --auto_toc_prefix_with_type.\n"); + /* + * Usage text columns, starting after the initial TAB. + * 1 2 3 4 5 6 7 8 + * 901234567890123456789012345678901234567890123456789012345678901234567890 + */ + fprintf(where, + "\n" + "\tList file syntax: Items are listed on one or more lines and separated\n" + "\tby whitespace (space+tab).\n" + "\tComments begin with # and are ignored. Empty lines are ignored.\n" + "\tLines where the first non-whitespace character is one of %s\n" + "\tare also ignored, to reserve for future syntax.\n", + U_PKG_RESERVED_CHARS); + fprintf(where, + "\tItems for removal or extraction may contain a single '*' wildcard\n" + "\tcharacter. The '*' matches zero or more characters.\n" + "\tIf --matchmode noslash (-m noslash) is set, then the '*'\n" + "\tdoes not match '/'.\n"); + fprintf(where, + "\n" + "\tItems must be listed relative to the package, and the --sourcedir or\n" + "\tthe --destdir path will be prepended.\n" + "\tThe paths are only prepended to item filenames while adding or\n" + "\textracting items, not to ICU .dat package or list filenames.\n" + "\t\n" + "\tPaths may contain '/' instead of the platform's\n" + "\tfile separator character, and are converted as appropriate.\n"); + fprintf(where, + "\n" + "\t-s path or --sourcedir path directory for the --add items\n" + "\t-d path or --destdir path directory for the --extract items\n" + "\n" + "\t-l or --list list the package items\n" + "\t (after modifying the package)\n" + "\t to stdout or to output list file\n" + "\t-o path or --outlist path path/filename for the --list output\n"); + } +} + +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_DEF("type", 't', UOPT_REQUIRES_ARG), + + UOPTION_COPYRIGHT, + UOPTION_DEF("comment", 'C', UOPT_REQUIRES_ARG), + + UOPTION_SOURCEDIR, + UOPTION_DESTDIR, + + UOPTION_DEF("writepkg", 'w', UOPT_NO_ARG), + + UOPTION_DEF("matchmode", 'm', UOPT_REQUIRES_ARG), + + UOPTION_DEF("ignore-deps", '\1', UOPT_NO_ARG), + + UOPTION_DEF("add", 'a', UOPT_REQUIRES_ARG), + UOPTION_DEF("remove", 'r', UOPT_REQUIRES_ARG), + UOPTION_DEF("extract", 'x', UOPT_REQUIRES_ARG), + + UOPTION_DEF("list", 'l', UOPT_NO_ARG), + UOPTION_DEF("outlist", 'o', UOPT_REQUIRES_ARG), + + UOPTION_DEF("auto_toc_prefix", '\1', UOPT_NO_ARG), + UOPTION_DEF("auto_toc_prefix_with_type", '\1', UOPT_NO_ARG), + UOPTION_DEF("toc_prefix", '\1', UOPT_REQUIRES_ARG) +}; + +enum { + OPT_HELP_H, + OPT_HELP_QUESTION_MARK, + OPT_OUT_TYPE, + + OPT_COPYRIGHT, + OPT_COMMENT, + + OPT_SOURCEDIR, + OPT_DESTDIR, + + OPT_WRITEPKG, + + OPT_MATCHMODE, + + OPT_IGNORE_DEPS, + + OPT_ADD_LIST, + OPT_REMOVE_LIST, + OPT_EXTRACT_LIST, + + OPT_LIST_ITEMS, + OPT_LIST_FILE, + + OPT_AUTO_TOC_PREFIX, + OPT_AUTO_TOC_PREFIX_WITH_TYPE, + OPT_TOC_PREFIX, + + OPT_COUNT +}; + +static UBool +isPackageName(const char *filename) { + int32_t len; + + len=(int32_t)strlen(filename)-4; /* -4: subtract the length of ".dat" */ + return (UBool)(len>0 && 0==strcmp(filename+len, ".dat")); +} +/* +This line is required by MinGW because it incorrectly globs the arguments. +So when \* is used, it turns into a list of files instead of a literal "*" +*/ +int _CRT_glob = 0; + +extern int +main(int argc, char *argv[]) { + const char *pname, *sourcePath, *destPath, *inFilename, *outFilename, *outComment; + char outType; + UBool isHelp, isModified, isPackage; + int result = 0; + + Package *pkg, *listPkg, *addListPkg; + + U_MAIN_INIT_ARGS(argc, argv); + + /* get the program basename */ + pname=findBasename(argv[0]); + + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + isHelp=options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur; + if(isHelp) { + printUsage(pname, true); + return U_ZERO_ERROR; + } + + pkg=new Package; + if(pkg==nullptr) { + fprintf(stderr, "icupkg: not enough memory\n"); + return U_MEMORY_ALLOCATION_ERROR; + } + isModified=false; + + int autoPrefix=0; + if(options[OPT_AUTO_TOC_PREFIX].doesOccur) { + pkg->setAutoPrefix(); + ++autoPrefix; + } + if(options[OPT_AUTO_TOC_PREFIX_WITH_TYPE].doesOccur) { + if(options[OPT_TOC_PREFIX].doesOccur) { + fprintf(stderr, "icupkg: --auto_toc_prefix_with_type and also --toc_prefix\n"); + printUsage(pname, false); + return U_ILLEGAL_ARGUMENT_ERROR; + } + pkg->setAutoPrefixWithType(); + ++autoPrefix; + } + if(argc<2 || 3<argc || autoPrefix>1) { + printUsage(pname, false); + return U_ILLEGAL_ARGUMENT_ERROR; + } + + if(options[OPT_SOURCEDIR].doesOccur) { + sourcePath=options[OPT_SOURCEDIR].value; + } else { + // work relative to the current working directory + sourcePath=nullptr; + } + if(options[OPT_DESTDIR].doesOccur) { + destPath=options[OPT_DESTDIR].value; + } else { + // work relative to the current working directory + destPath=nullptr; + } + + if(0==strcmp(argv[1], "new")) { + if(autoPrefix) { + fprintf(stderr, "icupkg: --auto_toc_prefix[_with_type] but no input package\n"); + printUsage(pname, false); + return U_ILLEGAL_ARGUMENT_ERROR; + } + inFilename=nullptr; + isPackage=true; + } else { + inFilename=argv[1]; + if(isPackageName(inFilename)) { + pkg->readPackage(inFilename); + isPackage=true; + } else { + /* swap a single file (icuswap replacement) rather than work on a package */ + pkg->addFile(sourcePath, inFilename); + isPackage=false; + } + } + + if(argc>=3) { + outFilename=argv[2]; + if(0!=strcmp(argv[1], argv[2])) { + isModified=true; + } + } else if(isPackage) { + outFilename=nullptr; + } else /* !isPackage */ { + outFilename=inFilename; + isModified=(UBool)(sourcePath!=destPath); + } + + /* parse the output type option */ + if(options[OPT_OUT_TYPE].doesOccur) { + const char *type=options[OPT_OUT_TYPE].value; + if(type[0]==0 || type[1]!=0) { + /* the type must be exactly one letter */ + printUsage(pname, false); + return U_ILLEGAL_ARGUMENT_ERROR; + } + outType=type[0]; + switch(outType) { + case 'l': + case 'b': + case 'e': + break; + default: + printUsage(pname, false); + return U_ILLEGAL_ARGUMENT_ERROR; + } + + /* + * Set the isModified flag if the output type differs from the + * input package type. + * If we swap a single file, just assume that we are modifying it. + * The Package class does not give us access to the item and its type. + */ + isModified|=(UBool)(!isPackage || outType!=pkg->getInType()); + } else if(isPackage) { + outType=pkg->getInType(); // default to input type + } else /* !isPackage: swap single file */ { + outType=0; /* tells extractItem() to not swap */ + } + + if(options[OPT_WRITEPKG].doesOccur) { + isModified=true; + } + + if(!isPackage) { + /* + * icuswap tool replacement: Only swap a single file. + * Check that irrelevant options are not set. + */ + if( options[OPT_COMMENT].doesOccur || + options[OPT_COPYRIGHT].doesOccur || + options[OPT_MATCHMODE].doesOccur || + options[OPT_REMOVE_LIST].doesOccur || + options[OPT_ADD_LIST].doesOccur || + options[OPT_EXTRACT_LIST].doesOccur || + options[OPT_LIST_ITEMS].doesOccur + ) { + printUsage(pname, false); + return U_ILLEGAL_ARGUMENT_ERROR; + } + if(isModified) { + pkg->extractItem(destPath, outFilename, 0, outType); + } + + delete pkg; + return result; + } + + /* Work with a package. */ + + if(options[OPT_COMMENT].doesOccur) { + outComment=options[OPT_COMMENT].value; + } else if(options[OPT_COPYRIGHT].doesOccur) { + outComment=U_COPYRIGHT_STRING; + } else { + outComment=nullptr; + } + + if(options[OPT_MATCHMODE].doesOccur) { + if(0==strcmp(options[OPT_MATCHMODE].value, "noslash")) { + pkg->setMatchMode(Package::MATCH_NOSLASH); + } else { + printUsage(pname, false); + return U_ILLEGAL_ARGUMENT_ERROR; + } + } + + /* remove items */ + if(options[OPT_REMOVE_LIST].doesOccur) { + listPkg=new Package(); + if(listPkg==nullptr) { + fprintf(stderr, "icupkg: not enough memory\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + if(readList(nullptr, options[OPT_REMOVE_LIST].value, false, listPkg)) { + pkg->removeItems(*listPkg); + delete listPkg; + isModified=true; + } else { + printUsage(pname, false); + return U_ILLEGAL_ARGUMENT_ERROR; + } + } + + /* + * add items + * use a separate Package so that its memory and items stay around + * as long as the main Package + */ + addListPkg=nullptr; + if(options[OPT_ADD_LIST].doesOccur) { + addListPkg=new Package(); + if(addListPkg==nullptr) { + fprintf(stderr, "icupkg: not enough memory\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + if(readList(sourcePath, options[OPT_ADD_LIST].value, true, addListPkg)) { + pkg->addItems(*addListPkg); + // delete addListPkg; deferred until after writePackage() + isModified=true; + } else { + printUsage(pname, false); + return U_ILLEGAL_ARGUMENT_ERROR; + } + } + + /* extract items */ + if(options[OPT_EXTRACT_LIST].doesOccur) { + listPkg=new Package(); + if(listPkg==nullptr) { + fprintf(stderr, "icupkg: not enough memory\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + if(readList(nullptr, options[OPT_EXTRACT_LIST].value, false, listPkg)) { + pkg->extractItems(destPath, *listPkg, outType); + delete listPkg; + } else { + printUsage(pname, false); + return U_ILLEGAL_ARGUMENT_ERROR; + } + } + + /* list items */ + if(options[OPT_LIST_ITEMS].doesOccur) { + int32_t i; + if (options[OPT_LIST_FILE].doesOccur) { + FileStream *out; + out = T_FileStream_open(options[OPT_LIST_FILE].value, "w"); + if (out != nullptr) { + for(i=0; i<pkg->getItemCount(); ++i) { + T_FileStream_writeLine(out, pkg->getItem(i)->name); + T_FileStream_writeLine(out, "\n"); + } + T_FileStream_close(out); + } else { + return U_ILLEGAL_ARGUMENT_ERROR; + } + } else { + for(i=0; i<pkg->getItemCount(); ++i) { + fprintf(stdout, "%s\n", pkg->getItem(i)->name); + } + } + } + + /* check dependencies between items */ + if(!options[OPT_IGNORE_DEPS].doesOccur && !pkg->checkDependencies()) { + /* some dependencies are not fulfilled */ + return U_MISSING_RESOURCE_ERROR; + } + + /* write the output .dat package if there are any modifications */ + if(isModified) { + char outFilenameBuffer[1024]; // for auto-generated output filename, if necessary + + if(outFilename==nullptr || outFilename[0]==0) { + if(inFilename==nullptr || inFilename[0]==0) { + fprintf(stderr, "icupkg: unable to auto-generate an output filename if there is no input filename\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + + /* + * auto-generate a filename: + * copy the inFilename, + * and if the last basename character matches the input file's type, + * then replace it with the output file's type + */ + char suffix[6]="?.dat"; + char *s; + + suffix[0]=pkg->getInType(); + strcpy(outFilenameBuffer, inFilename); + s=strchr(outFilenameBuffer, 0); + if((s-outFilenameBuffer)>5 && 0==memcmp(s-5, suffix, 5)) { + *(s-5)=outType; + } + outFilename=outFilenameBuffer; + } + if(options[OPT_TOC_PREFIX].doesOccur) { + pkg->setPrefix(options[OPT_TOC_PREFIX].value); + } + result = writePackageDatFile(outFilename, outComment, nullptr, nullptr, pkg, outType); + } + + delete addListPkg; + delete pkg; + return result; +} + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ diff --git a/intl/icu/source/tools/icupkg/icupkg.vcxproj b/intl/icu/source/tools/icupkg/icupkg.vcxproj new file mode 100644 index 0000000000..7b9cf58491 --- /dev/null +++ b/intl/icu/source/tools/icupkg/icupkg.vcxproj @@ -0,0 +1,80 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{62D4B15D-7A90-4ECB-BA19-5E021D6A21BC}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)\icupkg.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level4</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)\icupkg.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)\icupkg.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)\icupkg.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="icupkg.cpp" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/icupkg/sources.txt b/intl/icu/source/tools/icupkg/sources.txt new file mode 100644 index 0000000000..b4406e6226 --- /dev/null +++ b/intl/icu/source/tools/icupkg/sources.txt @@ -0,0 +1 @@ +icupkg.cpp diff --git a/intl/icu/source/tools/icuswap/Makefile.in b/intl/icu/source/tools/icuswap/Makefile.in new file mode 100644 index 0000000000..18b04e8b3c --- /dev/null +++ b/intl/icu/source/tools/icuswap/Makefile.in @@ -0,0 +1,98 @@ +## Makefile.in for ICU - tools/icuswap +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 1999-2011, International Business Machines Corporation and +## others. All Rights Reserved. +## Steven R. Loomis + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/icuswap + +TARGET_STUB_NAME = icuswap + +SECTION = 8 + +#MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(SOURCES:.cpp=.o) + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(sbindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir) + +install-man: $(MAN_FILES) +# $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) +# $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + echo Note: icuswap is obsolete - use icupkg instead. + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/icuswap/icuswap.cpp b/intl/icu/source/tools/icuswap/icuswap.cpp new file mode 100644 index 0000000000..db1eafc025 --- /dev/null +++ b/intl/icu/source/tools/icuswap/icuswap.cpp @@ -0,0 +1,649 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2003-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: icuswap.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003aug08 +* created by: Markus W. Scherer +* +* This tool takes an ICU data file and "swaps" it, that is, changes its +* platform properties between big-/little-endianness and ASCII/EBCDIC charset +* families. +* The modified data file is written to a new file. +* Useful as an install-time tool for shipping only one flavor of ICU data +* and preparing data files for the target platform. +* Will not work with data DLLs (shared libraries). +*/ + +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/udata.h" +#include "cmemory.h" +#include "cstring.h" +#include "uinvchar.h" +#include "uarrsort.h" +#include "ucmndata.h" +#include "udataswp.h" +#include "swapimpl.h" +#include "toolutil.h" +#include "uoptions.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +/* definitions */ + +#define DEFAULT_PADDING_LENGTH 15 + +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_DEF("type", 't', UOPT_REQUIRES_ARG) +}; + +enum { + OPT_HELP_H, + OPT_HELP_QUESTION_MARK, + OPT_OUT_TYPE +}; + +static int32_t +fileSize(FILE *f) { + int32_t size; + + fseek(f, 0, SEEK_END); + size=(int32_t)ftell(f); + fseek(f, 0, SEEK_SET); + return size; +} + +/** + * Swap an ICU .dat package, including swapping of enclosed items. + */ +U_CFUNC int32_t U_CALLCONV +udata_swapPackage(const char *inFilename, const char *outFilename, + const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode); + +U_CDECL_BEGIN +static void U_CALLCONV +printError(void *context, const char *fmt, va_list args) { + vfprintf((FILE *)context, fmt, args); +} +U_CDECL_END + +static int +printUsage(const char *pname, UBool ishelp) { + fprintf(stderr, + "%csage: %s [ -h, -?, --help ] -tl|-tb|-te|--type=b|... infilename outfilename\n", + ishelp ? 'U' : 'u', pname); + if(ishelp) { + fprintf(stderr, + "\nOptions: -h, -?, --help print this message and exit\n" + " Read the input file, swap its platform properties according\n" + " to the -t or --type option, and write the result to the output file.\n" + " -tl change to little-endian/ASCII charset family\n" + " -tb change to big-endian/ASCII charset family\n" + " -te change to big-endian/EBCDIC charset family\n"); + } + + return !ishelp; +} + +extern int +main(int argc, char *argv[]) { + FILE *in, *out; + const char *pname; + char *data; + int32_t length; + UBool ishelp; + int rc; + + UDataSwapper *ds; + const UDataInfo *pInfo; + UErrorCode errorCode; + uint8_t outCharset; + UBool outIsBigEndian; + + U_MAIN_INIT_ARGS(argc, argv); + + fprintf(stderr, "Warning: icuswap is an obsolete tool and it will be removed in the next ICU release.\nPlease use the icupkg tool instead.\n"); + + /* get the program basename */ + pname=strrchr(argv[0], U_FILE_SEP_CHAR); + if(pname==nullptr) { + pname=strrchr(argv[0], '/'); + } + if(pname!=nullptr) { + ++pname; + } else { + pname=argv[0]; + } + + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + ishelp=options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur; + if(ishelp || argc!=3) { + return printUsage(pname, ishelp); + } + + /* parse the output type option */ + data=(char *)options[OPT_OUT_TYPE].value; + if(data[0]==0 || data[1]!=0) { + /* the type must be exactly one letter */ + return printUsage(pname, false); + } + switch(data[0]) { + case 'l': + outIsBigEndian=false; + outCharset=U_ASCII_FAMILY; + break; + case 'b': + outIsBigEndian=true; + outCharset=U_ASCII_FAMILY; + break; + case 'e': + outIsBigEndian=true; + outCharset=U_EBCDIC_FAMILY; + break; + default: + return printUsage(pname, false); + } + + in=out=nullptr; + data=nullptr; + + /* open the input file, get its length, allocate memory for it, read the file */ + in=fopen(argv[1], "rb"); + if(in==nullptr) { + fprintf(stderr, "%s: unable to open input file \"%s\"\n", pname, argv[1]); + rc=2; + goto done; + } + + length=fileSize(in); + if(length<DEFAULT_PADDING_LENGTH) { + fprintf(stderr, "%s: empty input file \"%s\"\n", pname, argv[1]); + rc=2; + goto done; + } + + /* + * +15: udata_swapPackage() may need to add a few padding bytes to the + * last item if charset swapping is done, + * because the last item may be resorted into the middle and then needs + * additional padding bytes + */ + data=(char *)malloc(length+DEFAULT_PADDING_LENGTH); + if(data==nullptr) { + fprintf(stderr, "%s: error allocating memory for \"%s\"\n", pname, argv[1]); + rc=2; + goto done; + } + + /* set the last 15 bytes to the usual padding byte, see udata_swapPackage() */ + uprv_memset(data+length-DEFAULT_PADDING_LENGTH, 0xaa, DEFAULT_PADDING_LENGTH); + + if(length!=(int32_t)fread(data, 1, length, in)) { + fprintf(stderr, "%s: error reading \"%s\"\n", pname, argv[1]); + rc=3; + goto done; + } + + fclose(in); + in=nullptr; + + /* swap the data in-place */ + errorCode=U_ZERO_ERROR; + ds=udata_openSwapperForInputData(data, length, outIsBigEndian, outCharset, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "%s: udata_openSwapperForInputData(\"%s\") failed - %s\n", + pname, argv[1], u_errorName(errorCode)); + rc=4; + goto done; + } + + ds->printError=printError; + ds->printErrorContext=stderr; + + /* speculative cast, protected by the following length check */ + pInfo=(const UDataInfo *)((const char *)data+4); + + if( length>=20 && + pInfo->dataFormat[0]==0x43 && /* dataFormat="CmnD" */ + pInfo->dataFormat[1]==0x6d && + pInfo->dataFormat[2]==0x6e && + pInfo->dataFormat[3]==0x44 + ) { + /* + * swap the .dat package + * udata_swapPackage() needs to rename ToC name entries from the old package + * name to the new one. + * We pass it the filenames, and udata_swapPackage() will extract the + * package names. + */ + length=udata_swapPackage(argv[1], argv[2], ds, data, length, data, &errorCode); + udata_closeSwapper(ds); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "%s: udata_swapPackage(\"%s\") failed - %s\n", + pname, argv[1], u_errorName(errorCode)); + rc=4; + goto done; + } + } else { + /* swap the data, which is not a .dat package */ + length=udata_swap(ds, data, length, data, &errorCode); + udata_closeSwapper(ds); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "%s: udata_swap(\"%s\") failed - %s\n", + pname, argv[1], u_errorName(errorCode)); + rc=4; + goto done; + } + } + + out=fopen(argv[2], "wb"); + if(out==nullptr) { + fprintf(stderr, "%s: unable to open output file \"%s\"\n", pname, argv[2]); + rc=5; + goto done; + } + + if(length!=(int32_t)fwrite(data, 1, length, out)) { + fprintf(stderr, "%s: error writing \"%s\"\n", pname, argv[2]); + rc=6; + goto done; + } + + fclose(out); + out=nullptr; + + /* all done */ + rc=0; + +done: + if(in!=nullptr) { + fclose(in); + } + if(out!=nullptr) { + fclose(out); + } + if(data!=nullptr) { + free(data); + } + return rc; +} + +/* swap .dat package files -------------------------------------------------- */ + +static int32_t +extractPackageName(const UDataSwapper *ds, const char *filename, + char pkg[], int32_t capacity, + UErrorCode *pErrorCode) { + const char *basename; + int32_t len; + + if(U_FAILURE(*pErrorCode)) { + return 0; + } + + basename=findBasename(filename); + len=(int32_t)uprv_strlen(basename)-4; /* -4: subtract the length of ".dat" */ + + if(len<=0 || 0!=uprv_strcmp(basename+len, ".dat")) { + udata_printError(ds, "udata_swapPackage(): \"%s\" is not recognized as a package filename (must end with .dat)\n", + basename); + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + if(len>=capacity) { + udata_printError(ds, "udata_swapPackage(): the package name \"%s\" is too long (>=%ld)\n", + (long)capacity); + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + uprv_memcpy(pkg, basename, len); + pkg[len]=0; + return len; +} + +struct ToCEntry { + uint32_t nameOffset, inOffset, outOffset, length; +}; + +U_CDECL_BEGIN +static int32_t U_CALLCONV +compareToCEntries(const void *context, const void *left, const void *right) { + const char *chars=(const char *)context; + return (int32_t)uprv_strcmp(chars+((const ToCEntry *)left)->nameOffset, + chars+((const ToCEntry *)right)->nameOffset); +} +U_CDECL_END + +U_CFUNC int32_t U_CALLCONV +udata_swapPackage(const char *inFilename, const char *outFilename, + const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t headerSize; + + const uint8_t *inBytes; + uint8_t *outBytes; + + uint32_t itemCount, offset, i; + int32_t itemLength; + + const UDataOffsetTOCEntry *inEntries; + UDataOffsetTOCEntry *outEntries; + + ToCEntry *table; + + char inPkgName[32], outPkgName[32]; + int32_t inPkgNameLength, outPkgNameLength; + + /* udata_swapDataHeader checks the arguments */ + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==0x43 && /* dataFormat="CmnD" */ + pInfo->dataFormat[1]==0x6d && + pInfo->dataFormat[2]==0x6e && + pInfo->dataFormat[3]==0x44 && + pInfo->formatVersion[0]==1 + )) { + udata_printError(ds, "udata_swapPackage(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as an ICU .dat package\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + /* + * We need to change the ToC name entries so that they have the correct + * package name prefix. + * Extract the package names from the in/out filenames. + */ + inPkgNameLength=extractPackageName( + ds, inFilename, + inPkgName, (int32_t)sizeof(inPkgName), + pErrorCode); + outPkgNameLength=extractPackageName( + ds, outFilename, + outPkgName, (int32_t)sizeof(outPkgName), + pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return 0; + } + + /* + * It is possible to work with inPkgNameLength!=outPkgNameLength, + * but then the length of the data file would change more significantly, + * which we are not currently prepared for. + */ + if(inPkgNameLength!=outPkgNameLength) { + udata_printError(ds, "udata_swapPackage(): the package names \"%s\" and \"%s\" must have the same length\n", + inPkgName, outPkgName); + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + inBytes=(const uint8_t *)inData+headerSize; + inEntries=(const UDataOffsetTOCEntry *)(inBytes+4); + + if(length<0) { + /* preflighting */ + itemCount=ds->readUInt32(*(const uint32_t *)inBytes); + if(itemCount==0) { + /* no items: count only the item count and return */ + return headerSize+4; + } + + /* read the last item's offset and preflight it */ + offset=ds->readUInt32(inEntries[itemCount-1].dataOffset); + itemLength=udata_swap(ds, inBytes+offset, -1, nullptr, pErrorCode); + + if(U_SUCCESS(*pErrorCode)) { + return headerSize+offset+(uint32_t)itemLength; + } else { + return 0; + } + } else { + /* check that the itemCount fits, then the ToC table, then at least the header of the last item */ + length-=headerSize; + if(length<4) { + /* itemCount does not fit */ + offset=0xffffffff; + itemCount=0; /* make compilers happy */ + } else { + itemCount=ds->readUInt32(*(const uint32_t *)inBytes); + if(itemCount==0) { + offset=4; + } else if((uint32_t)length<(4+8*itemCount)) { + /* ToC table does not fit */ + offset=0xffffffff; + } else { + /* offset of the last item plus at least 20 bytes for its header */ + offset=20+ds->readUInt32(inEntries[itemCount-1].dataOffset); + } + } + if((uint32_t)length<offset) { + udata_printError(ds, "udata_swapPackage(): too few bytes (%d after header) for a .dat package\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + outBytes=(uint8_t *)outData+headerSize; + + /* swap the item count */ + ds->swapArray32(ds, inBytes, 4, outBytes, pErrorCode); + + if(itemCount==0) { + /* no items: just return now */ + return headerSize+4; + } + + /* swap the item name strings */ + offset=4+8*itemCount; + itemLength=(int32_t)(ds->readUInt32(inEntries[0].dataOffset)-offset); + udata_swapInvStringBlock(ds, inBytes+offset, itemLength, outBytes+offset, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + udata_printError(ds, "udata_swapPackage() failed to swap the data item name strings\n"); + return 0; + } + /* keep offset and itemLength in case we allocate and copy the strings below */ + + /* swap the package names into the output charset */ + if(ds->outCharset!=U_CHARSET_FAMILY) { + UDataSwapper *ds2; + ds2=udata_openSwapper(true, U_CHARSET_FAMILY, true, ds->outCharset, pErrorCode); + ds2->swapInvChars(ds2, inPkgName, inPkgNameLength, inPkgName, pErrorCode); + ds2->swapInvChars(ds2, outPkgName, outPkgNameLength, outPkgName, pErrorCode); + udata_closeSwapper(ds2); + if(U_FAILURE(*pErrorCode)) { + udata_printError(ds, "udata_swapPackage() failed to swap the input/output package names\n"); + } + } + + /* change the prefix of each ToC entry name from the old to the new package name */ + { + char *entryName; + + for(i=0; i<itemCount; ++i) { + entryName=(char *)inBytes+ds->readUInt32(inEntries[i].nameOffset); + + if(0==uprv_memcmp(entryName, inPkgName, inPkgNameLength)) { + uprv_memcpy(entryName, outPkgName, inPkgNameLength); + } else { + udata_printError(ds, "udata_swapPackage() failed: ToC item %ld does not have the input package name as a prefix\n", + (long)i); + *pErrorCode=U_INVALID_FORMAT_ERROR; + return 0; + } + } + } + + /* + * Allocate the ToC table and, if necessary, a temporary buffer for + * pseudo-in-place swapping. + * + * We cannot swap in-place because: + * + * 1. If the swapping of an item fails mid-way, then in-place swapping + * has destroyed its data. + * Out-of-place swapping allows us to then copy its original data. + * + * 2. If swapping changes the charset family, then we must resort + * not only the ToC table but also the data items themselves. + * This requires a permutation and is best done with separate in/out + * buffers. + * + * We swapped the strings above to avoid the malloc below if string swapping fails. + */ + if(inData==outData) { + /* +15: prepare for extra padding of a newly-last item */ + table=(ToCEntry *)uprv_malloc(itemCount*sizeof(ToCEntry)+length+DEFAULT_PADDING_LENGTH); + if(table!=nullptr) { + outBytes=(uint8_t *)(table+itemCount); + + /* copy the item count and the swapped strings */ + uprv_memcpy(outBytes, inBytes, 4); + uprv_memcpy(outBytes+offset, inBytes+offset, itemLength); + } + } else { + table=(ToCEntry *)uprv_malloc(itemCount*sizeof(ToCEntry)); + } + if(table==nullptr) { + udata_printError(ds, "udata_swapPackage(): out of memory allocating %d bytes\n", + inData==outData ? + itemCount*sizeof(ToCEntry)+length+DEFAULT_PADDING_LENGTH : + itemCount*sizeof(ToCEntry)); + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + return 0; + } + outEntries=(UDataOffsetTOCEntry *)(outBytes+4); + + /* read the ToC table */ + for(i=0; i<itemCount; ++i) { + table[i].nameOffset=ds->readUInt32(inEntries[i].nameOffset); + table[i].inOffset=ds->readUInt32(inEntries[i].dataOffset); + if(i>0) { + table[i-1].length=table[i].inOffset-table[i-1].inOffset; + } + } + table[itemCount-1].length=(uint32_t)length-table[itemCount-1].inOffset; + + if(ds->inCharset==ds->outCharset) { + /* no charset swapping, no resorting: keep item offsets the same */ + for(i=0; i<itemCount; ++i) { + table[i].outOffset=table[i].inOffset; + } + } else { + /* charset swapping: resort items by their swapped names */ + + /* + * Before the actual sorting, we need to make sure that each item + * has a length that is a multiple of 16 bytes so that all items + * are 16-aligned. + * Only the old last item may be missing up to 15 padding bytes. + * Add padding bytes for it. + * Since the icuswap main() function has already allocated enough + * input buffer space and set the last 15 bytes there to 0xaa, + * we only need to increase the total data length and the length + * of the last item here. + */ + if((length&0xf)!=0) { + int32_t delta=16-(length&0xf); + length+=delta; + table[itemCount-1].length+=(uint32_t)delta; + } + + /* Save the offset before we sort the TOC. */ + offset=table[0].inOffset; + /* sort the TOC entries */ + uprv_sortArray(table, (int32_t)itemCount, (int32_t)sizeof(ToCEntry), + compareToCEntries, outBytes, false, pErrorCode); + + /* + * Note: Before sorting, the inOffset values were in order. + * Now the outOffset values are in order. + */ + + /* assign outOffset values */ + for(i=0; i<itemCount; ++i) { + table[i].outOffset=offset; + offset+=table[i].length; + } + } + + /* write the output ToC table */ + for(i=0; i<itemCount; ++i) { + ds->writeUInt32(&outEntries[i].nameOffset, table[i].nameOffset); + ds->writeUInt32(&outEntries[i].dataOffset, table[i].outOffset); + } + + /* swap each data item */ + for(i=0; i<itemCount; ++i) { + /* first copy the item bytes to make sure that unreachable bytes are copied */ + uprv_memcpy(outBytes+table[i].outOffset, inBytes+table[i].inOffset, table[i].length); + + /* swap the item */ + udata_swap(ds, inBytes+table[i].inOffset, (int32_t)table[i].length, + outBytes+table[i].outOffset, pErrorCode); + + if(U_FAILURE(*pErrorCode)) { + if(ds->outCharset==U_CHARSET_FAMILY) { + udata_printError(ds, "warning: udata_swapPackage() failed to swap item \"%s\"\n" + " at inOffset 0x%x length 0x%x - %s\n" + " the data item will be copied, not swapped\n\n", + (char *)outBytes+table[i].nameOffset, + table[i].inOffset, table[i].length, u_errorName(*pErrorCode)); + } else { + udata_printError(ds, "warning: udata_swapPackage() failed to swap an item\n" + " at inOffset 0x%x length 0x%x - %s\n" + " the data item will be copied, not swapped\n\n", + table[i].inOffset, table[i].length, u_errorName(*pErrorCode)); + } + /* reset the error code, copy the data item, and continue */ + *pErrorCode=U_ZERO_ERROR; + uprv_memcpy(outBytes+table[i].outOffset, inBytes+table[i].inOffset, table[i].length); + } + } + + if(inData==outData) { + /* copy the data from the temporary buffer to the in-place buffer */ + uprv_memcpy((uint8_t *)outData+headerSize, outBytes, length); + } + uprv_free(table); + + return headerSize+length; + } +} + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ diff --git a/intl/icu/source/tools/icuswap/icuswap.vcxproj b/intl/icu/source/tools/icuswap/icuswap.vcxproj new file mode 100644 index 0000000000..3c60511b42 --- /dev/null +++ b/intl/icu/source/tools/icuswap/icuswap.vcxproj @@ -0,0 +1,223 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup Label="ProjectConfigurations"> + <ProjectConfiguration Include="Debug|Win32"> + <Configuration>Debug</Configuration> + <Platform>Win32</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Debug|x64"> + <Configuration>Debug</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|Win32"> + <Configuration>Release</Configuration> + <Platform>Win32</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|x64"> + <Configuration>Release</Configuration> + <Platform>x64</Platform> + </ProjectConfiguration> + </ItemGroup> + <PropertyGroup Label="Globals"> + <ProjectGuid>{39690C2A-AD89-45E4-893A-899496B85785}</ProjectGuid> + <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.40219.1</_ProjectFileVersion> + <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\x86\Release\</OutDir> + <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\x86\Release\</IntDir> + <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental> + <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\x86\Debug\</OutDir> + <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\x86\Debug\</IntDir> + <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental> + <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">.\x64\Release\</OutDir> + <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">.\x64\Release\</IntDir> + <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</LinkIncremental> + <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">.\x64\Debug\</OutDir> + <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">.\x64\Debug\</IntDir> + <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</LinkIncremental> + </PropertyGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\bin +</Command> + <Outputs>..\..\..\bin\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + <Midl> + <TypeLibraryName>.\x86\Release/icuswap.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PreprocessorDefinitions>WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <StringPooling>true</StringPooling> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <PrecompiledHeaderOutputFile>.\x86\Release/icuswap.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>.\x86\Release/</AssemblerListingLocation> + <ObjectFileName>.\x86\Release/</ObjectFileName> + <ProgramDataBaseFileName>.\x86\Release/</ProgramDataBaseFileName> + <WarningLevel>Level4</WarningLevel> + <SuppressStartupBanner>true</SuppressStartupBanner> + <CompileAs>Default</CompileAs> + </ClCompile> + <ResourceCompile> + <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <Culture>0x0409</Culture> + </ResourceCompile> + <Link> + <OutputFile>.\x86\Release/icuswap.exe</OutputFile> + <SuppressStartupBanner>true</SuppressStartupBanner> + <ProgramDatabaseFile>.\x86\Release/icuswap.pdb</ProgramDatabaseFile> + <SubSystem>Console</SubSystem> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\bin +</Command> + <Outputs>..\..\..\bin\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + <Midl> + <TypeLibraryName>.\x86\Debug/icuswap.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <Optimization>Disabled</Optimization> + <AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PreprocessorDefinitions>WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + <BufferSecurityCheck>true</BufferSecurityCheck> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <PrecompiledHeaderOutputFile>.\x86\Debug/icuswap.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>.\x86\Debug/</AssemblerListingLocation> + <ObjectFileName>.\x86\Debug/</ObjectFileName> + <ProgramDataBaseFileName>.\x86\Debug/</ProgramDataBaseFileName> + <BrowseInformation>true</BrowseInformation> + <WarningLevel>Level4</WarningLevel> + <SuppressStartupBanner>true</SuppressStartupBanner> + <DebugInformationFormat>EditAndContinue</DebugInformationFormat> + <CompileAs>Default</CompileAs> + </ClCompile> + <ResourceCompile> + <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <Culture>0x0409</Culture> + </ResourceCompile> + <Link> + <OutputFile>.\x86\Debug/icuswap.exe</OutputFile> + <SuppressStartupBanner>true</SuppressStartupBanner> + <GenerateDebugInformation>true</GenerateDebugInformation> + <ProgramDatabaseFile>.\x86\Debug/icuswap.pdb</ProgramDatabaseFile> + <SubSystem>Console</SubSystem> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\bin64 +</Command> + <Outputs>..\..\..\bin64\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + <Midl> + <TargetEnvironment>X64</TargetEnvironment> + <TypeLibraryName>.\x64\Release/icuswap.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PreprocessorDefinitions>WIN64;WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <StringPooling>true</StringPooling> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <PrecompiledHeaderOutputFile>.\x64\Release/icuswap.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>.\x64\Release/</AssemblerListingLocation> + <ObjectFileName>.\x64\Release/</ObjectFileName> + <ProgramDataBaseFileName>.\x64\Release/</ProgramDataBaseFileName> + <WarningLevel>Level4</WarningLevel> + <SuppressStartupBanner>true</SuppressStartupBanner> + <CompileAs>Default</CompileAs> + </ClCompile> + <ResourceCompile> + <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <Culture>0x0409</Culture> + </ResourceCompile> + <Link> + <OutputFile>.\x64\Release/icuswap.exe</OutputFile> + <SuppressStartupBanner>true</SuppressStartupBanner> + <ProgramDatabaseFile>.\x64\Release/icuswap.pdb</ProgramDatabaseFile> + <SubSystem>Console</SubSystem> + <TargetMachine>MachineX64</TargetMachine> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\bin64 +</Command> + <Outputs>..\..\..\bin64\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + <Midl> + <TargetEnvironment>X64</TargetEnvironment> + <TypeLibraryName>.\x64\Debug/icuswap.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <Optimization>Disabled</Optimization> + <AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PreprocessorDefinitions>WIN64;WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + <BufferSecurityCheck>true</BufferSecurityCheck> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <PrecompiledHeaderOutputFile>.\x64\Debug/icuswap.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>.\x64\Debug/</AssemblerListingLocation> + <ObjectFileName>.\x64\Debug/</ObjectFileName> + <ProgramDataBaseFileName>.\x64\Debug/</ProgramDataBaseFileName> + <BrowseInformation>true</BrowseInformation> + <WarningLevel>Level4</WarningLevel> + <SuppressStartupBanner>true</SuppressStartupBanner> + <DebugInformationFormat>ProgramDatabase</DebugInformationFormat> + <CompileAs>Default</CompileAs> + </ClCompile> + <ResourceCompile> + <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <Culture>0x0409</Culture> + </ResourceCompile> + <Link> + <OutputFile>.\x64\Debug/icuswap.exe</OutputFile> + <SuppressStartupBanner>true</SuppressStartupBanner> + <GenerateDebugInformation>true</GenerateDebugInformation> + <ProgramDatabaseFile>.\x64\Debug/icuswap.pdb</ProgramDatabaseFile> + <SubSystem>Console</SubSystem> + <TargetMachine>MachineX64</TargetMachine> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="icuswap.cpp" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project> diff --git a/intl/icu/source/tools/icuswap/sources.txt b/intl/icu/source/tools/icuswap/sources.txt new file mode 100644 index 0000000000..6e73ffbf1e --- /dev/null +++ b/intl/icu/source/tools/icuswap/sources.txt @@ -0,0 +1 @@ +icuswap.cpp diff --git a/intl/icu/source/tools/makeconv/Makefile.in b/intl/icu/source/tools/makeconv/Makefile.in new file mode 100644 index 0000000000..061d54c488 --- /dev/null +++ b/intl/icu/source/tools/makeconv/Makefile.in @@ -0,0 +1,97 @@ +## Makefile.in for ICU - tools/makeconv +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 1999-2011, International Business Machines Corporation and +## others. All Rights Reserved. +## Stephen F. Booth + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/makeconv + +TARGET_STUB_NAME = makeconv + +SECTION = 1 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(srcdir)/../toolutil +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(patsubst %.cpp,%.o,$(patsubst %.c,%.o, $(SOURCES))) + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(bindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(bindir) + +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/makeconv/gencnvex.c b/intl/icu/source/tools/makeconv/gencnvex.c new file mode 100644 index 0000000000..837a2d2c50 --- /dev/null +++ b/intl/icu/source/tools/makeconv/gencnvex.c @@ -0,0 +1,1084 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2003-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: gencnvex.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003oct12 +* created by: Markus W. Scherer +*/ + +#include <stdbool.h> +#include <stdio.h> +#include "unicode/utypes.h" +#include "unicode/ustring.h" +#include "cstring.h" +#include "cmemory.h" +#include "ucnv_cnv.h" +#include "ucnvmbcs.h" +#include "toolutil.h" +#include "unewdata.h" +#include "ucm.h" +#include "makeconv.h" +#include "genmbcs.h" + +static void +CnvExtClose(NewConverter *cnvData); + +static UBool +CnvExtIsValid(NewConverter *cnvData, + const uint8_t *bytes, int32_t length); + +static UBool +CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData); + +static uint32_t +CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData, + UNewDataMemory *pData, int32_t tableType); + +typedef struct CnvExtData { + NewConverter newConverter; + + UCMFile *ucm; + + /* toUnicode (state table in ucm->states) */ + UToolMemory *toUTable, *toUUChars; + + /* fromUnicode */ + UToolMemory *fromUTableUChars, *fromUTableValues, *fromUBytes; + + uint16_t stage1[MBCS_STAGE_1_SIZE]; + uint16_t stage2[MBCS_STAGE_2_SIZE]; + uint16_t stage3[0x10000<<UCNV_EXT_STAGE_2_LEFT_SHIFT]; /* 0x10000 because of 16-bit stage 2/3 indexes */ + uint32_t stage3b[0x10000]; + + int32_t stage1Top, stage2Top, stage3Top, stage3bTop; + + /* for stage3 compaction of <subchar1> |2 mappings */ + uint16_t stage3Sub1Block; + + /* statistics */ + int32_t + maxInBytes, maxOutBytes, maxBytesPerUChar, + maxInUChars, maxOutUChars, maxUCharsPerByte; +} CnvExtData; + +NewConverter * +CnvExtOpen(UCMFile *ucm) { + CnvExtData *extData; + + extData=(CnvExtData *)uprv_malloc(sizeof(CnvExtData)); + if(extData==NULL) { + printf("out of memory\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + uprv_memset(extData, 0, sizeof(CnvExtData)); + + extData->ucm=ucm; /* aliased, not owned */ + + extData->newConverter.close=CnvExtClose; + extData->newConverter.isValid=CnvExtIsValid; + extData->newConverter.addTable=CnvExtAddTable; + extData->newConverter.write=CnvExtWrite; + return &extData->newConverter; +} + +static void +CnvExtClose(NewConverter *cnvData) { + CnvExtData *extData=(CnvExtData *)cnvData; + if(extData!=NULL) { + utm_close(extData->toUTable); + utm_close(extData->toUUChars); + utm_close(extData->fromUTableUChars); + utm_close(extData->fromUTableValues); + utm_close(extData->fromUBytes); + uprv_free(extData); + } +} + +/* we do not expect this to be called */ +static UBool +CnvExtIsValid(NewConverter *cnvData, + const uint8_t *bytes, int32_t length) { + // suppress compiler warnings about unused variables + (void)cnvData; + (void)bytes; + (void)length; + return false; +} + +static uint32_t +CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData, + UNewDataMemory *pData, int32_t tableType) { + (void) staticData; // suppress compiler warnings about unused variable + CnvExtData *extData=(CnvExtData *)cnvData; + int32_t length, top, headerSize; + + int32_t indexes[UCNV_EXT_INDEXES_MIN_LENGTH]={ 0 }; + + if(tableType&TABLE_BASE) { + headerSize=0; + } else { + _MBCSHeader header={ { 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + /* write the header and base table name for an extension-only table */ + length=(int32_t)uprv_strlen(extData->ucm->baseName)+1; + while(length&3) { + /* add padding */ + extData->ucm->baseName[length++]=0; + } + + headerSize=MBCS_HEADER_V4_LENGTH*4+length; + + /* fill the header */ + header.version[0]=4; + header.version[1]=2; + header.flags=(uint32_t)((headerSize<<8)|MBCS_OUTPUT_EXT_ONLY); + + /* write the header and the base table name */ + udata_writeBlock(pData, &header, MBCS_HEADER_V4_LENGTH*4); + udata_writeBlock(pData, extData->ucm->baseName, length); + } + + /* fill indexes[] - offsets/indexes are in units of the target array */ + top=0; + + indexes[UCNV_EXT_INDEXES_LENGTH]=length=UCNV_EXT_INDEXES_MIN_LENGTH; + top+=length*4; + + indexes[UCNV_EXT_TO_U_INDEX]=top; + indexes[UCNV_EXT_TO_U_LENGTH]=length=utm_countItems(extData->toUTable); + top+=length*4; + + indexes[UCNV_EXT_TO_U_UCHARS_INDEX]=top; + indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]=length=utm_countItems(extData->toUUChars); + top+=length*2; + + indexes[UCNV_EXT_FROM_U_UCHARS_INDEX]=top; + length=utm_countItems(extData->fromUTableUChars); + top+=length*2; + + if(top&3) { + /* add padding */ + *((UChar *)utm_alloc(extData->fromUTableUChars))=0; + *((uint32_t *)utm_alloc(extData->fromUTableValues))=0; + ++length; + top+=2; + } + indexes[UCNV_EXT_FROM_U_LENGTH]=length; + + indexes[UCNV_EXT_FROM_U_VALUES_INDEX]=top; + top+=length*4; + + indexes[UCNV_EXT_FROM_U_BYTES_INDEX]=top; + length=utm_countItems(extData->fromUBytes); + top+=length; + + if(top&1) { + /* add padding */ + *((uint8_t *)utm_alloc(extData->fromUBytes))=0; + ++length; + ++top; + } + indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]=length; + + indexes[UCNV_EXT_FROM_U_STAGE_12_INDEX]=top; + indexes[UCNV_EXT_FROM_U_STAGE_1_LENGTH]=length=extData->stage1Top; + indexes[UCNV_EXT_FROM_U_STAGE_12_LENGTH]=length+=extData->stage2Top; + top+=length*2; + + indexes[UCNV_EXT_FROM_U_STAGE_3_INDEX]=top; + length=extData->stage3Top; + top+=length*2; + + if(top&3) { + /* add padding */ + extData->stage3[extData->stage3Top++]=0; + ++length; + top+=2; + } + indexes[UCNV_EXT_FROM_U_STAGE_3_LENGTH]=length; + + indexes[UCNV_EXT_FROM_U_STAGE_3B_INDEX]=top; + indexes[UCNV_EXT_FROM_U_STAGE_3B_LENGTH]=length=extData->stage3bTop; + top+=length*4; + + indexes[UCNV_EXT_SIZE]=top; + + /* statistics */ + indexes[UCNV_EXT_COUNT_BYTES]= + (extData->maxInBytes<<16)| + (extData->maxOutBytes<<8)| + extData->maxBytesPerUChar; + indexes[UCNV_EXT_COUNT_UCHARS]= + (extData->maxInUChars<<16)| + (extData->maxOutUChars<<8)| + extData->maxUCharsPerByte; + + indexes[UCNV_EXT_FLAGS]=extData->ucm->ext->unicodeMask; + + /* write the extension data */ + udata_writeBlock(pData, indexes, sizeof(indexes)); + udata_writeBlock(pData, utm_getStart(extData->toUTable), indexes[UCNV_EXT_TO_U_LENGTH]*4); + udata_writeBlock(pData, utm_getStart(extData->toUUChars), indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]*2); + + udata_writeBlock(pData, utm_getStart(extData->fromUTableUChars), indexes[UCNV_EXT_FROM_U_LENGTH]*2); + udata_writeBlock(pData, utm_getStart(extData->fromUTableValues), indexes[UCNV_EXT_FROM_U_LENGTH]*4); + udata_writeBlock(pData, utm_getStart(extData->fromUBytes), indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]); + + udata_writeBlock(pData, extData->stage1, extData->stage1Top*2); + udata_writeBlock(pData, extData->stage2, extData->stage2Top*2); + udata_writeBlock(pData, extData->stage3, extData->stage3Top*2); + udata_writeBlock(pData, extData->stage3b, extData->stage3bTop*4); + +#if 0 + { + int32_t i, j; + + length=extData->stage1Top; + printf("\nstage1[%x]:\n", length); + + for(i=0; i<length; ++i) { + if(extData->stage1[i]!=length) { + printf("stage1[%04x]=%04x\n", i, extData->stage1[i]); + } + } + + j=length; + length=extData->stage2Top; + printf("\nstage2[%x]:\n", length); + + for(i=0; i<length; ++j, ++i) { + if(extData->stage2[i]!=0) { + printf("stage12[%04x]=%04x\n", j, extData->stage2[i]); + } + } + + length=extData->stage3Top; + printf("\nstage3[%x]:\n", length); + + for(i=0; i<length; ++i) { + if(extData->stage3[i]!=0) { + printf("stage3[%04x]=%04x\n", i, extData->stage3[i]); + } + } + + length=extData->stage3bTop; + printf("\nstage3b[%x]:\n", length); + + for(i=0; i<length; ++i) { + if(extData->stage3b[i]!=0) { + printf("stage3b[%04x]=%08x\n", i, extData->stage3b[i]); + } + } + } +#endif + + if(VERBOSE) { + printf("size of extension data: %ld\n", (long)top); + } + + /* return the number of bytes that should have been written */ + return (uint32_t)(headerSize+top); +} + +/* to Unicode --------------------------------------------------------------- */ + +/* + * Remove fromUnicode fallbacks and SUB mappings which are irrelevant for + * the toUnicode table. + * This includes mappings with MBCS_FROM_U_EXT_FLAG which were suitable + * for the base toUnicode table but not for the base fromUnicode table. + * The table must be sorted. + * Modifies previous data in the reverseMap. + */ +static int32_t +reduceToUMappings(UCMTable *table) { + UCMapping *mappings; + int32_t *map; + int32_t i, j, count; + int8_t flag; + + mappings=table->mappings; + map=table->reverseMap; + count=table->mappingsLength; + + /* leave the map alone for the initial mappings with desired flags */ + for(i=j=0; i<count; ++i) { + flag=mappings[map[i]].f; + if(flag!=0 && flag!=3) { + break; + } + } + + /* reduce from here to the rest */ + for(j=i; i<count; ++i) { + flag=mappings[map[i]].f; + if(flag==0 || flag==3) { + map[j++]=map[i]; + } + } + + return j; +} + +static uint32_t +getToUnicodeValue(CnvExtData *extData, UCMTable *table, UCMapping *m) { + UChar32 *u32; + UChar *u; + uint32_t value; + int32_t u16Length, ratio; + UErrorCode errorCode; + + /* write the Unicode result code point or string index */ + if(m->uLen==1) { + u16Length=U16_LENGTH(m->u); + value=(uint32_t)(UCNV_EXT_TO_U_MIN_CODE_POINT+m->u); + } else { + /* the parser enforces m->uLen<=UCNV_EXT_MAX_UCHARS */ + + /* get the result code point string and its 16-bit string length */ + u32=UCM_GET_CODE_POINTS(table, m); + errorCode=U_ZERO_ERROR; + u_strFromUTF32(NULL, 0, &u16Length, u32, m->uLen, &errorCode); + if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) { + exit(errorCode); + } + + /* allocate it and put its length and index into the value */ + value= + (((uint32_t)u16Length+UCNV_EXT_TO_U_LENGTH_OFFSET)<<UCNV_EXT_TO_U_LENGTH_SHIFT)| + ((uint32_t)utm_countItems(extData->toUUChars)); + u=utm_allocN(extData->toUUChars, u16Length); + + /* write the result 16-bit string */ + errorCode=U_ZERO_ERROR; + u_strFromUTF32(u, u16Length, NULL, u32, m->uLen, &errorCode); + if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) { + exit(errorCode); + } + } + if(m->f==0) { + value|=UCNV_EXT_TO_U_ROUNDTRIP_FLAG; + } + + /* update statistics */ + if(m->bLen>extData->maxInBytes) { + extData->maxInBytes=m->bLen; + } + if(u16Length>extData->maxOutUChars) { + extData->maxOutUChars=u16Length; + } + + ratio=(u16Length+(m->bLen-1))/m->bLen; + if(ratio>extData->maxUCharsPerByte) { + extData->maxUCharsPerByte=ratio; + } + + return value; +} + +/* + * Recursive toUTable generator core function. + * Preconditions: + * - start<limit (There is at least one mapping.) + * - The mappings are sorted lexically. (Access is through the reverseMap.) + * - All mappings between start and limit have input sequences that share + * the same prefix of unitIndex length, and therefore all of these sequences + * are at least unitIndex+1 long. + * - There are only relevant mappings available through the reverseMap, + * see reduceToUMappings(). + * + * One function invocation generates one section table. + * + * Steps: + * 1. Count the number of unique unit values and get the low/high unit values + * that occur at unitIndex. + * 2. Allocate the section table with possible optimization for linear access. + * 3. Write temporary version of the section table with start indexes of + * subsections, each corresponding to one unit value at unitIndex. + * 4. Iterate through the table once more, and depending on the subsection length: + * 0: write 0 as a result value (unused byte in linear-access section table) + * >0: if there is one mapping with an input unit sequence of unitIndex+1 + * then defaultValue=compute the mapping result for this whole sequence + * else defaultValue=0 + * + * recurse into the subsection + */ +static UBool +generateToUTable(CnvExtData *extData, UCMTable *table, + int32_t start, int32_t limit, int32_t unitIndex, + uint32_t defaultValue) { + UCMapping *mappings, *m; + int32_t *map; + int32_t i, j, uniqueCount, count, subStart, subLimit; + + uint8_t *bytes; + int32_t low, high, prev; + + uint32_t *section; + + mappings=table->mappings; + map=table->reverseMap; + + /* step 1: examine the input units; set low, high, uniqueCount */ + m=mappings+map[start]; + bytes=UCM_GET_BYTES(table, m); + low=bytes[unitIndex]; + uniqueCount=1; + + prev=high=low; + for(i=start+1; i<limit; ++i) { + m=mappings+map[i]; + bytes=UCM_GET_BYTES(table, m); + high=bytes[unitIndex]; + + if(high!=prev) { + prev=high; + ++uniqueCount; + } + } + + /* step 2: allocate the section; set count, section */ + count=(high-low)+1; + if(count<0x100 && (unitIndex==0 || uniqueCount>=(3*count)/4)) { + /* + * for the root table and for fairly full tables: + * allocate for direct, linear array access + * by keeping count, to write an entry for each unit value + * from low to high + * exception: use a compact table if count==0x100 because + * that cannot be encoded in the length byte + */ + } else { + count=uniqueCount; + } + + if(count>=0x100) { + fprintf(stderr, "error: toUnicode extension table section overflow: %ld section entries\n", (long)count); + return false; + } + + /* allocate the section: 1 entry for the header + count for the items */ + section=(uint32_t *)utm_allocN(extData->toUTable, 1+count); + + /* write the section header */ + *section++=((uint32_t)count<<UCNV_EXT_TO_U_BYTE_SHIFT)|defaultValue; + + /* step 3: write temporary section table with subsection starts */ + prev=low-1; /* just before low to prevent empty subsections before low */ + j=0; /* section table index */ + for(i=start; i<limit; ++i) { + m=mappings+map[i]; + bytes=UCM_GET_BYTES(table, m); + high=bytes[unitIndex]; + + if(high!=prev) { + /* start of a new subsection for unit high */ + if(count>uniqueCount) { + /* write empty subsections for unused units in a linear table */ + while(++prev<high) { + section[j++]=((uint32_t)prev<<UCNV_EXT_TO_U_BYTE_SHIFT)|(uint32_t)i; + } + } else { + prev=high; + } + + /* write the entry with the subsection start */ + section[j++]=((uint32_t)high<<UCNV_EXT_TO_U_BYTE_SHIFT)|(uint32_t)i; + } + } + /* assert(j==count) */ + + /* step 4: recurse and write results */ + subLimit=UCNV_EXT_TO_U_GET_VALUE(section[0]); + for(j=0; j<count; ++j) { + subStart=subLimit; + subLimit= (j+1)<count ? UCNV_EXT_TO_U_GET_VALUE(section[j+1]) : limit; + + /* remove the subStart temporary value */ + section[j]&=~UCNV_EXT_TO_U_VALUE_MASK; + + if(subStart==subLimit) { + /* leave the value zero: empty subsection for unused unit in a linear table */ + continue; + } + + /* see if there is exactly one input unit sequence of length unitIndex+1 */ + defaultValue=0; + m=mappings+map[subStart]; + if(m->bLen==unitIndex+1) { + /* do not include this in generateToUTable() */ + ++subStart; + + if(subStart<subLimit && mappings[map[subStart]].bLen==unitIndex+1) { + /* print error for multiple same-input-sequence mappings */ + fprintf(stderr, "error: multiple mappings from same bytes\n"); + ucm_printMapping(table, m, stderr); + ucm_printMapping(table, mappings+map[subStart], stderr); + return false; + } + + defaultValue=getToUnicodeValue(extData, table, m); + } + + if(subStart==subLimit) { + /* write the result for the input sequence ending here */ + section[j]|=defaultValue; + } else { + /* write the index to the subsection table */ + section[j]|=(uint32_t)utm_countItems(extData->toUTable); + + /* recurse */ + if(!generateToUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) { + return false; + } + } + } + return true; +} + +/* + * Generate the toUTable and toUUChars from the input table. + * The input table must be sorted, and all precision flags must be 0..3. + * This function will modify the table's reverseMap. + */ +static UBool +makeToUTable(CnvExtData *extData, UCMTable *table) { + int32_t toUCount; + + toUCount=reduceToUMappings(table); + + extData->toUTable=utm_open("cnv extension toUTable", 0x10000, UCNV_EXT_TO_U_MIN_CODE_POINT, 4); + extData->toUUChars=utm_open("cnv extension toUUChars", 0x10000, UCNV_EXT_TO_U_INDEX_MASK+1, 2); + + return generateToUTable(extData, table, 0, toUCount, 0, 0); +} + +/* from Unicode ------------------------------------------------------------- */ + +/* + * preprocessing: + * rebuild reverseMap with mapping indexes for mappings relevant for from Unicode + * change each Unicode string to encode all but the first code point in 16-bit form + * + * generation: + * for each unique code point + * write an entry in the 3-stage trie + * check that there is only one single-code point sequence + * start recursion for following 16-bit input units + */ + +/* + * Remove toUnicode fallbacks and non-<subchar1> SUB mappings + * which are irrelevant for the fromUnicode extension table. + * Remove MBCS_FROM_U_EXT_FLAG bits. + * Overwrite the reverseMap with an index array to the relevant mappings. + * Modify the code point sequences to a generator-friendly format where + * the first code points remains unchanged but the following are recoded + * into 16-bit Unicode string form. + * The table must be sorted. + * Destroys previous data in the reverseMap. + */ +static int32_t +prepareFromUMappings(UCMTable *table) { + UCMapping *mappings, *m; + int32_t *map; + int32_t i, j, count; + int8_t flag; + + mappings=table->mappings; + map=table->reverseMap; + count=table->mappingsLength; + + /* + * we do not go through the map on input because the mappings are + * sorted lexically + */ + m=mappings; + + for(i=j=0; i<count; ++m, ++i) { + flag=m->f; + if(flag>=0) { + flag&=MBCS_FROM_U_EXT_MASK; + m->f=flag; + } + if(flag==0 || flag==1 || (flag==2 && m->bLen==1) || flag==4) { + map[j++]=i; + + if(m->uLen>1) { + /* recode all but the first code point to 16-bit Unicode */ + UChar32 *u32; + UChar *u; + UChar32 c; + int32_t q, r; + + u32=UCM_GET_CODE_POINTS(table, m); + u=(UChar *)u32; /* destructive in-place recoding */ + for(r=2, q=1; q<m->uLen; ++q) { + c=u32[q]; + U16_APPEND_UNSAFE(u, r, c); + } + + /* counts the first code point always at 2 - the first 16-bit unit is at 16-bit index 2 */ + m->uLen=(int8_t)r; + } + } + } + + return j; +} + +static uint32_t +getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) { + uint8_t *bytes, *resultBytes; + uint32_t value; + int32_t u16Length, ratio; + + if(m->f==2) { + /* + * no mapping, <subchar1> preferred + * + * no need to count in statistics because the subchars are already + * counted for maxOutBytes and maxBytesPerUChar in UConverterStaticData, + * and this non-mapping does not count for maxInUChars which are always + * trivially at least two if counting unmappable supplementary code points + */ + return UCNV_EXT_FROM_U_SUBCHAR1; + } + + bytes=UCM_GET_BYTES(table, m); + value=0; + switch(m->bLen) { + /* 1..3: store the bytes in the value word */ + case 3: + value=((uint32_t)*bytes++)<<16; + case 2: + value|=((uint32_t)*bytes++)<<8; + case 1: + value|=*bytes; + break; + default: + /* the parser enforces m->bLen<=UCNV_EXT_MAX_BYTES */ + /* store the bytes in fromUBytes[] and the index in the value word */ + value=(uint32_t)utm_countItems(extData->fromUBytes); + resultBytes=utm_allocN(extData->fromUBytes, m->bLen); + uprv_memcpy(resultBytes, bytes, m->bLen); + break; + } + value|=(uint32_t)m->bLen<<UCNV_EXT_FROM_U_LENGTH_SHIFT; + if(m->f==0) { + value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG; + } else if(m->f==4) { + value|=UCNV_EXT_FROM_U_GOOD_ONE_WAY_FLAG; + } + + /* calculate the real UTF-16 length (see recoding in prepareFromUMappings()) */ + if(m->uLen==1) { + u16Length=U16_LENGTH(m->u); + } else { + u16Length=U16_LENGTH(UCM_GET_CODE_POINTS(table, m)[0])+(m->uLen-2); + } + + /* update statistics */ + if(u16Length>extData->maxInUChars) { + extData->maxInUChars=u16Length; + } + if(m->bLen>extData->maxOutBytes) { + extData->maxOutBytes=m->bLen; + } + + ratio=(m->bLen+(u16Length-1))/u16Length; + if(ratio>extData->maxBytesPerUChar) { + extData->maxBytesPerUChar=ratio; + } + + return value; +} + +/* + * works like generateToUTable(), except that the + * output section consists of two arrays, one for input UChars and one + * for result values + * + * also, fromUTable sections are always stored in a compact form for + * access via binary search + */ +static UBool +generateFromUTable(CnvExtData *extData, UCMTable *table, + int32_t start, int32_t limit, int32_t unitIndex, + uint32_t defaultValue) { + UCMapping *mappings, *m; + int32_t *map; + int32_t i, j, uniqueCount, count, subStart, subLimit; + + UChar *uchars; + UChar32 low, high, prev; + + UChar *sectionUChars; + uint32_t *sectionValues; + + mappings=table->mappings; + map=table->reverseMap; + + /* step 1: examine the input units; set low, high, uniqueCount */ + m=mappings+map[start]; + uchars=(UChar *)UCM_GET_CODE_POINTS(table, m); + low=uchars[unitIndex]; + uniqueCount=1; + + prev=high=low; + for(i=start+1; i<limit; ++i) { + m=mappings+map[i]; + uchars=(UChar *)UCM_GET_CODE_POINTS(table, m); + high=uchars[unitIndex]; + + if(high!=prev) { + prev=high; + ++uniqueCount; + } + } + + /* step 2: allocate the section; set count, section */ + /* the fromUTable always stores for access via binary search */ + count=uniqueCount; + + /* allocate the section: 1 entry for the header + count for the items */ + sectionUChars=(UChar *)utm_allocN(extData->fromUTableUChars, 1+count); + sectionValues=(uint32_t *)utm_allocN(extData->fromUTableValues, 1+count); + + /* write the section header */ + *sectionUChars++=(UChar)count; + *sectionValues++=defaultValue; + + /* step 3: write temporary section table with subsection starts */ + prev=low-1; /* just before low to prevent empty subsections before low */ + j=0; /* section table index */ + for(i=start; i<limit; ++i) { + m=mappings+map[i]; + uchars=(UChar *)UCM_GET_CODE_POINTS(table, m); + high=uchars[unitIndex]; + + if(high!=prev) { + /* start of a new subsection for unit high */ + prev=high; + + /* write the entry with the subsection start */ + sectionUChars[j]=(UChar)high; + sectionValues[j]=(uint32_t)i; + ++j; + } + } + /* assert(j==count) */ + + /* step 4: recurse and write results */ + subLimit=(int32_t)(sectionValues[0]); + for(j=0; j<count; ++j) { + subStart=subLimit; + subLimit= (j+1)<count ? (int32_t)(sectionValues[j+1]) : limit; + + /* see if there is exactly one input unit sequence of length unitIndex+1 */ + defaultValue=0; + m=mappings+map[subStart]; + if(m->uLen==unitIndex+1) { + /* do not include this in generateToUTable() */ + ++subStart; + + if(subStart<subLimit && mappings[map[subStart]].uLen==unitIndex+1) { + /* print error for multiple same-input-sequence mappings */ + fprintf(stderr, "error: multiple mappings from same Unicode code points\n"); + ucm_printMapping(table, m, stderr); + ucm_printMapping(table, mappings+map[subStart], stderr); + return false; + } + + defaultValue=getFromUBytesValue(extData, table, m); + } + + if(subStart==subLimit) { + /* write the result for the input sequence ending here */ + sectionValues[j]=defaultValue; + } else { + /* write the index to the subsection table */ + sectionValues[j]=(uint32_t)utm_countItems(extData->fromUTableValues); + + /* recurse */ + if(!generateFromUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) { + return false; + } + } + } + return true; +} + +/* + * add entries to the fromUnicode trie, + * assume to be called with code points in ascending order + * and use that to build the trie in precompacted form + */ +static void +addFromUTrieEntry(CnvExtData *extData, UChar32 c, uint32_t value) { + int32_t i1, i2, i3, i3b, nextOffset, min, newBlock; + + if(value==0) { + return; + } + + /* + * compute the index for each stage, + * allocate a stage block if necessary, + * and write the stage value + */ + i1=c>>10; + if(i1>=extData->stage1Top) { + extData->stage1Top=i1+1; + } + + nextOffset=(c>>4)&0x3f; + + if(extData->stage1[i1]==0) { + /* allocate another block in stage 2; overlap with the previous block */ + newBlock=extData->stage2Top; + min=newBlock-nextOffset; /* minimum block start with overlap */ + while(min<newBlock && extData->stage2[newBlock-1]==0) { + --newBlock; + } + + extData->stage1[i1]=(uint16_t)newBlock; + extData->stage2Top=newBlock+MBCS_STAGE_2_BLOCK_SIZE; + if(extData->stage2Top>UPRV_LENGTHOF(extData->stage2)) { + fprintf(stderr, "error: too many stage 2 entries at U+%04x\n", (int)c); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + i2=extData->stage1[i1]+nextOffset; + nextOffset=c&0xf; + + if(extData->stage2[i2]==0) { + /* allocate another block in stage 3; overlap with the previous block */ + newBlock=extData->stage3Top; + min=newBlock-nextOffset; /* minimum block start with overlap */ + while(min<newBlock && extData->stage3[newBlock-1]==0) { + --newBlock; + } + + /* round up to a multiple of stage 3 granularity >1 (similar to utrie.c) */ + newBlock=(newBlock+(UCNV_EXT_STAGE_3_GRANULARITY-1))&~(UCNV_EXT_STAGE_3_GRANULARITY-1); + extData->stage2[i2]=(uint16_t)(newBlock>>UCNV_EXT_STAGE_2_LEFT_SHIFT); + + extData->stage3Top=newBlock+MBCS_STAGE_3_BLOCK_SIZE; + if(extData->stage3Top>UPRV_LENGTHOF(extData->stage3)) { + fprintf(stderr, "error: too many stage 3 entries at U+%04x\n", (int)c); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + i3=((int32_t)extData->stage2[i2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)+nextOffset; + /* + * assume extData->stage3[i3]==0 because we get + * code points in strictly ascending order + */ + + if(value==UCNV_EXT_FROM_U_SUBCHAR1) { + /* <subchar1> SUB mapping, see getFromUBytesValue() and prepareFromUMappings() */ + extData->stage3[i3]=1; + + /* + * precompaction is not optimal for <subchar1> |2 mappings because + * stage3 values for them are all the same, unlike for other mappings + * which all have unique values; + * use a simple compaction of reusing a whole block filled with these + * mappings + */ + + /* is the entire block filled with <subchar1> |2 mappings? */ + if(nextOffset==MBCS_STAGE_3_BLOCK_SIZE-1) { + for(min=i3-nextOffset; + min<i3 && extData->stage3[min]==1; + ++min) {} + + if(min==i3) { + /* the entire block is filled with these mappings */ + if(extData->stage3Sub1Block!=0) { + /* point to the previous such block and remove this block from stage3 */ + extData->stage2[i2]=extData->stage3Sub1Block; + extData->stage3Top-=MBCS_STAGE_3_BLOCK_SIZE; + uprv_memset(extData->stage3+extData->stage3Top, 0, MBCS_STAGE_3_BLOCK_SIZE*2); + } else { + /* remember this block's stage2 entry */ + extData->stage3Sub1Block=extData->stage2[i2]; + } + } + } + } else { + if((i3b=extData->stage3bTop++)>=UPRV_LENGTHOF(extData->stage3b)) { + fprintf(stderr, "error: too many stage 3b entries at U+%04x\n", (int)c); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + /* roundtrip or fallback mapping */ + extData->stage3[i3]=(uint16_t)i3b; + extData->stage3b[i3b]=value; + } +} + +static UBool +generateFromUTrie(CnvExtData *extData, UCMTable *table, int32_t mapLength) { + UCMapping *mappings, *m; + int32_t *map; + uint32_t value; + int32_t subStart, subLimit; + + UChar32 *codePoints; + UChar32 c, next; + + if(mapLength==0) { + return true; + } + + mappings=table->mappings; + map=table->reverseMap; + + /* + * iterate over same-initial-code point mappings, + * enter the initial code point into the trie, + * and start a recursion on the corresponding mappings section + * with generateFromUTable() + */ + m=mappings+map[0]; + codePoints=UCM_GET_CODE_POINTS(table, m); + next=codePoints[0]; + subLimit=0; + while(subLimit<mapLength) { + /* get a new subsection of mappings starting with the same code point */ + subStart=subLimit; + c=next; + while(next==c && ++subLimit<mapLength) { + m=mappings+map[subLimit]; + codePoints=UCM_GET_CODE_POINTS(table, m); + next=codePoints[0]; + } + + /* + * compute the value for this code point; + * if there is a mapping for this code point alone, it is at subStart + * because the table is sorted lexically + */ + value=0; + m=mappings+map[subStart]; + codePoints=UCM_GET_CODE_POINTS(table, m); + if(m->uLen==1) { + /* do not include this in generateFromUTable() */ + ++subStart; + + if(subStart<subLimit && mappings[map[subStart]].uLen==1) { + /* print error for multiple same-input-sequence mappings */ + fprintf(stderr, "error: multiple mappings from same Unicode code points\n"); + ucm_printMapping(table, m, stderr); + ucm_printMapping(table, mappings+map[subStart], stderr); + return false; + } + + value=getFromUBytesValue(extData, table, m); + } + + if(subStart==subLimit) { + /* write the result for this one code point */ + addFromUTrieEntry(extData, c, value); + } else { + /* write the index to the subsection table */ + addFromUTrieEntry(extData, c, (uint32_t)utm_countItems(extData->fromUTableValues)); + + /* recurse, starting from 16-bit-unit index 2, the first 16-bit unit after c */ + if(!generateFromUTable(extData, table, subStart, subLimit, 2, value)) { + return false; + } + } + } + return true; +} + +/* + * Generate the fromU data structures from the input table. + * The input table must be sorted, and all precision flags must be 0..3. + * This function will modify the table's reverseMap. + */ +static UBool +makeFromUTable(CnvExtData *extData, UCMTable *table) { + uint16_t *stage1; + int32_t i, stage1Top, fromUCount; + + fromUCount=prepareFromUMappings(table); + + extData->fromUTableUChars=utm_open("cnv extension fromUTableUChars", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 2); + extData->fromUTableValues=utm_open("cnv extension fromUTableValues", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 4); + extData->fromUBytes=utm_open("cnv extension fromUBytes", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 1); + + /* allocate all-unassigned stage blocks */ + extData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED; + extData->stage3Top=MBCS_STAGE_3_FIRST_ASSIGNED; + + /* + * stage 3b stores only unique values, and in + * index 0: 0 for "no mapping" + * index 1: "no mapping" with preference for <subchar1> rather than <subchar> + */ + extData->stage3b[1]=UCNV_EXT_FROM_U_SUBCHAR1; + extData->stage3bTop=2; + + /* allocate the first entry in the fromUTable because index 0 means "no result" */ + utm_alloc(extData->fromUTableUChars); + utm_alloc(extData->fromUTableValues); + + if(!generateFromUTrie(extData, table, fromUCount)) { + return false; + } + + /* + * offset the stage 1 trie entries by stage1Top because they will + * be stored in a single array + */ + stage1=extData->stage1; + stage1Top=extData->stage1Top; + for(i=0; i<stage1Top; ++i) { + stage1[i]=(uint16_t)(stage1[i]+stage1Top); + } + + return true; +} + +/* -------------------------------------------------------------------------- */ + +static UBool +CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) { + CnvExtData *extData; + + if(table->unicodeMask&UCNV_HAS_SURROGATES) { + fprintf(stderr, "error: contains mappings for surrogate code points\n"); + return false; + } + + staticData->conversionType=UCNV_MBCS; + + extData=(CnvExtData *)cnvData; + + /* + * assume that the table is sorted + * + * call the functions in this order because + * makeToUTable() modifies the original reverseMap, + * makeFromUTable() writes a whole new mapping into reverseMap + */ + return + makeToUTable(extData, table) && + makeFromUTable(extData, table); +} diff --git a/intl/icu/source/tools/makeconv/genmbcs.cpp b/intl/icu/source/tools/makeconv/genmbcs.cpp new file mode 100644 index 0000000000..43b96d814f --- /dev/null +++ b/intl/icu/source/tools/makeconv/genmbcs.cpp @@ -0,0 +1,1576 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: genmbcs.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000jul06 +* created by: Markus W. Scherer +*/ + +#include <stdio.h> +#include "unicode/utypes.h" +#include "cstring.h" +#include "cmemory.h" +#include "unewdata.h" +#include "ucnv_cnv.h" +#include "ucnvmbcs.h" +#include "ucm.h" +#include "makeconv.h" +#include "genmbcs.h" +#include "toolutil.h" + +/* + * TODO: Split this file into toUnicode, SBCSFromUnicode and MBCSFromUnicode files. + * Reduce tests for maxCharLength. + */ + +struct MBCSData { + NewConverter newConverter; + + UCMFile *ucm; + + /* toUnicode (state table in ucm->states) */ + _MBCSToUFallback toUFallbacks[MBCS_MAX_FALLBACK_COUNT]; + int32_t countToUFallbacks; + uint16_t *unicodeCodeUnits; + + /* fromUnicode */ + uint16_t stage1[MBCS_STAGE_1_SIZE]; + uint16_t stage2Single[MBCS_STAGE_2_SIZE]; /* stage 2 for single-byte codepages */ + uint32_t stage2[MBCS_STAGE_2_SIZE]; /* stage 2 for MBCS */ + uint8_t *fromUBytes; + uint32_t stage2Top, stage3Top; + + /* fromUTF8 */ + uint16_t stageUTF8[0x10000>>MBCS_UTF8_STAGE_SHIFT]; /* allow for utf8Max=0xffff */ + + /* + * Maximum UTF-8-friendly code point. + * 0 if !utf8Friendly, otherwise 0x01ff..0xffff in steps of 0x100. + * If utf8Friendly, utf8Max is normally either MBCS_UTF8_MAX or 0xffff. + */ + uint16_t utf8Max; + + UBool utf8Friendly; + UBool omitFromU; +}; + +/* prototypes */ +U_CDECL_BEGIN +static void +MBCSClose(NewConverter *cnvData); + +static UBool +MBCSStartMappings(MBCSData *mbcsData); + +static UBool +MBCSAddToUnicode(MBCSData *mbcsData, + const uint8_t *bytes, int32_t length, + UChar32 c, + int8_t flag); + +static UBool +MBCSIsValid(NewConverter *cnvData, + const uint8_t *bytes, int32_t length); + +static UBool +MBCSSingleAddFromUnicode(MBCSData *mbcsData, + const uint8_t *bytes, int32_t length, + UChar32 c, + int8_t flag); + +static UBool +MBCSAddFromUnicode(MBCSData *mbcsData, + const uint8_t *bytes, int32_t length, + UChar32 c, + int8_t flag); + +static void +MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData); + +static UBool +MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData); + +static uint32_t +MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, + UNewDataMemory *pData, int32_t tableType); +U_CDECL_END + +/* helper ------------------------------------------------------------------- */ + +static inline char +hexDigit(uint8_t digit) { + return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit); +} + +static inline char * +printBytes(char *buffer, const uint8_t *bytes, int32_t length) { + char *s=buffer; + while(length>0) { + *s++=hexDigit((uint8_t)(*bytes>>4)); + *s++=hexDigit((uint8_t)(*bytes&0xf)); + ++bytes; + --length; + } + + *s=0; + return buffer; +} + +/* implementation ----------------------------------------------------------- */ + +static MBCSData gDummy; + + +U_CFUNC const MBCSData * +MBCSGetDummy() { + uprv_memset(&gDummy, 0, sizeof(MBCSData)); + + /* + * Set "pessimistic" values which may sometimes move too many + * mappings to the extension table (but never too few). + * These values cause MBCSOkForBaseFromUnicode() to return false for the + * largest set of mappings. + * Assume maxCharLength>1. + */ + gDummy.utf8Friendly=true; + if(SMALL) { + gDummy.utf8Max=0xffff; + gDummy.omitFromU=true; + } else { + gDummy.utf8Max=MBCS_UTF8_MAX; + } + return &gDummy; +} + +static void +MBCSInit(MBCSData *mbcsData, UCMFile *ucm) { + uprv_memset(mbcsData, 0, sizeof(MBCSData)); + + mbcsData->ucm=ucm; /* aliased, not owned */ + + mbcsData->newConverter.close=MBCSClose; + mbcsData->newConverter.isValid=MBCSIsValid; + mbcsData->newConverter.addTable=MBCSAddTable; + mbcsData->newConverter.write=MBCSWrite; +} + +U_CFUNC NewConverter * +MBCSOpen(UCMFile *ucm) { + MBCSData *mbcsData=(MBCSData *)uprv_malloc(sizeof(MBCSData)); + if(mbcsData==nullptr) { + printf("out of memory\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + MBCSInit(mbcsData, ucm); + return &mbcsData->newConverter; +} + +static void +MBCSDestruct(MBCSData *mbcsData) { + uprv_free(mbcsData->unicodeCodeUnits); + uprv_free(mbcsData->fromUBytes); +} + +U_CDECL_BEGIN +static void +MBCSClose(NewConverter *cnvData) { + MBCSData *mbcsData=(MBCSData *)cnvData; + if(mbcsData!=nullptr) { + MBCSDestruct(mbcsData); + uprv_free(mbcsData); + } +} +U_CDECL_END + +static UBool +MBCSStartMappings(MBCSData *mbcsData) { + int32_t i, sum, maxCharLength, + stage2NullLength, stage2AllocLength, + stage3NullLength, stage3AllocLength; + + /* toUnicode */ + + /* allocate the code unit array and prefill it with "unassigned" values */ + sum=mbcsData->ucm->states.countToUCodeUnits; + if(VERBOSE) { + printf("the total number of offsets is 0x%lx=%ld\n", (long)sum, (long)sum); + } + + if(sum>0) { + mbcsData->unicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t)); + if(mbcsData->unicodeCodeUnits==nullptr) { + fprintf(stderr, "error: out of memory allocating %ld 16-bit code units\n", + (long)sum); + return false; + } + for(i=0; i<sum; ++i) { + mbcsData->unicodeCodeUnits[i]=0xfffe; + } + } + + /* fromUnicode */ + maxCharLength=mbcsData->ucm->states.maxCharLength; + + /* allocate the codepage mappings and preset the first 16 characters to 0 */ + if(maxCharLength==1) { + /* allocate 64k 16-bit results for single-byte codepages */ + sum=0x20000; + } else { + /* allocate 1M * maxCharLength bytes for at most 1M mappings */ + sum=0x100000*maxCharLength; + } + mbcsData->fromUBytes=(uint8_t *)uprv_malloc(sum); + if(mbcsData->fromUBytes==nullptr) { + fprintf(stderr, "error: out of memory allocating %ld B for target mappings\n", (long)sum); + return false; + } + uprv_memset(mbcsData->fromUBytes, 0, sum); + + /* + * UTF-8-friendly fromUnicode tries: allocate multiple blocks at a time. + * See ucnvmbcs.h for details. + * + * There is code, for example in ucnv_MBCSGetUnicodeSetForUnicode(), which + * assumes that the initial stage 2/3 blocks are the all-unassigned ones. + * Therefore, we refine the data structure while maintaining this placement + * even though it would be convenient to allocate the ASCII block at the + * beginning of stage 3, for example. + * + * UTF-8-friendly fromUnicode tries work from sorted tables and are built + * pre-compacted, overlapping adjacent stage 2/3 blocks. + * This is necessary because the block allocation and compaction changes + * at SBCS_UTF8_MAX or MBCS_UTF8_MAX, and for MBCS tables the additional + * stage table uses direct indexes into stage 3, without a multiplier and + * thus with a smaller reach. + * + * Non-UTF-8-friendly fromUnicode tries work from unsorted tables + * (because implicit precision is used), and are compacted + * in post-processing. + * + * Preallocation for UTF-8-friendly fromUnicode tries: + * + * Stage 3: + * 64-entry all-unassigned first block followed by ASCII (128 entries). + * + * Stage 2: + * 64-entry all-unassigned first block followed by preallocated + * 64-block for ASCII. + */ + + /* Preallocate ASCII as a linear 128-entry stage 3 block. */ + stage2NullLength=MBCS_STAGE_2_BLOCK_SIZE; + stage2AllocLength=MBCS_STAGE_2_BLOCK_SIZE; + + stage3NullLength=MBCS_UTF8_STAGE_3_BLOCK_SIZE; + stage3AllocLength=128; /* ASCII U+0000..U+007f */ + + /* Initialize stage 1 for the preallocated blocks. */ + sum=stage2NullLength; + for(i=0; i<(stage2AllocLength>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT); ++i) { + mbcsData->stage1[i]=sum; + sum+=MBCS_STAGE_2_BLOCK_SIZE; + } + mbcsData->stage2Top=stage2NullLength+stage2AllocLength; /* ==sum */ + + /* + * Stage 2 indexes count 16-blocks in stage 3 as follows: + * SBCS: directly, indexes increment by 16 + * MBCS: indexes need to be multiplied by 16*maxCharLength, indexes increment by 1 + * MBCS UTF-8: directly, indexes increment by 16 + */ + if(maxCharLength==1) { + sum=stage3NullLength; + for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) { + mbcsData->stage2Single[mbcsData->stage1[0]+i]=sum; + sum+=MBCS_STAGE_3_BLOCK_SIZE; + } + } else { + sum=stage3NullLength/MBCS_STAGE_3_GRANULARITY; + for(i=0; i<(stage3AllocLength/MBCS_STAGE_3_BLOCK_SIZE); ++i) { + mbcsData->stage2[mbcsData->stage1[0]+i]=sum; + sum+=MBCS_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_GRANULARITY; + } + } + + sum=stage3NullLength; + for(i=0; i<(stage3AllocLength/MBCS_UTF8_STAGE_3_BLOCK_SIZE); ++i) { + mbcsData->stageUTF8[i]=sum; + sum+=MBCS_UTF8_STAGE_3_BLOCK_SIZE; + } + + /* + * Allocate a 64-entry all-unassigned first stage 3 block, + * for UTF-8-friendly lookup with a trail byte, + * plus 128 entries for ASCII. + */ + mbcsData->stage3Top=(stage3NullLength+stage3AllocLength)*maxCharLength; /* ==sum*maxCharLength */ + + return true; +} + +/* return true for success */ +static UBool +setFallback(MBCSData *mbcsData, uint32_t offset, UChar32 c) { + int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset); + if(i>=0) { + /* if there is already a fallback for this offset, then overwrite it */ + mbcsData->toUFallbacks[i].codePoint=c; + return true; + } else { + /* if there is no fallback for this offset, then add one */ + i=mbcsData->countToUFallbacks; + if(i>=MBCS_MAX_FALLBACK_COUNT) { + fprintf(stderr, "error: too many toUnicode fallbacks, currently at: U+%x\n", (int)c); + return false; + } else { + mbcsData->toUFallbacks[i].offset=offset; + mbcsData->toUFallbacks[i].codePoint=c; + mbcsData->countToUFallbacks=i+1; + return true; + } + } +} + +/* remove fallback if there is one with this offset; return the code point if there was such a fallback, otherwise -1 */ +static int32_t +removeFallback(MBCSData *mbcsData, uint32_t offset) { + int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset); + if(i>=0) { + _MBCSToUFallback *toUFallbacks; + int32_t limit, old; + + toUFallbacks=mbcsData->toUFallbacks; + limit=mbcsData->countToUFallbacks; + old=(int32_t)toUFallbacks[i].codePoint; + + /* copy the last fallback entry here to keep the list contiguous */ + toUFallbacks[i].offset=toUFallbacks[limit-1].offset; + toUFallbacks[i].codePoint=toUFallbacks[limit-1].codePoint; + mbcsData->countToUFallbacks=limit-1; + return old; + } else { + return -1; + } +} + +/* + * isFallback is almost a boolean: + * 1 (true) this is a fallback mapping + * 0 (false) this is a precise mapping + * -1 the precision of this mapping is not specified + */ +static UBool +MBCSAddToUnicode(MBCSData *mbcsData, + const uint8_t *bytes, int32_t length, + UChar32 c, + int8_t flag) { + char buffer[10]; + uint32_t offset=0; + int32_t i=0, entry, old; + uint8_t state=0; + + if(mbcsData->ucm->states.countStates==0) { + fprintf(stderr, "error: there is no state information!\n"); + return false; + } + + /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */ + if(length==2 && mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO) { + state=1; + } + + /* + * Walk down the state table like in conversion, + * much like getNextUChar(). + * We assume that c<=0x10ffff. + */ + for(i=0;;) { + entry=mbcsData->ucm->states.stateTable[state][bytes[i++]]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + if(i==length) { + fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%s (U+%x)\n", + (short)state, printBytes(buffer, bytes, length), (int)c); + return false; + } + state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); + offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); + } else { + if(i<length) { + fprintf(stderr, "error: byte sequence too long by %d bytes, final state %u: 0x%s (U+%x)\n", + (int)(length-i), state, printBytes(buffer, bytes, length), (int)c); + return false; + } + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_ILLEGAL: + fprintf(stderr, "error: byte sequence ends in illegal state at U+%04x<->0x%s\n", + (int)c, printBytes(buffer, bytes, length)); + return false; + case MBCS_STATE_CHANGE_ONLY: + fprintf(stderr, "error: byte sequence ends in state-change-only at U+%04x<->0x%s\n", + (int)c, printBytes(buffer, bytes, length)); + return false; + case MBCS_STATE_UNASSIGNED: + fprintf(stderr, "error: byte sequence ends in unassigned state at U+%04x<->0x%s\n", + (int)c, printBytes(buffer, bytes, length)); + return false; + case MBCS_STATE_FALLBACK_DIRECT_16: + case MBCS_STATE_VALID_DIRECT_16: + case MBCS_STATE_FALLBACK_DIRECT_20: + case MBCS_STATE_VALID_DIRECT_20: + if(MBCS_ENTRY_SET_STATE(entry, 0)!=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) { + /* the "direct" action's value is not "valid-direct-16-unassigned" any more */ + if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_DIRECT_16 || MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_FALLBACK_DIRECT_16) { + old=MBCS_ENTRY_FINAL_VALUE(entry); + } else { + old=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); + } + if(flag>=0) { + fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", + (int)c, printBytes(buffer, bytes, length), (int)old); + return false; + } else if(VERBOSE) { + fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", + (int)c, printBytes(buffer, bytes, length), (int)old); + } + /* + * Continue after the above warning + * if the precision of the mapping is unspecified. + */ + } + /* reassign the correct action code */ + entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, (MBCS_STATE_VALID_DIRECT_16+(flag==3 ? 2 : 0)+(c>=0x10000 ? 1 : 0))); + + /* put the code point into bits 22..7 for BMP, c-0x10000 into 26..7 for others */ + if(c<=0xffff) { + entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c); + } else { + entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c-0x10000); + } + mbcsData->ucm->states.stateTable[state][bytes[i-1]]=entry; + break; + case MBCS_STATE_VALID_16: + /* bits 26..16 are not used, 0 */ + /* bits 15..7 contain the final offset delta to one 16-bit code unit */ + offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); + /* check that this byte sequence is still unassigned */ + if((old=mbcsData->unicodeCodeUnits[offset])!=0xfffe || (old=removeFallback(mbcsData, offset))!=-1) { + if(flag>=0) { + fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", + (int)c, printBytes(buffer, bytes, length), (int)old); + return false; + } else if(VERBOSE) { + fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", + (int)c, printBytes(buffer, bytes, length), (int)old); + } + } + if(c>=0x10000) { + fprintf(stderr, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%s\n", + (int)c, printBytes(buffer, bytes, length)); + return false; + } + if(flag>0) { + /* assign only if there is no precise mapping */ + if(mbcsData->unicodeCodeUnits[offset]==0xfffe) { + return setFallback(mbcsData, offset, c); + } + } else { + mbcsData->unicodeCodeUnits[offset]=(uint16_t)c; + } + break; + case MBCS_STATE_VALID_16_PAIR: + /* bits 26..16 are not used, 0 */ + /* bits 15..7 contain the final offset delta to two 16-bit code units */ + offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); + /* check that this byte sequence is still unassigned */ + old=mbcsData->unicodeCodeUnits[offset]; + if(old<0xfffe) { + int32_t real; + if(old<0xd800) { + real=old; + } else if(old<=0xdfff) { + real=0x10000+((old&0x3ff)<<10)+((mbcsData->unicodeCodeUnits[offset+1])&0x3ff); + } else /* old<=0xe001 */ { + real=mbcsData->unicodeCodeUnits[offset+1]; + } + if(flag>=0) { + fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", + (int)c, printBytes(buffer, bytes, length), (int)real); + return false; + } else if(VERBOSE) { + fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04x\n", + (int)c, printBytes(buffer, bytes, length), (int)real); + } + } + if(flag>0) { + /* assign only if there is no precise mapping */ + if(old<=0xdbff || old==0xe000) { + /* do nothing */ + } else if(c<=0xffff) { + /* set a BMP fallback code point as a pair with 0xe001 */ + mbcsData->unicodeCodeUnits[offset++]=0xe001; + mbcsData->unicodeCodeUnits[offset]=(uint16_t)c; + } else { + /* set a fallback surrogate pair with two second surrogates */ + mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xdbc0+(c>>10)); + mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff)); + } + } else { + if(c<0xd800) { + /* set a BMP code point */ + mbcsData->unicodeCodeUnits[offset]=(uint16_t)c; + } else if(c<=0xffff) { + /* set a BMP code point above 0xd800 as a pair with 0xe000 */ + mbcsData->unicodeCodeUnits[offset++]=0xe000; + mbcsData->unicodeCodeUnits[offset]=(uint16_t)c; + } else { + /* set a surrogate pair */ + mbcsData->unicodeCodeUnits[offset++]=(uint16_t)(0xd7c0+(c>>10)); + mbcsData->unicodeCodeUnits[offset]=(uint16_t)(0xdc00+(c&0x3ff)); + } + } + break; + default: + /* reserved, must never occur */ + fprintf(stderr, "internal error: byte sequence reached reserved action code, entry 0x%02x: 0x%s (U+%x)\n", + (int)entry, printBytes(buffer, bytes, length), (int)c); + return false; + } + + return true; + } + } +} + +U_CDECL_BEGIN +/* is this byte sequence valid? (this is almost the same as MBCSAddToUnicode()) */ +static UBool +MBCSIsValid(NewConverter *cnvData, + const uint8_t *bytes, int32_t length) { + MBCSData *mbcsData=(MBCSData *)cnvData; + + return (UBool)(1==ucm_countChars(&mbcsData->ucm->states, bytes, length)); +} +U_CDECL_END +static UBool +MBCSSingleAddFromUnicode(MBCSData *mbcsData, + const uint8_t *bytes, int32_t /*length*/, + UChar32 c, + int8_t flag) { + uint16_t *stage3, *p; + uint32_t idx; + uint16_t old; + uint8_t b; + + uint32_t blockSize, newTop, i, nextOffset, newBlock, min; + + /* ignore |2 SUB mappings */ + if(flag==2) { + return true; + } + + /* + * Walk down the triple-stage compact array ("trie") and + * allocate parts as necessary. + * Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings. + * We assume that length<=maxCharLength and that c<=0x10ffff. + */ + stage3=(uint16_t *)mbcsData->fromUBytes; + b=*bytes; + + /* inspect stage 1 */ + idx=c>>MBCS_STAGE_1_SHIFT; + if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) { + nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1); + } else { + nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK; + } + if(mbcsData->stage1[idx]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) { + /* allocate another block in stage 2 */ + newBlock=mbcsData->stage2Top; + if(mbcsData->utf8Friendly) { + min=newBlock-nextOffset; /* minimum block start with overlap */ + while(min<newBlock && mbcsData->stage2Single[newBlock-1]==0) { + --newBlock; + } + } + newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE; + + if(newTop>MBCS_MAX_STAGE_2_TOP) { + fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02x\n", (int)c, b); + return false; + } + + /* + * each stage 2 block contains 64 16-bit words: + * 6 code point bits 9..4 with 1 stage 3 index + */ + mbcsData->stage1[idx]=(uint16_t)newBlock; + mbcsData->stage2Top=newTop; + } + + /* inspect stage 2 */ + idx=mbcsData->stage1[idx]+nextOffset; + if(mbcsData->utf8Friendly && c<=SBCS_UTF8_MAX) { + /* allocate 64-entry blocks for UTF-8-friendly lookup */ + blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE; + nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK; + } else { + blockSize=MBCS_STAGE_3_BLOCK_SIZE; + nextOffset=c&MBCS_STAGE_3_BLOCK_MASK; + } + if(mbcsData->stage2Single[idx]==0) { + /* allocate another block in stage 3 */ + newBlock=mbcsData->stage3Top; + if(mbcsData->utf8Friendly) { + min=newBlock-nextOffset; /* minimum block start with overlap */ + while(min<newBlock && stage3[newBlock-1]==0) { + --newBlock; + } + } + newTop=newBlock+blockSize; + + if(newTop>MBCS_STAGE_3_SBCS_SIZE) { + fprintf(stderr, "error: too many code points at U+%04x<->0x%02x\n", (int)c, b); + return false; + } + /* each block has 16 uint16_t entries */ + i=idx; + while(newBlock<newTop) { + mbcsData->stage2Single[i++]=(uint16_t)newBlock; + newBlock+=MBCS_STAGE_3_BLOCK_SIZE; + } + mbcsData->stage3Top=newTop; /* ==newBlock */ + } + + /* write the codepage entry into stage 3 and get the previous entry */ + p=stage3+mbcsData->stage2Single[idx]+nextOffset; + old=*p; + if(flag<=0) { + *p=(uint16_t)(0xf00|b); + } else if(IS_PRIVATE_USE(c)) { + *p=(uint16_t)(0xc00|b); + } else { + *p=(uint16_t)(0x800|b); + } + + /* check that this Unicode code point was still unassigned */ + if(old>=0x100) { + if(flag>=0) { + fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n", + (int)c, b, old&0xff); + return false; + } else if(VERBOSE) { + fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n", + (int)c, b, old&0xff); + } + /* continue after the above warning if the precision of the mapping is unspecified */ + } + + return true; +} + +static UBool +MBCSAddFromUnicode(MBCSData *mbcsData, + const uint8_t *bytes, int32_t length, + UChar32 c, + int8_t flag) { + char buffer[10]; + const uint8_t *pb; + uint8_t *stage3, *p; + uint32_t idx, b, old, stage3Index; + int32_t maxCharLength; + + uint32_t blockSize, newTop, i, nextOffset, newBlock, min, overlap, maxOverlap; + + maxCharLength=mbcsData->ucm->states.maxCharLength; + + if( mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO && + (!IGNORE_SISO_CHECK && (*bytes==0xe || *bytes==0xf)) + ) { + fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%s\n", + (int)c, printBytes(buffer, bytes, length)); + return false; + } + + if(flag==1 && length==1 && *bytes==0) { + fprintf(stderr, "error: unable to encode a |1 fallback from U+%04x to 0x%02x\n", + (int)c, *bytes); + return false; + } + + /* + * Walk down the triple-stage compact array ("trie") and + * allocate parts as necessary. + * Note that the first stage 2 and 3 blocks are reserved for + * all-unassigned mappings. + * We assume that length<=maxCharLength and that c<=0x10ffff. + */ + stage3=mbcsData->fromUBytes; + + /* inspect stage 1 */ + idx=c>>MBCS_STAGE_1_SHIFT; + if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) { + nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1); + } else { + nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK; + } + if(mbcsData->stage1[idx]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) { + /* allocate another block in stage 2 */ + newBlock=mbcsData->stage2Top; + if(mbcsData->utf8Friendly) { + min=newBlock-nextOffset; /* minimum block start with overlap */ + while(min<newBlock && mbcsData->stage2[newBlock-1]==0) { + --newBlock; + } + } + newTop=newBlock+MBCS_STAGE_2_BLOCK_SIZE; + + if(newTop>MBCS_MAX_STAGE_2_TOP) { + fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%s\n", + (int)c, printBytes(buffer, bytes, length)); + return false; + } + + /* + * each stage 2 block contains 64 32-bit words: + * 6 code point bits 9..4 with value with bits 31..16 "assigned" flags and bits 15..0 stage 3 index + */ + i=idx; + while(newBlock<newTop) { + mbcsData->stage1[i++]=(uint16_t)newBlock; + newBlock+=MBCS_STAGE_2_BLOCK_SIZE; + } + mbcsData->stage2Top=newTop; /* ==newBlock */ + } + + /* inspect stage 2 */ + idx=mbcsData->stage1[idx]+nextOffset; + if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) { + /* allocate 64-entry blocks for UTF-8-friendly lookup */ + blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE*maxCharLength; + nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK; + } else { + blockSize=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength; + nextOffset=c&MBCS_STAGE_3_BLOCK_MASK; + } + if(mbcsData->stage2[idx]==0) { + /* allocate another block in stage 3 */ + newBlock=mbcsData->stage3Top; + if(mbcsData->utf8Friendly && nextOffset>=MBCS_STAGE_3_GRANULARITY) { + /* + * Overlap stage 3 blocks only in multiples of 16-entry blocks + * because of the indexing granularity in stage 2. + */ + maxOverlap=(nextOffset&~(MBCS_STAGE_3_GRANULARITY-1))*maxCharLength; + for(overlap=0; + overlap<maxOverlap && stage3[newBlock-overlap-1]==0; + ++overlap) {} + + overlap=(overlap/MBCS_STAGE_3_GRANULARITY)/maxCharLength; + overlap=(overlap*MBCS_STAGE_3_GRANULARITY)*maxCharLength; + + newBlock-=overlap; + } + newTop=newBlock+blockSize; + + if(newTop>MBCS_STAGE_3_MBCS_SIZE*(uint32_t)maxCharLength) { + fprintf(stderr, "error: too many code points at U+%04x<->0x%s\n", + (int)c, printBytes(buffer, bytes, length)); + return false; + } + /* each block has 16*maxCharLength bytes */ + i=idx; + while(newBlock<newTop) { + mbcsData->stage2[i++]=(newBlock/MBCS_STAGE_3_GRANULARITY)/maxCharLength; + newBlock+=MBCS_STAGE_3_BLOCK_SIZE*maxCharLength; + } + mbcsData->stage3Top=newTop; /* ==newBlock */ + } + + stage3Index=MBCS_STAGE_3_GRANULARITY*(uint32_t)(uint16_t)mbcsData->stage2[idx]; + + /* Build an alternate, UTF-8-friendly stage table as well. */ + if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) { + /* Overflow for uint16_t entries in stageUTF8? */ + if(stage3Index>0xffff) { + /* + * This can occur only if the mapping table is nearly perfectly filled and if + * utf8Max==0xffff. + * (There is no known charset like this. GB 18030 does not map + * surrogate code points and LMBCS does not map 256 PUA code points.) + * + * Otherwise, stage3Index<=MBCS_UTF8_LIMIT<0xffff + * (stage3Index can at most reach exactly MBCS_UTF8_LIMIT) + * because we have a sorted table and there are at most MBCS_UTF8_LIMIT + * mappings with 0<=c<MBCS_UTF8_LIMIT, and there is only also + * the initial all-unassigned block in stage3. + * + * Solution for the overflow: Reduce utf8Max to the next lower value, 0xfeff. + * + * (See svn revision 20866 of the markus/ucnvutf8 feature branch for + * code that causes MBCSAddTable() to rebuild the table not utf8Friendly + * in case of overflow. That code was not tested.) + */ + mbcsData->utf8Max=0xfeff; + } else { + /* + * The stage 3 block has been assigned for the regular trie. + * Just copy its index into stageUTF8[], without the granularity. + */ + mbcsData->stageUTF8[c>>MBCS_UTF8_STAGE_SHIFT]=(uint16_t)stage3Index; + } + } + + /* write the codepage bytes into stage 3 and get the previous bytes */ + + /* assemble the bytes into a single integer */ + pb=bytes; + b=0; + switch(length) { + case 4: + b=*pb++; + U_FALLTHROUGH; + case 3: + b=(b<<8)|*pb++; + U_FALLTHROUGH; + case 2: + b=(b<<8)|*pb++; + U_FALLTHROUGH; + case 1: + default: + b=(b<<8)|*pb++; + break; + } + + old=0; + p=stage3+(stage3Index+nextOffset)*maxCharLength; + switch(maxCharLength) { + case 2: + old=*(uint16_t *)p; + *(uint16_t *)p=(uint16_t)b; + break; + case 3: + old=(uint32_t)*p<<16; + *p++=(uint8_t)(b>>16); + old|=(uint32_t)*p<<8; + *p++=(uint8_t)(b>>8); + old|=*p; + *p=(uint8_t)b; + break; + case 4: + old=*(uint32_t *)p; + *(uint32_t *)p=b; + break; + default: + /* will never occur */ + break; + } + + /* check that this Unicode code point was still unassigned */ + if((mbcsData->stage2[idx+(nextOffset>>MBCS_STAGE_2_SHIFT)]&(1UL<<(16+(c&0xf))))!=0 || old!=0) { + if(flag>=0) { + fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n", + (int)c, printBytes(buffer, bytes, length), (int)old); + return false; + } else if(VERBOSE) { + fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%s see 0x%02x\n", + (int)c, printBytes(buffer, bytes, length), (int)old); + } + /* continue after the above warning if the precision of the mapping is + unspecified */ + } + if(flag<=0) { + /* set the roundtrip flag */ + mbcsData->stage2[idx+(nextOffset>>4)]|=(1UL<<(16+(c&0xf))); + } + + return true; +} + +U_CFUNC UBool +MBCSOkForBaseFromUnicode(const MBCSData *mbcsData, + const uint8_t *bytes, int32_t length, + UChar32 c, int8_t flag) { + /* + * A 1:1 mapping does not fit into the MBCS base table's fromUnicode table under + * the following conditions: + * + * - a |2 SUB mapping for <subchar1> (no base table data structure for them) + * - a |1 fallback to 0x00 (result value 0, indistinguishable from unmappable entry) + * - a multi-byte mapping with leading 0x00 bytes (no explicit length field) + * + * Some of these tests are redundant with ucm_mappingType(). + */ + if( (flag==2 && length==1) || + (flag==1 && bytes[0]==0) || /* testing length==1 would be redundant with the next test */ + (flag<=1 && length>1 && bytes[0]==0) + ) { + return false; + } + + /* + * Additional restrictions for UTF-8-friendly fromUnicode tables, + * for code points up to the maximum optimized one: + * + * - any mapping to 0x00 (result value 0, indistinguishable from unmappable entry) + * - any |1 fallback (no roundtrip flags in the optimized table) + */ + if(mbcsData->utf8Friendly && flag<=1 && c<=mbcsData->utf8Max && (bytes[0]==0 || flag==1)) { + return false; + } + + /* + * If we omit the fromUnicode data, we can only store roundtrips there + * because only they are recoverable from the toUnicode data. + * Fallbacks must go into the extension table. + */ + if(mbcsData->omitFromU && flag!=0) { + return false; + } + + /* All other mappings do fit into the base table. */ + return true; +} + +U_CDECL_BEGIN +/* we can assume that the table only contains 1:1 mappings with <=4 bytes each */ +static UBool +MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) { + MBCSData *mbcsData; + UCMapping *m; + UChar32 c; + int32_t i, maxCharLength; + int8_t f; + UBool isOK, utf8Friendly; + + staticData->unicodeMask=table->unicodeMask; + if(staticData->unicodeMask==3) { + fprintf(stderr, "error: contains mappings for both supplementary and surrogate code points\n"); + return false; + } + + staticData->conversionType=UCNV_MBCS; + + mbcsData=(MBCSData *)cnvData; + maxCharLength=mbcsData->ucm->states.maxCharLength; + + /* + * Generation of UTF-8-friendly data requires + * a sorted table, which makeconv generates when explicit precision + * indicators are used. + */ + mbcsData->utf8Friendly=utf8Friendly=(UBool)((table->flagsType&UCM_FLAGS_EXPLICIT)!=0); + if(utf8Friendly) { + mbcsData->utf8Max=MBCS_UTF8_MAX; + if(SMALL && maxCharLength>1) { + mbcsData->omitFromU=true; + } + } else { + mbcsData->utf8Max=0; + if(SMALL && maxCharLength>1) { + fprintf(stderr, + "makeconv warning: --small not available for .ucm files without |0 etc.\n"); + } + } + + if(!MBCSStartMappings(mbcsData)) { + return false; + } + + staticData->hasFromUnicodeFallback=false; + staticData->hasToUnicodeFallback=false; + + isOK=true; + + m=table->mappings; + for(i=0; i<table->mappingsLength; ++m, ++i) { + c=m->u; + f=m->f; + + /* + * Small optimization for --small .cnv files: + * + * If there are fromUnicode mappings above MBCS_UTF8_MAX, + * then the file size will be smaller if we make utf8Max larger + * because the size increase in stageUTF8 will be more than balanced by + * how much less of stage2 needs to be stored. + * + * There is no point in doing this incrementally because stageUTF8 + * uses so much less space per block than stage2, + * so we immediately increase utf8Max to 0xffff. + * + * Do not increase utf8Max if it is already at 0xfeff because MBCSAddFromUnicode() + * sets it to that value when stageUTF8 overflows. + */ + if( mbcsData->omitFromU && f<=1 && + mbcsData->utf8Max<c && c<=0xffff && + mbcsData->utf8Max<0xfeff + ) { + mbcsData->utf8Max=0xffff; + } + + switch(f) { + case -1: + /* there was no precision/fallback indicator */ + /* fall through to set the mappings */ + U_FALLTHROUGH; + case 0: + /* set roundtrip mappings */ + isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f); + + if(maxCharLength==1) { + isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f); + } else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) { + isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f); + } else { + m->f|=MBCS_FROM_U_EXT_FLAG; + m->moveFlag=UCM_MOVE_TO_EXT; + } + break; + case 1: + /* set only a fallback mapping from Unicode to codepage */ + if(maxCharLength==1) { + staticData->hasFromUnicodeFallback=true; + isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f); + } else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) { + staticData->hasFromUnicodeFallback=true; + isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f); + } else { + m->f|=MBCS_FROM_U_EXT_FLAG; + m->moveFlag=UCM_MOVE_TO_EXT; + } + break; + case 2: + /* ignore |2 SUB mappings, except to move <subchar1> mappings to the extension table */ + if(maxCharLength>1 && m->bLen==1) { + m->f|=MBCS_FROM_U_EXT_FLAG; + m->moveFlag=UCM_MOVE_TO_EXT; + } + break; + case 3: + /* set only a fallback mapping from codepage to Unicode */ + staticData->hasToUnicodeFallback=true; + isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f); + break; + case 4: + /* move "good one-way" mappings to the extension table */ + m->f|=MBCS_FROM_U_EXT_FLAG; + m->moveFlag=UCM_MOVE_TO_EXT; + break; + default: + /* will not occur because the parser checked it already */ + fprintf(stderr, "error: illegal fallback indicator %d\n", f); + return false; + } + } + + MBCSPostprocess(mbcsData, staticData); + + return isOK; +} +U_CDECL_END +static UBool +transformEUC(MBCSData *mbcsData) { + uint8_t *p8; + uint32_t i, value, oldLength, old3Top; + uint8_t b; + + oldLength=mbcsData->ucm->states.maxCharLength; + if(oldLength<3) { + return false; + } + + old3Top=mbcsData->stage3Top; + + /* careful: 2-byte and 4-byte codes are stored in platform endianness! */ + + /* test if all first bytes are in {0, 0x8e, 0x8f} */ + p8=mbcsData->fromUBytes; + +#if !U_IS_BIG_ENDIAN + if(oldLength==4) { + p8+=3; + } +#endif + + for(i=0; i<old3Top; i+=oldLength) { + b=p8[i]; + if(b!=0 && b!=0x8e && b!=0x8f) { + /* some first byte does not fit the EUC pattern, nothing to be done */ + return false; + } + } + /* restore p if it was modified above */ + p8=mbcsData->fromUBytes; + + /* modify outputType and adjust stage3Top */ + mbcsData->ucm->states.outputType=(int8_t)(MBCS_OUTPUT_3_EUC+oldLength-3); + mbcsData->stage3Top=(old3Top*(oldLength-1))/oldLength; + + /* + * EUC-encode all byte sequences; + * see "CJKV Information Processing" (1st ed. 1999) from Ken Lunde, O'Reilly, + * p. 161 in chapter 4 "Encoding Methods" + * + * This also must reverse the byte order if the platform is little-endian! + */ + if(oldLength==3) { + uint16_t *q=(uint16_t *)p8; + for(i=0; i<old3Top; i+=oldLength) { + b=*p8; + if(b==0) { + /* short sequences are stored directly */ + /* code set 0 or 1 */ + (*q++)=(uint16_t)((p8[1]<<8)|p8[2]); + } else if(b==0x8e) { + /* code set 2 */ + (*q++)=(uint16_t)(((p8[1]&0x7f)<<8)|p8[2]); + } else /* b==0x8f */ { + /* code set 3 */ + (*q++)=(uint16_t)((p8[1]<<8)|(p8[2]&0x7f)); + } + p8+=3; + } + } else /* oldLength==4 */ { + uint8_t *q=p8; + uint32_t *p32=(uint32_t *)p8; + for(i=0; i<old3Top; i+=4) { + value=(*p32++); + if(value<=0xffffff) { + /* short sequences are stored directly */ + /* code set 0 or 1 */ + (*q++)=(uint8_t)(value>>16); + (*q++)=(uint8_t)(value>>8); + (*q++)=(uint8_t)value; + } else if(value<=0x8effffff) { + /* code set 2 */ + (*q++)=(uint8_t)((value>>16)&0x7f); + (*q++)=(uint8_t)(value>>8); + (*q++)=(uint8_t)value; + } else /* first byte is 0x8f */ { + /* code set 3 */ + (*q++)=(uint8_t)(value>>16); + (*q++)=(uint8_t)((value>>8)&0x7f); + (*q++)=(uint8_t)value; + } + } + } + + return true; +} + +/* + * Compact stage 2 for SBCS by overlapping adjacent stage 2 blocks as far + * as possible. Overlapping is done on unassigned head and tail + * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER. + * Stage 1 indexes need to be adjusted accordingly. + * This function is very similar to genprops/store.c/compactStage(). + */ +static void +singleCompactStage2(MBCSData *mbcsData) { + /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */ + uint16_t map[MBCS_STAGE_2_MAX_BLOCKS]; + uint16_t i, start, prevEnd, newStart; + + /* enter the all-unassigned first stage 2 block into the map */ + map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX; + + /* begin with the first block after the all-unassigned one */ + start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED; + while(start<mbcsData->stage2Top) { + prevEnd=(uint16_t)(newStart-1); + + /* find the size of the overlap */ + for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2Single[start+i]==0 && mbcsData->stage2Single[prevEnd-i]==0; ++i) {} + + if(i>0) { + map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i); + + /* move the non-overlapping indexes to their new positions */ + start+=i; + for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) { + mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++]; + } + } else if(newStart<start) { + /* move the indexes to their new positions */ + map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart; + for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) { + mbcsData->stage2Single[newStart++]=mbcsData->stage2Single[start++]; + } + } else /* no overlap && newStart==start */ { + map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start; + start=newStart+=MBCS_STAGE_2_BLOCK_SIZE; + } + } + + /* adjust stage2Top */ + if(VERBOSE && newStart<mbcsData->stage2Top) { + printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n", + (unsigned long)mbcsData->stage2Top, (unsigned long)newStart, + (long)(mbcsData->stage2Top-newStart)*2); + } + mbcsData->stage2Top=newStart; + + /* now adjust stage 1 */ + for(i=0; i<MBCS_STAGE_1_SIZE; ++i) { + mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]; + } +} + +/* Compact stage 3 for SBCS - same algorithm as above. */ +static void +singleCompactStage3(MBCSData *mbcsData) { + uint16_t *stage3=(uint16_t *)mbcsData->fromUBytes; + + /* this array maps the ordinal number of a stage 3 block to its new stage 2 index */ + uint16_t map[0x1000]; + uint16_t i, start, prevEnd, newStart; + + /* enter the all-unassigned first stage 3 block into the map */ + map[0]=0; + + /* begin with the first block after the all-unassigned one */ + start=newStart=16; + while(start<mbcsData->stage3Top) { + prevEnd=(uint16_t)(newStart-1); + + /* find the size of the overlap */ + for(i=0; i<16 && stage3[start+i]==0 && stage3[prevEnd-i]==0; ++i) {} + + if(i>0) { + map[start>>4]=(uint16_t)(newStart-i); + + /* move the non-overlapping indexes to their new positions */ + start+=i; + for(i=(uint16_t)(16-i); i>0; --i) { + stage3[newStart++]=stage3[start++]; + } + } else if(newStart<start) { + /* move the indexes to their new positions */ + map[start>>4]=newStart; + for(i=16; i>0; --i) { + stage3[newStart++]=stage3[start++]; + } + } else /* no overlap && newStart==start */ { + map[start>>4]=start; + start=newStart+=16; + } + } + + /* adjust stage3Top */ + if(VERBOSE && newStart<mbcsData->stage3Top) { + printf("compacting stage 3 from stage3Top=0x%lx to 0x%lx, saving %ld bytes\n", + (unsigned long)mbcsData->stage3Top, (unsigned long)newStart, + (long)(mbcsData->stage3Top-newStart)*2); + } + mbcsData->stage3Top=newStart; + + /* now adjust stage 2 */ + for(i=0; i<mbcsData->stage2Top; ++i) { + mbcsData->stage2Single[i]=map[mbcsData->stage2Single[i]>>4]; + } +} + +/* + * Compact stage 2 by overlapping adjacent stage 2 blocks as far + * as possible. Overlapping is done on unassigned head and tail + * parts of blocks in steps of MBCS_STAGE_2_MULTIPLIER. + * Stage 1 indexes need to be adjusted accordingly. + * This function is very similar to genprops/store.c/compactStage(). + */ +static void +compactStage2(MBCSData *mbcsData) { + /* this array maps the ordinal number of a stage 2 block to its new stage 1 index */ + uint16_t map[MBCS_STAGE_2_MAX_BLOCKS]; + uint16_t i, start, prevEnd, newStart; + + /* enter the all-unassigned first stage 2 block into the map */ + map[0]=MBCS_STAGE_2_ALL_UNASSIGNED_INDEX; + + /* begin with the first block after the all-unassigned one */ + start=newStart=MBCS_STAGE_2_FIRST_ASSIGNED; + while(start<mbcsData->stage2Top) { + prevEnd=(uint16_t)(newStart-1); + + /* find the size of the overlap */ + for(i=0; i<MBCS_STAGE_2_BLOCK_SIZE && mbcsData->stage2[start+i]==0 && mbcsData->stage2[prevEnd-i]==0; ++i) {} + + if(i>0) { + map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=(uint16_t)(newStart-i); + + /* move the non-overlapping indexes to their new positions */ + start+=i; + for(i=(uint16_t)(MBCS_STAGE_2_BLOCK_SIZE-i); i>0; --i) { + mbcsData->stage2[newStart++]=mbcsData->stage2[start++]; + } + } else if(newStart<start) { + /* move the indexes to their new positions */ + map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=newStart; + for(i=MBCS_STAGE_2_BLOCK_SIZE; i>0; --i) { + mbcsData->stage2[newStart++]=mbcsData->stage2[start++]; + } + } else /* no overlap && newStart==start */ { + map[start>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]=start; + start=newStart+=MBCS_STAGE_2_BLOCK_SIZE; + } + } + + /* adjust stage2Top */ + if(VERBOSE && newStart<mbcsData->stage2Top) { + printf("compacting stage 2 from stage2Top=0x%lx to 0x%lx, saving %ld bytes\n", + (unsigned long)mbcsData->stage2Top, (unsigned long)newStart, + (long)(mbcsData->stage2Top-newStart)*4); + } + mbcsData->stage2Top=newStart; + + /* now adjust stage 1 */ + for(i=0; i<MBCS_STAGE_1_SIZE; ++i) { + mbcsData->stage1[i]=map[mbcsData->stage1[i]>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT]; + } +} + +static void +MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData * /*staticData*/) { + UCMStates *states; + int32_t maxCharLength, stage3Width; + + states=&mbcsData->ucm->states; + stage3Width=maxCharLength=states->maxCharLength; + + ucm_optimizeStates(states, + &mbcsData->unicodeCodeUnits, + mbcsData->toUFallbacks, mbcsData->countToUFallbacks, + VERBOSE); + + /* try to compact the fromUnicode tables */ + if(transformEUC(mbcsData)) { + --stage3Width; + } + + /* + * UTF-8-friendly tries are built precompacted, to cope with variable + * stage 3 allocation block sizes. + * + * Tables without precision indicators cannot be built that way, + * because if a block was overlapped with a previous one, then a smaller + * code point for the same block would not fit. + * Therefore, such tables are not marked UTF-8-friendly and must be + * compacted after all mappings are entered. + */ + if(!mbcsData->utf8Friendly) { + if(maxCharLength==1) { + singleCompactStage3(mbcsData); + singleCompactStage2(mbcsData); + } else { + compactStage2(mbcsData); + } + } + + if(VERBOSE) { + /*uint32_t c, i1, i2, i2Limit, i3;*/ + + printf("fromUnicode number of uint%s_t in stage 2: 0x%lx=%lu\n", + maxCharLength==1 ? "16" : "32", + (unsigned long)mbcsData->stage2Top, + (unsigned long)mbcsData->stage2Top); + printf("fromUnicode number of %d-byte stage 3 mapping entries: 0x%lx=%lu\n", + (int)stage3Width, + (unsigned long)mbcsData->stage3Top/stage3Width, + (unsigned long)mbcsData->stage3Top/stage3Width); +#if 0 + c=0; + for(i1=0; i1<MBCS_STAGE_1_SIZE; ++i1) { + i2=mbcsData->stage1[i1]; + if(i2==0) { + c+=MBCS_STAGE_2_BLOCK_SIZE*MBCS_STAGE_3_BLOCK_SIZE; + continue; + } + for(i2Limit=i2+MBCS_STAGE_2_BLOCK_SIZE; i2<i2Limit; ++i2) { + if(maxCharLength==1) { + i3=mbcsData->stage2Single[i2]; + } else { + i3=(uint16_t)mbcsData->stage2[i2]; + } + if(i3==0) { + c+=MBCS_STAGE_3_BLOCK_SIZE; + continue; + } + printf("U+%04lx i1=0x%02lx i2=0x%04lx i3=0x%04lx\n", + (unsigned long)c, + (unsigned long)i1, + (unsigned long)i2, + (unsigned long)i3); + c+=MBCS_STAGE_3_BLOCK_SIZE; + } + } +#endif + } +} + +U_CDECL_BEGIN +static uint32_t +MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, + UNewDataMemory *pData, int32_t tableType) { + MBCSData *mbcsData=(MBCSData *)cnvData; + uint32_t stage2Start, stage2Length; + uint32_t top, stageUTF8Length=0; + int32_t i, stage1Top; + uint32_t headerLength; + + _MBCSHeader header=UCNV_MBCS_HEADER_INITIALIZER; + + stage2Length=mbcsData->stage2Top; + if(mbcsData->omitFromU) { + /* find how much of stage2 can be omitted */ + int32_t utf8Limit=(int32_t)mbcsData->utf8Max+1; + uint32_t st2=0; /*initialized it to avoid compiler warnings */ + + i=utf8Limit>>MBCS_STAGE_1_SHIFT; + if((utf8Limit&((1<<MBCS_STAGE_1_SHIFT)-1))!=0 && (st2=mbcsData->stage1[i])!=0) { + /* utf8Limit is in the middle of an existing stage 2 block */ + stage2Start=st2+((utf8Limit>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK); + } else { + /* find the last stage2 block with mappings before utf8Limit */ + while(i>0 && (st2=mbcsData->stage1[--i])==0) {} + /* stage2 up to the end of this block corresponds to stageUTF8 */ + stage2Start=st2+MBCS_STAGE_2_BLOCK_SIZE; + } + header.options|=MBCS_OPT_NO_FROM_U; + header.fullStage2Length=stage2Length; + stage2Length-=stage2Start; + if(VERBOSE) { + printf("+ omitting %lu out of %lu stage2 entries and %lu fromUBytes\n", + (unsigned long)stage2Start, + (unsigned long)mbcsData->stage2Top, + (unsigned long)mbcsData->stage3Top); + printf("+ total size savings: %lu bytes\n", (unsigned long)stage2Start*4+mbcsData->stage3Top); + } + } else { + stage2Start=0; + } + + if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { + stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */ + } else { + stage1Top=0x40; /* 0x40==64 */ + } + + /* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */ + if(mbcsData->ucm->states.maxCharLength==1) { + for(i=0; i<stage1Top; ++i) { + mbcsData->stage1[i]+=(uint16_t)stage1Top; + } + + /* stage2Top/Length have counted 16-bit results, now we need to count bytes */ + /* also round up to a multiple of 4 bytes */ + stage2Length=(stage2Length*2+1)&~1; + + /* stage3Top has counted 16-bit results, now we need to count bytes */ + mbcsData->stage3Top*=2; + + if(mbcsData->utf8Friendly) { + header.version[2]=(uint8_t)(SBCS_UTF8_MAX>>8); /* store 0x1f for max==0x1fff */ + } + } else { + for(i=0; i<stage1Top; ++i) { + mbcsData->stage1[i]+=(uint16_t)stage1Top/2; /* stage 2 contains 32-bit entries, stage 1 16-bit entries */ + } + + /* stage2Top/Length have counted 32-bit results, now we need to count bytes */ + stage2Length*=4; + /* leave stage2Start counting 32-bit units */ + + if(mbcsData->utf8Friendly) { + stageUTF8Length=(mbcsData->utf8Max+1)>>MBCS_UTF8_STAGE_SHIFT; + header.version[2]=(uint8_t)(mbcsData->utf8Max>>8); /* store 0xd7 for max==0xd7ff */ + } + + /* stage3Top has already counted bytes */ + } + + /* round up stage3Top so that the sizes of all data blocks are multiples of 4 */ + mbcsData->stage3Top=(mbcsData->stage3Top+3)&~3; + + /* fill the header */ + if(header.options&MBCS_OPT_INCOMPATIBLE_MASK) { + header.version[0]=5; + if(header.options&MBCS_OPT_NO_FROM_U) { + headerLength=10; /* include fullStage2Length */ + } else { + headerLength=MBCS_HEADER_V5_MIN_LENGTH; /* 9 */ + } + } else { + header.version[0]=4; + headerLength=MBCS_HEADER_V4_LENGTH; /* 8 */ + } + header.version[1]=4; + /* header.version[2] set above for utf8Friendly data */ + + header.options|=(uint32_t)headerLength; + + header.countStates=mbcsData->ucm->states.countStates; + header.countToUFallbacks=mbcsData->countToUFallbacks; + + header.offsetToUCodeUnits= + headerLength*4+ + mbcsData->ucm->states.countStates*1024+ + mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback); + header.offsetFromUTable= + header.offsetToUCodeUnits+ + mbcsData->ucm->states.countToUCodeUnits*2; + header.offsetFromUBytes= + header.offsetFromUTable+ + stage1Top*2+ + stage2Length; + header.fromUBytesLength=mbcsData->stage3Top; + + top=header.offsetFromUBytes+stageUTF8Length*2; + if(!(header.options&MBCS_OPT_NO_FROM_U)) { + top+=header.fromUBytesLength; + } + + header.flags=(uint8_t)(mbcsData->ucm->states.outputType); + + if(tableType&TABLE_EXT) { + if(top>0xffffff) { + fprintf(stderr, "error: offset 0x%lx to extension table exceeds 0xffffff\n", (long)top); + return 0; + } + + header.flags|=top<<8; + } + + /* write the MBCS data */ + udata_writeBlock(pData, &header, headerLength*4); + udata_writeBlock(pData, mbcsData->ucm->states.stateTable, header.countStates*1024); + udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback)); + udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->ucm->states.countToUCodeUnits*2); + udata_writeBlock(pData, mbcsData->stage1, stage1Top*2); + if(mbcsData->ucm->states.maxCharLength==1) { + udata_writeBlock(pData, mbcsData->stage2Single+stage2Start, stage2Length); + } else { + udata_writeBlock(pData, mbcsData->stage2+stage2Start, stage2Length); + } + if(!(header.options&MBCS_OPT_NO_FROM_U)) { + udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top); + } + + if(stageUTF8Length>0) { + udata_writeBlock(pData, mbcsData->stageUTF8, stageUTF8Length*2); + } + + /* return the number of bytes that should have been written */ + return top; +} +U_CDECL_END diff --git a/intl/icu/source/tools/makeconv/genmbcs.h b/intl/icu/source/tools/makeconv/genmbcs.h new file mode 100644 index 0000000000..9ff1c77633 --- /dev/null +++ b/intl/icu/source/tools/makeconv/genmbcs.h @@ -0,0 +1,126 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2008, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: genmbcs.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000jul10 +* created by: Markus W. Scherer +*/ + +#ifndef __GENMBCS_H__ +#define __GENMBCS_H__ + +#include "makeconv.h" + +enum { + /* + * TODO: Consider using ucnvmbcs.h constants. + * However, not all values need to be exactly the same, for example + * the xxx_UTF8_MAX values may be different. (Especially SBCS_UTF8_MAX + * may be higher in makeconv than in the runtime code because that + * affects only a small number of .cnv files [if any] but all + * runtime UConverterSharedData objects. + */ + MBCS_STAGE_2_SHIFT=4, + MBCS_STAGE_2_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits in stage 2 */ + MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */ + MBCS_STAGE_2_BLOCK_MASK=0x3f, /* for after shifting by MBCS_STAGE_2_SHIFT */ + MBCS_STAGE_1_SHIFT=10, + MBCS_STAGE_1_BMP_SIZE=0x40, /* 0x10000>>MBCS_STAGE_1_SHIFT, or 16 for one entry per 1k code points on the BMP */ + MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>MBCS_STAGE_1_SHIFT, or 17*64 for one entry per 1k code points */ + MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE: stages 1 & 2 share a 16-bit-indexed array */ + MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE, + MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT, + + MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */ + MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */ + + MBCS_STAGE_3_BLOCK_SIZE=16, /* =16=1<<4 for 4 bits in stage 3 */ + MBCS_STAGE_3_BLOCK_MASK=0xf, + MBCS_STAGE_3_FIRST_ASSIGNED=MBCS_STAGE_3_BLOCK_SIZE, /* start of the first stage 3 block after the all-unassigned one */ + + MBCS_STAGE_3_GRANULARITY=16, /* =1<<4: MBCS stage 2 indexes are shifted left 4 */ + MBCS_STAGE_3_SBCS_SIZE=0x10000, /* max 64k mappings for SBCS */ + MBCS_STAGE_3_MBCS_SIZE=0x10000*MBCS_STAGE_3_GRANULARITY, /* max mappings for MBCS */ + + /* + * SBCS_UTF8_MAX: Maximum code point with UTF-8-friendly SBCS data structures. + * Possible values are 0x01ff..0xffff, in steps of 0x100. + * + * Unlike for MBCS, this constant only affects the stage 3 block allocation size; + * there is no additional stage 1/2 table stored in the .cnv file. + * The max value should be at least 0x7ff to cover 2-byte UTF-8. + * 0xfff also covers a number other small scripts which have legacy charsets + * (like Thai). + * Higher values up to 0x1fff are harmless and potentially useful because + * that covers small-script blocks which usually have either dense mappings + * or no mappings at all. + * Starting at U+2000, there are mostly symbols and format characters + * with a low density of SBCS mappings, which would result in more wasted + * stage 3 entries with the larger block size. + */ + SBCS_UTF8_MAX=0x1fff, + + /* + * MBCS_UTF8_MAX: Maximum code point with UTF-8-friendly MBCS data structures. + * Possible values are 0x01ff..0xffff, in steps of 0x100. + * + * Note that with 0xffff, MBCSAddFromUnicode() may overflow the additional UTF-8 stage table + * with extreme input data. The function checks for this overflow. + * + * 0xd7ff is chosen for the majority of common characters including Unihan and Hangul. + * At U+d800 there are mostly surrogates, private use codes, compatibility characters, etc. + * Larger values cause slightly larger MBCS .cnv files. + */ + MBCS_UTF8_MAX=0xd7ff, + MBCS_UTF8_LIMIT=MBCS_UTF8_MAX+1, /* =0xd800 */ + + MBCS_UTF8_STAGE_SHIFT=6, + MBCS_UTF8_STAGE_3_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits from last trail byte */ + MBCS_UTF8_STAGE_3_BLOCK_MASK=0x3f, + + /* size of the single-stage table for up to U+d7ff (used instead of stage1/2) */ + MBCS_UTF8_STAGE_SIZE=MBCS_UTF8_LIMIT>>MBCS_UTF8_STAGE_SHIFT, /* =0x360 */ + + MBCS_FROM_U_EXT_FLAG=0x10, /* UCMapping.f bit for base table mappings that fit into the base toU table */ + MBCS_FROM_U_EXT_MASK=0x0f, /* but need to go into the extension fromU table */ + + /* =4 number of regular stage 3 blocks for final UTF-8 trail byte */ + MBCS_UTF8_STAGE_3_BLOCKS=MBCS_UTF8_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_BLOCK_SIZE, + + MBCS_MAX_FALLBACK_COUNT=8192 +}; + +U_CFUNC NewConverter * +MBCSOpen(UCMFile *ucm); + +struct MBCSData; +typedef struct MBCSData MBCSData; + +/* + * Get a dummy MBCSData for use with MBCSOkForBaseFromUnicode() + * for creating an extension-only file. + * Assume maxCharLength>1. + */ +U_CFUNC const MBCSData * +MBCSGetDummy(void); + +/* Test if a 1:1 mapping fits into the MBCS base table's fromUnicode structure. */ +U_CFUNC UBool +MBCSOkForBaseFromUnicode(const MBCSData *mbcsData, + const uint8_t *bytes, int32_t length, + UChar32 c, int8_t flag); + +U_CFUNC NewConverter * +CnvExtOpen(UCMFile *ucm); + +#endif /* __GENMBCS_H__ */ diff --git a/intl/icu/source/tools/makeconv/makeconv.1.in b/intl/icu/source/tools/makeconv/makeconv.1.in new file mode 100644 index 0000000000..e42d5127b5 --- /dev/null +++ b/intl/icu/source/tools/makeconv/makeconv.1.in @@ -0,0 +1,114 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" makeconv.1: manual page for the makeconv utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2000-2002 IBM, Inc. and others. +.\" +.\" Manual page by Yves Arrouye <yves@realnames.com>. +.\" +.TH MAKECONV 1 "16 April 2002" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B makeconv +\- compile a converter table +.SH SYNOPSIS +.B makeconv +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-c\fP, \fB\-\-copyright" +] +[ +.BR "\-v\fP, \fB\-\-verbose" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +.IR convertertable " .\|.\|." +.SH DESCRIPTION +.B makeconv +converts the ICU converter table +.I convertertable +into a binary file. The binary file has the same base name as +.I convertertable +but has a +.B .cnv +extension (instead of the typical +.B .ucm +extension of the +.I convertertable +file). +This binary file can then be read directly by ICU, or used by +.BR pkgdata (1) +for incorporation into a larger archive or library. +.PP +The +.I convertertable +must be in the ICU ucm (Unicode Codepage Mapping) format in order to +be understood by +.BR makeconv . +The ICU ucm format is similar to the IBM NLTC upmap/tpmap/rpmap files. +Comments in the +.I convertertable +are handled as follows. If a comment (starting with a `#' sign) that +is after some text does contain the fallback indicator `|' then only +the text starting with the `#' sign, and ending before the `|' sign, +is ignored. +Otherwise, or if the comment is the first thing on the line, +the comment runs up to the end of the line. This special +handling of comments is to accommodate the practice of putting fallback +information in comments in the strict IBM NLTC ucmap format. +.PP +Note that new converters will be automatically found by ICU after their +installation in ICU's data directory. They do not need to +be listed in the +.BR convrtrs.txt (5) +converters aliases file in order to be available to applications using ICU. +They do need to be listed there if one wants to give them aliases, or +tags, though. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BR "\-c\fP, \fB\-\-copyright" +Include a copyright notice in the binary data. +.TP +.BR "\-v\fP, \fB\-\-verbose" +Display extra informative messages during execution. +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory to +.IR destination . +The default destination directory is specified by the environment variable +.BR ICU_DATA . +.SH CAVEATS +If an existing converter table is changed and recompiled using +.BR makeconv , +the resulting binary file must be packaged in the same way that it was +packaged initially. For example, if converters were grouped together in +an archive or a library with +.BR pkgdata (1), +then the archive or library must be rebuilt with the new binary file. +A standalone binary converter file will not take precedence over a +packaged one. +.SH ENVIRONMENT +.TP 10 +.B ICU_DATA +Specifies the directory containing ICU data. Defaults to +.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ . +Some tools in ICU depend on the presence of the trailing slash. It is thus +important to make sure that it is present if +.B ICU_DATA +is set. +.SH VERSION +@VERSION@ +.SH COPYRIGHT +Copyright (C) 2000 IBM, Inc. and others. +.SH SEE ALSO +.BR convrtrs.txt (5) +.br +.BR pkgdata (1) + diff --git a/intl/icu/source/tools/makeconv/makeconv.cpp b/intl/icu/source/tools/makeconv/makeconv.cpp new file mode 100644 index 0000000000..b14b4316f3 --- /dev/null +++ b/intl/icu/source/tools/makeconv/makeconv.cpp @@ -0,0 +1,862 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ******************************************************************************** + * + * Copyright (C) 1998-2015, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************** + * + * + * makeconv.cpp: + * tool creating a binary (compressed) representation of the conversion mapping + * table (IBM NLTC ucmap format). + * + * 05/04/2000 helena Added fallback mapping into the picture... + * 06/29/2000 helena Major rewrite of the callback APIs. + */ + +#include <stdio.h> +#include "unicode/putil.h" +#include "unicode/ucnv_err.h" +#include "charstr.h" +#include "ucnv_bld.h" +#include "ucnv_imp.h" +#include "ucnv_cnv.h" +#include "cstring.h" +#include "cmemory.h" +#include "uinvchar.h" +#include "filestrm.h" +#include "toolutil.h" +#include "uoptions.h" +#include "unicode/udata.h" +#include "unewdata.h" +#include "uparse.h" +#include "ucm.h" +#include "makeconv.h" +#include "genmbcs.h" + +#define DEBUG 0 + +typedef struct ConvData { + UCMFile *ucm; + NewConverter *cnvData, *extData; + UConverterSharedData sharedData; + UConverterStaticData staticData; +} ConvData; + +static void +initConvData(ConvData *data) { + uprv_memset(data, 0, sizeof(ConvData)); + data->sharedData.structSize=sizeof(UConverterSharedData); + data->staticData.structSize=sizeof(UConverterStaticData); + data->sharedData.staticData=&data->staticData; +} + +static void +cleanupConvData(ConvData *data) { + if(data!=nullptr) { + if(data->cnvData!=nullptr) { + data->cnvData->close(data->cnvData); + data->cnvData=nullptr; + } + if(data->extData!=nullptr) { + data->extData->close(data->extData); + data->extData=nullptr; + } + ucm_close(data->ucm); + data->ucm=nullptr; + } +} + +/* + * from ucnvstat.c - static prototypes of data-based converters + */ +U_CAPI const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]; + +/* + * Global - verbosity + */ +UBool VERBOSE = false; +UBool QUIET = false; +UBool SMALL = false; +UBool IGNORE_SISO_CHECK = false; + +static void +createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode); + +/* + * Set up the UNewData and write the converter.. + */ +static void +writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status); + +UBool haveCopyright=true; + +static UDataInfo dataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + sizeof(char16_t), + 0, + + {0x63, 0x6e, 0x76, 0x74}, /* dataFormat="cnvt" */ + {6, 2, 0, 0}, /* formatVersion */ + {0, 0, 0, 0} /* dataVersion (calculated at runtime) */ +}; + +static void +writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status) +{ + UNewDataMemory *mem = nullptr; + uint32_t sz2; + uint32_t size = 0; + int32_t tableType; + + if(U_FAILURE(*status)) + { + return; + } + + tableType=TABLE_NONE; + if(data->cnvData!=nullptr) { + tableType|=TABLE_BASE; + } + if(data->extData!=nullptr) { + tableType|=TABLE_EXT; + } + + mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : nullptr, status); + + if(U_FAILURE(*status)) + { + fprintf(stderr, "Couldn't create the udata %s.%s: %s\n", + cnvName, + "cnv", + u_errorName(*status)); + return; + } + + if(VERBOSE) + { + printf("- Opened udata %s.%s\n", cnvName, "cnv"); + } + + + /* all read only, clean, platform independent data. Mmmm. :) */ + udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData)); + size += sizeof(UConverterStaticData); /* Is 4-aligned - by size */ + /* Now, write the table */ + if(tableType&TABLE_BASE) { + size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType); + } + if(tableType&TABLE_EXT) { + size += data->extData->write(data->extData, &data->staticData, mem, tableType); + } + + sz2 = udata_finish(mem, status); + if(size != sz2) + { + fprintf(stderr, "error: wrote %u bytes to the .cnv file but counted %u bytes\n", (int)sz2, (int)size); + *status=U_INTERNAL_PROGRAM_ERROR; + } + if(VERBOSE) + { + printf("- Wrote %u bytes to the udata.\n", (int)sz2); + } +} + +enum { + OPT_HELP_H, + OPT_HELP_QUESTION_MARK, + OPT_COPYRIGHT, + OPT_VERSION, + OPT_DESTDIR, + OPT_VERBOSE, + OPT_SMALL, + OPT_IGNORE_SISO_CHECK, + OPT_QUIET, + OPT_SOURCEDIR, + + OPT_COUNT +}; + +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_COPYRIGHT, + UOPTION_VERSION, + UOPTION_DESTDIR, + UOPTION_VERBOSE, + { "small", nullptr, nullptr, nullptr, '\1', UOPT_NO_ARG, 0 }, + { "ignore-siso-check", nullptr, nullptr, nullptr, '\1', UOPT_NO_ARG, 0 }, + UOPTION_QUIET, + UOPTION_SOURCEDIR, +}; + +int main(int argc, char* argv[]) +{ + ConvData data; + char cnvName[UCNV_MAX_FULL_FILE_NAME_LENGTH]; + + U_MAIN_INIT_ARGS(argc, argv); + + /* Set up the ICU version number */ + UVersionInfo icuVersion; + u_getVersion(icuVersion); + uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo)); + + /* preset then read command line options */ + options[OPT_DESTDIR].value=u_getDataDirectory(); + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + if(options[OPT_VERSION].doesOccur) { + printf("makeconv version %u.%u, ICU tool to read .ucm codepage mapping files and write .cnv files\n", + dataInfo.formatVersion[0], dataInfo.formatVersion[1]); + printf("%s\n", U_COPYRIGHT_STRING); + exit(0); + } + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } else if(argc<2) { + argc=-1; + } + if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) { + FILE *stdfile=argc<0 ? stderr : stdout; + fprintf(stdfile, + "usage: %s [-options] files...\n" + "\tread .ucm codepage mapping files and write .cnv files\n" + "options:\n" + "\t-h or -? or --help this usage text\n" + "\t-V or --version show a version message\n" + "\t-c or --copyright include a copyright notice\n" + "\t-d or --destdir destination directory, followed by the path\n" + "\t-v or --verbose Turn on verbose output\n" + "\t-q or --quiet do not display warnings and progress\n" + "\t-s or --sourcedir source directory, followed by the path\n", + argv[0]); + fprintf(stdfile, + "\t --small Generate smaller .cnv files. They will be\n" + "\t significantly smaller but may not be compatible with\n" + "\t older versions of ICU and will require heap memory\n" + "\t allocation when loaded.\n" + "\t --ignore-siso-check Use SI/SO other than 0xf/0xe.\n"); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + /* get the options values */ + haveCopyright = options[OPT_COPYRIGHT].doesOccur; + const char *destdir = options[OPT_DESTDIR].value; + VERBOSE = options[OPT_VERBOSE].doesOccur; + QUIET = options[OPT_QUIET].doesOccur; + SMALL = options[OPT_SMALL].doesOccur; + + if (options[OPT_IGNORE_SISO_CHECK].doesOccur) { + IGNORE_SISO_CHECK = true; + } + + icu::CharString outFileName; + UErrorCode err = U_ZERO_ERROR; + if (destdir != nullptr && *destdir != 0) { + outFileName.append(destdir, err).ensureEndsWithFileSeparator(err); + if (U_FAILURE(err)) { + return err; + } + } + int32_t outBasenameStart = outFileName.length(); + +#if DEBUG + { + int i; + printf("makeconv: processing %d files...\n", argc - 1); + for(i=1; i<argc; ++i) { + printf("%s ", argv[i]); + } + printf("\n"); + fflush(stdout); + } +#endif + + UBool printFilename = (UBool) (argc > 2 || VERBOSE); + icu::CharString pathBuf; + for (++argv; --argc; ++argv) + { + UErrorCode localError = U_ZERO_ERROR; + const char *arg = getLongPathname(*argv); + + const char* sourcedir = options[OPT_SOURCEDIR].value; + if (sourcedir != nullptr && *sourcedir != 0 && uprv_strcmp(sourcedir, ".") != 0) { + pathBuf.clear(); + pathBuf.appendPathPart(sourcedir, localError); + pathBuf.appendPathPart(arg, localError); + arg = pathBuf.data(); + } + + /*produces the right destination path for display*/ + outFileName.truncate(outBasenameStart); + if (outBasenameStart != 0) + { + /* find the last file sepator */ + const char *basename = findBasename(arg); + outFileName.append(basename, localError); + } + else + { + outFileName.append(arg, localError); + } + if (U_FAILURE(localError)) { + return localError; + } + + /*removes the extension if any is found*/ + int32_t lastDotIndex = outFileName.lastIndexOf('.'); + if (lastDotIndex >= outBasenameStart) { + outFileName.truncate(lastDotIndex); + } + + /* the basename without extension is the converter name */ + if ((outFileName.length() - outBasenameStart) >= UPRV_LENGTHOF(cnvName)) { + fprintf(stderr, "converter name %s too long\n", outFileName.data() + outBasenameStart); + return U_BUFFER_OVERFLOW_ERROR; + } + uprv_strcpy(cnvName, outFileName.data() + outBasenameStart); + + /*Adds the target extension*/ + outFileName.append(CONVERTER_FILE_EXTENSION, localError); + if (U_FAILURE(localError)) { + return localError; + } + +#if DEBUG + printf("makeconv: processing %s ...\n", arg); + fflush(stdout); +#endif + initConvData(&data); + createConverter(&data, arg, &localError); + + if (U_FAILURE(localError)) + { + /* if an error is found, print out an error msg and keep going */ + fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n", + outFileName.data(), arg, u_errorName(localError)); + if(U_SUCCESS(err)) { + err = localError; + } + } + else + { + /* Insure the static data name matches the file name */ + /* Changed to ignore directory and only compare base name + LDH 1/2/08*/ + char *p; + p = strrchr(cnvName, U_FILE_SEP_CHAR); /* Find last file separator */ + + if(p == nullptr) /* OK, try alternate */ + { + p = strrchr(cnvName, U_FILE_ALT_SEP_CHAR); + if(p == nullptr) + { + p=cnvName; /* If no separators, no problem */ + } + } + else + { + p++; /* If found separator, don't include it in compare */ + } + if(uprv_stricmp(p,data.staticData.name) && !QUIET) + { + fprintf(stderr, "Warning: %s%s claims to be '%s'\n", + cnvName, CONVERTER_FILE_EXTENSION, + data.staticData.name); + } + + uprv_strcpy((char*)data.staticData.name, cnvName); + + if(!uprv_isInvariantString((char*)data.staticData.name, -1)) { + fprintf(stderr, + "Error: A converter name must contain only invariant characters.\n" + "%s is not a valid converter name.\n", + data.staticData.name); + if(U_SUCCESS(err)) { + err = U_INVALID_TABLE_FORMAT; + } + } + + localError = U_ZERO_ERROR; + writeConverterData(&data, cnvName, destdir, &localError); + + if(U_FAILURE(localError)) + { + /* if an error is found, print out an error msg and keep going*/ + fprintf(stderr, "Error writing \"%s\" file for \"%s\" (%s)\n", outFileName.data(), arg, + u_errorName(localError)); + if(U_SUCCESS(err)) { + err = localError; + } + } + else if (printFilename) + { + puts(outFileName.data() + outBasenameStart); + } + } + fflush(stdout); + fflush(stderr); + + cleanupConvData(&data); + } + + return err; +} + +static void +getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID) { + if( (name[0]=='i' || name[0]=='I') && + (name[1]=='b' || name[1]=='B') && + (name[2]=='m' || name[2]=='M') + ) { + name+=3; + if(*name=='-') { + ++name; + } + *pPlatform=UCNV_IBM; + *pCCSID=(int32_t)uprv_strtoul(name, nullptr, 10); + } else { + *pPlatform=UCNV_UNKNOWN; + *pCCSID=0; + } +} + +static void +readHeader(ConvData *data, + FileStream* convFile, + UErrorCode *pErrorCode) { + char line[1024]; + char *s, *key, *value; + const UConverterStaticData *prototype; + UConverterStaticData *staticData; + + if(U_FAILURE(*pErrorCode)) { + return; + } + + staticData=&data->staticData; + staticData->platform=UCNV_IBM; + staticData->subCharLen=0; + + while(T_FileStream_readLine(convFile, line, sizeof(line))) { + /* basic parsing and handling of state-related items */ + if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) { + continue; + } + + /* stop at the beginning of the mapping section */ + if(uprv_strcmp(line, "CHARMAP")==0) { + break; + } + + /* collect the information from the header field, ignore unknown keys */ + if(uprv_strcmp(key, "code_set_name")==0) { + if(*value!=0) { + uprv_strcpy((char *)staticData->name, value); + getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage); + } + } else if(uprv_strcmp(key, "subchar")==0) { + uint8_t bytes[UCNV_EXT_MAX_BYTES]; + int8_t length; + + s=value; + length=ucm_parseBytes(bytes, line, (const char **)&s); + if(1<=length && length<=4 && *s==0) { + staticData->subCharLen=length; + uprv_memcpy(staticData->subChar, bytes, length); + } else { + fprintf(stderr, "error: illegal <subchar> %s\n", value); + *pErrorCode=U_INVALID_TABLE_FORMAT; + return; + } + } else if(uprv_strcmp(key, "subchar1")==0) { + uint8_t bytes[UCNV_EXT_MAX_BYTES]; + + s=value; + if(1==ucm_parseBytes(bytes, line, (const char **)&s) && *s==0) { + staticData->subChar1=bytes[0]; + } else { + fprintf(stderr, "error: illegal <subchar1> %s\n", value); + *pErrorCode=U_INVALID_TABLE_FORMAT; + return; + } + } + } + + /* copy values from the UCMFile to the static data */ + staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength; + staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength; + staticData->conversionType=data->ucm->states.conversionType; + + if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) { + fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + return; + } + + /* + * Now that we know the type, copy any 'default' values from the table. + * We need not check the type any further because the parser only + * recognizes what we have prototypes for. + * + * For delta (extension-only) tables, copy values from the base file + * instead, see createConverter(). + */ + if(data->ucm->baseName[0]==0) { + prototype=ucnv_converterStaticData[staticData->conversionType]; + if(prototype!=nullptr) { + if(staticData->name[0]==0) { + uprv_strcpy((char *)staticData->name, prototype->name); + } + + if(staticData->codepage==0) { + staticData->codepage=prototype->codepage; + } + + if(staticData->platform==0) { + staticData->platform=prototype->platform; + } + + if(staticData->minBytesPerChar==0) { + staticData->minBytesPerChar=prototype->minBytesPerChar; + } + + if(staticData->maxBytesPerChar==0) { + staticData->maxBytesPerChar=prototype->maxBytesPerChar; + } + + if(staticData->subCharLen==0) { + staticData->subCharLen=prototype->subCharLen; + if(prototype->subCharLen>0) { + uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen); + } + } + } + } + + if(data->ucm->states.outputType<0) { + data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength-1; + } + + if( staticData->subChar1!=0 && + (staticData->minBytesPerChar>1 || + (staticData->conversionType!=UCNV_MBCS && + staticData->conversionType!=UCNV_EBCDIC_STATEFUL)) + ) { + fprintf(stderr, "error: <subchar1> defined for a type other than MBCS or EBCDIC_STATEFUL\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + } +} + +/* return true if a base table was read, false for an extension table */ +static UBool +readFile(ConvData *data, const char* converterName, + UErrorCode *pErrorCode) { + char line[1024]; + char *end; + FileStream *convFile; + + UCMStates *baseStates; + UBool dataIsBase; + + if(U_FAILURE(*pErrorCode)) { + return false; + } + + data->ucm=ucm_open(); + + convFile=T_FileStream_open(converterName, "r"); + if(convFile==nullptr) { + *pErrorCode=U_FILE_ACCESS_ERROR; + return false; + } + + readHeader(data, convFile, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return false; + } + + if(data->ucm->baseName[0]==0) { + dataIsBase=true; + baseStates=&data->ucm->states; + ucm_processStates(baseStates, IGNORE_SISO_CHECK); + } else { + dataIsBase=false; + baseStates=nullptr; + } + + /* read the base table */ + ucm_readTable(data->ucm, convFile, dataIsBase, baseStates, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return false; + } + + /* read an extension table if there is one */ + while(T_FileStream_readLine(convFile, line, sizeof(line))) { + end=uprv_strchr(line, 0); + while(line<end && + (*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) { + --end; + } + *end=0; + + if(line[0]=='#' || u_skipWhitespace(line)==end) { + continue; /* ignore empty and comment lines */ + } + + if(0==uprv_strcmp(line, "CHARMAP")) { + /* read the extension table */ + ucm_readTable(data->ucm, convFile, false, baseStates, pErrorCode); + } else { + fprintf(stderr, "unexpected text after the base mapping table\n"); + } + break; + } + + T_FileStream_close(convFile); + + if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) { + fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + } + + return dataIsBase; +} + +static void +createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) { + ConvData baseData; + UBool dataIsBase; + + UConverterStaticData *staticData; + UCMStates *states, *baseStates; + + if(U_FAILURE(*pErrorCode)) { + return; + } + + initConvData(data); + + dataIsBase=readFile(data, converterName, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return; + } + + staticData=&data->staticData; + states=&data->ucm->states; + + if(dataIsBase) { + /* + * Build a normal .cnv file with a base table + * and an optional extension table. + */ + data->cnvData=MBCSOpen(data->ucm); + if(data->cnvData==nullptr) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + + } else if(!data->cnvData->isValid(data->cnvData, + staticData->subChar, staticData->subCharLen) + ) { + fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + + } else if(staticData->subChar1!=0 && + !data->cnvData->isValid(data->cnvData, &staticData->subChar1, 1) + ) { + fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + + } else if( + data->ucm->ext->mappingsLength>0 && + !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, false) + ) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) { + /* sort the table so that it can be turned into UTF-8-friendly data */ + ucm_sortTable(data->ucm->base); + } + + if(U_SUCCESS(*pErrorCode)) { + if( + /* add the base table after ucm_checkBaseExt()! */ + !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData) + ) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } else { + /* + * addTable() may have requested moving more mappings to the extension table + * if they fit into the base toUnicode table but not into the + * base fromUnicode table. + * (Especially for UTF-8-friendly fromUnicode tables.) + * Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them + * to be excluded from the extension toUnicode data. + * See MBCSOkForBaseFromUnicode() for which mappings do not fit into + * the base fromUnicode table. + */ + ucm_moveMappings(data->ucm->base, data->ucm->ext); + ucm_sortTable(data->ucm->ext); + if(data->ucm->ext->mappingsLength>0) { + /* prepare the extension table, if there is one */ + data->extData=CnvExtOpen(data->ucm); + if(data->extData==nullptr) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + } else if( + !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData) + ) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } + } + } + } + } else { + /* Build an extension-only .cnv file. */ + char baseFilename[500]; + char *basename; + + initConvData(&baseData); + + /* assemble a path/filename for data->ucm->baseName */ + uprv_strcpy(baseFilename, converterName); + basename=(char *)findBasename(baseFilename); + uprv_strcpy(basename, data->ucm->baseName); + uprv_strcat(basename, ".ucm"); + + /* read the base table */ + dataIsBase=readFile(&baseData, baseFilename, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return; + } else if(!dataIsBase) { + fprintf(stderr, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename); + *pErrorCode=U_INVALID_TABLE_FORMAT; + } else { + /* prepare the extension table */ + data->extData=CnvExtOpen(data->ucm); + if(data->extData==nullptr) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + } else { + /* fill in gaps in extension file header fields */ + UCMapping *m, *mLimit; + uint8_t fallbackFlags; + + baseStates=&baseData.ucm->states; + if(states->conversionType==UCNV_DBCS) { + staticData->minBytesPerChar=(int8_t)(states->minCharLength=2); + } else if(states->minCharLength==0) { + staticData->minBytesPerChar=(int8_t)(states->minCharLength=baseStates->minCharLength); + } + if(states->maxCharLength<states->minCharLength) { + staticData->maxBytesPerChar=(int8_t)(states->maxCharLength=baseStates->maxCharLength); + } + + if(staticData->subCharLen==0) { + uprv_memcpy(staticData->subChar, baseData.staticData.subChar, 4); + staticData->subCharLen=baseData.staticData.subCharLen; + } + /* + * do not copy subChar1 - + * only use what is explicitly specified + * because it cannot be unset in the extension file header + */ + + /* get the fallback flags */ + fallbackFlags=0; + for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength; + m<mLimit && fallbackFlags!=3; + ++m + ) { + if(m->f==1) { + fallbackFlags|=1; + } else if(m->f==3) { + fallbackFlags|=2; + } + } + + if(fallbackFlags&1) { + staticData->hasFromUnicodeFallback=true; + } + if(fallbackFlags&2) { + staticData->hasToUnicodeFallback=true; + } + + if(1!=ucm_countChars(baseStates, staticData->subChar, staticData->subCharLen)) { + fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + + } else if(staticData->subChar1!=0 && 1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) { + fprintf(stderr, " the subchar1 byte is illegal in this codepage structure!\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + + } else if( + !ucm_checkValidity(data->ucm->ext, baseStates) || + !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, false) + ) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } else { + if(states->maxCharLength>1) { + /* + * When building a normal .cnv file with a base table + * for an MBCS (not SBCS) table with explicit precision flags, + * the MBCSAddTable() function marks some mappings for moving + * to the extension table. + * They fit into the base toUnicode table but not into the + * base fromUnicode table. + * (Note: We do have explicit precision flags because they are + * required for extension table generation, and + * ucm_checkBaseExt() verified it.) + * + * We do not call MBCSAddTable() here (we probably could) + * so we need to do the analysis before building the extension table. + * We assume that MBCSAddTable() will build a UTF-8-friendly table. + * Redundant mappings in the extension table are ok except they cost some size. + * + * Do this after ucm_checkBaseExt(). + */ + const MBCSData *mbcsData=MBCSGetDummy(); + int32_t needsMove=0; + for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength; + m<mLimit; + ++m + ) { + if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) { + m->f|=MBCS_FROM_U_EXT_FLAG; + m->moveFlag=UCM_MOVE_TO_EXT; + ++needsMove; + } + } + + if(needsMove!=0) { + ucm_moveMappings(baseData.ucm->base, data->ucm->ext); + ucm_sortTable(data->ucm->ext); + } + } + if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } + } + } + } + + cleanupConvData(&baseData); + } +} + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ diff --git a/intl/icu/source/tools/makeconv/makeconv.h b/intl/icu/source/tools/makeconv/makeconv.h new file mode 100644 index 0000000000..addc2cb3e7 --- /dev/null +++ b/intl/icu/source/tools/makeconv/makeconv.h @@ -0,0 +1,62 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2010, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: makeconv.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000nov01 +* created by: Markus W. Scherer +*/ + +#ifndef __MAKECONV_H__ +#define __MAKECONV_H__ + +#include "unicode/utypes.h" +#include "ucnv_bld.h" +#include "unewdata.h" +#include "ucm.h" + +/* exports from makeconv.c */ +U_CFUNC UBool VERBOSE; +U_CFUNC UBool SMALL; +U_CFUNC UBool IGNORE_SISO_CHECK; + +/* converter table type for writing */ +enum { + TABLE_NONE, + TABLE_BASE, + TABLE_EXT, + TABLE_BASE_AND_EXT +}; + +/* abstract converter generator struct, C++ - style */ +struct NewConverter; +typedef struct NewConverter NewConverter; + +U_CDECL_BEGIN +struct NewConverter { + void + (* U_CALLCONV_FPTR close)(NewConverter *cnvData); + + /** is this byte sequence valid? */ + UBool + (*U_CALLCONV_FPTR isValid)(NewConverter *cnvData, + const uint8_t *bytes, int32_t length); + + UBool + (*U_CALLCONV_FPTR addTable)(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData); + + uint32_t + (*U_CALLCONV_FPTR write)(NewConverter *cnvData, const UConverterStaticData *staticData, + UNewDataMemory *pData, int32_t tableType); +}; +U_CDECL_END +#endif /* __MAKECONV_H__ */ diff --git a/intl/icu/source/tools/makeconv/makeconv.vcxproj b/intl/icu/source/tools/makeconv/makeconv.vcxproj new file mode 100644 index 0000000000..1ec8ec709e --- /dev/null +++ b/intl/icu/source/tools/makeconv/makeconv.vcxproj @@ -0,0 +1,87 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{F5AD9738-1A3D-4906-B9C4-A7D9CE33DC2C}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)/makeconv.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\..\include;..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)/makeconv.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)/makeconv.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)/makeconv.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="gencnvex.c" /> + <ClCompile Include="genmbcs.cpp" /> + <ClCompile Include="makeconv.cpp" /> + <ClCompile Include="ucnvstat.c" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="genmbcs.h" /> + <ClInclude Include="makeconv.h" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/makeconv/makeconv.vcxproj.filters b/intl/icu/source/tools/makeconv/makeconv.vcxproj.filters new file mode 100644 index 0000000000..b5232c7c7d --- /dev/null +++ b/intl/icu/source/tools/makeconv/makeconv.vcxproj.filters @@ -0,0 +1,39 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{de2dc8b2-bfcb-4516-bc0b-851f2bddd695}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{0638fe1b-842e-4db0-b609-7da558bbad33}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{c192904c-2a84-40cd-8829-c5a00d5a15fb}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="gencnvex.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="genmbcs.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="makeconv.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="ucnvstat.c"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="genmbcs.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="makeconv.h"> + <Filter>Header Files</Filter> + </ClInclude> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/makeconv/sources.txt b/intl/icu/source/tools/makeconv/sources.txt new file mode 100644 index 0000000000..4283ea9c49 --- /dev/null +++ b/intl/icu/source/tools/makeconv/sources.txt @@ -0,0 +1,4 @@ +gencnvex.c +genmbcs.cpp +makeconv.cpp +ucnvstat.c diff --git a/intl/icu/source/tools/makeconv/ucnvstat.c b/intl/icu/source/tools/makeconv/ucnvstat.c new file mode 100644 index 0000000000..2140bc263f --- /dev/null +++ b/intl/icu/source/tools/makeconv/ucnvstat.c @@ -0,0 +1,72 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ****************************************************************************** + * + * Copyright (C) 1998-2006, International Business Machines + * Corporation and others. All Rights Reserved. + * + ****************************************************************************** + * + * + * ucnvstat.c: + * UConverterStaticData prototypes for data based converters + */ + +#include <stdbool.h> + +#include "unicode/utypes.h" +#include "unicode/ucnv.h" +#include "toolutil.h" +#include "ucnv_bld.h" + + +static const UConverterStaticData _SBCSStaticData={ + sizeof(UConverterStaticData), + "SBCS", + 0, UCNV_IBM, UCNV_SBCS, 1, 1, + { 0x1a, 0, 0, 0 }, 1, false, false, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + + +static const UConverterStaticData _DBCSStaticData={ + sizeof(UConverterStaticData), + "DBCS", + 0, UCNV_IBM, UCNV_DBCS, 2, 2, + { 0, 0, 0, 0 },0, false, false, /* subchar */ + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + +static const UConverterStaticData _MBCSStaticData={ + sizeof(UConverterStaticData), + "MBCS", + 0, UCNV_IBM, UCNV_MBCS, 1, 1, + { 0x1a, 0, 0, 0 }, 1, false, false, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + +static const UConverterStaticData _EBCDICStatefulStaticData={ + sizeof(UConverterStaticData), + "EBCDICStateful", + 0, UCNV_IBM, UCNV_EBCDIC_STATEFUL, 1, 1, + { 0, 0, 0, 0 },0, false, false, + 0, + 0, + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ +}; + +/* NULLs for algorithmic types, their tables live in ucnv_bld.c */ +const UConverterStaticData *ucnv_converterStaticData[UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES]={ + &_SBCSStaticData, &_DBCSStaticData, &_MBCSStaticData, NULL/*Lat1*/, + NULL/*UTF8*/, NULL/*UTF16be*/, NULL/*UTF16LE*/, NULL/*UTF32be*/, NULL/*UTF32LE*/, &_EBCDICStatefulStaticData, + NULL/*ISO2022*/, + /* LMBCS */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL +}; + diff --git a/intl/icu/source/tools/memcheck/ICUMemCheck.pl b/intl/icu/source/tools/memcheck/ICUMemCheck.pl new file mode 100755 index 0000000000..019f7ecd78 --- /dev/null +++ b/intl/icu/source/tools/memcheck/ICUMemCheck.pl @@ -0,0 +1,64 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# *********************************************************************** +# * COPYRIGHT: +# * Copyright (c) 2004-2006, International Business Machines Corporation +# * and others. All Rights Reserved. +# *********************************************************************** +# +# This perl script checks for correct memory function usage in ICU library code. +# It works with Linux builds of ICU using clang or gcc. +# +# To run it, +# 1. Build ICU +# 2. cd icu/source +# 3. perl tools/memcheck/ICUMemCheck.pl +# +# All object files containing direct references to C or C++ runtime library memory +# functions will be listed in the output. +# +# For ICU 58, the expected output is +# common/uniset.o U operator delete(void*) +# common/unifilt.o U operator delete(void*) +# common/cmemory.o U malloc +# common/cmemory.o U free +# i18n/strrepl.o U operator delete(void*) +# +# cmemory.c Expected failures from uprv_malloc, uprv_free implementation. +# uniset.cpp Fails because of SymbolTable::~SymbolTable() +# unifilt.cpp Fails because of UnicodeMatcher::~UnicodeMatcher() +# strrepl.cpp Fails because of UnicodeReplacer::~UnicodeReplacer() +# +# To verify that no additional problems exist in the .cpp files, #ifdef out the +# offending destructors, rebuild icu, and re-run the tool. The problems should +# be gone. +# +# The problem destructors all are for mix-in style interface classes. +# These classes can not derive from UObject or UMemory because of multiple-inheritance +# problems, so they don't get the ICU memory functions. The delete code +# in the destructors will never be called because stand-alone instances of +# the classes cannot exist. +# +$fileNames = `find common i18n io -name "*.o" -print`; +foreach $f (split('\n', $fileNames)) { + $symbols = `nm -u -C $f`; + if ($symbols =~ /U +operator delete\(void\*\)/) { + print "$f $&\n"; + } + if ($symbols =~ /U +operator delete\[\]\(void\*\)/) { + print "$f $&\n"; + } + if ($symbols =~ /U +operator new\(unsigned int\)/) { + print "$f $&\n"; + } + if ($symbols =~ /U +operator new\[\]\(unsigned int\)/) { + print "$f $&\n"; + } + if ($symbols =~ /U +malloc.*/) { + print "$f $&\n"; + } + if ($symbols =~ /(?m:U +free$)/) { + print "$f $&\n"; + } + +} diff --git a/intl/icu/source/tools/pkgdata/Makefile.in b/intl/icu/source/tools/pkgdata/Makefile.in new file mode 100644 index 0000000000..4777998852 --- /dev/null +++ b/intl/icu/source/tools/pkgdata/Makefile.in @@ -0,0 +1,102 @@ +## Makefile.in for ICU - tools/pkgdata +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 1999-2011, International Business Machines Corporation and +## others. All Rights Reserved. +## Steven R. Loomis + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/pkgdata + +TARGET_STUB_NAME = pkgdata + +SECTION = 1 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) + +ifneq ($(PKGDATA_DEFS),) +DEFS += $(PKGDATA_DEFS) +endif + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(srcdir)/../toolutil +DEFS += -DUDATA_SO_SUFFIX=\".$(SO)\" -DSTATIC_O=\"$(STATIC_O)\" +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(patsubst %.cpp,%.o,$(patsubst %.c,%.o, $(SOURCES))) + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(bindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(bindir) + +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/pkgdata/pkgdata.1.in b/intl/icu/source/tools/pkgdata/pkgdata.1.in new file mode 100644 index 0000000000..cee92cab94 --- /dev/null +++ b/intl/icu/source/tools/pkgdata/pkgdata.1.in @@ -0,0 +1,260 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" pkgdata.1: manual page for the pkgdata utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2000-2009 IBM, Inc. and others. +.\" +.\" Manual page by Yves Arrouye <yves@realnames.com>. +.\" Modified by Michael Ow <mow@us.ibm.com>. +.\" +.TH PKGDATA 1 "6 February 2009" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B pkgdata +\- package data for use by ICU +.SH SYNOPSIS +.B pkgdata +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BI "\-v\fP, \fB\-\-verbose" +] +[ +.BR "\-c\fP, \fB\-\-copyright" +| +.BI "\-C\fP, \fB\-\-comment" " comment" +] +[ +.BI "\-m\fP, \fB\-\-mode" " mode" +] +.BI "\-p\fP, \fB\-\-name" " name" +.BI "\-O\fP, \fB\-\-bldopt" " options" +[ +.BI "\-e\fP, \fB\-\-entrypoint" " name" +] +[ +.BI "\-r\fP, \fB\-\-revision" " version" +] +[ +.BI "\-F\fP, \fB\-\-rebuild" +] +[ +.BI "\-I\fP, \fB\-\-install" +] +[ +.BI "\-s\fP, \fB\-\-sourcedir" " source" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +[ +.BI "\-T\fP, \fB\-\-tempdir" " directory" +] +[ +.IR file " .\|.\|." +] +.SH DESCRIPTION +.B pkgdata +takes a set of data files and packages them for use by ICU or +applications that use ICU. The typical reason to package files using +.B pkgdata +is to make their distribution easier and their loading by ICU faster +and less consuming of limited system resources such as file +descriptors. +Packaged data also allow applications to be distributed with fewer +resource files, or even with none at all if they link against the +packaged data directly. +.PP +.B pkgdata +supports a few different methods of packaging data that serve +different purposes. +.PP +The default packaging +.I mode +is +.BR common , +or +.BR archive . +In this mode, the different data files are bundled together as an +architecture-dependent file that can later be memory mapped for use by +ICU. Data packaged using this mode will be looked up under the ICU +data directory. Such packaging is easy to use for applications resource +bundles, for example, as long as the application can install the +packaged file in the ICU data directory. +.PP +Another packaging mode is the +.BR dll , +or +.BR library , +mode, where the data files are compiled into a shared library. ICU +used to be able to dynamically load these shared libraries, but as of +ICU 2.0, such support has been removed. This mode is still useful for +two main purposes: to build ICU itself, as the ICU data is packaged as +a shared library by default; and to build resource bundles that are +linked to the application that uses them. Such resource bundles can +then be placed anywhere where the system's dynamic linker will be +looking for shared libraries, instead of being forced to live inside +the ICU data directory. +.PP +The +.BR static +packaging mode is similar to the shared library one except that it +produces a static library. +.\" Note that many platforms are not able to +.\" dynamically load symbols from static object files, so for this reason +.\" .BR udata_setAppData() +.\" must be called +.\" to install this data. As a convenience, pkgdata will build a C source file +.\" and a header file. Given a data package named +.\" .IR name, in the output +.\" directory will be created +.\" .IR name .c +.\" and +.\" .IR name .h with the single +.\" function +.\" .BR "udata_install_\fcIname\fB(UErrorCode *err)" , +.\" where +.\" .I cname +.\" is +.\" .I name +.\" turned into a valid C identifier. +.\" The application need to call this function once. The error code returned +.\" is that of +.\" .BR udata_setAppData() . +.\" .PP +.\" Data pakackaged in a library, whether shared or static, +.\" Subsequently, the application can access this data by passing +.\" .I name for the +.\" .I path +.\" rgument to functions such as +.\" .BR Bures_open() . +.PP +Finally, +.B pkgdata +supports a +.B files +mode which simply copies the data files instead of packaging +them as a single file or library. This mode is mainly intended to +provide support for building ICU before it is packaged as separate +small packages for distribution with operating systems such as Debian +GNU/Linux for example. Please refer to the packaging documentation in +the ICU source distribution for further information on the use of this +mode. +.PP +.B pkgdata +builds, packages, installs, or cleans the appropriate data based on the options given +without the need to call GNU +.BR make +anymore. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BR "\-v\fP, \fB\-\-verbose" +Display extra informative messages during execution. +.TP +.BR "\-c\fP, \fB\-\-copyright" +Include a copyright notice in the binary data. +.TP +.BI "\-C\fP, \fB\-\-comment" " comment" +Includes the specified +.I comment +in the resulting data instead of the ICU copyright notice. +.TP +.BI "\-m\fP, \fB\-\-mode" " mode" +Set the packaging +.I mode +to be used by +.BR pkgdata . +The different modes and their meaning are explained in the +.B DESCRIPTION +section above. The valid mode names are +.BR common +(or +.BR archive ), +.BR dll +(or +.BR library ), +and +.BR files . +.TP +.BI "\-O\fP, \fB\-\-bldopt" " options" +Specify options for the builder. The builder is used internally by +.B pkgdata +to generate the correct packaged file. Such options include, but are +not limited to, setting variables used by +.BR make (1) +during the build of the packaged file. Note: If +.BR icu-config +is available, then this option is not needed. +.TP +.BI "\-p\fP, \fB\-\-name" " name" +Set the packaged file name to +.IR name . +This name is also used as the default entry point name after having +been turned into a valid C identifier. +.TP +.BI "\-e\fP, \fB\-\-entrypoint" " name" +Set the data entry point (used for linking against the data in a +shared library form) to +.IR name . +The default entry point name is the name set by the +.BI "\-n\fP, \fB\-\-name" +option. +.TP +.BI "\-r\fP, \fB\-\-revision" " version" +Enable versioning of the shared library produced in +.BR dll , +or +.BR library , +mode. The version number has the format +.I major\fP.\fIminor\fP.\fIpatchlevel +and all parts except for +.I major +are optional. If only +.I major +is supplied then the version is +assumed to be +.IR major .0 +for versioning purposes. +.TP +.BI "\-F\fP, \fB\-\-rebuild" +Force the rebuilding of all data and their repackaging. +.TP +.BI "\-I\fP, \fB\-\-install" +Install the packaged file (or all the files in the +.B files +mode). If the variable +.B DESTDIR +is set it will be used for installation. +.TP +.BI "\-s\fP, \fB\-\-sourcedir" " source" +Set the source directory to +.IR source . +The default source directory is the current directory. +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory to +.IR destination . +The default destination directory is the current directory. +.TP +.BI "\-T\fP, \fB\-\-tempdir" " directory" +Set the directory used to generate temporary files to +.IR directory . +The default temporary directory is the same as the destination +directory +as set by the +.BI "\-d\fP, \fB\-\-destdir" +option. +.SH AUTHORS +Steven Loomis +.br +Yves Arrouye +.SH VERSION +@VERSION@ +.SH COPYRIGHT +Copyright (C) 2000-2009 IBM, Inc. and others. + diff --git a/intl/icu/source/tools/pkgdata/pkgdata.cpp b/intl/icu/source/tools/pkgdata/pkgdata.cpp new file mode 100644 index 0000000000..c2ac112f6e --- /dev/null +++ b/intl/icu/source/tools/pkgdata/pkgdata.cpp @@ -0,0 +1,2293 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2000-2016, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + * file name: pkgdata.cpp + * encoding: ANSI X3.4 (1968) + * tab size: 8 (not used) + * indentation:4 + * + * created on: 2000may15 + * created by: Steven \u24C7 Loomis + * + * This program packages the ICU data into different forms + * (DLL, common data, etc.) + */ + +// Defines _XOPEN_SOURCE for access to POSIX functions. +// Must be before any other #includes. +#include "uposixdefs.h" + +#include "unicode/utypes.h" + +#include "unicode/putil.h" +#include "putilimp.h" + +#if U_HAVE_POPEN +#if (U_PF_MINGW <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN) && defined(__STRICT_ANSI__) +/* popen/pclose aren't defined in strict ANSI on Cygwin and MinGW */ +#undef __STRICT_ANSI__ +#endif +#endif + +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" +#include "toolutil.h" +#include "unicode/uclean.h" +#include "unewdata.h" +#include "uoptions.h" +#include "package.h" +#include "pkg_icu.h" +#include "pkg_genc.h" +#include "pkg_gencmn.h" +#include "flagparser.h" +#include "filetools.h" +#include "charstr.h" +#include "uassert.h" + +#if U_HAVE_POPEN +# include <unistd.h> +#endif + +#include <stdio.h> +#include <stdlib.h> + +U_CDECL_BEGIN +#include "pkgtypes.h" +U_CDECL_END + +#if U_HAVE_POPEN + +using icu::LocalPointerBase; + +U_DEFINE_LOCAL_OPEN_POINTER(LocalPipeFilePointer, FILE, pclose); + +#endif + +using icu::LocalMemory; + +static void loadLists(UPKGOptions *o, UErrorCode *status); + +static int32_t pkg_executeOptions(UPKGOptions *o); + +#ifdef WINDOWS_WITH_MSVC +static int32_t pkg_createWindowsDLL(const char mode, const char *gencFilePath, UPKGOptions *o); +#endif +static int32_t pkg_createSymLinks(const char *targetDir, UBool specialHandling=false); +static int32_t pkg_installLibrary(const char *installDir, const char *dir, UBool noVersion); +static int32_t pkg_installFileMode(const char *installDir, const char *srcDir, const char *fileListName); +static int32_t pkg_installCommonMode(const char *installDir, const char *fileName); + +#ifdef BUILD_DATA_WITHOUT_ASSEMBLY +static int32_t pkg_createWithoutAssemblyCode(UPKGOptions *o, const char *targetDir, const char mode); +#endif + +#ifdef CAN_WRITE_OBJ_CODE +static void pkg_createOptMatchArch(char *optMatchArch); +static void pkg_destroyOptMatchArch(char *optMatchArch); +#endif + +static int32_t pkg_createWithAssemblyCode(const char *targetDir, const char mode, const char *gencFilePath); +static int32_t pkg_generateLibraryFile(const char *targetDir, const char mode, const char *objectFile, char *command = nullptr, UBool specialHandling=false); +static int32_t pkg_archiveLibrary(const char *targetDir, const char *version, UBool reverseExt); +static void createFileNames(UPKGOptions *o, const char mode, const char *version_major, const char *version, const char *libName, const UBool reverseExt, UBool noVersion); +static int32_t initializePkgDataFlags(UPKGOptions *o); + +static int32_t pkg_getPkgDataPath(UBool verbose, UOption *option); +static int runCommand(const char* command, UBool specialHandling=false); + +#define IN_COMMON_MODE(mode) (mode == 'a' || mode == 'c') +#define IN_DLL_MODE(mode) (mode == 'd' || mode == 'l') +#define IN_STATIC_MODE(mode) (mode == 's') +#define IN_FILES_MODE(mode) (mode == 'f') + +enum { + NAME, + BLDOPT, + MODE, + HELP, + HELP_QUESTION_MARK, + VERBOSE, + COPYRIGHT, + COMMENT, + DESTDIR, + REBUILD, + TEMPDIR, + INSTALL, + SOURCEDIR, + ENTRYPOINT, + REVISION, + FORCE_PREFIX, + LIBNAME, + QUIET, + WITHOUT_ASSEMBLY, + PDS_BUILD, + WIN_UWP_BUILD, + WIN_DLL_ARCH, + WIN_DYNAMICBASE +}; + +/* This sets the modes that are available */ +static struct { + const char *name, *alt_name; + const char *desc; +} modes[] = { + { "files", 0, "Uses raw data files (no effect). Installation copies all files to the target location." }, +#if U_PLATFORM_HAS_WIN32_API + { "dll", "library", "Generates one common data file and one shared library, <package>.dll"}, + { "common", "archive", "Generates just the common file, <package>.dat"}, + { "static", "static", "Generates one statically linked library, " LIB_PREFIX "<package>" UDATA_LIB_SUFFIX } +#else +#ifdef UDATA_SO_SUFFIX + { "dll", "library", "Generates one shared library, <package>" UDATA_SO_SUFFIX }, +#endif + { "common", "archive", "Generates one common data file, <package>.dat" }, + { "static", "static", "Generates one statically linked library, " LIB_PREFIX "<package>" UDATA_LIB_SUFFIX } +#endif +}; + +static UOption options[]={ + /*00*/ UOPTION_DEF( "name", 'p', UOPT_REQUIRES_ARG), + /*01*/ UOPTION_DEF( "bldopt", 'O', UOPT_REQUIRES_ARG), /* on Win32 it is release or debug */ + /*02*/ UOPTION_DEF( "mode", 'm', UOPT_REQUIRES_ARG), + /*03*/ UOPTION_HELP_H, /* -h */ + /*04*/ UOPTION_HELP_QUESTION_MARK, /* -? */ + /*05*/ UOPTION_VERBOSE, /* -v */ + /*06*/ UOPTION_COPYRIGHT, /* -c */ + /*07*/ UOPTION_DEF( "comment", 'C', UOPT_REQUIRES_ARG), + /*08*/ UOPTION_DESTDIR, /* -d */ + /*11*/ UOPTION_DEF( "rebuild", 'F', UOPT_NO_ARG), + /*12*/ UOPTION_DEF( "tempdir", 'T', UOPT_REQUIRES_ARG), + /*13*/ UOPTION_DEF( "install", 'I', UOPT_REQUIRES_ARG), + /*14*/ UOPTION_SOURCEDIR , + /*15*/ UOPTION_DEF( "entrypoint", 'e', UOPT_REQUIRES_ARG), + /*16*/ UOPTION_DEF( "revision", 'r', UOPT_REQUIRES_ARG), + /*17*/ UOPTION_DEF( "force-prefix", 'f', UOPT_NO_ARG), + /*18*/ UOPTION_DEF( "libname", 'L', UOPT_REQUIRES_ARG), + /*19*/ UOPTION_DEF( "quiet", 'q', UOPT_NO_ARG), + /*20*/ UOPTION_DEF( "without-assembly", 'w', UOPT_NO_ARG), + /*21*/ UOPTION_DEF("zos-pds-build", 'z', UOPT_NO_ARG), + /*22*/ UOPTION_DEF("windows-uwp-build", 'u', UOPT_NO_ARG), + /*23*/ UOPTION_DEF("windows-DLL-arch", 'a', UOPT_REQUIRES_ARG), + /*24*/ UOPTION_DEF("windows-dynamicbase", 'b', UOPT_NO_ARG), +}; + +/* This enum and the following char array should be kept in sync. */ +enum { + GENCCODE_ASSEMBLY_TYPE, + SO_EXT, + SOBJ_EXT, + A_EXT, + LIBPREFIX, + LIB_EXT_ORDER, + COMPILER, + LIBFLAGS, + GENLIB, + LDICUDTFLAGS, + LD_SONAME, + RPATH_FLAGS, + BIR_FLAGS, + AR, + ARFLAGS, + RANLIB, + INSTALL_CMD, + PKGDATA_FLAGS_SIZE +}; +static const char* FLAG_NAMES[PKGDATA_FLAGS_SIZE] = { + "GENCCODE_ASSEMBLY_TYPE", + "SO", + "SOBJ", + "A", + "LIBPREFIX", + "LIB_EXT_ORDER", + "COMPILE", + "LIBFLAGS", + "GENLIB", + "LDICUDTFLAGS", + "LD_SONAME", + "RPATH_FLAGS", + "BIR_LDFLAGS", + "AR", + "ARFLAGS", + "RANLIB", + "INSTALL_CMD" +}; +static char **pkgDataFlags = nullptr; + +enum { + LIB_FILE, + LIB_FILE_VERSION_MAJOR, + LIB_FILE_VERSION, + LIB_FILE_VERSION_TMP, +#if U_PLATFORM == U_PF_CYGWIN + LIB_FILE_CYGWIN, + LIB_FILE_CYGWIN_VERSION, +#elif U_PLATFORM == U_PF_MINGW + LIB_FILE_MINGW, +#elif U_PLATFORM == U_PF_OS390 + LIB_FILE_OS390BATCH_MAJOR, + LIB_FILE_OS390BATCH_VERSION, +#endif + LIB_FILENAMES_SIZE +}; +static char libFileNames[LIB_FILENAMES_SIZE][256]; + +static UPKGOptions *pkg_checkFlag(UPKGOptions *o); + +const char options_help[][320]={ + "Set the data name", +#ifdef U_MAKE_IS_NMAKE + "The directory where the ICU is located (e.g. <ICUROOT> which contains the bin directory)", +#else + "Specify options for the builder.", +#endif + "Specify the mode of building (see below; default: common)", + "This usage text", + "This usage text", + "Make the output verbose", + "Use the standard ICU copyright", + "Use a custom comment (instead of the copyright)", + "Specify the destination directory for files", + "Force rebuilding of all data", + "Specify temporary dir (default: output dir)", + "Install the data (specify target)", + "Specify a custom source directory", + "Specify a custom entrypoint name (default: short name)", + "Specify a version when packaging in dll or static mode", + "Add package to all file names if not present", + "Library name to build (if different than package name)", + "Quiet mode. (e.g. Do not output a readme file for static libraries)", + "Build the data without assembly code", + "Build PDS dataset (zOS build only)", + "Build for Universal Windows Platform (Windows build only)", + "Specify the DLL machine architecture for LINK.exe (Windows build only)", + "Ignored. Enable DYNAMICBASE on the DLL. This is now the default. (Windows build only)", +}; + +const char *progname = "PKGDATA"; + +int +main(int argc, char* argv[]) { + int result = 0; + /* FileStream *out; */ + UPKGOptions o; + CharList *tail; + UBool needsHelp = false; + UErrorCode status = U_ZERO_ERROR; + /* char tmp[1024]; */ + uint32_t i; + int32_t n; + + U_MAIN_INIT_ARGS(argc, argv); + + progname = argv[0]; + + options[MODE].value = "common"; + + /* read command line options */ + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + /* error handling, printing usage message */ + /* I've decided to simply print an error and quit. This tool has too + many options to just display them all of the time. */ + + if(options[HELP].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { + needsHelp = true; + } + else { + if(!needsHelp && argc<0) { + fprintf(stderr, + "%s: error in command line argument \"%s\"\n", + progname, + argv[-argc]); + fprintf(stderr, "Run '%s --help' for help.\n", progname); + return 1; + } + + +#if !defined(WINDOWS_WITH_MSVC) || defined(USING_CYGWIN) + if(!options[BLDOPT].doesOccur && uprv_strcmp(options[MODE].value, "common") != 0) { + if (pkg_getPkgDataPath(options[VERBOSE].doesOccur, &options[BLDOPT]) != 0) { + fprintf(stderr, " required parameter is missing: -O is required for static and shared builds.\n"); + fprintf(stderr, "Run '%s --help' for help.\n", progname); + return 1; + } + } +#else + if(options[BLDOPT].doesOccur) { + fprintf(stdout, "Warning: You are using the -O option which is not needed for MSVC build on Windows.\n"); + } +#endif + + if(!options[NAME].doesOccur) /* -O we already have - don't report it. */ + { + fprintf(stderr, " required parameter -p is missing \n"); + fprintf(stderr, "Run '%s --help' for help.\n", progname); + return 1; + } + + if(argc == 1) { + fprintf(stderr, + "No input files specified.\n" + "Run '%s --help' for help.\n", progname); + return 1; + } + } /* end !needsHelp */ + + if(argc<0 || needsHelp ) { + fprintf(stderr, + "usage: %s [-options] [-] [packageFile] \n" + "\tProduce packaged ICU data from the given list(s) of files.\n" + "\t'-' by itself means to read from stdin.\n" + "\tpackageFile is a text file containing the list of files to package.\n", + progname); + + fprintf(stderr, "\n options:\n"); + for(i=0;i<UPRV_LENGTHOF(options);i++) { + fprintf(stderr, "%-5s -%c %s%-10s %s\n", + (i<1?"[REQ]":""), + options[i].shortName, + options[i].longName ? "or --" : " ", + options[i].longName ? options[i].longName : "", + options_help[i]); + } + + fprintf(stderr, "modes: (-m option)\n"); + for(i=0;i<UPRV_LENGTHOF(modes);i++) { + fprintf(stderr, " %-9s ", modes[i].name); + if (modes[i].alt_name) { + fprintf(stderr, "/ %-9s", modes[i].alt_name); + } else { + fprintf(stderr, " "); + } + fprintf(stderr, " %s\n", modes[i].desc); + } + return 1; + } + + /* OK, fill in the options struct */ + uprv_memset(&o, 0, sizeof(o)); + + o.mode = options[MODE].value; + o.version = options[REVISION].doesOccur ? options[REVISION].value : 0; + + o.shortName = options[NAME].value; + { + int32_t len = (int32_t)uprv_strlen(o.shortName); + char *csname, *cp; + const char *sp; + + cp = csname = (char *) uprv_malloc((len + 1 + 1) * sizeof(*o.cShortName)); + if (*(sp = o.shortName)) { + *cp++ = isalpha(*sp) ? * sp : '_'; + for (++sp; *sp; ++sp) { + *cp++ = isalnum(*sp) ? *sp : '_'; + } + } + *cp = 0; + + o.cShortName = csname; + } + + if(options[LIBNAME].doesOccur) { /* get libname from shortname, or explicit -L parameter */ + o.libName = options[LIBNAME].value; + } else { + o.libName = o.shortName; + } + + if(options[QUIET].doesOccur) { + o.quiet = true; + } else { + o.quiet = false; + } + + if(options[PDS_BUILD].doesOccur) { +#if U_PLATFORM == U_PF_OS390 + o.pdsbuild = true; +#else + o.pdsbuild = false; + fprintf(stdout, "Warning: You are using the -z option which only works on z/OS.\n"); + +#endif + } else { + o.pdsbuild = false; + } + + o.verbose = options[VERBOSE].doesOccur; + + +#if !defined(WINDOWS_WITH_MSVC) || defined(USING_CYGWIN) /* on UNIX, we'll just include the file... */ + if (options[BLDOPT].doesOccur) { + o.options = options[BLDOPT].value; + } else { + o.options = nullptr; + } +#endif + if(options[COPYRIGHT].doesOccur) { + o.comment = U_COPYRIGHT_STRING; + } else if (options[COMMENT].doesOccur) { + o.comment = options[COMMENT].value; + } + + if( options[DESTDIR].doesOccur ) { + o.targetDir = options[DESTDIR].value; + } else { + o.targetDir = "."; /* cwd */ + } + + o.rebuild = options[REBUILD].doesOccur; + + if( options[TEMPDIR].doesOccur ) { + o.tmpDir = options[TEMPDIR].value; + } else { + o.tmpDir = o.targetDir; + } + + if( options[INSTALL].doesOccur ) { + o.install = options[INSTALL].value; + } else { + o.install = nullptr; + } + + if( options[SOURCEDIR].doesOccur ) { + o.srcDir = options[SOURCEDIR].value; + } else { + o.srcDir = "."; + } + + if( options[ENTRYPOINT].doesOccur ) { + o.entryName = options[ENTRYPOINT].value; + } else { + o.entryName = o.cShortName; + } + + o.withoutAssembly = false; + if (options[WITHOUT_ASSEMBLY].doesOccur) { +#ifndef BUILD_DATA_WITHOUT_ASSEMBLY + fprintf(stdout, "Warning: You are using the option to build without assembly code which is not supported on this platform.\n"); + fprintf(stdout, "Warning: This option will be ignored.\n"); +#else + o.withoutAssembly = true; +#endif + } + + if (options[WIN_DYNAMICBASE].doesOccur) { + fprintf(stdout, "Note: Ignoring option -b (windows-dynamicbase).\n"); + } + + /* OK options are set up. Now the file lists. */ + tail = nullptr; + for( n=1; n<argc; n++) { + o.fileListFiles = pkg_appendToList(o.fileListFiles, &tail, uprv_strdup(argv[n])); + } + + /* load the files */ + loadLists(&o, &status); + if( U_FAILURE(status) ) { + fprintf(stderr, "error loading input file lists: %s\n", u_errorName(status)); + return 2; + } + + result = pkg_executeOptions(&o); + + if (pkgDataFlags != nullptr) { + for (n = 0; n < PKGDATA_FLAGS_SIZE; n++) { + if (pkgDataFlags[n] != nullptr) { + uprv_free(pkgDataFlags[n]); + } + } + uprv_free(pkgDataFlags); + } + + if (o.cShortName != nullptr) { + uprv_free((char *)o.cShortName); + } + if (o.fileListFiles != nullptr) { + pkg_deleteList(o.fileListFiles); + } + if (o.filePaths != nullptr) { + pkg_deleteList(o.filePaths); + } + if (o.files != nullptr) { + pkg_deleteList(o.files); + } + return result; +} + +static int runCommand(const char* command, UBool specialHandling) { + char *cmd = nullptr; + char cmdBuffer[SMALL_BUFFER_MAX_SIZE]; + int32_t len = static_cast<int32_t>(strlen(command)); + + if (len == 0) { + return 0; + } + + if (!specialHandling) { +#if defined(USING_CYGWIN) || U_PLATFORM == U_PF_MINGW || U_PLATFORM == U_PF_OS400 + int32_t buff_len; + if ((len + BUFFER_PADDING_SIZE) >= SMALL_BUFFER_MAX_SIZE) { + cmd = (char *)uprv_malloc(len + BUFFER_PADDING_SIZE); + buff_len = len + BUFFER_PADDING_SIZE; + } else { + cmd = cmdBuffer; + buff_len = SMALL_BUFFER_MAX_SIZE; + } +#if defined(USING_CYGWIN) || U_PLATFORM == U_PF_MINGW + snprintf(cmd, buff_len, "bash -c \"%s\"", command); + +#elif U_PLATFORM == U_PF_OS400 + snprintf(cmd, buff_len "QSH CMD('%s')", command); +#endif +#else + goto normal_command_mode; +#endif + } else { +#if !(defined(USING_CYGWIN) || U_PLATFORM == U_PF_MINGW || U_PLATFORM == U_PF_OS400) +normal_command_mode: +#endif + cmd = (char *)command; + } + + printf("pkgdata: %s\n", cmd); + int result = system(cmd); + if (result != 0) { + fprintf(stderr, "-- return status = %d\n", result); + result = 1; // system() result code is platform specific. + } + + if (cmd != cmdBuffer && cmd != command) { + uprv_free(cmd); + } + + return result; +} + +#define LN_CMD "ln -s" +#define RM_CMD "rm -f" + +static int32_t pkg_executeOptions(UPKGOptions *o) { + int32_t result = 0; + + const char mode = o->mode[0]; + char targetDir[SMALL_BUFFER_MAX_SIZE] = ""; + char tmpDir[SMALL_BUFFER_MAX_SIZE] = ""; + char datFileName[SMALL_BUFFER_MAX_SIZE] = ""; + char datFileNamePath[LARGE_BUFFER_MAX_SIZE] = ""; + char checkLibFile[LARGE_BUFFER_MAX_SIZE] = ""; + + initializePkgDataFlags(o); + + if (IN_FILES_MODE(mode)) { + /* Copy the raw data to the installation directory. */ + if (o->install != nullptr) { + uprv_strcpy(targetDir, o->install); + if (o->shortName != nullptr) { + uprv_strcat(targetDir, PKGDATA_FILE_SEP_STRING); + uprv_strcat(targetDir, o->shortName); + } + + if(o->verbose) { + fprintf(stdout, "# Install: Files mode, copying files to %s..\n", targetDir); + } + result = pkg_installFileMode(targetDir, o->srcDir, o->fileListFiles->str); + } + return result; + } else /* if (IN_COMMON_MODE(mode) || IN_DLL_MODE(mode) || IN_STATIC_MODE(mode)) */ { + UBool noVersion = false; + + uprv_strcpy(targetDir, o->targetDir); + uprv_strcat(targetDir, PKGDATA_FILE_SEP_STRING); + + uprv_strcpy(tmpDir, o->tmpDir); + uprv_strcat(tmpDir, PKGDATA_FILE_SEP_STRING); + + uprv_strcpy(datFileNamePath, tmpDir); + + uprv_strcpy(datFileName, o->shortName); + uprv_strcat(datFileName, UDATA_CMN_SUFFIX); + + uprv_strcat(datFileNamePath, datFileName); + + if(o->verbose) { + fprintf(stdout, "# Writing package file %s ..\n", datFileNamePath); + } + result = writePackageDatFile(datFileNamePath, o->comment, o->srcDir, o->fileListFiles->str, nullptr, U_CHARSET_FAMILY ? 'e' : U_IS_BIG_ENDIAN ? 'b' : 'l'); + if (result != 0) { + fprintf(stderr,"Error writing package dat file.\n"); + return result; + } + + if (IN_COMMON_MODE(mode)) { + char targetFileNamePath[LARGE_BUFFER_MAX_SIZE] = ""; + + uprv_strcpy(targetFileNamePath, targetDir); + uprv_strcat(targetFileNamePath, datFileName); + + /* Move the dat file created to the target directory. */ + if (uprv_strcmp(datFileNamePath, targetFileNamePath) != 0) { + if (T_FileStream_file_exists(targetFileNamePath)) { + if ((result = remove(targetFileNamePath)) != 0) { + fprintf(stderr, "Unable to remove old dat file: %s\n", + targetFileNamePath); + return result; + } + } + + result = rename(datFileNamePath, targetFileNamePath); + + if (o->verbose) { + fprintf(stdout, "# Moving package file to %s ..\n", + targetFileNamePath); + } + if (result != 0) { + fprintf( + stderr, + "Unable to move dat file (%s) to target location (%s).\n", + datFileNamePath, targetFileNamePath); + return result; + } + } + + if (o->install != nullptr) { + result = pkg_installCommonMode(o->install, targetFileNamePath); + } + + return result; + } else /* if (IN_STATIC_MODE(mode) || IN_DLL_MODE(mode)) */ { + char gencFilePath[SMALL_BUFFER_MAX_SIZE] = ""; + char version_major[10] = ""; + UBool reverseExt = false; + +#if !defined(WINDOWS_WITH_MSVC) || defined(USING_CYGWIN) + /* Get the version major number. */ + if (o->version != nullptr) { + for (uint32_t i = 0;i < sizeof(version_major);i++) { + if (o->version[i] == '.') { + version_major[i] = 0; + break; + } + version_major[i] = o->version[i]; + } + } else { + noVersion = true; + if (IN_DLL_MODE(mode)) { + fprintf(stdout, "Warning: Providing a revision number with the -r option is recommended when packaging data in the current mode.\n"); + } + } + +#if U_PLATFORM != U_PF_OS400 + /* Certain platforms have different library extension ordering. (e.g. libicudata.##.so vs libicudata.so.##) + * reverseExt is false if the suffix should be the version number. + */ + if (pkgDataFlags[LIB_EXT_ORDER][uprv_strlen(pkgDataFlags[LIB_EXT_ORDER])-1] == pkgDataFlags[SO_EXT][uprv_strlen(pkgDataFlags[SO_EXT])-1]) { + reverseExt = true; + } +#endif + /* Using the base libName and version number, generate the library file names. */ + createFileNames(o, mode, version_major, o->version == nullptr ? "" : o->version, o->libName, reverseExt, noVersion); + + if ((o->version!=nullptr || IN_STATIC_MODE(mode)) && o->rebuild == false && o->pdsbuild == false) { + /* Check to see if a previous built data library file exists and check if it is the latest. */ + snprintf(checkLibFile, sizeof(checkLibFile), "%s%s", targetDir, libFileNames[LIB_FILE_VERSION]); + if (T_FileStream_file_exists(checkLibFile)) { + if (isFileModTimeLater(checkLibFile, o->srcDir, true) && isFileModTimeLater(checkLibFile, o->options)) { + if (o->install != nullptr) { + if(o->verbose) { + fprintf(stdout, "# Installing already-built library into %s\n", o->install); + } + result = pkg_installLibrary(o->install, targetDir, noVersion); + } else { + if(o->verbose) { + printf("# Not rebuilding %s - up to date.\n", checkLibFile); + } + } + return result; + } else if (o->verbose && (o->install!=nullptr)) { + fprintf(stdout, "# Not installing up-to-date library %s into %s\n", checkLibFile, o->install); + } + } else if(o->verbose && (o->install!=nullptr)) { + fprintf(stdout, "# Not installing missing %s into %s\n", checkLibFile, o->install); + } + } + + if (pkg_checkFlag(o) == nullptr) { + /* Error occurred. */ + return result; + } +#endif + + if (!o->withoutAssembly && pkgDataFlags[GENCCODE_ASSEMBLY_TYPE][0] != 0) { + const char* genccodeAssembly = pkgDataFlags[GENCCODE_ASSEMBLY_TYPE]; + + if(o->verbose) { + fprintf(stdout, "# Generating assembly code %s of type %s ..\n", gencFilePath, genccodeAssembly); + } + + /* Offset genccodeAssembly by 3 because "-a " */ + if (genccodeAssembly && + (uprv_strlen(genccodeAssembly)>3) && + checkAssemblyHeaderName(genccodeAssembly+3)) { + writeAssemblyCode( + datFileNamePath, + o->tmpDir, + o->entryName, + nullptr, + gencFilePath, + sizeof(gencFilePath)); + + result = pkg_createWithAssemblyCode(targetDir, mode, gencFilePath); + if (result != 0) { + fprintf(stderr, "Error generating assembly code for data.\n"); + return result; + } else if (IN_STATIC_MODE(mode)) { + if(o->install != nullptr) { + if(o->verbose) { + fprintf(stdout, "# Installing static library into %s\n", o->install); + } + result = pkg_installLibrary(o->install, targetDir, noVersion); + } + return result; + } + } else { + fprintf(stderr,"Assembly type \"%s\" is unknown.\n", genccodeAssembly); + return -1; + } + } else { + if(o->verbose) { + fprintf(stdout, "# Writing object code to %s ..\n", gencFilePath); + } + if (o->withoutAssembly) { +#ifdef BUILD_DATA_WITHOUT_ASSEMBLY + result = pkg_createWithoutAssemblyCode(o, targetDir, mode); +#else + /* This error should not occur. */ + fprintf(stderr, "Error- BUILD_DATA_WITHOUT_ASSEMBLY is not defined. Internal error.\n"); +#endif + } else { +#ifdef CAN_WRITE_OBJ_CODE + /* Try to detect the arch type, use nullptr if unsuccessful */ + char optMatchArch[10] = { 0 }; + pkg_createOptMatchArch(optMatchArch); + writeObjectCode( + datFileNamePath, + o->tmpDir, + o->entryName, + (optMatchArch[0] == 0 ? nullptr : optMatchArch), + nullptr, + gencFilePath, + sizeof(gencFilePath), + true); + pkg_destroyOptMatchArch(optMatchArch); +#if U_PLATFORM_IS_LINUX_BASED + result = pkg_generateLibraryFile(targetDir, mode, gencFilePath); +#elif defined(WINDOWS_WITH_MSVC) + result = pkg_createWindowsDLL(mode, gencFilePath, o); +#endif +#elif defined(BUILD_DATA_WITHOUT_ASSEMBLY) + result = pkg_createWithoutAssemblyCode(o, targetDir, mode); +#else + fprintf(stderr, "Error- neither CAN_WRITE_OBJ_CODE nor BUILD_DATA_WITHOUT_ASSEMBLY are defined. Internal error.\n"); + return 1; +#endif + } + + if (result != 0) { + fprintf(stderr, "Error generating package data.\n"); + return result; + } + } +#if !U_PLATFORM_USES_ONLY_WIN32_API + if(!IN_STATIC_MODE(mode)) { + /* Certain platforms uses archive library. (e.g. AIX) */ + if(o->verbose) { + fprintf(stdout, "# Creating data archive library file ..\n"); + } + result = pkg_archiveLibrary(targetDir, o->version, reverseExt); + if (result != 0) { + fprintf(stderr, "Error creating data archive library file.\n"); + return result; + } +#if U_PLATFORM != U_PF_OS400 + if (!noVersion) { + /* Create symbolic links for the final library file. */ +#if U_PLATFORM == U_PF_OS390 + result = pkg_createSymLinks(targetDir, o->pdsbuild); +#else + result = pkg_createSymLinks(targetDir, noVersion); +#endif + if (result != 0) { + fprintf(stderr, "Error creating symbolic links of the data library file.\n"); + return result; + } + } +#endif + } /* !IN_STATIC_MODE */ +#endif + +#if !U_PLATFORM_USES_ONLY_WIN32_API + /* Install the libraries if option was set. */ + if (o->install != nullptr) { + if(o->verbose) { + fprintf(stdout, "# Installing library file to %s ..\n", o->install); + } + result = pkg_installLibrary(o->install, targetDir, noVersion); + if (result != 0) { + fprintf(stderr, "Error installing the data library.\n"); + return result; + } + } +#endif + } + } + return result; +} + +/* Initialize the pkgDataFlags with the option file given. */ +static int32_t initializePkgDataFlags(UPKGOptions *o) { + UErrorCode status = U_ZERO_ERROR; + int32_t result = 0; + int32_t currentBufferSize = SMALL_BUFFER_MAX_SIZE; + int32_t tmpResult = 0; + + /* Initialize pkgdataFlags */ + pkgDataFlags = (char**)uprv_malloc(sizeof(char*) * PKGDATA_FLAGS_SIZE); + + /* If we run out of space, allocate more */ +#if !defined(WINDOWS_WITH_MSVC) || defined(USING_CYGWIN) + do { +#endif + if (pkgDataFlags != nullptr) { + for (int32_t i = 0; i < PKGDATA_FLAGS_SIZE; i++) { + pkgDataFlags[i] = (char*)uprv_malloc(sizeof(char) * currentBufferSize); + if (pkgDataFlags[i] != nullptr) { + pkgDataFlags[i][0] = 0; + } else { + fprintf(stderr,"Error allocating memory for pkgDataFlags.\n"); + /* If an error occurs, ensure that the rest of the array is nullptr */ + for (int32_t n = i + 1; n < PKGDATA_FLAGS_SIZE; n++) { + pkgDataFlags[n] = nullptr; + } + return -1; + } + } + } else { + fprintf(stderr,"Error allocating memory for pkgDataFlags.\n"); + return -1; + } + + if (o->options == nullptr) { + return result; + } + +#if !defined(WINDOWS_WITH_MSVC) || defined(USING_CYGWIN) + /* Read in options file. */ + if(o->verbose) { + fprintf(stdout, "# Reading options file %s\n", o->options); + } + status = U_ZERO_ERROR; + tmpResult = parseFlagsFile(o->options, pkgDataFlags, currentBufferSize, FLAG_NAMES, (int32_t)PKGDATA_FLAGS_SIZE, &status); + if (status == U_BUFFER_OVERFLOW_ERROR) { + for (int32_t i = 0; i < PKGDATA_FLAGS_SIZE; i++) { + if (pkgDataFlags[i]) { + uprv_free(pkgDataFlags[i]); + pkgDataFlags[i] = nullptr; + } + } + currentBufferSize = tmpResult; + } else if (U_FAILURE(status)) { + fprintf(stderr,"Unable to open or read \"%s\" option file. status = %s\n", o->options, u_errorName(status)); + return -1; + } +#endif + if(o->verbose) { + fprintf(stdout, "# pkgDataFlags=\n"); + for(int32_t i=0;i<PKGDATA_FLAGS_SIZE;i++) { + fprintf(stdout, " [%d] %s: %s\n", i, FLAG_NAMES[i], pkgDataFlags[i]); + } + fprintf(stdout, "\n"); + } +#if !defined(WINDOWS_WITH_MSVC) || defined(USING_CYGWIN) + } while (status == U_BUFFER_OVERFLOW_ERROR); +#endif + + return result; +} + + +/* + * Given the base libName and version numbers, generate the library file names and store it in libFileNames. + * Depending on the configuration, the library name may either end with version number or shared object suffix. + */ +static void createFileNames(UPKGOptions *o, const char mode, const char *version_major, const char *version, const char *libName, UBool reverseExt, UBool noVersion) { + const char* FILE_EXTENSION_SEP = uprv_strlen(pkgDataFlags[SO_EXT]) == 0 ? "" : "."; + const char* FILE_SUFFIX = pkgDataFlags[LIB_EXT_ORDER][0] == '.' ? "." : ""; + +#if U_PLATFORM == U_PF_MINGW + /* MinGW does not need the library prefix when building in dll mode. */ + if (IN_DLL_MODE(mode)) { + snprintf(libFileNames[LIB_FILE], sizeof(libFileNames[LIB_FILE]), "%s", libName); + } else { + snprintf(libFileNames[LIB_FILE], sizeof(libFileNames[LIB_FILE]), "%s%s%s", + (strstr(libName, "icudt") ? "lib" : ""), + pkgDataFlags[LIBPREFIX], + libName); + } +#else + snprintf(libFileNames[LIB_FILE], sizeof(libFileNames[LIB_FILE]), "%s%s", + pkgDataFlags[LIBPREFIX], + libName); +#endif + + if(o->verbose) { + fprintf(stdout, "# libFileName[LIB_FILE] = %s\n", libFileNames[LIB_FILE]); + } + +#if U_PLATFORM == U_PF_MINGW + // Name the import library lib*.dll.a + snprintf(libFileNames[LIB_FILE_MINGW], sizeof(libFileNames[LIB_FILE_MINGW]), "lib%s.dll.a", libName); +#elif U_PLATFORM == U_PF_CYGWIN + snprintf(libFileNames[LIB_FILE_CYGWIN], sizeof(libFileNames[LIB_FILE_CYGWIN]), "cyg%s%s%s", + libName, + FILE_EXTENSION_SEP, + pkgDataFlags[SO_EXT]); + snprintf(libFileNames[LIB_FILE_CYGWIN_VERSION], sizeof(libFileNames[LIB_FILE_CYGWIN_VERSION]), "cyg%s%s%s%s", + libName, + version_major, + FILE_EXTENSION_SEP, + pkgDataFlags[SO_EXT]); + + uprv_strcat(pkgDataFlags[SO_EXT], "."); + uprv_strcat(pkgDataFlags[SO_EXT], pkgDataFlags[A_EXT]); +#elif U_PLATFORM == U_PF_OS400 || defined(_AIX) + snprintf(libFileNames[LIB_FILE_VERSION_TMP], sizeof(libFileNames[LIB_FILE_VERSION_TMP]), "%s%s%s", + libFileNames[LIB_FILE], + FILE_EXTENSION_SEP, + pkgDataFlags[SOBJ_EXT]); +#elif U_PLATFORM == U_PF_OS390 + snprintf(libFileNames[LIB_FILE_VERSION_TMP], sizeof(libFileNames[LIB_FILE_VERSION_TMP]), "%s%s%s%s%s", + libFileNames[LIB_FILE], + pkgDataFlags[LIB_EXT_ORDER][0] == '.' ? "." : "", + reverseExt ? version : pkgDataFlags[SOBJ_EXT], + FILE_EXTENSION_SEP, + reverseExt ? pkgDataFlags[SOBJ_EXT] : version); + + snprintf(libFileNames[LIB_FILE_OS390BATCH_VERSION], sizeof(libFileNames[LIB_FILE_OS390BATCH_VERSION]), "%s%s.x", + libFileNames[LIB_FILE], + version); + snprintf(libFileNames[LIB_FILE_OS390BATCH_MAJOR], sizeof(libFileNames[LIB_FILE_OS390BATCH_MAJOR]), "%s%s.x", + libFileNames[LIB_FILE], + version_major); +#else + if (noVersion && !reverseExt) { + snprintf(libFileNames[LIB_FILE_VERSION_TMP], sizeof(libFileNames[LIB_FILE_VERSION_TMP]), "%s%s%s", + libFileNames[LIB_FILE], + FILE_SUFFIX, + pkgDataFlags[SOBJ_EXT]); + } else { + snprintf(libFileNames[LIB_FILE_VERSION_TMP], sizeof(libFileNames[LIB_FILE_VERSION_TMP]), "%s%s%s%s%s", + libFileNames[LIB_FILE], + FILE_SUFFIX, + reverseExt ? version : pkgDataFlags[SOBJ_EXT], + FILE_EXTENSION_SEP, + reverseExt ? pkgDataFlags[SOBJ_EXT] : version); + } +#endif + if (noVersion && !reverseExt) { + snprintf(libFileNames[LIB_FILE_VERSION_MAJOR], sizeof(libFileNames[LIB_FILE_VERSION_TMP]), "%s%s%s", + libFileNames[LIB_FILE], + FILE_SUFFIX, + pkgDataFlags[SO_EXT]); + + snprintf(libFileNames[LIB_FILE_VERSION], sizeof(libFileNames[LIB_FILE_VERSION]), "%s%s%s", + libFileNames[LIB_FILE], + FILE_SUFFIX, + pkgDataFlags[SO_EXT]); + } else { + snprintf(libFileNames[LIB_FILE_VERSION_MAJOR], sizeof(libFileNames[LIB_FILE_VERSION_MAJOR]), "%s%s%s%s%s", + libFileNames[LIB_FILE], + FILE_SUFFIX, + reverseExt ? version_major : pkgDataFlags[SO_EXT], + FILE_EXTENSION_SEP, + reverseExt ? pkgDataFlags[SO_EXT] : version_major); + + snprintf(libFileNames[LIB_FILE_VERSION], sizeof(libFileNames[LIB_FILE_VERSION]), "%s%s%s%s%s", + libFileNames[LIB_FILE], + FILE_SUFFIX, + reverseExt ? version : pkgDataFlags[SO_EXT], + FILE_EXTENSION_SEP, + reverseExt ? pkgDataFlags[SO_EXT] : version); + } + + if(o->verbose) { + fprintf(stdout, "# libFileName[LIB_FILE_VERSION] = %s\n", libFileNames[LIB_FILE_VERSION]); + } + +#if U_PF_MINGW <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN + /* Cygwin and MinGW only deals with the version major number. */ + uprv_strcpy(libFileNames[LIB_FILE_VERSION_TMP], libFileNames[LIB_FILE_VERSION_MAJOR]); +#endif + + if(IN_STATIC_MODE(mode)) { + snprintf(libFileNames[LIB_FILE_VERSION], sizeof(libFileNames[LIB_FILE_VERSION]), "%s.%s", libFileNames[LIB_FILE], pkgDataFlags[A_EXT]); + libFileNames[LIB_FILE_VERSION_MAJOR][0]=0; + if(o->verbose) { + fprintf(stdout, "# libFileName[LIB_FILE_VERSION] = %s (static)\n", libFileNames[LIB_FILE_VERSION]); + } + } +} + +/* Create the symbolic links for the final library file. */ +static int32_t pkg_createSymLinks(const char *targetDir, UBool specialHandling) { + int32_t result = 0; + char cmd[LARGE_BUFFER_MAX_SIZE]; + char name1[SMALL_BUFFER_MAX_SIZE]; /* symlink file name */ + char name2[SMALL_BUFFER_MAX_SIZE]; /* file name to symlink */ + const char* FILE_EXTENSION_SEP = uprv_strlen(pkgDataFlags[SO_EXT]) == 0 ? "" : "."; + +#if U_PLATFORM != U_PF_CYGWIN + /* No symbolic link to make. */ + if (uprv_strlen(libFileNames[LIB_FILE_VERSION]) == 0 || uprv_strlen(libFileNames[LIB_FILE_VERSION_MAJOR]) == 0 || + uprv_strcmp(libFileNames[LIB_FILE_VERSION], libFileNames[LIB_FILE_VERSION_MAJOR]) == 0) { + return result; + } + + snprintf(cmd, sizeof(cmd), "cd %s && %s %s && %s %s %s", + targetDir, + RM_CMD, + libFileNames[LIB_FILE_VERSION_MAJOR], + LN_CMD, + libFileNames[LIB_FILE_VERSION], + libFileNames[LIB_FILE_VERSION_MAJOR]); + result = runCommand(cmd); + if (result != 0) { + fprintf(stderr, "Error creating symbolic links. Failed command: %s\n", cmd); + return result; + } +#endif + + if (specialHandling) { +#if U_PLATFORM == U_PF_CYGWIN + snprintf(name1, sizeof(name1), "%s", libFileNames[LIB_FILE_CYGWIN]); + snprintf(name2, sizeof(name2), "%s", libFileNames[LIB_FILE_CYGWIN_VERSION]); +#elif U_PLATFORM == U_PF_OS390 + /* Create the symbolic links for the import data */ + /* Use the cmd buffer to store path to import data file to check its existence */ + snprintf(cmd, sizeof(cmd), "%s/%s", targetDir, libFileNames[LIB_FILE_OS390BATCH_VERSION]); + if (T_FileStream_file_exists(cmd)) { + snprintf(cmd, sizeof(cmd), "cd %s && %s %s && %s %s %s", + targetDir, + RM_CMD, + libFileNames[LIB_FILE_OS390BATCH_MAJOR], + LN_CMD, + libFileNames[LIB_FILE_OS390BATCH_VERSION], + libFileNames[LIB_FILE_OS390BATCH_MAJOR]); + result = runCommand(cmd); + if (result != 0) { + fprintf(stderr, "Error creating symbolic links. Failed command: %s\n", cmd); + return result; + } + + snprintf(cmd, sizeof(cmd), "cd %s && %s %s.x && %s %s %s.x", + targetDir, + RM_CMD, + libFileNames[LIB_FILE], + LN_CMD, + libFileNames[LIB_FILE_OS390BATCH_VERSION], + libFileNames[LIB_FILE]); + result = runCommand(cmd); + if (result != 0) { + fprintf(stderr, "Error creating symbolic links. Failed command: %s\n", cmd); + return result; + } + } + + /* Needs to be set here because special handling skips it */ + snprintf(name1, sizeof(name1), "%s%s%s", libFileNames[LIB_FILE], FILE_EXTENSION_SEP, pkgDataFlags[SO_EXT]); + snprintf(name2, sizeof(name2), "%s", libFileNames[LIB_FILE_VERSION]); +#else + goto normal_symlink_mode; +#endif + } else { +#if U_PLATFORM != U_PF_CYGWIN +normal_symlink_mode: +#endif + snprintf(name1, sizeof(name1), "%s%s%s", libFileNames[LIB_FILE], FILE_EXTENSION_SEP, pkgDataFlags[SO_EXT]); + snprintf(name2, sizeof(name2), "%s", libFileNames[LIB_FILE_VERSION]); + } + + snprintf(cmd, sizeof(cmd), "cd %s && %s %s && %s %s %s", + targetDir, + RM_CMD, + name1, + LN_CMD, + name2, + name1); + + result = runCommand(cmd); + + return result; +} + +static int32_t pkg_installLibrary(const char *installDir, const char *targetDir, UBool noVersion) { + int32_t result = 0; + char cmd[SMALL_BUFFER_MAX_SIZE]; + + auto ret = snprintf(cmd, + sizeof(cmd), + "cd %s && %s %s %s%s%s", + targetDir, + pkgDataFlags[INSTALL_CMD], + libFileNames[LIB_FILE_VERSION], + installDir, PKGDATA_FILE_SEP_STRING, libFileNames[LIB_FILE_VERSION]); + (void)ret; + U_ASSERT(0 <= ret && ret < SMALL_BUFFER_MAX_SIZE); + + result = runCommand(cmd); + + if (result != 0) { + fprintf(stderr, "Error installing library. Failed command: %s\n", cmd); + return result; + } + +#ifdef CYGWINMSVC + snprintf(cmd, sizeof(cmd), "cd %s && %s %s.lib %s", + targetDir, + pkgDataFlags[INSTALL_CMD], + libFileNames[LIB_FILE], + installDir + ); + result = runCommand(cmd); + + if (result != 0) { + fprintf(stderr, "Error installing library. Failed command: %s\n", cmd); + return result; + } +#elif U_PLATFORM == U_PF_CYGWIN + snprintf(cmd, sizeof(cmd), "cd %s && %s %s %s", + targetDir, + pkgDataFlags[INSTALL_CMD], + libFileNames[LIB_FILE_CYGWIN_VERSION], + installDir + ); + result = runCommand(cmd); + + if (result != 0) { + fprintf(stderr, "Error installing library. Failed command: %s\n", cmd); + return result; + } + +#elif U_PLATFORM == U_PF_OS390 + if (T_FileStream_file_exists(libFileNames[LIB_FILE_OS390BATCH_VERSION])) { + snprintf(cmd, sizeof(cmd), "%s %s %s", + pkgDataFlags[INSTALL_CMD], + libFileNames[LIB_FILE_OS390BATCH_VERSION], + installDir + ); + result = runCommand(cmd); + + if (result != 0) { + fprintf(stderr, "Error installing library. Failed command: %s\n", cmd); + return result; + } + } +#endif + + if (noVersion) { + return result; + } else { + return pkg_createSymLinks(installDir, true); + } +} + +static int32_t pkg_installCommonMode(const char *installDir, const char *fileName) { + int32_t result = 0; + char cmd[SMALL_BUFFER_MAX_SIZE] = ""; + + if (!T_FileStream_file_exists(installDir)) { + UErrorCode status = U_ZERO_ERROR; + + uprv_mkdir(installDir, &status); + if (U_FAILURE(status)) { + fprintf(stderr, "Error creating installation directory: %s\n", installDir); + return -1; + } + } +#ifndef U_WINDOWS_WITH_MSVC + snprintf(cmd, sizeof(cmd), "%s %s %s", pkgDataFlags[INSTALL_CMD], fileName, installDir); +#else + snprintf(cmd, sizeof(cmd), "%s %s %s %s", WIN_INSTALL_CMD, fileName, installDir, WIN_INSTALL_CMD_FLAGS); +#endif + + result = runCommand(cmd); + if (result != 0) { + fprintf(stderr, "Failed to install data file with command: %s\n", cmd); + } + + return result; +} + +#ifdef U_WINDOWS_MSVC +/* Copy commands for installing the raw data files on Windows. */ +#define WIN_INSTALL_CMD "xcopy" +#define WIN_INSTALL_CMD_FLAGS "/E /Y /K" +#endif +static int32_t pkg_installFileMode(const char *installDir, const char *srcDir, const char *fileListName) { + int32_t result = 0; + char cmd[SMALL_BUFFER_MAX_SIZE] = ""; + + if (!T_FileStream_file_exists(installDir)) { + UErrorCode status = U_ZERO_ERROR; + + uprv_mkdir(installDir, &status); + if (U_FAILURE(status)) { + fprintf(stderr, "Error creating installation directory: %s\n", installDir); + return -1; + } + } +#ifndef U_WINDOWS_WITH_MSVC + char buffer[SMALL_BUFFER_MAX_SIZE] = ""; + int32_t bufferLength = 0; + + FileStream *f = T_FileStream_open(fileListName, "r"); + if (f != nullptr) { + for(;;) { + if (T_FileStream_readLine(f, buffer, SMALL_BUFFER_MAX_SIZE) != nullptr) { + bufferLength = static_cast<int32_t>(uprv_strlen(buffer)); + /* Remove new line character. */ + if (bufferLength > 0) { + buffer[bufferLength-1] = 0; + } + + auto ret = snprintf(cmd, + sizeof(cmd), + "%s %s%s%s %s%s%s", + pkgDataFlags[INSTALL_CMD], + srcDir, PKGDATA_FILE_SEP_STRING, buffer, + installDir, PKGDATA_FILE_SEP_STRING, buffer); + (void)ret; + U_ASSERT(0 <= ret && ret < SMALL_BUFFER_MAX_SIZE); + + result = runCommand(cmd); + if (result != 0) { + fprintf(stderr, "Failed to install data file with command: %s\n", cmd); + break; + } + } else { + if (!T_FileStream_eof(f)) { + fprintf(stderr, "Failed to read line from file: %s\n", fileListName); + result = -1; + } + break; + } + } + T_FileStream_close(f); + } else { + result = -1; + fprintf(stderr, "Unable to open list file: %s\n", fileListName); + } +#else + snprintf(cmd, sizeof(cmd), "%s %s %s %s", WIN_INSTALL_CMD, srcDir, installDir, WIN_INSTALL_CMD_FLAGS); + result = runCommand(cmd); + if (result != 0) { + fprintf(stderr, "Failed to install data file with command: %s\n", cmd); + } +#endif + + return result; +} + +/* Archiving of the library file may be needed depending on the platform and options given. + * If archiving is not needed, copy over the library file name. + */ +static int32_t pkg_archiveLibrary(const char *targetDir, const char *version, UBool reverseExt) { + int32_t result = 0; + char cmd[LARGE_BUFFER_MAX_SIZE]; + + /* If the shared object suffix and the final object suffix is different and the final object suffix and the + * archive file suffix is the same, then the final library needs to be archived. + */ + if (uprv_strcmp(pkgDataFlags[SOBJ_EXT], pkgDataFlags[SO_EXT]) != 0 && uprv_strcmp(pkgDataFlags[A_EXT], pkgDataFlags[SO_EXT]) == 0) { + snprintf(libFileNames[LIB_FILE_VERSION], sizeof(libFileNames[LIB_FILE_VERSION]), "%s%s%s.%s", + libFileNames[LIB_FILE], + pkgDataFlags[LIB_EXT_ORDER][0] == '.' ? "." : "", + reverseExt ? version : pkgDataFlags[SO_EXT], + reverseExt ? pkgDataFlags[SO_EXT] : version); + + snprintf(cmd, sizeof(cmd), "%s %s %s%s %s%s", + pkgDataFlags[AR], + pkgDataFlags[ARFLAGS], + targetDir, + libFileNames[LIB_FILE_VERSION], + targetDir, + libFileNames[LIB_FILE_VERSION_TMP]); + + result = runCommand(cmd); + if (result != 0) { + fprintf(stderr, "Error creating archive library. Failed command: %s\n", cmd); + return result; + } + + snprintf(cmd, sizeof(cmd), "%s %s%s", + pkgDataFlags[RANLIB], + targetDir, + libFileNames[LIB_FILE_VERSION]); + + result = runCommand(cmd); + if (result != 0) { + fprintf(stderr, "Error creating archive library. Failed command: %s\n", cmd); + return result; + } + + /* Remove unneeded library file. */ + snprintf(cmd, sizeof(cmd), "%s %s%s", + RM_CMD, + targetDir, + libFileNames[LIB_FILE_VERSION_TMP]); + + result = runCommand(cmd); + if (result != 0) { + fprintf(stderr, "Error creating archive library. Failed command: %s\n", cmd); + return result; + } + + } else { + uprv_strcpy(libFileNames[LIB_FILE_VERSION], libFileNames[LIB_FILE_VERSION_TMP]); + } + + return result; +} + +/* + * Using the compiler information from the configuration file set by -O option, generate the library file. + * command may be given to allow for a larger buffer for cmd. + */ +static int32_t pkg_generateLibraryFile(const char *targetDir, const char mode, const char *objectFile, char *command, UBool specialHandling) { + int32_t result = 0; + char *cmd = nullptr; + UBool freeCmd = false; + int32_t length = 0; + + (void)specialHandling; // Suppress unused variable compiler warnings on platforms where all usage + // of this parameter is #ifdefed out. + + /* This is necessary because if packaging is done without assembly code, objectFile might be extremely large + * containing many object files and so the calling function should supply a command buffer that is large + * enough to handle this. Otherwise, use the default size. + */ + if (command != nullptr) { + cmd = command; + } + + if (IN_STATIC_MODE(mode)) { + if (cmd == nullptr) { + length = static_cast<int32_t>(uprv_strlen(pkgDataFlags[AR]) + uprv_strlen(pkgDataFlags[ARFLAGS]) + uprv_strlen(targetDir) + + uprv_strlen(libFileNames[LIB_FILE_VERSION]) + uprv_strlen(objectFile) + uprv_strlen(pkgDataFlags[RANLIB]) + BUFFER_PADDING_SIZE); + if ((cmd = (char *)uprv_malloc(sizeof(char) * length)) == nullptr) { + fprintf(stderr, "Unable to allocate memory for command.\n"); + return -1; + } + freeCmd = true; + } + sprintf(cmd, "%s %s %s%s %s", + pkgDataFlags[AR], + pkgDataFlags[ARFLAGS], + targetDir, + libFileNames[LIB_FILE_VERSION], + objectFile); + + result = runCommand(cmd); + if (result == 0) { + sprintf(cmd, "%s %s%s", + pkgDataFlags[RANLIB], + targetDir, + libFileNames[LIB_FILE_VERSION]); + + result = runCommand(cmd); + } + } else /* if (IN_DLL_MODE(mode)) */ { + if (cmd == nullptr) { + length = static_cast<int32_t>(uprv_strlen(pkgDataFlags[GENLIB]) + uprv_strlen(pkgDataFlags[LDICUDTFLAGS]) + + ((uprv_strlen(targetDir) + uprv_strlen(libFileNames[LIB_FILE_VERSION_TMP])) * 2) + + uprv_strlen(objectFile) + uprv_strlen(pkgDataFlags[LD_SONAME]) + + uprv_strlen(pkgDataFlags[LD_SONAME][0] == 0 ? "" : libFileNames[LIB_FILE_VERSION_MAJOR]) + + uprv_strlen(pkgDataFlags[RPATH_FLAGS]) + uprv_strlen(pkgDataFlags[BIR_FLAGS]) + BUFFER_PADDING_SIZE); +#if U_PLATFORM == U_PF_CYGWIN + length += static_cast<int32_t>(uprv_strlen(targetDir) + uprv_strlen(libFileNames[LIB_FILE_CYGWIN_VERSION])); +#elif U_PLATFORM == U_PF_MINGW + length += static_cast<int32_t>(uprv_strlen(targetDir) + uprv_strlen(libFileNames[LIB_FILE_MINGW])); +#endif + if ((cmd = (char *)uprv_malloc(sizeof(char) * length)) == nullptr) { + fprintf(stderr, "Unable to allocate memory for command.\n"); + return -1; + } + freeCmd = true; + } +#if U_PLATFORM == U_PF_MINGW + sprintf(cmd, "%s%s%s %s -o %s%s %s %s%s %s %s", + pkgDataFlags[GENLIB], + targetDir, + libFileNames[LIB_FILE_MINGW], + pkgDataFlags[LDICUDTFLAGS], + targetDir, + libFileNames[LIB_FILE_VERSION_TMP], +#elif U_PLATFORM == U_PF_CYGWIN + sprintf(cmd, "%s%s%s %s -o %s%s %s %s%s %s %s", + pkgDataFlags[GENLIB], + targetDir, + libFileNames[LIB_FILE_VERSION_TMP], + pkgDataFlags[LDICUDTFLAGS], + targetDir, + libFileNames[LIB_FILE_CYGWIN_VERSION], +#elif U_PLATFORM == U_PF_AIX + sprintf(cmd, "%s %s%s;%s %s -o %s%s %s %s%s %s %s", + RM_CMD, + targetDir, + libFileNames[LIB_FILE_VERSION_TMP], + pkgDataFlags[GENLIB], + pkgDataFlags[LDICUDTFLAGS], + targetDir, + libFileNames[LIB_FILE_VERSION_TMP], +#else + sprintf(cmd, "%s %s -o %s%s %s %s%s %s %s", + pkgDataFlags[GENLIB], + pkgDataFlags[LDICUDTFLAGS], + targetDir, + libFileNames[LIB_FILE_VERSION_TMP], +#endif + objectFile, + pkgDataFlags[LD_SONAME], + pkgDataFlags[LD_SONAME][0] == 0 ? "" : libFileNames[LIB_FILE_VERSION_MAJOR], + pkgDataFlags[RPATH_FLAGS], + pkgDataFlags[BIR_FLAGS]); + + /* Generate the library file. */ + result = runCommand(cmd); + +#if U_PLATFORM == U_PF_OS390 + char *env_tmp; + char PDS_LibName[512]; + char PDS_Name[512]; + + PDS_Name[0] = 0; + PDS_LibName[0] = 0; + if (specialHandling && uprv_strcmp(libFileNames[LIB_FILE],"libicudata") == 0) { + if (env_tmp = getenv("ICU_PDS_NAME")) { + sprintf(PDS_Name, "%s%s", + env_tmp, + "DA"); + strcat(PDS_Name, getenv("ICU_PDS_NAME_SUFFIX")); + } else if (env_tmp = getenv("PDS_NAME_PREFIX")) { + sprintf(PDS_Name, "%s%s", + env_tmp, + U_ICU_VERSION_SHORT "DA"); + } else { + sprintf(PDS_Name, "%s%s", + "IXMI", + U_ICU_VERSION_SHORT "DA"); + } + } else if (!specialHandling && uprv_strcmp(libFileNames[LIB_FILE],"libicudata_stub") == 0) { + if (env_tmp = getenv("ICU_PDS_NAME")) { + sprintf(PDS_Name, "%s%s", + env_tmp, + "D1"); + strcat(PDS_Name, getenv("ICU_PDS_NAME_SUFFIX")); + } else if (env_tmp = getenv("PDS_NAME_PREFIX")) { + sprintf(PDS_Name, "%s%s", + env_tmp, + U_ICU_VERSION_SHORT "D1"); + } else { + sprintf(PDS_Name, "%s%s", + "IXMI", + U_ICU_VERSION_SHORT "D1"); + } + } + + if (PDS_Name[0]) { + sprintf(PDS_LibName,"%s%s%s%s%s", + "\"//'", + getenv("LOADMOD"), + "(", + PDS_Name, + ")'\""); + sprintf(cmd, "%s %s -o %s %s %s%s %s %s", + pkgDataFlags[GENLIB], + pkgDataFlags[LDICUDTFLAGS], + PDS_LibName, + objectFile, + pkgDataFlags[LD_SONAME], + pkgDataFlags[LD_SONAME][0] == 0 ? "" : libFileNames[LIB_FILE_VERSION_MAJOR], + pkgDataFlags[RPATH_FLAGS], + pkgDataFlags[BIR_FLAGS]); + + result = runCommand(cmd); + } +#endif + } + + if (result != 0) { + fprintf(stderr, "Error generating library file. Failed command: %s\n", cmd); + } + + if (freeCmd) { + uprv_free(cmd); + } + + return result; +} + +static int32_t pkg_createWithAssemblyCode(const char *targetDir, const char mode, const char *gencFilePath) { + char tempObjectFile[SMALL_BUFFER_MAX_SIZE] = ""; + int32_t result = 0; + int32_t length = 0; + + /* Remove the ending .s and replace it with .o for the new object file. */ + uprv_strcpy(tempObjectFile, gencFilePath); + tempObjectFile[uprv_strlen(tempObjectFile)-1] = 'o'; + + length = static_cast<int32_t>(uprv_strlen(pkgDataFlags[COMPILER]) + uprv_strlen(pkgDataFlags[LIBFLAGS]) + + uprv_strlen(tempObjectFile) + uprv_strlen(gencFilePath) + BUFFER_PADDING_SIZE); + + LocalMemory<char> cmd((char *)uprv_malloc(sizeof(char) * length)); + if (cmd.isNull()) { + return -1; + } + + /* Generate the object file. */ + snprintf(cmd.getAlias(), length, "%s %s -o %s %s", + pkgDataFlags[COMPILER], + pkgDataFlags[LIBFLAGS], + tempObjectFile, + gencFilePath); + + result = runCommand(cmd.getAlias()); + + if (result != 0) { + fprintf(stderr, "Error creating with assembly code. Failed command: %s\n", cmd.getAlias()); + return result; + } + + return pkg_generateLibraryFile(targetDir, mode, tempObjectFile); +} + +#ifdef BUILD_DATA_WITHOUT_ASSEMBLY +/* + * Generation of the data library without assembly code needs to compile each data file + * individually and then link it all together. + * Note: Any update to the directory structure of the data needs to be reflected here. + */ +enum { + DATA_PREFIX_BRKITR, + DATA_PREFIX_COLL, + DATA_PREFIX_CURR, + DATA_PREFIX_LANG, + DATA_PREFIX_RBNF, + DATA_PREFIX_REGION, + DATA_PREFIX_TRANSLIT, + DATA_PREFIX_ZONE, + DATA_PREFIX_UNIT, + DATA_PREFIX_LENGTH +}; + +const static char DATA_PREFIX[DATA_PREFIX_LENGTH][10] = { + "brkitr", + "coll", + "curr", + "lang", + "rbnf", + "region", + "translit", + "zone", + "unit" +}; + +static int32_t pkg_createWithoutAssemblyCode(UPKGOptions *o, const char *targetDir, const char mode) { + int32_t result = 0; + CharList *list = o->filePaths; + CharList *listNames = o->files; + int32_t listSize = pkg_countCharList(list); + char *buffer; + char *cmd; + char gencmnFile[SMALL_BUFFER_MAX_SIZE] = ""; + char tempObjectFile[SMALL_BUFFER_MAX_SIZE] = ""; +#ifdef USE_SINGLE_CCODE_FILE + char icudtAll[SMALL_BUFFER_MAX_SIZE] = ""; + FileStream *icudtAllFile = nullptr; + + snprintf(icudtAll, sizeof(icudtAll), "%s%s%sall.c", + o->tmpDir, + PKGDATA_FILE_SEP_STRING, + libFileNames[LIB_FILE]); + /* Remove previous icudtall.c file. */ + if (T_FileStream_file_exists(icudtAll) && (result = remove(icudtAll)) != 0) { + fprintf(stderr, "Unable to remove old icudtall file: %s\n", icudtAll); + return result; + } + + if((icudtAllFile = T_FileStream_open(icudtAll, "w"))==nullptr) { + fprintf(stderr, "Unable to write to icudtall file: %s\n", icudtAll); + return result; + } +#endif + + if (list == nullptr || listNames == nullptr) { + /* list and listNames should never be nullptr since we are looping through the CharList with + * the given size. + */ + return -1; + } + + if ((cmd = (char *)uprv_malloc((listSize + 2) * SMALL_BUFFER_MAX_SIZE)) == nullptr) { + fprintf(stderr, "Unable to allocate memory for cmd.\n"); + return -1; + } else if ((buffer = (char *)uprv_malloc((listSize + 1) * SMALL_BUFFER_MAX_SIZE)) == nullptr) { + fprintf(stderr, "Unable to allocate memory for buffer.\n"); + uprv_free(cmd); + return -1; + } + + for (int32_t i = 0; i < (listSize + 1); i++) { + const char *file ; + const char *name; + + if (i == 0) { + /* The first iteration calls the gencmn function and initializes the buffer. */ + createCommonDataFile(o->tmpDir, o->shortName, o->entryName, nullptr, o->srcDir, o->comment, o->fileListFiles->str, 0, true, o->verbose, gencmnFile); + buffer[0] = 0; +#ifdef USE_SINGLE_CCODE_FILE + uprv_strcpy(tempObjectFile, gencmnFile); + tempObjectFile[uprv_strlen(tempObjectFile) - 1] = 'o'; + + sprintf(cmd, "%s %s -o %s %s", + pkgDataFlags[COMPILER], + pkgDataFlags[LIBFLAGS], + tempObjectFile, + gencmnFile); + + result = runCommand(cmd); + if (result != 0) { + break; + } + + sprintf(buffer, "%s",tempObjectFile); +#endif + } else { + char newName[SMALL_BUFFER_MAX_SIZE]; + char dataName[SMALL_BUFFER_MAX_SIZE]; + char dataDirName[SMALL_BUFFER_MAX_SIZE]; + const char *pSubstring; + file = list->str; + name = listNames->str; + + newName[0] = dataName[0] = 0; + for (int32_t n = 0; n < DATA_PREFIX_LENGTH; n++) { + dataDirName[0] = 0; + sprintf(dataDirName, "%s%s", DATA_PREFIX[n], PKGDATA_FILE_SEP_STRING); + /* If the name contains a prefix (indicating directory), alter the new name accordingly. */ + pSubstring = uprv_strstr(name, dataDirName); + if (pSubstring != nullptr) { + char newNameTmp[SMALL_BUFFER_MAX_SIZE] = ""; + const char *p = name + uprv_strlen(dataDirName); + for (int32_t i = 0;;i++) { + if (p[i] == '.') { + newNameTmp[i] = '_'; + continue; + } + newNameTmp[i] = p[i]; + if (p[i] == 0) { + break; + } + } + auto ret = snprintf(newName, + sizeof(newName), + "%s_%s", + DATA_PREFIX[n], + newNameTmp); + (void)ret; + U_ASSERT(0 <= ret && ret < SMALL_BUFFER_MAX_SIZE); + ret = snprintf(dataName, + sizeof(dataName), + "%s_%s", + o->shortName, + DATA_PREFIX[n]); + (void)ret; + U_ASSERT(0 <= ret && ret < SMALL_BUFFER_MAX_SIZE); + } + if (newName[0] != 0) { + break; + } + } + + if(o->verbose) { + printf("# Generating %s \n", gencmnFile); + } + + writeCCode( + file, + o->tmpDir, + nullptr, + dataName[0] != 0 ? dataName : o->shortName, + newName[0] != 0 ? newName : nullptr, + gencmnFile, + sizeof(gencmnFile)); + +#ifdef USE_SINGLE_CCODE_FILE + sprintf(cmd, "#include \"%s\"\n", gencmnFile); + T_FileStream_writeLine(icudtAllFile, cmd); + /* don't delete the file */ +#endif + } + +#ifndef USE_SINGLE_CCODE_FILE + uprv_strcpy(tempObjectFile, gencmnFile); + tempObjectFile[uprv_strlen(tempObjectFile) - 1] = 'o'; + + sprintf(cmd, "%s %s -o %s %s", + pkgDataFlags[COMPILER], + pkgDataFlags[LIBFLAGS], + tempObjectFile, + gencmnFile); + result = runCommand(cmd); + if (result != 0) { + fprintf(stderr, "Error creating library without assembly code. Failed command: %s\n", cmd); + break; + } + + uprv_strcat(buffer, " "); + uprv_strcat(buffer, tempObjectFile); + +#endif + + if (i > 0) { + list = list->next; + listNames = listNames->next; + } + } + +#ifdef USE_SINGLE_CCODE_FILE + T_FileStream_close(icudtAllFile); + uprv_strcpy(tempObjectFile, icudtAll); + tempObjectFile[uprv_strlen(tempObjectFile) - 1] = 'o'; + + sprintf(cmd, "%s %s -I. -o %s %s", + pkgDataFlags[COMPILER], + pkgDataFlags[LIBFLAGS], + tempObjectFile, + icudtAll); + + result = runCommand(cmd); + if (result == 0) { + uprv_strcat(buffer, " "); + uprv_strcat(buffer, tempObjectFile); + } else { + fprintf(stderr, "Error creating library without assembly code. Failed command: %s\n", cmd); + } +#endif + + if (result == 0) { + /* Generate the library file. */ +#if U_PLATFORM == U_PF_OS390 + result = pkg_generateLibraryFile(targetDir, mode, buffer, cmd, (o->pdsbuild && IN_DLL_MODE(mode))); +#else + result = pkg_generateLibraryFile(targetDir,mode, buffer, cmd); +#endif + } + + uprv_free(buffer); + uprv_free(cmd); + + return result; +} +#endif + +#ifdef WINDOWS_WITH_MSVC +#define LINK_CMD "link.exe /nologo /release /out:" +#define LINK_FLAGS "/NXCOMPAT /DYNAMICBASE /DLL /NOENTRY /MANIFEST:NO /implib:" + +#define LINK_EXTRA_UWP_FLAGS "/APPCONTAINER " +#define LINK_EXTRA_UWP_FLAGS_X86_ONLY "/SAFESEH " + +#define LINK_EXTRA_FLAGS_MACHINE "/MACHINE:" +#define LIB_CMD "LIB.exe /nologo /out:" +#define LIB_FILE "icudt.lib" +#define LIB_EXT UDATA_LIB_SUFFIX +#define DLL_EXT UDATA_SO_SUFFIX + +static int32_t pkg_createWindowsDLL(const char mode, const char *gencFilePath, UPKGOptions *o) { + int32_t result = 0; + char cmd[LARGE_BUFFER_MAX_SIZE]; + if (IN_STATIC_MODE(mode)) { + char staticLibFilePath[SMALL_BUFFER_MAX_SIZE] = ""; + +#ifdef CYGWINMSVC + snprintf(staticLibFilePath, sizeof(staticLibFilePath), "%s%s%s%s%s", + o->targetDir, + PKGDATA_FILE_SEP_STRING, + pkgDataFlags[LIBPREFIX], + o->libName, + LIB_EXT); +#else + snprintf(staticLibFilePath, sizeof(staticLibFilePath), "%s%s%s%s%s", + o->targetDir, + PKGDATA_FILE_SEP_STRING, + (strstr(o->libName, "icudt") ? "s" : ""), + o->libName, + LIB_EXT); +#endif + + snprintf(cmd, sizeof(cmd), "%s\"%s\" \"%s\"", + LIB_CMD, + staticLibFilePath, + gencFilePath); + } else if (IN_DLL_MODE(mode)) { + char dllFilePath[SMALL_BUFFER_MAX_SIZE] = ""; + char libFilePath[SMALL_BUFFER_MAX_SIZE] = ""; + char resFilePath[SMALL_BUFFER_MAX_SIZE] = ""; + char tmpResFilePath[SMALL_BUFFER_MAX_SIZE] = ""; + +#ifdef CYGWINMSVC + uprv_strcpy(dllFilePath, o->targetDir); +#else + uprv_strcpy(dllFilePath, o->srcDir); +#endif + uprv_strcat(dllFilePath, PKGDATA_FILE_SEP_STRING); + uprv_strcpy(libFilePath, dllFilePath); + +#ifdef CYGWINMSVC + uprv_strcat(libFilePath, o->libName); + uprv_strcat(libFilePath, ".lib"); + + uprv_strcat(dllFilePath, o->libName); + uprv_strcat(dllFilePath, o->version); +#else + if (strstr(o->libName, "icudt")) { + uprv_strcat(libFilePath, LIB_FILE); + } else { + uprv_strcat(libFilePath, o->libName); + uprv_strcat(libFilePath, ".lib"); + } + uprv_strcat(dllFilePath, o->entryName); +#endif + uprv_strcat(dllFilePath, DLL_EXT); + + uprv_strcpy(tmpResFilePath, o->tmpDir); + uprv_strcat(tmpResFilePath, PKGDATA_FILE_SEP_STRING); + uprv_strcat(tmpResFilePath, ICUDATA_RES_FILE); + + if (T_FileStream_file_exists(tmpResFilePath)) { + snprintf(resFilePath, sizeof(resFilePath), "\"%s\"", tmpResFilePath); + } + + /* Check if dll file and lib file exists and that it is not newer than genc file. */ + if (!o->rebuild && (T_FileStream_file_exists(dllFilePath) && isFileModTimeLater(dllFilePath, gencFilePath)) && + (T_FileStream_file_exists(libFilePath) && isFileModTimeLater(libFilePath, gencFilePath))) { + if(o->verbose) { + printf("# Not rebuilding %s - up to date.\n", gencFilePath); + } + return 0; + } + + char extraFlags[SMALL_BUFFER_MAX_SIZE] = ""; +#ifdef WINDOWS_WITH_MSVC + if (options[WIN_UWP_BUILD].doesOccur) { + uprv_strcat(extraFlags, LINK_EXTRA_UWP_FLAGS); + + if (options[WIN_DLL_ARCH].doesOccur) { + if (uprv_strcmp(options[WIN_DLL_ARCH].value, "X86") == 0) { + uprv_strcat(extraFlags, LINK_EXTRA_UWP_FLAGS_X86_ONLY); + } + } + } + + if (options[WIN_DLL_ARCH].doesOccur) { + uprv_strcat(extraFlags, LINK_EXTRA_FLAGS_MACHINE); + uprv_strcat(extraFlags, options[WIN_DLL_ARCH].value); + } + +#endif + snprintf(cmd, sizeof(cmd), "%s\"%s\" %s %s\"%s\" \"%s\" %s", + LINK_CMD, + dllFilePath, + extraFlags, + LINK_FLAGS, + libFilePath, + gencFilePath, + resFilePath + ); + } + + result = runCommand(cmd, true); + if (result != 0) { + fprintf(stderr, "Error creating Windows DLL library. Failed command: %s\n", cmd); + } + + return result; +} +#endif + +static UPKGOptions *pkg_checkFlag(UPKGOptions *o) { +#if U_PLATFORM == U_PF_AIX + /* AIX needs a map file. */ + char *flag = nullptr; + int32_t length = 0; + char tmpbuffer[SMALL_BUFFER_MAX_SIZE]; + const char MAP_FILE_EXT[] = ".map"; + FileStream *f = nullptr; + char mapFile[SMALL_BUFFER_MAX_SIZE] = ""; + int32_t start = -1; + uint32_t count = 0; + const char rm_cmd[] = "rm -f all ;"; + + flag = pkgDataFlags[GENLIB]; + + /* This portion of the code removes 'rm -f all' in the GENLIB. + * Only occurs in AIX. + */ + if (uprv_strstr(flag, rm_cmd) != nullptr) { + char *tmpGenlibFlagBuffer = nullptr; + int32_t i, offset; + + length = static_cast<int32_t>(uprv_strlen(flag) + 1); + tmpGenlibFlagBuffer = (char *)uprv_malloc(length); + if (tmpGenlibFlagBuffer == nullptr) { + /* Memory allocation error */ + fprintf(stderr,"Unable to allocate buffer of size: %d.\n", length); + return nullptr; + } + + uprv_strcpy(tmpGenlibFlagBuffer, flag); + + offset = static_cast<int32_t>(uprv_strlen(rm_cmd)); + + for (i = 0; i < (length - offset); i++) { + flag[i] = tmpGenlibFlagBuffer[offset + i]; + } + + /* Zero terminate the string */ + flag[i] = 0; + + uprv_free(tmpGenlibFlagBuffer); + } + + flag = pkgDataFlags[BIR_FLAGS]; + length = static_cast<int32_t>(uprv_strlen(pkgDataFlags[BIR_FLAGS])); + + for (int32_t i = 0; i < length; i++) { + if (flag[i] == MAP_FILE_EXT[count]) { + if (count == 0) { + start = i; + } + count++; + } else { + count = 0; + } + + if (count == uprv_strlen(MAP_FILE_EXT)) { + break; + } + } + + if (start >= 0) { + int32_t index = 0; + for (int32_t i = 0;;i++) { + if (i == start) { + for (int32_t n = 0;;n++) { + if (o->shortName[n] == 0) { + break; + } + tmpbuffer[index++] = o->shortName[n]; + } + } + + tmpbuffer[index++] = flag[i]; + + if (flag[i] == 0) { + break; + } + } + + uprv_memset(flag, 0, length); + uprv_strcpy(flag, tmpbuffer); + + uprv_strcpy(mapFile, o->shortName); + uprv_strcat(mapFile, MAP_FILE_EXT); + + f = T_FileStream_open(mapFile, "w"); + if (f == nullptr) { + fprintf(stderr,"Unable to create map file: %s.\n", mapFile); + return nullptr; + } else { + snprintf(tmpbuffer, sizeof(tmpbuffer), "%s%s ", o->entryName, UDATA_CMN_INTERMEDIATE_SUFFIX); + + T_FileStream_writeLine(f, tmpbuffer); + + T_FileStream_close(f); + } + } +#elif U_PLATFORM == U_PF_CYGWIN || U_PLATFORM == U_PF_MINGW + /* Cygwin needs to change flag options. */ + char *flag = nullptr; + int32_t length = 0; + + flag = pkgDataFlags[GENLIB]; + length = static_cast<int32_t>(uprv_strlen(pkgDataFlags[GENLIB])); + + int32_t position = length - 1; + + for(;position >= 0;position--) { + if (flag[position] == '=') { + position++; + break; + } + } + + uprv_memset(flag + position, 0, length - position); +#elif U_PLATFORM == U_PF_OS400 + /* OS/400 needs to fix the ld options (swap single quote with double quote) */ + char *flag = nullptr; + int32_t length = 0; + + flag = pkgDataFlags[GENLIB]; + length = static_cast<int32_t>(uprv_strlen(pkgDataFlags[GENLIB])); + + int32_t position = length - 1; + + for(int32_t i = 0; i < length; i++) { + if (flag[i] == '\'') { + flag[i] = '\"'; + } + } +#endif + // Don't really need a return value, just need to stop compiler warnings about + // the unused parameter 'o' on platforms where it is not otherwise used. + return o; +} + +static void loadLists(UPKGOptions *o, UErrorCode *status) +{ + CharList *l, *tail = nullptr, *tail2 = nullptr; + FileStream *in; + char line[16384]; + char *linePtr, *lineNext; + const uint32_t lineMax = 16300; + char *tmp; + int32_t tmpLength = 0; + char *s; + int32_t ln=0; /* line number */ + + for(l = o->fileListFiles; l; l = l->next) { + if(o->verbose) { + fprintf(stdout, "# pkgdata: Reading %s..\n", l->str); + } + /* TODO: stdin */ + in = T_FileStream_open(l->str, "r"); /* open files list */ + + if(!in) { + fprintf(stderr, "Error opening <%s>.\n", l->str); + *status = U_FILE_ACCESS_ERROR; + return; + } + + while(T_FileStream_readLine(in, line, sizeof(line))!=nullptr) { /* for each line */ + ln++; + if(uprv_strlen(line)>lineMax) { + fprintf(stderr, "%s:%d - line too long (over %d chars)\n", l->str, (int)ln, (int)lineMax); + exit(1); + } + /* remove spaces at the beginning */ + linePtr = line; + /* On z/OS, disable call to isspace (#9996). Investigate using uprv_isspace instead (#9999) */ +#if U_PLATFORM != U_PF_OS390 + while(isspace(*linePtr)) { + linePtr++; + } +#endif + s=linePtr; + /* remove trailing newline characters */ + while(*s!=0) { + if(*s=='\r' || *s=='\n') { + *s=0; + break; + } + ++s; + } + if((*linePtr == 0) || (*linePtr == '#')) { + continue; /* comment or empty line */ + } + + /* Now, process the line */ + lineNext = nullptr; + + while(linePtr && *linePtr) { /* process space-separated items */ + while(*linePtr == ' ') { + linePtr++; + } + /* Find the next quote */ + if(linePtr[0] == '"') + { + lineNext = uprv_strchr(linePtr+1, '"'); + if(lineNext == nullptr) { + fprintf(stderr, "%s:%d - missing trailing double quote (\")\n", + l->str, (int)ln); + exit(1); + } else { + lineNext++; + if(*lineNext) { + if(*lineNext != ' ') { + fprintf(stderr, "%s:%d - malformed quoted line at position %d, expected ' ' got '%c'\n", + l->str, (int)ln, (int)(lineNext-line), (*lineNext)?*lineNext:'0'); + exit(1); + } + *lineNext = 0; + lineNext++; + } + } + } else { + lineNext = uprv_strchr(linePtr, ' '); + if(lineNext) { + *lineNext = 0; /* terminate at space */ + lineNext++; + } + } + + /* add the file */ + s = (char*)getLongPathname(linePtr); + + /* normal mode.. o->files is just the bare list without package names */ + o->files = pkg_appendToList(o->files, &tail, uprv_strdup(linePtr)); + if(uprv_pathIsAbsolute(s) || s[0] == '.') { + fprintf(stderr, "pkgdata: Error: absolute path encountered. Old style paths are not supported. Use relative paths such as 'fur.res' or 'translit%cfur.res'.\n\tBad path: '%s'\n", U_FILE_SEP_CHAR, s); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + /* The +5 is to add a little extra space for, among other things, PKGDATA_FILE_SEP_STRING */ + tmpLength = static_cast<int32_t>(uprv_strlen(o->srcDir) + uprv_strlen(s) + 5); + if((tmp = (char *)uprv_malloc(tmpLength)) == nullptr) { + fprintf(stderr, "pkgdata: Error: Unable to allocate tmp buffer size: %d\n", tmpLength); + exit(U_MEMORY_ALLOCATION_ERROR); + } + uprv_strcpy(tmp, o->srcDir); + uprv_strcat(tmp, o->srcDir[uprv_strlen(o->srcDir)-1] == U_FILE_SEP_CHAR ? "" : PKGDATA_FILE_SEP_STRING); + uprv_strcat(tmp, s); + o->filePaths = pkg_appendToList(o->filePaths, &tail2, tmp); + linePtr = lineNext; + } /* for each entry on line */ + } /* for each line */ + T_FileStream_close(in); + } /* for each file list file */ +} + +/* Helper for pkg_getPkgDataPath() */ +#if U_HAVE_POPEN +static UBool getPkgDataPath(const char *cmd, UBool verbose, char *buf, size_t items) { + icu::CharString cmdBuf; + UErrorCode status = U_ZERO_ERROR; + LocalPipeFilePointer p; + size_t n; + + cmdBuf.append(cmd, status); + if (verbose) { + fprintf(stdout, "# Calling: %s\n", cmdBuf.data()); + } + p.adoptInstead( popen(cmdBuf.data(), "r") ); + + if (p.isNull() || (n = fread(buf, 1, items-1, p.getAlias())) <= 0) { + fprintf(stderr, "%s: Error calling '%s'\n", progname, cmd); + *buf = 0; + return false; + } + + return true; +} +#endif + +/* Get path to pkgdata.inc. Try pkg-config first, falling back to icu-config. */ +static int32_t pkg_getPkgDataPath(UBool verbose, UOption *option) { +#if U_HAVE_POPEN + static char buf[512] = ""; + UBool pkgconfigIsValid = true; + const char *pkgconfigCmd = "pkg-config --variable=pkglibdir icu-uc"; + const char *icuconfigCmd = "icu-config --incpkgdatafile"; + const char *pkgdata = "pkgdata.inc"; + + if (!getPkgDataPath(pkgconfigCmd, verbose, buf, UPRV_LENGTHOF(buf))) { + if (!getPkgDataPath(icuconfigCmd, verbose, buf, UPRV_LENGTHOF(buf))) { + fprintf(stderr, "%s: icu-config not found. Fix PATH or specify -O option\n", progname); + return -1; + } + + pkgconfigIsValid = false; + } + + for (int32_t length = strlen(buf) - 1; length >= 0; length--) { + if (buf[length] == '\n' || buf[length] == ' ') { + buf[length] = 0; + } else { + break; + } + } + + if (!*buf) { + fprintf(stderr, "%s: Unable to locate pkgdata.inc. Unable to parse the results of '%s'. Check paths or use the -O option to specify the path to pkgdata.inc.\n", progname, pkgconfigIsValid ? pkgconfigCmd : icuconfigCmd); + return -1; + } + + if (pkgconfigIsValid) { + uprv_strcat(buf, U_FILE_SEP_STRING); + uprv_strcat(buf, pkgdata); + } + + buf[strlen(buf)] = 0; + + option->value = buf; + option->doesOccur = true; + + return 0; +#else + return -1; +#endif +} + +#ifdef CAN_WRITE_OBJ_CODE + /* Create optMatchArch for genccode architecture detection */ +static void pkg_createOptMatchArch(char *optMatchArch) { +#if !defined(WINDOWS_WITH_MSVC) || defined(USING_CYGWIN) + const char* code = "void oma(){}"; + const char* source = "oma.c"; + const char* obj = "oma.obj"; + FileStream* stream = nullptr; + + stream = T_FileStream_open(source,"w"); + if (stream != nullptr) { + T_FileStream_writeLine(stream, code); + T_FileStream_close(stream); + + char cmd[LARGE_BUFFER_MAX_SIZE]; + snprintf(cmd, sizeof(cmd), "%s %s -o %s", + pkgDataFlags[COMPILER], + source, + obj); + + if (runCommand(cmd) == 0){ + sprintf(optMatchArch, "%s", obj); + } + else { + fprintf(stderr, "Failed to compile %s\n", source); + } + if(!T_FileStream_remove(source)){ + fprintf(stderr, "T_FileStream_remove failed to delete %s\n", source); + } + } + else { + fprintf(stderr, "T_FileStream_open failed to open %s for writing\n", source); + } +#endif +} +static void pkg_destroyOptMatchArch(char *optMatchArch) { + if(T_FileStream_file_exists(optMatchArch) && !T_FileStream_remove(optMatchArch)){ + fprintf(stderr, "T_FileStream_remove failed to delete %s\n", optMatchArch); + } +} +#endif diff --git a/intl/icu/source/tools/pkgdata/pkgdata.vcxproj b/intl/icu/source/tools/pkgdata/pkgdata.vcxproj new file mode 100644 index 0000000000..0975456bda --- /dev/null +++ b/intl/icu/source/tools/pkgdata/pkgdata.vcxproj @@ -0,0 +1,84 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{4C8454FE-81D3-4CA3-9927-29BA96F03DAC}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)/pkgdata.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <AdditionalIncludeDirectories>../../../include;../../common;../toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)/pkgdata.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)/pkgdata.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)/pkgdata.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="pkgdata.cpp" /> + <ClCompile Include="pkgtypes.c" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="pkgtypes.h" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/pkgdata/pkgdata.vcxproj.filters b/intl/icu/source/tools/pkgdata/pkgdata.vcxproj.filters new file mode 100644 index 0000000000..dc530aba11 --- /dev/null +++ b/intl/icu/source/tools/pkgdata/pkgdata.vcxproj.filters @@ -0,0 +1,30 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{73901add-40cd-4d05-a0ba-d336fa136062}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{0dcaab2b-a92a-430e-8185-de2a2ababd2a}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{4a7309b1-40ce-4bb5-b80d-2cc01b68e8d0}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="pkgdata.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="pkgtypes.c"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="pkgtypes.h"> + <Filter>Header Files</Filter> + </ClInclude> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/pkgdata/pkgtypes.c b/intl/icu/source/tools/pkgdata/pkgtypes.c new file mode 100644 index 0000000000..26bd945df7 --- /dev/null +++ b/intl/icu/source/tools/pkgdata/pkgtypes.c @@ -0,0 +1,303 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/************************************************************************** +* +* Copyright (C) 2000-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +*************************************************************************** +* file name: pkgdata.c +* encoding: ANSI X3.4 (1968) +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000may16 +* created by: Steven \u24C7 Loomis +* +* common types for pkgdata +*/ + +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "cmemory.h" +#include "cstring.h" +#include "pkgtypes.h" +#include "putilimp.h" + +const char *pkg_writeCharListWrap(FileStream *s, CharList *l, const char *delim, const char *brk, int32_t quote) +{ + int32_t ln = 0; + char buffer[1024]; + while(l != NULL) + { + if(l->str) + { + uprv_strncpy(buffer, l->str, 1020); + buffer[1019]=0; + + if(quote < 0) { /* remove quotes */ + if(buffer[uprv_strlen(buffer)-1] == '"') { + buffer[uprv_strlen(buffer)-1] = '\0'; + } + if(buffer[0] == '"') { + uprv_strcpy(buffer, buffer+1); + } + } else if(quote > 0) { /* add quotes */ + if(l->str[0] != '"') { + uprv_strcpy(buffer, "\""); + uprv_strncat(buffer, l->str,1020); + } + if(l->str[uprv_strlen(l->str)-1] != '"') { + uprv_strcat(buffer, "\""); + } + } + T_FileStream_write(s, buffer, (int32_t)uprv_strlen(buffer)); + + ln += (int32_t)uprv_strlen(l->str); + } + + if(l->next && delim) + { + if(ln > 60 && brk) { + ln = 0; + T_FileStream_write(s, brk, (int32_t)uprv_strlen(brk)); + } + T_FileStream_write(s, delim, (int32_t)uprv_strlen(delim)); + } + l = l->next; + } + return NULL; +} + + +const char *pkg_writeCharList(FileStream *s, CharList *l, const char *delim, int32_t quote) +{ + char buffer[1024]; + while(l != NULL) + { + if(l->str) + { + uprv_strncpy(buffer, l->str, 1023); + buffer[1023]=0; + if(uprv_strlen(l->str) >= 1023) + { + fprintf(stderr, "%s:%d: Internal error, line too long (greater than 1023 chars)\n", + __FILE__, __LINE__); + exit(0); + } + if(quote < 0) { /* remove quotes */ + if(buffer[uprv_strlen(buffer)-1] == '"') { + buffer[uprv_strlen(buffer)-1] = '\0'; + } + if(buffer[0] == '"') { + uprv_strcpy(buffer, buffer+1); + } + } else if(quote > 0) { /* add quotes */ + if(l->str[0] != '"') { + uprv_strcpy(buffer, "\""); + uprv_strcat(buffer, l->str); + } + if(l->str[uprv_strlen(l->str)-1] != '"') { + uprv_strcat(buffer, "\""); + } + } + T_FileStream_write(s, buffer, (int32_t)uprv_strlen(buffer)); + } + + if(l->next && delim) + { + T_FileStream_write(s, delim, (int32_t)uprv_strlen(delim)); + } + l = l->next; + } + return NULL; +} + + +/* + * Count items . 0 if null + */ +uint32_t pkg_countCharList(CharList *l) +{ + uint32_t c = 0; + while(l != NULL) + { + c++; + l = l->next; + } + + return c; +} + +/* + * Prepend string to CharList + */ +CharList *pkg_prependToList(CharList *l, const char *str) +{ + CharList *newList; + newList = uprv_malloc(sizeof(CharList)); + + /* test for NULL */ + if(newList == NULL) { + return NULL; + } + + newList->str = str; + newList->next = l; + return newList; +} + +/* + * append string to CharList. *end or even end can be null if you don't + * know it.[slow] + * Str is adopted! + */ +CharList *pkg_appendToList(CharList *l, CharList** end, const char *str) +{ + CharList *endptr = NULL, *tmp; + + if(end == NULL) + { + end = &endptr; + } + + /* FIND the end */ + if((*end == NULL) && (l != NULL)) + { + tmp = l; + while(tmp->next) + { + tmp = tmp->next; + } + + *end = tmp; + } + + /* Create a new empty list and append it */ + if(l == NULL) + { + l = pkg_prependToList(NULL, str); + } + else + { + (*end)->next = pkg_prependToList(NULL, str); + } + + /* Move the end pointer. */ + if(*end) + { + (*end) = (*end)->next; + } + else + { + *end = l; + } + + return l; +} + +char * convertToNativePathSeparators(char *path) { +#if defined(U_MAKE_IS_NMAKE) + char *itr; + while ((itr = uprv_strchr(path, U_FILE_ALT_SEP_CHAR))) { + *itr = U_FILE_SEP_CHAR; + } +#endif + return path; +} + +CharList *pkg_appendUniqueDirToList(CharList *l, CharList** end, const char *strAlias) { + char aBuf[1024]; + char *rPtr; + rPtr = uprv_strrchr(strAlias, U_FILE_SEP_CHAR); +#if (U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR) + { + char *aPtr = uprv_strrchr(strAlias, U_FILE_ALT_SEP_CHAR); + if(!rPtr || /* regular char wasn't found or.. */ + (aPtr && (aPtr > rPtr))) + { /* alt ptr exists and is to the right of r ptr */ + rPtr = aPtr; /* may copy NULL which is OK */ + } + } +#endif + if(!rPtr) { + return l; /* no dir path */ + } + if((rPtr-strAlias) >= UPRV_LENGTHOF(aBuf)) { + fprintf(stderr, "## ERR: Path too long [%d chars]: %s\n", (int)sizeof(aBuf), strAlias); + return l; + } + strncpy(aBuf, strAlias,(rPtr-strAlias)); + aBuf[rPtr-strAlias]=0; /* no trailing slash */ + convertToNativePathSeparators(aBuf); + + if(!pkg_listContains(l, aBuf)) { + return pkg_appendToList(l, end, uprv_strdup(aBuf)); + } else { + return l; /* already found */ + } +} + +#if 0 +static CharList * +pkg_appendFromStrings(CharList *l, CharList** end, const char *s, int32_t len) +{ + CharList *endptr = NULL; + const char *p; + char *t; + const char *targ; + if(end == NULL) { + end = &endptr; + } + + if(len==-1) { + len = uprv_strlen(s); + } + targ = s+len; + + while(*s && s<targ) { + while(s<targ&&isspace(*s)) s++; + for(p=s;s<targ&&!isspace(*p);p++); + if(p!=s) { + t = uprv_malloc(p-s+1); + uprv_strncpy(t,s,p-s); + t[p-s]=0; + l=pkg_appendToList(l,end,t); + fprintf(stderr, " P %s\n", t); + } + s=p; + } + + return l; +} +#endif + + +/* + * Delete list + */ +void pkg_deleteList(CharList *l) +{ + CharList *tmp; + while(l != NULL) + { + uprv_free((void*)l->str); + tmp = l; + l = l->next; + uprv_free(tmp); + } +} + +UBool pkg_listContains(CharList *l, const char *str) +{ + for(;l;l=l->next){ + if(!uprv_strcmp(l->str, str)) { + return true; + } + } + + return false; +} diff --git a/intl/icu/source/tools/pkgdata/pkgtypes.h b/intl/icu/source/tools/pkgdata/pkgtypes.h new file mode 100644 index 0000000000..51c11e0a14 --- /dev/null +++ b/intl/icu/source/tools/pkgdata/pkgtypes.h @@ -0,0 +1,172 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/************************************************************************** +* +* Copyright (C) 2000-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +*************************************************************************** +* file name: pkgdata.c +* encoding: ANSI X3.4 (1968) +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000may16 +* created by: Steven \u24C7 Loomis +* +* common types for pkgdata +*/ + +#ifndef _PKGTYPES +#define _PKGTYPES + +/* headers */ +#include "unicode/utypes.h" +#include "filestrm.h" + +/* linked list */ +struct _CharList; + +typedef struct _CharList +{ + const char *str; + struct _CharList *next; +} CharList; + + + +/* + * write CharList 'l' into stream 's' using delimiter 'delim' (delim can be nullptr). quoted: -1 remove, 0 as is, 1 add quotes + */ +const char *pkg_writeCharList(FileStream *s, CharList *l, const char *delim, int32_t quoted); + +/* + * Same, but use line breaks. quoted: -1 remove, 0 as is, 1 add quotes + */ +const char *pkg_writeCharListWrap(FileStream *s, CharList *l, const char *delim, const char *brk, int32_t quoted); + + +/* + * Count items . 0 if null + */ +uint32_t pkg_countCharList(CharList *l); + +/* + * Prepend string to CharList. Str is adopted! + */ +CharList *pkg_prependToList(CharList *l, const char *str); + +/* + * append string to CharList. *end or even end can be null if you don't + * know it.[slow] + * Str is adopted! + */ +CharList *pkg_appendToList(CharList *l, CharList** end, const char *str); + +/* + * strAlias is an alias to a full or relative path to a FILE. This function + * will search strAlias for the directory name (with strrchr). Then, it will + * determine if that directory is already in list l. If not, it will add it + * with strdup(strAlias). + * @param l list to append to , or nullptr + * @param end end pointer-to-pointer. Can point to null, or be null. + * @param strAlias alias to full path string + * @return new list + */ +CharList *pkg_appendUniqueDirToList(CharList *l, CharList** end, const char *strAlias); + +/* + * does list contain string? Returns: t/f + */ +UBool pkg_listContains(CharList *l, const char *str); + +/* + * Delete list + */ +void pkg_deleteList(CharList *l); + +/* + * Mode package function + */ +struct UPKGOptions_; +typedef void (UPKGMODE)(struct UPKGOptions_ *, FileStream *s, UErrorCode *status); + +/* + * Static mode - write the readme file + * @param opt UPKGOptions + * @param libName Name of the .lib, etc file + * @param status ICU error code + */ +void pkg_sttc_writeReadme(struct UPKGOptions_ *opt, const char *libName, UErrorCode *status); + +/* + * Options to be passed throughout the program + */ + +typedef struct UPKGOptions_ +{ + CharList *fileListFiles; /* list of files containing files for inclusion in the package */ + CharList *filePaths; /* All the files, with long paths */ + CharList *files; /* All the files */ + CharList *outFiles; /* output files [full paths] */ + + const char *shortName; /* name of what we're building */ + const char *cShortName; /* name of what we're building as a C identifier */ + const char *entryName; /* special entrypoint name */ + const char *targetDir; /* dir for packaged data to go */ + const char *dataDir; /* parent of dir for package (default: tmpdir) */ + const char *tmpDir; + const char *srcDir; + const char *options; /* Options arg */ + const char *mode; /* Mode of building */ + const char *version; /* Library version */ + const char *comment; /* comment string */ + const char *install; /* Where to install to (nullptr = don't install) */ + const char *icuroot; /* where does ICU lives */ + const char *libName; /* name for library (default: shortName) */ + UBool rebuild; + UBool verbose; + UBool quiet; + UBool withoutAssembly; + UBool pdsbuild; /* for building PDS in z/OS */ +} UPKGOptions; + +char * convertToNativePathSeparators(char *path); + + +/* set up common defines for library naming */ + +#if U_PLATFORM_HAS_WIN32_API +# ifndef UDATA_SO_SUFFIX +# define UDATA_SO_SUFFIX ".dll" +# endif +# define LIB_PREFIX "" +# define LIB_STATIC_PREFIX "" +# define OBJ_SUFFIX ".obj" +# define UDATA_LIB_SUFFIX ".lib" + +#elif U_PLATFORM == U_PF_CYGWIN +# define LIB_PREFIX "cyg" +# define LIB_STATIC_PREFIX "lib" +# define OBJ_SUFFIX ".o" +# define UDATA_LIB_SUFFIX ".a" + +#else /* POSIX? */ +# define LIB_PREFIX "lib" +# define LIB_STATIC_PREFIX "lib" +# define OBJ_SUFFIX ".o" +# define UDATA_LIB_SUFFIX ".a" +#endif + +#define ASM_SUFFIX ".s" + +/* defines for common file names */ +#define UDATA_CMN_PREFIX "" +#define UDATA_CMN_SUFFIX ".dat" +#define UDATA_CMN_INTERMEDIATE_SUFFIX "_dat" + +#define ICUDATA_RES_FILE "icudata.res" + +#define PKGDATA_DERIVED_PATH '\t' + +#endif diff --git a/intl/icu/source/tools/pkgdata/sources.txt b/intl/icu/source/tools/pkgdata/sources.txt new file mode 100644 index 0000000000..8a5b202fa9 --- /dev/null +++ b/intl/icu/source/tools/pkgdata/sources.txt @@ -0,0 +1,2 @@ +pkgdata.cpp +pkgtypes.c diff --git a/intl/icu/source/tools/toolutil/BUILD.bazel b/intl/icu/source/tools/toolutil/BUILD.bazel new file mode 100644 index 0000000000..276c857f12 --- /dev/null +++ b/intl/icu/source/tools/toolutil/BUILD.bazel @@ -0,0 +1,126 @@ +# © 2021 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html + +# This Bazel build file defines targets that are dependencies for building +# the gennorm2 and genprops binaries. + +load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") + +package( + default_visibility = ["//visibility:public"], +) + +cc_library( + name = "toolutil", + includes = ["."], + hdrs = ["toolutil.h"], + srcs = ["toolutil.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = ["//icu4c/source/common:platform"], +) + +cc_library( + name = "unewdata", + includes = ["."], + hdrs = ["unewdata.h"], + srcs = ["unewdata.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = [ + ":filestrm", + "//icu4c/source/common:platform", + ], +) + +cc_library( + name = "uoptions", + includes = ["."], + hdrs = ["uoptions.h"], + srcs = ["uoptions.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = ["//icu4c/source/common:platform"], +) + +cc_library( + name = "writesrc", + includes = ["."], + hdrs = ["writesrc.h"], + srcs = ["writesrc.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = [ + "//icu4c/source/common:bytestream", + "//icu4c/source/common:platform", + "//icu4c/source/common:uniset_core", + ], +) + +cc_library( + name = "uparse", + includes = ["."], + hdrs = ["uparse.h"], + srcs = ["uparse.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = [ + ":filestrm", + "//icu4c/source/common:platform", + ], +) + +cc_library( + name = "filestrm", + includes = ["."], + hdrs = ["filestrm.h"], + srcs = ["filestrm.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = ["//icu4c/source/common:platform"], +) + +cc_library( + name = "ppucd", + includes = ["."], + hdrs = ["ppucd.h"], + srcs = ["ppucd.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = [ + ":uparse", + "//icu4c/source/common:platform", + ], +) + +cc_library( + name = "denseranges", + includes = ["."], + hdrs = ["denseranges.h"], + srcs = ["denseranges.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = ["//icu4c/source/common:platform"], +) + +cc_library( + name = "collationinfo", + includes = ["."], + hdrs = ["collationinfo.h"], + srcs = ["collationinfo.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = [ + "//icu4c/source/common:platform", + "//icu4c/source/i18n:headers", + ], +) diff --git a/intl/icu/source/tools/toolutil/Makefile.in b/intl/icu/source/tools/toolutil/Makefile.in new file mode 100644 index 0000000000..c9fd89b0f0 --- /dev/null +++ b/intl/icu/source/tools/toolutil/Makefile.in @@ -0,0 +1,155 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +#****************************************************************************** +# +# Copyright (C) 1999-2014, International Business Machines +# Corporation and others. All Rights Reserved. +# +#****************************************************************************** +## Makefile.in for ICU - tools/toolutil +## Steven R. Loomis + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +## All the flags and other definitions are included here. +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/toolutil + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(IMPORT_LIB) $(MIDDLE_IMPORT_LIB) $(FINAL_IMPORT_LIB) + +## Target information + +TARGET_STUBNAME=$(TOOLUTIL_STUBNAME) + +ifneq ($(ENABLE_STATIC),) +TARGET = $(LIBDIR)/$(LIBSICU)$(TARGET_STUBNAME)$(ICULIBSUFFIX).$(A) +endif + +ifneq ($(ENABLE_SHARED),) +SO_TARGET = $(LIBDIR)/$(LIBICU)$(TARGET_STUBNAME)$(ICULIBSUFFIX).$(SO) +ALL_SO_TARGETS = $(SO_TARGET) $(MIDDLE_SO_TARGET) $(FINAL_SO_TARGET) $(SHARED_OBJECT) +endif + +ALL_TARGETS = $(TARGET) $(ALL_SO_TARGETS) + +DYNAMICCPPFLAGS = $(SHAREDLIBCPPFLAGS) +DYNAMICCFLAGS = $(SHAREDLIBCFLAGS) +DYNAMICCXXFLAGS = $(SHAREDLIBCXXFLAGS) +CFLAGS += $(LIBCFLAGS) +CXXFLAGS += $(LIBCXXFLAGS) + +CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(top_srcdir)/i18n $(LIBCPPFLAGS) + +# from icuinfo +CPPFLAGS+= "-DU_BUILD=\"@build@\"" "-DU_HOST=\"@host@\"" "-DU_CC=\"@CC@\"" "-DU_CXX=\"@CXX@\"" +CPPFLAGS += -DUNISTR_FROM_CHAR_EXPLICIT=explicit -DUNISTR_FROM_STRING_EXPLICIT=explicit + +DEFS += -DU_TOOLUTIL_IMPLEMENTATION +LDFLAGS += $(LDFLAGSICUTOOLUTIL) +LIBS = $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(SOURCES:.cpp=.o) + +STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O)) + +DEPS = $(OBJECTS:.o=.d) + +-include Makefile.local + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local install-library dist \ +dist-local check check-local + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(ALL_TARGETS) + +install-local: install-library + +install-library: all-local + $(MKINSTALLDIRS) $(DESTDIR)$(libdir) +ifneq ($(ENABLE_STATIC),) + $(INSTALL-L) $(TARGET) $(DESTDIR)$(libdir) +endif +ifneq ($(ENABLE_SHARED),) +# For MinGW, do we want the DLL to go in the bin location? +ifeq ($(MINGW_MOVEDLLSTOBINDIR),YES) + $(MKINSTALLDIRS) $(DESTDIR)$(bindir) + $(INSTALL-L) $(FINAL_SO_TARGET) $(DESTDIR)$(bindir) +else + $(INSTALL-L) $(FINAL_SO_TARGET) $(DESTDIR)$(libdir) +ifneq ($(FINAL_SO_TARGET),$(SO_TARGET)) + cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(SO_TARGET)) && ln -s $(notdir $(FINAL_SO_TARGET)) $(notdir $(SO_TARGET)) +ifneq ($(FINAL_SO_TARGET),$(MIDDLE_SO_TARGET)) + cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(MIDDLE_SO_TARGET)) && ln -s $(notdir $(FINAL_SO_TARGET)) $(notdir $(MIDDLE_SO_TARGET)) +endif +endif +endif +ifneq ($(IMPORT_LIB_EXT),) + $(INSTALL-L) $(FINAL_IMPORT_LIB) $(DESTDIR)$(libdir) +ifneq ($(IMPORT_LIB),$(FINAL_IMPORT_LIB)) + cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(IMPORT_LIB)) && ln -s $(notdir $(FINAL_IMPORT_LIB)) $(notdir $(IMPORT_LIB)) +endif +ifneq ($(MIDDLE_IMPORT_LIB),$(FINAL_IMPORT_LIB)) + cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(MIDDLE_IMPORT_LIB)) && ln -s $(notdir $(FINAL_IMPORT_LIB)) $(notdir $(MIDDLE_IMPORT_LIB)) +endif +endif +endif + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(OBJECTS) $(STATIC_OBJECTS) $(ALL_TARGETS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +ifneq ($(ENABLE_STATIC),) +$(TARGET): $(STATIC_OBJECTS) + $(AR) $(ARFLAGS) $(AR_OUTOPT)$@ $^ + $(RANLIB) $@ +endif + +ifneq ($(ENABLE_SHARED),) +$(SHARED_OBJECT): $(OBJECTS) + $(SHLIB.cc) $(LD_SONAME) $(OUTOPT)$@ $^ $(LIBS) +ifeq ($(ENABLE_RPATH),YES) +ifneq ($(wildcard $(libdir)/$(MIDDLE_SO_TARGET)),) + $(warning RPATH warning: --enable-rpath means test programs may use existing $(libdir)/$(MIDDLE_SO_TARGET)) +endif +endif +endif + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/toolutil/collationinfo.cpp b/intl/icu/source/tools/toolutil/collationinfo.cpp new file mode 100644 index 0000000000..6bad90e133 --- /dev/null +++ b/intl/icu/source/tools/toolutil/collationinfo.cpp @@ -0,0 +1,152 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2013-2015, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* collationinfo.cpp +* +* created on: 2013aug05 +* created by: Markus W. Scherer +*/ + +#include <stdio.h> +#include <string.h> + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + +#include "collationdata.h" +#include "collationdatareader.h" +#include "collationinfo.h" +#include "uassert.h" +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +void +CollationInfo::printSizes(int32_t sizeWithHeader, const int32_t indexes[]) { + int32_t totalSize = indexes[CollationDataReader::IX_TOTAL_SIZE]; + if(sizeWithHeader > totalSize) { + printf(" header size: %6ld\n", (long)(sizeWithHeader - totalSize)); + } + + int32_t length = indexes[CollationDataReader::IX_INDEXES_LENGTH]; + printf(" indexes: %6ld *4 = %6ld\n", (long)length, (long)length * 4); + + length = getDataLength(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET); + if(length != 0) { + printf(" reorder codes: %6ld *4 = %6ld\n", (long)length / 4, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET); + if(length != 0) { + U_ASSERT(length >= 256); + printf(" reorder table: %6ld\n", (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_TRIE_OFFSET); + if(length != 0) { + printf(" trie size: %6ld\n", (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_RESERVED8_OFFSET); + if(length != 0) { + printf(" reserved (offset 8): %6ld\n", (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_CES_OFFSET); + if(length != 0) { + printf(" CEs: %6ld *8 = %6ld\n", (long)length / 8, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_RESERVED10_OFFSET); + if(length != 0) { + printf(" reserved (offset 10): %6ld\n", (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_CE32S_OFFSET); + if(length != 0) { + printf(" CE32s: %6ld *4 = %6ld\n", (long)length / 4, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET); + if(length != 0) { + printf(" rootElements: %6ld *4 = %6ld\n", (long)length / 4, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_CONTEXTS_OFFSET); + if(length != 0) { + printf(" contexts: %6ld *2 = %6ld\n", (long)length / 2, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_UNSAFE_BWD_OFFSET); + if(length != 0) { + printf(" unsafeBwdSet: %6ld *2 = %6ld\n", (long)length / 2, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET); + if(length != 0) { + printf(" fastLatin table: %6ld *2 = %6ld\n", (long)length / 2, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_SCRIPTS_OFFSET); + if(length != 0) { + printf(" scripts data: %6ld *2 = %6ld\n", (long)length / 2, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET); + if(length != 0) { + U_ASSERT(length >= 256); + printf(" compressibleBytes: %6ld\n", (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_RESERVED18_OFFSET); + if(length != 0) { + printf(" reserved (offset 18): %6ld\n", (long)length); + } + + printf(" collator binary total size: %6ld\n", (long)sizeWithHeader); +} + +int32_t +CollationInfo::getDataLength(const int32_t indexes[], int32_t startIndex) { + return indexes[startIndex + 1] - indexes[startIndex]; +} + +void +CollationInfo::printReorderRanges(const CollationData &data, const int32_t *codes, int32_t length) { + UErrorCode errorCode = U_ZERO_ERROR; + UVector32 ranges(errorCode); + data.makeReorderRanges(codes, length, ranges, errorCode); + if(U_FAILURE(errorCode)) { + printf(" error building reorder ranges: %s\n", u_errorName(errorCode)); + return; + } + + int32_t start = 0; + for(int32_t i = 0; i < ranges.size(); ++i) { + int32_t pair = ranges.elementAti(i); + int32_t limit = (pair >> 16) & 0xffff; + int16_t offset = (int16_t)pair; + if(offset == 0) { + // [inclusive-start, exclusive-limit[ + printf(" [%04x, %04x[\n", start, limit); + } else if(offset > 0) { + printf(" reorder [%04x, %04x[ by offset %02x to [%04x, %04x[\n", + start, limit, offset, + start + (offset << 8), limit + (offset << 8)); + } else /* offset < 0 */ { + printf(" reorder [%04x, %04x[ by offset -%02x to [%04x, %04x[\n", + start, limit, -offset, + start + (offset << 8), limit + (offset << 8)); + } + start = limit; + } +} + +U_NAMESPACE_END + +#endif // !UCONFIG_NO_COLLATION diff --git a/intl/icu/source/tools/toolutil/collationinfo.h b/intl/icu/source/tools/toolutil/collationinfo.h new file mode 100644 index 0000000000..815b89d40d --- /dev/null +++ b/intl/icu/source/tools/toolutil/collationinfo.h @@ -0,0 +1,42 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2013-2015, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* collationinfo.h +* +* created on: 2013aug05 +* created by: Markus W. Scherer +*/ + +#ifndef __COLLATIONINFO_H__ +#define __COLLATIONINFO_H__ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + +U_NAMESPACE_BEGIN + +struct CollationData; + +/** + * Collation-related code for tools & demos. + */ +class U_TOOLUTIL_API CollationInfo /* all static */ { +public: + static void printSizes(int32_t sizeWithHeader, const int32_t indexes[]); + static void printReorderRanges(const CollationData &data, const int32_t *codes, int32_t length); + +private: + CollationInfo(); // no constructor + + static int32_t getDataLength(const int32_t indexes[], int32_t startIndex); +}; + +U_NAMESPACE_END + +#endif // !UCONFIG_NO_COLLATION +#endif // __COLLATIONINFO_H__ diff --git a/intl/icu/source/tools/toolutil/dbgutil.cpp b/intl/icu/source/tools/toolutil/dbgutil.cpp new file mode 100644 index 0000000000..d42b267f73 --- /dev/null +++ b/intl/icu/source/tools/toolutil/dbgutil.cpp @@ -0,0 +1,160 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2007-2012, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + +#include "udbgutil.h" +#include "dbgutil.h" + +#if !UCONFIG_NO_FORMATTING + +#include "unicode/unistr.h" +#include "unicode/ustring.h" +#include "util.h" +#include "ucln.h" + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +U_NAMESPACE_USE + +static UnicodeString **strs = nullptr; + +static const UnicodeString& _fieldString(UDebugEnumType type, int32_t field, UnicodeString& fillin) { + const char *str = udbg_enumName(type, field); + if(str == nullptr) { + return fillin.remove(); + } else { + return fillin = UnicodeString(str, -1, US_INV); + } +} + +U_CDECL_BEGIN +static void udbg_cleanup() { + if(strs != nullptr) { + for(int t=0;t<=UDBG_ENUM_COUNT;t++) { + delete [] strs[t]; + } + delete[] strs; + strs = nullptr; + } +} + +static UBool tu_cleanup() +{ + udbg_cleanup(); + return true; +} + +static void udbg_register_cleanup() { + ucln_registerCleanup(UCLN_TOOLUTIL, tu_cleanup); +} +U_CDECL_END + +static void udbg_setup() { + if(strs == nullptr) { + udbg_register_cleanup(); + //fprintf(stderr,"Initializing string cache..\n"); + //fflush(stderr); + UnicodeString **newStrs = new UnicodeString*[UDBG_ENUM_COUNT+1]; + for(int t=0;t<UDBG_ENUM_COUNT;t++) { + int32_t c = udbg_enumCount((UDebugEnumType)t); + newStrs[t] = new UnicodeString[c+1]; + for(int f=0;f<=c;f++) { + _fieldString((UDebugEnumType)t, f, newStrs[t][f]); + } + } + newStrs[UDBG_ENUM_COUNT] = new UnicodeString[1]; // empty string + + strs = newStrs; + } +} + + + +U_TOOLUTIL_API const UnicodeString& U_EXPORT2 udbg_enumString(UDebugEnumType type, int32_t field) { + if(strs == nullptr ) { + udbg_setup(); + } + if(type<0||type>=UDBG_ENUM_COUNT) { + // use UDBG_ENUM_COUNT,0 to mean an empty string + //fprintf(stderr, "** returning out of range on %d\n",type); + //fflush(stderr); + return strs[UDBG_ENUM_COUNT][0]; + } + int32_t count = udbg_enumCount(type); + //fprintf(stderr, "enumString [%d,%d]: typecount %d, fieldcount %d\n", type,field,UDBG_ENUM_COUNT,count); + //fflush(stderr); + if(field<0 || field > count) { + return strs[type][count]; + } else { return strs[type][field]; + } +} + +U_CAPI int32_t U_EXPORT2 udbg_enumByString(UDebugEnumType type, const UnicodeString& string) { + if(type<0||type>=UDBG_ENUM_COUNT) { + return -1; + } + // initialize array + udbg_enumString(type,0); + // search + /// printf("type=%d\n", type); fflush(stdout); + for(int i=0;i<udbg_enumCount(type);i++) { +// printf("i=%d/%d\n", i, udbg_enumCount(type)); fflush(stdout); + if(string == (strs[type][i])) { + return i; + } + } + return -1; +} + +// from DataMap::utoi +U_CAPI int32_t +udbg_stoi(const UnicodeString &s) +{ + char ch[256]; + const char16_t *u = toUCharPtr(s.getBuffer()); + int32_t len = s.length(); + u_UCharsToChars(u, ch, len); + ch[len] = 0; /* include terminating \0 */ + return atoi(ch); +} + + +U_CAPI double +udbg_stod(const UnicodeString &s) +{ + char ch[256]; + const char16_t *u = toUCharPtr(s.getBuffer()); + int32_t len = s.length(); + u_UCharsToChars(u, ch, len); + ch[len] = 0; /* include terminating \0 */ + return atof(ch); +} + +U_CAPI UnicodeString * +udbg_escape(const UnicodeString &src, UnicodeString *dst) +{ + dst->remove(); + for (int32_t i = 0; i < src.length(); ++i) { + char16_t c = src[i]; + if(ICU_Utility::isUnprintable(c)) { + *dst += UnicodeString("["); + ICU_Utility::escapeUnprintable(*dst, c); + *dst += UnicodeString("]"); + } + else { + *dst += c; + } + } + + return dst; +} + + + +#endif diff --git a/intl/icu/source/tools/toolutil/dbgutil.h b/intl/icu/source/tools/toolutil/dbgutil.h new file mode 100644 index 0000000000..43fe2171b4 --- /dev/null +++ b/intl/icu/source/tools/toolutil/dbgutil.h @@ -0,0 +1,45 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +/* +************************************************************************ +* Copyright (c) 2007-2012, International Business Machines +* Corporation and others. All Rights Reserved. +************************************************************************ +*/ + +/** C++ Utilities to aid in debugging **/ + +#ifndef _DBGUTIL_H +#define _DBGUTIL_H + +#include "unicode/utypes.h" +#include "udbgutil.h" +#include "unicode/unistr.h" + +#if !UCONFIG_NO_FORMATTING + +U_TOOLUTIL_API const icu::UnicodeString& U_EXPORT2 +udbg_enumString(UDebugEnumType type, int32_t field); + +/** + * @return enum offset, or UDBG_INVALID_ENUM on error + */ +U_CAPI int32_t U_EXPORT2 +udbg_enumByString(UDebugEnumType type, const icu::UnicodeString& string); + +/** + * Convert a UnicodeString (with ascii digits) into a number. + * @param s string + * @return numerical value, or 0 on error + */ +U_CAPI int32_t U_EXPORT2 udbg_stoi(const icu::UnicodeString &s); + +U_CAPI double U_EXPORT2 udbg_stod(const icu::UnicodeString &s); + +U_CAPI icu::UnicodeString * U_EXPORT2 +udbg_escape(const icu::UnicodeString &s, icu::UnicodeString *dst); + +#endif + +#endif diff --git a/intl/icu/source/tools/toolutil/denseranges.cpp b/intl/icu/source/tools/toolutil/denseranges.cpp new file mode 100644 index 0000000000..f5e52b1bbb --- /dev/null +++ b/intl/icu/source/tools/toolutil/denseranges.cpp @@ -0,0 +1,160 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2010, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: denseranges.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2010sep25 +* created by: Markus W. Scherer +* +* Helper code for finding a small number of dense ranges. +*/ + +#include "unicode/utypes.h" +#include "denseranges.h" + +// Definitions in the anonymous namespace are invisible outside this file. +namespace { + +/** + * Collect up to 15 range gaps and sort them by ascending gap size. + */ +class LargestGaps { +public: + LargestGaps(int32_t max) : maxLength(max<=kCapacity ? max : kCapacity), length(0) {} + + void add(int32_t gapStart, int64_t gapLength) { + int32_t i=length; + while(i>0 && gapLength>gapLengths[i-1]) { + --i; + } + if(i<maxLength) { + // The new gap is now one of the maxLength largest. + // Insert the new gap, moving up smaller ones of the previous + // length largest. + int32_t j= length<maxLength ? length++ : maxLength-1; + while(j>i) { + gapStarts[j]=gapStarts[j-1]; + gapLengths[j]=gapLengths[j-1]; + --j; + } + gapStarts[i]=gapStart; + gapLengths[i]=gapLength; + } + } + + void truncate(int32_t newLength) { + if(newLength<length) { + length=newLength; + } + } + + int32_t count() const { return length; } + int32_t gapStart(int32_t i) const { return gapStarts[i]; } + int64_t gapLength(int32_t i) const { return gapLengths[i]; } + + int32_t firstAfter(int32_t value) const { + if(length==0) { + return -1; + } + int32_t minValue=0; + int32_t minIndex=-1; + for(int32_t i=0; i<length; ++i) { + if(value<gapStarts[i] && (minIndex<0 || gapStarts[i]<minValue)) { + minValue=gapStarts[i]; + minIndex=i; + } + } + return minIndex; + } + +private: + static const int32_t kCapacity=15; + + int32_t maxLength; + int32_t length; + int32_t gapStarts[kCapacity]; + int64_t gapLengths[kCapacity]; +}; + +} // namespace + +/** + * Does it make sense to write 1..capacity ranges? + * Returns 0 if not, otherwise the number of ranges. + * @param values Sorted array of signed-integer values. + * @param length Number of values. + * @param density Minimum average range density, in 256th. (0x100=100%=perfectly dense.) + * Should be 0x80..0x100, must be 1..0x100. + * @param ranges Output ranges array. + * @param capacity Maximum number of ranges. + * @return Minimum number of ranges (at most capacity) that have the desired density, + * or 0 if that density cannot be achieved. + */ +U_CAPI int32_t U_EXPORT2 +uprv_makeDenseRanges(const int32_t values[], int32_t length, + int32_t density, + int32_t ranges[][2], int32_t capacity) { + if(length<=2) { + return 0; + } + int32_t minValue=values[0]; + int32_t maxValue=values[length-1]; // Assume minValue<=maxValue. + // Use int64_t variables for intermediate-value precision and to avoid + // signed-int32_t overflow of maxValue-minValue. + int64_t maxLength=(int64_t)maxValue-(int64_t)minValue+1; + if(length>=(density*maxLength)/0x100) { + // Use one range. + ranges[0][0]=minValue; + ranges[0][1]=maxValue; + return 1; + } + if(length<=4) { + return 0; + } + // See if we can split [minValue, maxValue] into 2..capacity ranges, + // divided by the 1..(capacity-1) largest gaps. + LargestGaps gaps(capacity-1); + int32_t i; + int32_t expectedValue=minValue; + for(i=1; i<length; ++i) { + ++expectedValue; + int32_t actualValue=values[i]; + if(expectedValue!=actualValue) { + gaps.add(expectedValue, (int64_t)actualValue-(int64_t)expectedValue); + expectedValue=actualValue; + } + } + // We know gaps.count()>=1 because we have fewer values (length) than + // the length of the [minValue..maxValue] range (maxLength). + // (Otherwise we would have returned with the one range above.) + int32_t num; + for(i=0, num=2;; ++i, ++num) { + if(i>=gaps.count()) { + // The values are too sparse for capacity or fewer ranges + // of the requested density. + return 0; + } + maxLength-=gaps.gapLength(i); + if(length>num*2 && length>=(density*maxLength)/0x100) { + break; + } + } + // Use the num ranges with the num-1 largest gaps. + gaps.truncate(num-1); + ranges[0][0]=minValue; + for(i=0; i<=num-2; ++i) { + int32_t gapIndex=gaps.firstAfter(minValue); + int32_t gapStart=gaps.gapStart(gapIndex); + ranges[i][1]=gapStart-1; + ranges[i+1][0]=minValue=(int32_t)(gapStart+gaps.gapLength(gapIndex)); + } + ranges[num-1][1]=maxValue; + return num; +} diff --git a/intl/icu/source/tools/toolutil/denseranges.h b/intl/icu/source/tools/toolutil/denseranges.h new file mode 100644 index 0000000000..c489ca47d8 --- /dev/null +++ b/intl/icu/source/tools/toolutil/denseranges.h @@ -0,0 +1,41 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2010, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: denseranges.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2010sep25 +* created by: Markus W. Scherer +* +* Helper code for finding a small number of dense ranges. +*/ + +#ifndef __DENSERANGES_H__ +#define __DENSERANGES_H__ + +#include "unicode/utypes.h" + +/** + * Does it make sense to write 1..capacity ranges? + * Returns 0 if not, otherwise the number of ranges. + * @param values Sorted array of signed-integer values. + * @param length Number of values. + * @param density Minimum average range density, in 256th. (0x100=100%=perfectly dense.) + * Should be 0x80..0x100, must be 1..0x100. + * @param ranges Output ranges array. + * @param capacity Maximum number of ranges. + * @return Minimum number of ranges (at most capacity) that have the desired density, + * or 0 if that density cannot be achieved. + */ +U_CAPI int32_t U_EXPORT2 +uprv_makeDenseRanges(const int32_t values[], int32_t length, + int32_t density, + int32_t ranges[][2], int32_t capacity); + +#endif // __DENSERANGES_H__ diff --git a/intl/icu/source/tools/toolutil/filestrm.cpp b/intl/icu/source/tools/toolutil/filestrm.cpp new file mode 100644 index 0000000000..9a2695197a --- /dev/null +++ b/intl/icu/source/tools/toolutil/filestrm.cpp @@ -0,0 +1,227 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +****************************************************************************** +* +* Copyright (C) 1997-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* File FILESTRM.C +* +* @author Glenn Marcy +* +* Modification History: +* +* Date Name Description +* 5/8/98 gm Created +* 03/02/99 stephen Reordered params in ungetc to match stdio +* Added wopen +* 3/29/99 helena Merged Stephen and Bertrand's changes. +* +****************************************************************************** +*/ + +#include "filestrm.h" + +#include "cmemory.h" + +#include <stdio.h> + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_open(const char* filename, const char* mode) +{ + if(filename != nullptr && *filename != 0 && mode != nullptr && *mode != 0) { + FILE *file = fopen(filename, mode); + return (FileStream*)file; + } else { + return nullptr; + } +} + +/* +U_CAPI FileStream* U_EXPORT2 +T_FileStream_wopen(const wchar_t* filename, const wchar_t* mode) +{ + // TBD: _wfopen is believed to be MS-specific? +#if U_PLATFORM_USES_ONLY_WIN32_API + FILE* result = _wfopen(filename, mode); + return (FileStream*)result; +#else + size_t fnMbsSize, mdMbsSize; + char *fn, *md; + FILE *result; + + // convert from wchar_t to char + fnMbsSize = wcstombs(nullptr, filename, ((size_t)-1) >> 1); + fn = (char*)uprv_malloc(fnMbsSize+2); + wcstombs(fn, filename, fnMbsSize); + fn[fnMbsSize] = 0; + + mdMbsSize = wcstombs(nullptr, mode, ((size_t)-1) >> 1); + md = (char*)uprv_malloc(mdMbsSize+2); + wcstombs(md, mode, mdMbsSize); + md[mdMbsSize] = 0; + + result = fopen(fn, md); + uprv_free(fn); + uprv_free(md); + return (FileStream*)result; +#endif +} +*/ +U_CAPI void U_EXPORT2 +T_FileStream_close(FileStream* fileStream) +{ + if (fileStream != 0) + fclose((FILE*)fileStream); +} + +U_CAPI UBool U_EXPORT2 +T_FileStream_file_exists(const char* filename) +{ + FILE* temp = fopen(filename, "r"); + if (temp) { + fclose(temp); + return true; + } else + return false; +} + +/*static const int32_t kEOF; +const int32_t FileStream::kEOF = EOF;*/ + +/* +U_CAPI FileStream* +T_FileStream_tmpfile() +{ + FILE* file = tmpfile(); + return (FileStream*)file; +} +*/ + +U_CAPI int32_t U_EXPORT2 +T_FileStream_read(FileStream* fileStream, void* addr, int32_t len) +{ + return static_cast<int32_t>(fread(addr, 1, len, (FILE*)fileStream)); +} + +U_CAPI int32_t U_EXPORT2 +T_FileStream_write(FileStream* fileStream, const void* addr, int32_t len) +{ + + return static_cast<int32_t>(fwrite(addr, 1, len, (FILE*)fileStream)); +} + +U_CAPI void U_EXPORT2 +T_FileStream_rewind(FileStream* fileStream) +{ + rewind((FILE*)fileStream); +} + +U_CAPI int32_t U_EXPORT2 +T_FileStream_putc(FileStream* fileStream, int32_t ch) +{ + int32_t c = fputc(ch, (FILE*)fileStream); + return c; +} + +U_CAPI int U_EXPORT2 +T_FileStream_getc(FileStream* fileStream) +{ + int c = fgetc((FILE*)fileStream); + return c; +} + +U_CAPI int32_t U_EXPORT2 +T_FileStream_ungetc(int32_t ch, FileStream* fileStream) +{ + + int32_t c = ungetc(ch, (FILE*)fileStream); + return c; +} + +U_CAPI int32_t U_EXPORT2 +T_FileStream_peek(FileStream* fileStream) +{ + int32_t c = fgetc((FILE*)fileStream); + return ungetc(c, (FILE*)fileStream); +} + +U_CAPI char* U_EXPORT2 +T_FileStream_readLine(FileStream* fileStream, char* buffer, int32_t length) +{ + return fgets(buffer, length, (FILE*)fileStream); +} + +U_CAPI int32_t U_EXPORT2 +T_FileStream_writeLine(FileStream* fileStream, const char* buffer) +{ + return fputs(buffer, (FILE*)fileStream); +} + +U_CAPI int32_t U_EXPORT2 +T_FileStream_size(FileStream* fileStream) +{ + int32_t savedPos = ftell((FILE*)fileStream); + int32_t size = 0; + + /*Changes by Bertrand A. D. doesn't affect the current position + goes to the end of the file before ftell*/ + fseek((FILE*)fileStream, 0, SEEK_END); + size = (int32_t)ftell((FILE*)fileStream); + fseek((FILE*)fileStream, savedPos, SEEK_SET); + return size; +} + +U_CAPI int U_EXPORT2 +T_FileStream_eof(FileStream* fileStream) +{ + return feof((FILE*)fileStream); +} + +/* + Warning + This function may not work consistently on all platforms + (e.g. HP-UX, FreeBSD and MacOSX don't return an error when + putc is used on a file opened as readonly) +*/ +U_CAPI int U_EXPORT2 +T_FileStream_error(FileStream* fileStream) +{ + return (fileStream == 0 || ferror((FILE*)fileStream)); +} + +/* This function doesn't work. */ +/* force the stream to set its error flag*/ +/*U_CAPI void U_EXPORT2 +T_FileStream_setError(FileStream* fileStream) +{ + fseek((FILE*)fileStream, 99999, SEEK_SET); +} +*/ + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_stdin() +{ + return (FileStream*)stdin; +} + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_stdout() +{ + return (FileStream*)stdout; +} + + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_stderr() +{ + return (FileStream*)stderr; +} + +U_CAPI UBool U_EXPORT2 +T_FileStream_remove(const char* fileName){ + return (remove(fileName) == 0); +} diff --git a/intl/icu/source/tools/toolutil/filestrm.h b/intl/icu/source/tools/toolutil/filestrm.h new file mode 100644 index 0000000000..86fac3063f --- /dev/null +++ b/intl/icu/source/tools/toolutil/filestrm.h @@ -0,0 +1,106 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +****************************************************************************** +* +* Copyright (C) 1997-2005, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* File FILESTRM.H +* +* Contains FileStream interface +* +* @author Glenn Marcy +* +* Modification History: +* +* Date Name Description +* 5/8/98 gm Created. +* 03/02/99 stephen Reordered params in ungetc to match stdio +* Added wopen +* +****************************************************************************** +*/ + +#ifndef FILESTRM_H +#define FILESTRM_H + +#include "unicode/utypes.h" + +typedef struct _FileStream FileStream; + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_open(const char* filename, const char* mode); + +/* +U_CAPI FileStream* U_EXPORT2 +T_FileStream_wopen(const wchar_t* filename, const wchar_t* mode); +*/ +U_CAPI void U_EXPORT2 +T_FileStream_close(FileStream* fileStream); + +U_CAPI UBool U_EXPORT2 +T_FileStream_file_exists(const char* filename); + +/* +U_CAPI FileStream* U_EXPORT2 +T_FileStream_tmpfile(void); +*/ + +U_CAPI int32_t U_EXPORT2 +T_FileStream_read(FileStream* fileStream, void* addr, int32_t len); + +U_CAPI int32_t U_EXPORT2 +T_FileStream_write(FileStream* fileStream, const void* addr, int32_t len); + +U_CAPI void U_EXPORT2 +T_FileStream_rewind(FileStream* fileStream); + +/*Added by Bertrand A. D. */ +U_CAPI char * U_EXPORT2 +T_FileStream_readLine(FileStream* fileStream, char* buffer, int32_t length); + +U_CAPI int32_t U_EXPORT2 +T_FileStream_writeLine(FileStream* fileStream, const char* buffer); + +U_CAPI int32_t U_EXPORT2 +T_FileStream_putc(FileStream* fileStream, int32_t ch); + +U_CAPI int U_EXPORT2 +T_FileStream_getc(FileStream* fileStream); + +U_CAPI int32_t U_EXPORT2 +T_FileStream_ungetc(int32_t ch, FileStream *fileStream); + +U_CAPI int32_t U_EXPORT2 +T_FileStream_peek(FileStream* fileStream); + +U_CAPI int32_t U_EXPORT2 +T_FileStream_size(FileStream* fileStream); + +U_CAPI int U_EXPORT2 +T_FileStream_eof(FileStream* fileStream); + +U_CAPI int U_EXPORT2 +T_FileStream_error(FileStream* fileStream); + +/* +U_CAPI void U_EXPORT2 +T_FileStream_setError(FileStream* fileStream); +*/ + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_stdin(void); + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_stdout(void); + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_stderr(void); + +U_CAPI UBool U_EXPORT2 +T_FileStream_remove(const char* fileName); + +#endif /* _FILESTRM*/ diff --git a/intl/icu/source/tools/toolutil/filetools.cpp b/intl/icu/source/tools/toolutil/filetools.cpp new file mode 100644 index 0000000000..994d8e31f0 --- /dev/null +++ b/intl/icu/source/tools/toolutil/filetools.cpp @@ -0,0 +1,140 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2009-2013, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ + +#include "unicode/platform.h" +#if U_PLATFORM == U_PF_MINGW +// *cough* - for struct stat +#ifdef __STRICT_ANSI__ +#undef __STRICT_ANSI__ +#endif +#endif + +#include "filetools.h" +#include "filestrm.h" +#include "charstr.h" +#include "cstring.h" +#include "unicode/putil.h" +#include "putilimp.h" + +#include <stdio.h> +#include <stdlib.h> +#include <sys/stat.h> +#include <time.h> +#include <string.h> + +#if U_HAVE_DIRENT_H +#include <dirent.h> +typedef struct dirent DIRENT; + +#define SKIP1 "." +#define SKIP2 ".." +#endif + +static int32_t whichFileModTimeIsLater(const char *file1, const char *file2); + +/* + * Goes through the given directory recursive to compare each file's modification time with that of the file given. + * Also can be given just one file to check against. Default value for isDir is false. + */ +U_CAPI UBool U_EXPORT2 +isFileModTimeLater(const char *filePath, const char *checkAgainst, UBool isDir) { + UBool isLatest = true; + + if (filePath == nullptr || checkAgainst == nullptr) { + return false; + } + + if (isDir == true) { +#if U_HAVE_DIRENT_H + DIR *pDir = nullptr; + if ((pDir= opendir(checkAgainst)) != nullptr) { + DIR *subDirp = nullptr; + DIRENT *dirEntry = nullptr; + + while ((dirEntry = readdir(pDir)) != nullptr) { + if (uprv_strcmp(dirEntry->d_name, SKIP1) != 0 && uprv_strcmp(dirEntry->d_name, SKIP2) != 0) { + UErrorCode status = U_ZERO_ERROR; + icu::CharString newpath(checkAgainst, -1, status); + newpath.append(U_FILE_SEP_STRING, -1, status); + newpath.append(dirEntry->d_name, -1, status); + if (U_FAILURE(status)) { + fprintf(stderr, "%s:%d: %s\n", __FILE__, __LINE__, u_errorName(status)); + return false; + } + + if ((subDirp = opendir(newpath.data())) != nullptr) { + /* If this new path is a directory, make a recursive call with the newpath. */ + closedir(subDirp); + isLatest = isFileModTimeLater(filePath, newpath.data(), isDir); + if (!isLatest) { + break; + } + } else { + int32_t latest = whichFileModTimeIsLater(filePath, newpath.data()); + if (latest < 0 || latest == 2) { + isLatest = false; + break; + } + } + + } + } + closedir(pDir); + } else { + fprintf(stderr, "Unable to open directory: %s\n", checkAgainst); + return false; + } +#endif + } else { + if (T_FileStream_file_exists(checkAgainst)) { + int32_t latest = whichFileModTimeIsLater(filePath, checkAgainst); + if (latest < 0 || latest == 2) { + isLatest = false; + } + } else { + isLatest = false; + } + } + + return isLatest; +} + +/* Compares the mod time of both files returning a number indicating which one is later. -1 if error ocurs. */ +static int32_t whichFileModTimeIsLater(const char *file1, const char *file2) { + int32_t result = 0; + struct stat stbuf1, stbuf2; + + if (stat(file1, &stbuf1) == 0 && stat(file2, &stbuf2) == 0) { + time_t modtime1, modtime2; + double diff; + + modtime1 = stbuf1.st_mtime; + modtime2 = stbuf2.st_mtime; + + diff = difftime(modtime1, modtime2); + if (diff < 0.0) { + result = 2; + } else if (diff > 0.0) { + result = 1; + } + + } else { + fprintf(stderr, "Unable to get stats from file: %s or %s\n", file1, file2); + result = -1; + } + + return result; +} + +/* Swap the file separater character given with the new one in the file path. */ +U_CAPI void U_EXPORT2 +swapFileSepChar(char *filePath, const char oldFileSepChar, const char newFileSepChar) { + for (int32_t i = 0, length = static_cast<int32_t>(uprv_strlen(filePath)); i < length; i++) { + filePath[i] = (filePath[i] == oldFileSepChar ) ? newFileSepChar : filePath[i]; + } +} diff --git a/intl/icu/source/tools/toolutil/filetools.h b/intl/icu/source/tools/toolutil/filetools.h new file mode 100644 index 0000000000..40a606a7d4 --- /dev/null +++ b/intl/icu/source/tools/toolutil/filetools.h @@ -0,0 +1,34 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2009, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: filetools.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2009jan09 +* created by: Michael Ow +* +* Contains various functions to handle files. +* Not suitable for production use. Not supported. +* Not conformant. Not efficient. +*/ + +#ifndef __FILETOOLS_H__ +#define __FILETOOLS_H__ + +#include "unicode/utypes.h" + +U_CAPI UBool U_EXPORT2 +isFileModTimeLater(const char *filePath, const char *checkAgainst, UBool isDir=false); + +U_CAPI void U_EXPORT2 +swapFileSepChar(char *filePath, const char oldFileSepChar, const char newFileSepChar); + +#endif diff --git a/intl/icu/source/tools/toolutil/flagparser.cpp b/intl/icu/source/tools/toolutil/flagparser.cpp new file mode 100644 index 0000000000..8bbceb4f73 --- /dev/null +++ b/intl/icu/source/tools/toolutil/flagparser.cpp @@ -0,0 +1,180 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2009-2015, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ + +#include "flagparser.h" +#include "filestrm.h" +#include "cstring.h" +#include "cmemory.h" + +#define DEFAULT_BUFFER_SIZE 512 + +static int32_t currentBufferSize = DEFAULT_BUFFER_SIZE; + +static int32_t extractFlag(char* buffer, int32_t bufferSize, char* flag, int32_t flagSize, const char ** flagNames, int32_t numOfFlags, UErrorCode *status); +static int32_t getFlagOffset(const char *buffer, int32_t bufferSize); + +/* + * Opens the given fileName and reads in the information storing the data in flagBuffer. + */ +U_CAPI int32_t U_EXPORT2 +parseFlagsFile(const char *fileName, char **flagBuffer, int32_t flagBufferSize, const char ** flagNames, int32_t numOfFlags, UErrorCode *status) { + char* buffer = nullptr; + char* tmpFlagBuffer = nullptr; + UBool allocateMoreSpace = false; + int32_t idx, i; + int32_t result = 0; + + FileStream *f = T_FileStream_open(fileName, "r"); + if (f == nullptr) { + *status = U_FILE_ACCESS_ERROR; + goto parseFlagsFile_cleanup; + } + + buffer = (char *)uprv_malloc(sizeof(char) * currentBufferSize); + tmpFlagBuffer = (char *)uprv_malloc(sizeof(char) * flagBufferSize); + + if (buffer == nullptr || tmpFlagBuffer == nullptr) { + *status = U_MEMORY_ALLOCATION_ERROR; + goto parseFlagsFile_cleanup; + } + + do { + if (allocateMoreSpace) { + allocateMoreSpace = false; + currentBufferSize *= 2; + uprv_free(buffer); + buffer = (char *)uprv_malloc(sizeof(char) * currentBufferSize); + if (buffer == nullptr) { + *status = U_MEMORY_ALLOCATION_ERROR; + goto parseFlagsFile_cleanup; + } + } + for (i = 0; i < numOfFlags;) { + if (T_FileStream_readLine(f, buffer, currentBufferSize) == nullptr) { + /* End of file reached. */ + break; + } + if (buffer[0] == '#') { + continue; + } + + if ((int32_t)uprv_strlen(buffer) == (currentBufferSize - 1) && buffer[currentBufferSize-2] != '\n') { + /* Allocate more space for buffer if it did not read the entire line */ + allocateMoreSpace = true; + T_FileStream_rewind(f); + break; + } else { + idx = extractFlag(buffer, currentBufferSize, tmpFlagBuffer, flagBufferSize, flagNames, numOfFlags, status); + if (U_FAILURE(*status)) { + if (*status == U_BUFFER_OVERFLOW_ERROR) { + result = currentBufferSize; + } else { + result = -1; + } + break; + } else { + if (flagNames != nullptr) { + if (idx >= 0) { + uprv_strcpy(flagBuffer[idx], tmpFlagBuffer); + } else { + /* No match found. Skip it. */ + continue; + } + } else { + uprv_strcpy(flagBuffer[i++], tmpFlagBuffer); + } + } + } + } + } while (allocateMoreSpace && U_SUCCESS(*status)); + +parseFlagsFile_cleanup: + uprv_free(tmpFlagBuffer); + uprv_free(buffer); + + T_FileStream_close(f); + + if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) { + return -1; + } + + if (U_SUCCESS(*status) && result == 0) { + currentBufferSize = DEFAULT_BUFFER_SIZE; + } + + return result; +} + + +/* + * Extract the setting after the '=' and store it in flag excluding the newline character. + */ +static int32_t extractFlag(char* buffer, int32_t bufferSize, char* flag, int32_t flagSize, const char **flagNames, int32_t numOfFlags, UErrorCode *status) { + int32_t i, idx = -1; + char *pBuffer; + int32_t offset=0; + UBool bufferWritten = false; + + if (buffer[0] != 0) { + /* Get the offset (i.e. position after the '=') */ + offset = getFlagOffset(buffer, bufferSize); + pBuffer = buffer+offset; + for(i = 0;;i++) { + if (i >= flagSize) { + *status = U_BUFFER_OVERFLOW_ERROR; + return -1; + } + if (pBuffer[i+1] == 0) { + /* Indicates a new line character. End here. */ + flag[i] = 0; + break; + } + + flag[i] = pBuffer[i]; + if (i == 0) { + bufferWritten = true; + } + } + } + + if (!bufferWritten) { + flag[0] = 0; + } + + if (flagNames != nullptr && offset>0) { + offset--; /* Move offset back 1 because of '='*/ + for (i = 0; i < numOfFlags; i++) { + if (uprv_strncmp(buffer, flagNames[i], offset) == 0) { + idx = i; + break; + } + } + } + + return idx; +} + +/* + * Get the position after the '=' character. + */ +static int32_t getFlagOffset(const char *buffer, int32_t bufferSize) { + int32_t offset = 0; + + for (offset = 0; offset < bufferSize;offset++) { + if (buffer[offset] == '=') { + offset++; + break; + } + } + + if (offset == bufferSize || (offset - 1) == bufferSize) { + offset = 0; + } + + return offset; +} diff --git a/intl/icu/source/tools/toolutil/flagparser.h b/intl/icu/source/tools/toolutil/flagparser.h new file mode 100644 index 0000000000..aa42547164 --- /dev/null +++ b/intl/icu/source/tools/toolutil/flagparser.h @@ -0,0 +1,32 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2009-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: flagparser.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2009jan08 +* created by: Michael Ow +* +* Tiny flag file parser using ICU and intended for use in ICU tests and in build tools. +* Not suitable for production use. Not supported. +* Not conformant. Not efficient. +* But very small. +*/ + +#ifndef __FLAGPARSER_H__ +#define __FLAGPARSER_H__ + +#include "unicode/utypes.h" + +U_CAPI int32_t U_EXPORT2 +parseFlagsFile(const char *fileName, char **flagBuffer, int32_t flagBufferSize, const char ** flagNames, int32_t numOfFlags, UErrorCode *status); + +#endif diff --git a/intl/icu/source/tools/toolutil/package.cpp b/intl/icu/source/tools/toolutil/package.cpp new file mode 100644 index 0000000000..3098f5d57d --- /dev/null +++ b/intl/icu/source/tools/toolutil/package.cpp @@ -0,0 +1,1311 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: package.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005aug25 +* created by: Markus W. Scherer +* +* Read, modify, and write ICU .dat data package files. +* This is an integral part of the icupkg tool, moved to the toolutil library +* because parts of tool implementations tend to be later shared by +* other tools. +* Subsumes functionality and implementation code from +* gencmn, decmn, and icuswap tools. +*/ + +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/udata.h" +#include "cstring.h" +#include "uarrsort.h" +#include "ucmndata.h" +#include "udataswp.h" +#include "swapimpl.h" +#include "toolutil.h" +#include "package.h" +#include "cmemory.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + + +static const int32_t kItemsChunk = 256; /* How much to increase the filesarray by each time */ + +// general definitions ----------------------------------------------------- *** + +/* UDataInfo cf. udata.h */ +static const UDataInfo dataInfo={ + (uint16_t)sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + (uint8_t)sizeof(char16_t), + 0, + + {0x43, 0x6d, 0x6e, 0x44}, /* dataFormat="CmnD" */ + {1, 0, 0, 0}, /* formatVersion */ + {3, 0, 0, 0} /* dataVersion */ +}; + +U_CDECL_BEGIN +static void U_CALLCONV +printPackageError(void *context, const char *fmt, va_list args) { + vfprintf((FILE *)context, fmt, args); +} +U_CDECL_END + +static uint16_t +readSwapUInt16(uint16_t x) { + return (uint16_t)((x<<8)|(x>>8)); +} + +// platform types ---------------------------------------------------------- *** + +static const char *types="lb?e"; + +enum { TYPE_L, TYPE_B, TYPE_LE, TYPE_E, TYPE_COUNT }; + +static inline int32_t +makeTypeEnum(uint8_t charset, UBool isBigEndian) { + return 2*(int32_t)charset+isBigEndian; +} + +static inline int32_t +makeTypeEnum(char type) { + return + type == 'l' ? TYPE_L : + type == 'b' ? TYPE_B : + type == 'e' ? TYPE_E : + -1; +} + +static inline char +makeTypeLetter(uint8_t charset, UBool isBigEndian) { + return types[makeTypeEnum(charset, isBigEndian)]; +} + +static inline char +makeTypeLetter(int32_t typeEnum) { + return types[typeEnum]; +} + +static void +makeTypeProps(char type, uint8_t &charset, UBool &isBigEndian) { + int32_t typeEnum=makeTypeEnum(type); + charset=(uint8_t)(typeEnum>>1); + isBigEndian=(UBool)(typeEnum&1); +} + +U_CFUNC const UDataInfo * +getDataInfo(const uint8_t *data, int32_t length, + int32_t &infoLength, int32_t &headerLength, + UErrorCode *pErrorCode) { + const DataHeader *pHeader; + const UDataInfo *pInfo; + + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return nullptr; + } + if( data==nullptr || + (length>=0 && length<(int32_t)sizeof(DataHeader)) + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + + pHeader=(const DataHeader *)data; + pInfo=&pHeader->info; + if( (length>=0 && length<(int32_t)sizeof(DataHeader)) || + pHeader->dataHeader.magic1!=0xda || + pHeader->dataHeader.magic2!=0x27 || + pInfo->sizeofUChar!=2 + ) { + *pErrorCode=U_UNSUPPORTED_ERROR; + return nullptr; + } + + if(pInfo->isBigEndian==U_IS_BIG_ENDIAN) { + headerLength=pHeader->dataHeader.headerSize; + infoLength=pInfo->size; + } else { + headerLength=readSwapUInt16(pHeader->dataHeader.headerSize); + infoLength=readSwapUInt16(pInfo->size); + } + + if( headerLength<(int32_t)sizeof(DataHeader) || + infoLength<(int32_t)sizeof(UDataInfo) || + headerLength<(int32_t)(sizeof(pHeader->dataHeader)+infoLength) || + (length>=0 && length<headerLength) + ) { + *pErrorCode=U_UNSUPPORTED_ERROR; + return nullptr; + } + + return pInfo; +} + +static int32_t +getTypeEnumForInputData(const uint8_t *data, int32_t length, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t infoLength, headerLength; + + /* getDataInfo() checks for illegal arguments */ + pInfo=getDataInfo(data, length, infoLength, headerLength, pErrorCode); + if(pInfo==nullptr) { + return -1; + } + + return makeTypeEnum(pInfo->charsetFamily, (UBool)pInfo->isBigEndian); +} + +// file handling ----------------------------------------------------------- *** + +static void +extractPackageName(const char *filename, + char pkg[], int32_t capacity) { + const char *basename; + int32_t len; + + basename=findBasename(filename); + len=(int32_t)strlen(basename)-4; /* -4: subtract the length of ".dat" */ + + if(len<=0 || 0!=strcmp(basename+len, ".dat")) { + fprintf(stderr, "icupkg: \"%s\" is not recognized as a package filename (must end with .dat)\n", + basename); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + + if(len>=capacity) { + fprintf(stderr, "icupkg: the package name \"%s\" is too long (>=%ld)\n", + basename, (long)capacity); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + + memcpy(pkg, basename, len); + pkg[len]=0; +} + +static int32_t +getFileLength(FILE *f) { + int32_t length; + + fseek(f, 0, SEEK_END); + length=(int32_t)ftell(f); + fseek(f, 0, SEEK_SET); + return length; +} + +/* + * Turn tree separators and alternate file separators into normal file separators. + */ +#if U_TREE_ENTRY_SEP_CHAR==U_FILE_SEP_CHAR && U_FILE_ALT_SEP_CHAR==U_FILE_SEP_CHAR +#define treeToPath(s) +#else +static void +treeToPath(char *s) { + char *t; + + for(t=s; *t!=0; ++t) { + if(*t==U_TREE_ENTRY_SEP_CHAR || *t==U_FILE_ALT_SEP_CHAR) { + *t=U_FILE_SEP_CHAR; + } + } +} +#endif + +/* + * Turn file separators into tree separators. + */ +#if U_TREE_ENTRY_SEP_CHAR==U_FILE_SEP_CHAR && U_FILE_ALT_SEP_CHAR==U_FILE_SEP_CHAR +#define pathToTree(s) +#else +static void +pathToTree(char *s) { + char *t; + + for(t=s; *t!=0; ++t) { + if(*t==U_FILE_SEP_CHAR || *t==U_FILE_ALT_SEP_CHAR) { + *t=U_TREE_ENTRY_SEP_CHAR; + } + } +} +#endif + +/* + * Prepend the path (if any) to the name and run the name through treeToName(). + */ +static void +makeFullFilename(const char *path, const char *name, + char *filename, int32_t capacity) { + char *s; + + // prepend the path unless nullptr or empty + if(path!=nullptr && path[0]!=0) { + if((int32_t)(strlen(path)+1)>=capacity) { + fprintf(stderr, "pathname too long: \"%s\"\n", path); + exit(U_BUFFER_OVERFLOW_ERROR); + } + strcpy(filename, path); + + // make sure the path ends with a file separator + s=strchr(filename, 0); + if(*(s-1)!=U_FILE_SEP_CHAR && *(s-1)!=U_FILE_ALT_SEP_CHAR) { + *s++=U_FILE_SEP_CHAR; + } + } else { + s=filename; + } + + // turn the name into a filename, turn tree separators into file separators + if((int32_t)((s-filename)+strlen(name))>=capacity) { + fprintf(stderr, "path/filename too long: \"%s%s\"\n", filename, name); + exit(U_BUFFER_OVERFLOW_ERROR); + } + strcpy(s, name); + treeToPath(s); +} + +static void +makeFullFilenameAndDirs(const char *path, const char *name, + char *filename, int32_t capacity) { + char *sep; + UErrorCode errorCode; + + makeFullFilename(path, name, filename, capacity); + + // make tree directories + errorCode=U_ZERO_ERROR; + sep=strchr(filename, 0)-strlen(name); + while((sep=strchr(sep, U_FILE_SEP_CHAR))!=nullptr) { + if(sep!=filename) { + *sep=0; // truncate temporarily + uprv_mkdir(filename, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: unable to create tree directory \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + } + *sep++=U_FILE_SEP_CHAR; // restore file separator character + } +} + +static uint8_t * +readFile(const char *path, const char *name, int32_t &length, char &type) { + char filename[1024]; + FILE *file; + UErrorCode errorCode; + int32_t fileLength, typeEnum; + + makeFullFilename(path, name, filename, (int32_t)sizeof(filename)); + + /* open the input file, get its length, allocate memory for it, read the file */ + file=fopen(filename, "rb"); + if(file==nullptr) { + fprintf(stderr, "icupkg: unable to open input file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + /* get the file length */ + fileLength=getFileLength(file); + if(ferror(file) || fileLength<=0) { + fprintf(stderr, "icupkg: empty input file \"%s\"\n", filename); + fclose(file); + exit(U_FILE_ACCESS_ERROR); + } + + /* allocate the buffer, pad to multiple of 16 */ + length=(fileLength+0xf)&~0xf; + icu::LocalMemory<uint8_t> data((uint8_t *)uprv_malloc(length)); + if(data.isNull()) { + fclose(file); + fprintf(stderr, "icupkg: malloc error allocating %d bytes.\n", (int)length); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + /* read the file */ + if(fileLength!=(int32_t)fread(data.getAlias(), 1, fileLength, file)) { + fprintf(stderr, "icupkg: error reading \"%s\"\n", filename); + fclose(file); + exit(U_FILE_ACCESS_ERROR); + } + + /* pad the file to a multiple of 16 using the usual padding byte */ + if(fileLength<length) { + memset(data.getAlias()+fileLength, 0xaa, length-fileLength); + } + + fclose(file); + + // minimum check for ICU-format data + errorCode=U_ZERO_ERROR; + typeEnum=getTypeEnumForInputData(data.getAlias(), length, &errorCode); + if(typeEnum<0 || U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: not an ICU data file: \"%s\"\n", filename); +#if !UCONFIG_NO_LEGACY_CONVERSION + exit(U_INVALID_FORMAT_ERROR); +#else + fprintf(stderr, "U_INVALID_FORMAT_ERROR occurred but UCONFIG_NO_LEGACY_CONVERSION is on so this is expected.\n"); + exit(0); +#endif + } + type=makeTypeLetter(typeEnum); + + return data.orphan(); +} + +// .dat package file representation ---------------------------------------- *** + +U_CDECL_BEGIN + +static int32_t U_CALLCONV +compareItems(const void * /*context*/, const void *left, const void *right) { + U_NAMESPACE_USE + + return (int32_t)strcmp(((Item *)left)->name, ((Item *)right)->name); +} + +U_CDECL_END + +U_NAMESPACE_BEGIN + +Package::Package() + : doAutoPrefix(false), prefixEndsWithType(false) { + inPkgName[0]=0; + pkgPrefix[0]=0; + inData=nullptr; + inLength=0; + inCharset=U_CHARSET_FAMILY; + inIsBigEndian=U_IS_BIG_ENDIAN; + + itemCount=0; + itemMax=0; + items=nullptr; + + inStringTop=outStringTop=0; + + matchMode=0; + findPrefix=findSuffix=nullptr; + findPrefixLength=findSuffixLength=0; + findNextIndex=-1; + + // create a header for an empty package + DataHeader *pHeader; + pHeader=(DataHeader *)header; + pHeader->dataHeader.magic1=0xda; + pHeader->dataHeader.magic2=0x27; + memcpy(&pHeader->info, &dataInfo, sizeof(dataInfo)); + headerLength=(int32_t)(4+sizeof(dataInfo)); + if(headerLength&0xf) { + /* NUL-pad the header to a multiple of 16 */ + int32_t length=(headerLength+0xf)&~0xf; + memset(header+headerLength, 0, length-headerLength); + headerLength=length; + } + pHeader->dataHeader.headerSize=(uint16_t)headerLength; +} + +Package::~Package() { + int32_t idx; + + uprv_free(inData); + + for(idx=0; idx<itemCount; ++idx) { + if(items[idx].isDataOwned) { + uprv_free(items[idx].data); + } + } + + uprv_free((void*)items); +} + +void +Package::setPrefix(const char *p) { + if(strlen(p)>=sizeof(pkgPrefix)) { + fprintf(stderr, "icupkg: --toc_prefix %s too long\n", p); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + strcpy(pkgPrefix, p); +} + +void +Package::readPackage(const char *filename) { + UDataSwapper *ds; + const UDataInfo *pInfo; + UErrorCode errorCode; + + const uint8_t *inBytes; + + int32_t length, offset, i; + int32_t itemLength, typeEnum; + char type; + + const UDataOffsetTOCEntry *inEntries; + + extractPackageName(filename, inPkgName, (int32_t)sizeof(inPkgName)); + + /* read the file */ + inData=readFile(nullptr, filename, inLength, type); + length=inLength; + + /* + * swap the header - even if the swapping itself is a no-op + * because it tells us the header length + */ + errorCode=U_ZERO_ERROR; + makeTypeProps(type, inCharset, inIsBigEndian); + ds=udata_openSwapper(inIsBigEndian, inCharset, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_openSwapper(\"%s\") failed - %s\n", + filename, u_errorName(errorCode)); + exit(errorCode); + } + + ds->printError=printPackageError; + ds->printErrorContext=stderr; + + headerLength=sizeof(header); + if(length<headerLength) { + headerLength=length; + } + headerLength=udata_swapDataHeader(ds, inData, headerLength, header, &errorCode); + if(U_FAILURE(errorCode)) { + exit(errorCode); + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==0x43 && /* dataFormat="CmnD" */ + pInfo->dataFormat[1]==0x6d && + pInfo->dataFormat[2]==0x6e && + pInfo->dataFormat[3]==0x44 && + pInfo->formatVersion[0]==1 + )) { + fprintf(stderr, "icupkg: data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as an ICU .dat package\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + exit(U_UNSUPPORTED_ERROR); + } + inIsBigEndian=(UBool)pInfo->isBigEndian; + inCharset=pInfo->charsetFamily; + + inBytes=(const uint8_t *)inData+headerLength; + inEntries=(const UDataOffsetTOCEntry *)(inBytes+4); + + /* check that the itemCount fits, then the ToC table, then at least the header of the last item */ + length-=headerLength; + if(length<4) { + /* itemCount does not fit */ + offset=0x7fffffff; + } else { + itemCount=udata_readInt32(ds, *(const int32_t *)inBytes); + setItemCapacity(itemCount); /* resize so there's space */ + if(itemCount==0) { + offset=4; + } else if(length<(4+8*itemCount)) { + /* ToC table does not fit */ + offset=0x7fffffff; + } else { + /* offset of the last item plus at least 20 bytes for its header */ + offset=20+(int32_t)ds->readUInt32(inEntries[itemCount-1].dataOffset); + } + } + if(length<offset) { + fprintf(stderr, "icupkg: too few bytes (%ld after header) for a .dat package\n", + (long)length); + exit(U_INDEX_OUTOFBOUNDS_ERROR); + } + /* do not modify the package length variable until the last item's length is set */ + + if(itemCount<=0) { + if(doAutoPrefix) { + fprintf(stderr, "icupkg: --auto_toc_prefix[_with_type] but the input package is empty\n"); + exit(U_INVALID_FORMAT_ERROR); + } + } else { + char prefix[MAX_PKG_NAME_LENGTH+4]; + char *s, *inItemStrings; + + if(itemCount>itemMax) { + fprintf(stderr, "icupkg: too many items, maximum is %d\n", itemMax); + exit(U_BUFFER_OVERFLOW_ERROR); + } + + /* swap the item name strings */ + int32_t stringsOffset=4+8*itemCount; + itemLength=(int32_t)(ds->readUInt32(inEntries[0].dataOffset))-stringsOffset; + + // don't include padding bytes at the end of the item names + while(itemLength>0 && inBytes[stringsOffset+itemLength-1]!=0) { + --itemLength; + } + + if((inStringTop+itemLength)>STRING_STORE_SIZE) { + fprintf(stderr, "icupkg: total length of item name strings too long\n"); + exit(U_BUFFER_OVERFLOW_ERROR); + } + + inItemStrings=inStrings+inStringTop; + ds->swapInvChars(ds, inBytes+stringsOffset, itemLength, inItemStrings, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg failed to swap the input .dat package item name strings\n"); + exit(U_INVALID_FORMAT_ERROR); + } + inStringTop+=itemLength; + + // reset the Item entries + memset(items, 0, itemCount*sizeof(Item)); + + /* + * Get the common prefix of the items. + * New-style ICU .dat packages use tree separators ('/') between package names, + * tree names, and item names, + * while old-style ICU .dat packages (before multi-tree support) + * use an underscore ('_') between package and item names. + */ + offset=(int32_t)ds->readUInt32(inEntries[0].nameOffset)-stringsOffset; + s=inItemStrings+offset; // name of the first entry + int32_t prefixLength; + if(doAutoPrefix) { + // Use the first entry's prefix. Must be a new-style package. + const char *prefixLimit=strchr(s, U_TREE_ENTRY_SEP_CHAR); + if(prefixLimit==nullptr) { + fprintf(stderr, + "icupkg: --auto_toc_prefix[_with_type] but " + "the first entry \"%s\" does not contain a '%c'\n", + s, U_TREE_ENTRY_SEP_CHAR); + exit(U_INVALID_FORMAT_ERROR); + } + prefixLength=(int32_t)(prefixLimit-s); + if(prefixLength==0 || prefixLength>=UPRV_LENGTHOF(pkgPrefix)) { + fprintf(stderr, + "icupkg: --auto_toc_prefix[_with_type] but " + "the prefix of the first entry \"%s\" is empty or too long\n", + s); + exit(U_INVALID_FORMAT_ERROR); + } + if(prefixEndsWithType && s[prefixLength-1]!=type) { + fprintf(stderr, + "icupkg: --auto_toc_prefix_with_type but " + "the prefix of the first entry \"%s\" does not end with '%c'\n", + s, type); + exit(U_INVALID_FORMAT_ERROR); + } + memcpy(pkgPrefix, s, prefixLength); + pkgPrefix[prefixLength]=0; + memcpy(prefix, s, ++prefixLength); // include the / + } else { + // Use the package basename as prefix. + int32_t inPkgNameLength= static_cast<int32_t>(strlen(inPkgName)); + memcpy(prefix, inPkgName, inPkgNameLength); + prefixLength=inPkgNameLength; + + if( (int32_t)strlen(s)>=(inPkgNameLength+2) && + 0==memcmp(s, inPkgName, inPkgNameLength) && + s[inPkgNameLength]=='_' + ) { + // old-style .dat package + prefix[prefixLength++]='_'; + } else { + // new-style .dat package + prefix[prefixLength++]=U_TREE_ENTRY_SEP_CHAR; + // if it turns out to not contain U_TREE_ENTRY_SEP_CHAR + // then the test in the loop below will fail + } + } + prefix[prefixLength]=0; + + /* read the ToC table */ + for(i=0; i<itemCount; ++i) { + // skip the package part of the item name, error if it does not match the actual package name + // or if nothing follows the package name + offset=(int32_t)ds->readUInt32(inEntries[i].nameOffset)-stringsOffset; + s=inItemStrings+offset; + if(0!=strncmp(s, prefix, prefixLength) || s[prefixLength]==0) { + fprintf(stderr, "icupkg: input .dat item name \"%s\" does not start with \"%s\"\n", + s, prefix); + exit(U_INVALID_FORMAT_ERROR); + } + items[i].name=s+prefixLength; + + // set the item's data + items[i].data=(uint8_t *)inBytes+ds->readUInt32(inEntries[i].dataOffset); + if(i>0) { + items[i-1].length=(int32_t)(items[i].data-items[i-1].data); + + // set the previous item's platform type + typeEnum=getTypeEnumForInputData(items[i-1].data, items[i-1].length, &errorCode); + if(typeEnum<0 || U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: not an ICU data file: item \"%s\" in \"%s\"\n", items[i-1].name, filename); + exit(U_INVALID_FORMAT_ERROR); + } + items[i-1].type=makeTypeLetter(typeEnum); + } + items[i].isDataOwned=false; + } + // set the last item's length + items[itemCount-1].length=length-ds->readUInt32(inEntries[itemCount-1].dataOffset); + + // set the last item's platform type + typeEnum=getTypeEnumForInputData(items[itemCount-1].data, items[itemCount-1].length, &errorCode); + if(typeEnum<0 || U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: not an ICU data file: item \"%s\" in \"%s\"\n", items[itemCount-1].name, filename); + exit(U_INVALID_FORMAT_ERROR); + } + items[itemCount-1].type=makeTypeLetter(typeEnum); + + if(type!=U_ICUDATA_TYPE_LETTER[0]) { + // sort the item names for the local charset + sortItems(); + } + } + + udata_closeSwapper(ds); +} + +char +Package::getInType() { + return makeTypeLetter(inCharset, inIsBigEndian); +} + +void +Package::writePackage(const char *filename, char outType, const char *comment) { + char prefix[MAX_PKG_NAME_LENGTH+4]; + UDataOffsetTOCEntry entry; + UDataSwapper *dsLocalToOut, *ds[TYPE_COUNT]; + FILE *file; + Item *pItem; + char *name; + UErrorCode errorCode; + int32_t i, length, prefixLength, maxItemLength, basenameOffset, offset, outInt32; + uint8_t outCharset; + UBool outIsBigEndian; + + extractPackageName(filename, prefix, MAX_PKG_NAME_LENGTH); + + // if there is an explicit comment, then use it, else use what's in the current header + if(comment!=nullptr) { + /* get the header size minus the current comment */ + DataHeader *pHeader; + int32_t length; + + pHeader=(DataHeader *)header; + headerLength=4+pHeader->info.size; + length=(int32_t)strlen(comment); + if((int32_t)(headerLength+length)>=(int32_t)sizeof(header)) { + fprintf(stderr, "icupkg: comment too long\n"); + exit(U_BUFFER_OVERFLOW_ERROR); + } + memcpy(header+headerLength, comment, length+1); + headerLength+=length; + if(headerLength&0xf) { + /* NUL-pad the header to a multiple of 16 */ + length=(headerLength+0xf)&~0xf; + memset(header+headerLength, 0, length-headerLength); + headerLength=length; + } + pHeader->dataHeader.headerSize=(uint16_t)headerLength; + } + + makeTypeProps(outType, outCharset, outIsBigEndian); + + // open (TYPE_COUNT-2) swappers + // one is a no-op for local type==outType + // one type (TYPE_LE) is bogus + errorCode=U_ZERO_ERROR; + i=makeTypeEnum(outType); + ds[TYPE_B]= i==TYPE_B ? nullptr : udata_openSwapper(true, U_ASCII_FAMILY, outIsBigEndian, outCharset, &errorCode); + ds[TYPE_L]= i==TYPE_L ? nullptr : udata_openSwapper(false, U_ASCII_FAMILY, outIsBigEndian, outCharset, &errorCode); + ds[TYPE_LE]=nullptr; + ds[TYPE_E]= i==TYPE_E ? nullptr : udata_openSwapper(true, U_EBCDIC_FAMILY, outIsBigEndian, outCharset, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_openSwapper() failed - %s\n", u_errorName(errorCode)); + exit(errorCode); + } + for(i=0; i<TYPE_COUNT; ++i) { + if(ds[i]!=nullptr) { + ds[i]->printError=printPackageError; + ds[i]->printErrorContext=stderr; + } + } + + dsLocalToOut=ds[makeTypeEnum(U_CHARSET_FAMILY, U_IS_BIG_ENDIAN)]; + + // create the file and write its contents + file=fopen(filename, "wb"); + if(file==nullptr) { + fprintf(stderr, "icupkg: unable to create file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + // swap and write the header + if(dsLocalToOut!=nullptr) { + udata_swapDataHeader(dsLocalToOut, header, headerLength, header, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_swapDataHeader(local to out) failed - %s\n", u_errorName(errorCode)); + exit(errorCode); + } + } + length=(int32_t)fwrite(header, 1, headerLength, file); + if(length!=headerLength) { + fprintf(stderr, "icupkg: unable to write complete header to file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + // prepare and swap the package name with a tree separator + // for prepending to item names + if(pkgPrefix[0]==0) { + prefixLength=(int32_t)strlen(prefix); + } else { + prefixLength=(int32_t)strlen(pkgPrefix); + memcpy(prefix, pkgPrefix, prefixLength); + if(prefixEndsWithType) { + prefix[prefixLength-1]=outType; + } + } + prefix[prefixLength++]=U_TREE_ENTRY_SEP_CHAR; + prefix[prefixLength]=0; + if(dsLocalToOut!=nullptr) { + dsLocalToOut->swapInvChars(dsLocalToOut, prefix, prefixLength, prefix, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: swapInvChars(output package name) failed - %s\n", u_errorName(errorCode)); + exit(errorCode); + } + + // swap and sort the item names (sorting needs to be done in the output charset) + dsLocalToOut->swapInvChars(dsLocalToOut, inStrings, inStringTop, inStrings, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: swapInvChars(item names) failed - %s\n", u_errorName(errorCode)); + exit(errorCode); + } + sortItems(); + } + + // create the output item names in sorted order, with the package name prepended to each + for(i=0; i<itemCount; ++i) { + length=(int32_t)strlen(items[i].name); + name=allocString(false, length+prefixLength); + memcpy(name, prefix, prefixLength); + memcpy(name+prefixLength, items[i].name, length+1); + items[i].name=name; + } + + // calculate offsets for item names and items, pad to 16-align items + // align only the first item; each item's length is a multiple of 16 + basenameOffset=4+8*itemCount; + offset=basenameOffset+outStringTop; + if((length=(offset&15))!=0) { + length=16-length; + memset(allocString(false, length-1), 0xaa, length); + offset+=length; + } + + // write the table of contents + // first the itemCount + outInt32=itemCount; + if(dsLocalToOut!=nullptr) { + dsLocalToOut->swapArray32(dsLocalToOut, &outInt32, 4, &outInt32, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: swapArray32(item count) failed - %s\n", u_errorName(errorCode)); + exit(errorCode); + } + } + length=(int32_t)fwrite(&outInt32, 1, 4, file); + if(length!=4) { + fprintf(stderr, "icupkg: unable to write complete item count to file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + // then write the item entries (and collect the maxItemLength) + maxItemLength=0; + for(i=0; i<itemCount; ++i) { + entry.nameOffset=(uint32_t)(basenameOffset+(items[i].name-outStrings)); + entry.dataOffset=(uint32_t)offset; + if(dsLocalToOut!=nullptr) { + dsLocalToOut->swapArray32(dsLocalToOut, &entry, 8, &entry, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: swapArray32(item entry %ld) failed - %s\n", (long)i, u_errorName(errorCode)); + exit(errorCode); + } + } + length=(int32_t)fwrite(&entry, 1, 8, file); + if(length!=8) { + fprintf(stderr, "icupkg: unable to write complete item entry %ld to file \"%s\"\n", (long)i, filename); + exit(U_FILE_ACCESS_ERROR); + } + + length=items[i].length; + if(length>maxItemLength) { + maxItemLength=length; + } + offset+=length; + } + + // write the item names + length=(int32_t)fwrite(outStrings, 1, outStringTop, file); + if(length!=outStringTop) { + fprintf(stderr, "icupkg: unable to write complete item names to file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + // write the items + for(pItem=items, i=0; i<itemCount; ++pItem, ++i) { + int32_t type=makeTypeEnum(pItem->type); + if(ds[type]!=nullptr) { + // swap each item from its platform properties to the desired ones + udata_swap( + ds[type], + pItem->data, pItem->length, pItem->data, + &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_swap(item %ld) failed - %s\n", (long)i, u_errorName(errorCode)); + exit(errorCode); + } + } + length=(int32_t)fwrite(pItem->data, 1, pItem->length, file); + if(length!=pItem->length) { + fprintf(stderr, "icupkg: unable to write complete item %ld to file \"%s\"\n", (long)i, filename); + exit(U_FILE_ACCESS_ERROR); + } + } + + if(ferror(file)) { + fprintf(stderr, "icupkg: unable to write complete file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + fclose(file); + for(i=0; i<TYPE_COUNT; ++i) { + udata_closeSwapper(ds[i]); + } +} + +int32_t +Package::findItem(const char *name, int32_t length) const { + int32_t i, start, limit; + int result; + + /* do a binary search for the string */ + start=0; + limit=itemCount; + while(start<limit) { + i=(start+limit)/2; + if(length>=0) { + result=strncmp(name, items[i].name, length); + } else { + result=strcmp(name, items[i].name); + } + + if(result==0) { + /* found */ + if(length>=0) { + /* + * if we compared just prefixes, then we may need to back up + * to the first item with this prefix + */ + while(i>0 && 0==strncmp(name, items[i-1].name, length)) { + --i; + } + } + return i; + } else if(result<0) { + limit=i; + } else /* result>0 */ { + start=i+1; + } + } + + return ~start; /* not found, return binary-not of the insertion point */ +} + +void +Package::findItems(const char *pattern) { + const char *wild; + + if(pattern==nullptr || *pattern==0) { + findNextIndex=-1; + return; + } + + findPrefix=pattern; + findSuffix=nullptr; + findSuffixLength=0; + + wild=strchr(pattern, '*'); + if(wild==nullptr) { + // no wildcard + findPrefixLength=(int32_t)strlen(pattern); + } else { + // one wildcard + findPrefixLength=(int32_t)(wild-pattern); + findSuffix=wild+1; + findSuffixLength=(int32_t)strlen(findSuffix); + if(nullptr!=strchr(findSuffix, '*')) { + // two or more wildcards + fprintf(stderr, "icupkg: syntax error (more than one '*') in item pattern \"%s\"\n", pattern); + exit(U_PARSE_ERROR); + } + } + + if(findPrefixLength==0) { + findNextIndex=0; + } else { + findNextIndex=findItem(findPrefix, findPrefixLength); + } +} + +int32_t +Package::findNextItem() { + const char *name, *middle, *treeSep; + int32_t idx, nameLength, middleLength; + + if(findNextIndex<0) { + return -1; + } + + while(findNextIndex<itemCount) { + idx=findNextIndex++; + name=items[idx].name; + nameLength=(int32_t)strlen(name); + if(nameLength<(findPrefixLength+findSuffixLength)) { + // item name too short for prefix & suffix + continue; + } + if(findPrefixLength>0 && 0!=memcmp(findPrefix, name, findPrefixLength)) { + // left the range of names with this prefix + break; + } + middle=name+findPrefixLength; + middleLength=nameLength-findPrefixLength-findSuffixLength; + if(findSuffixLength>0 && 0!=memcmp(findSuffix, name+(nameLength-findSuffixLength), findSuffixLength)) { + // suffix does not match + continue; + } + // prefix & suffix match + + if(matchMode&MATCH_NOSLASH) { + treeSep=strchr(middle, U_TREE_ENTRY_SEP_CHAR); + if(treeSep!=nullptr && (treeSep-middle)<middleLength) { + // the middle (matching the * wildcard) contains a tree separator / + continue; + } + } + + // found a matching item + return idx; + } + + // no more items + findNextIndex=-1; + return -1; +} + +void +Package::setMatchMode(uint32_t mode) { + matchMode=mode; +} + +void +Package::addItem(const char *name) { + addItem(name, nullptr, 0, false, U_ICUDATA_TYPE_LETTER[0]); +} + +void +Package::addItem(const char *name, uint8_t *data, int32_t length, UBool isDataOwned, char type) { + int32_t idx; + + idx=findItem(name); + if(idx<0) { + // new item, make space at the insertion point + ensureItemCapacity(); + // move the following items down + idx=~idx; + if(idx<itemCount) { + memmove(items+idx+1, items+idx, (itemCount-idx)*sizeof(Item)); + } + ++itemCount; + + // reset this Item entry + memset(items+idx, 0, sizeof(Item)); + + // copy the item's name + items[idx].name=allocString(true, static_cast<int32_t>(strlen(name))); + strcpy(items[idx].name, name); + pathToTree(items[idx].name); + } else { + // same-name item found, replace it + if(items[idx].isDataOwned) { + uprv_free(items[idx].data); + } + + // keep the item's name since it is the same + } + + // set the item's data + items[idx].data=data; + items[idx].length=length; + items[idx].isDataOwned=isDataOwned; + items[idx].type=type; +} + +void +Package::addFile(const char *filesPath, const char *name) { + uint8_t *data; + int32_t length; + char type; + + data=readFile(filesPath, name, length, type); + // readFile() exits the tool if it fails + addItem(name, data, length, true, type); +} + +void +Package::addItems(const Package &listPkg) { + const Item *pItem; + int32_t i; + + for(pItem=listPkg.items, i=0; i<listPkg.itemCount; ++pItem, ++i) { + addItem(pItem->name, pItem->data, pItem->length, false, pItem->type); + } +} + +void +Package::removeItem(int32_t idx) { + if(idx>=0) { + // remove the item + if(items[idx].isDataOwned) { + uprv_free(items[idx].data); + } + + // move the following items up + if((idx+1)<itemCount) { + memmove(items+idx, items+idx+1, (itemCount-(idx+1))*sizeof(Item)); + } + --itemCount; + + if(idx<=findNextIndex) { + --findNextIndex; + } + } +} + +void +Package::removeItems(const char *pattern) { + int32_t idx; + + findItems(pattern); + while((idx=findNextItem())>=0) { + removeItem(idx); + } +} + +void +Package::removeItems(const Package &listPkg) { + const Item *pItem; + int32_t i; + + for(pItem=listPkg.items, i=0; i<listPkg.itemCount; ++pItem, ++i) { + removeItems(pItem->name); + } +} + +void +Package::extractItem(const char *filesPath, const char *outName, int32_t idx, char outType) { + char filename[1024]; + UDataSwapper *ds; + FILE *file; + Item *pItem; + int32_t fileLength; + uint8_t itemCharset, outCharset; + UBool itemIsBigEndian, outIsBigEndian; + + if(idx<0 || itemCount<=idx) { + return; + } + pItem=items+idx; + + // swap the data to the outType + // outType==0: don't swap + if(outType!=0 && pItem->type!=outType) { + // open the swapper + UErrorCode errorCode=U_ZERO_ERROR; + makeTypeProps(pItem->type, itemCharset, itemIsBigEndian); + makeTypeProps(outType, outCharset, outIsBigEndian); + ds=udata_openSwapper(itemIsBigEndian, itemCharset, outIsBigEndian, outCharset, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_openSwapper(item %ld) failed - %s\n", + (long)idx, u_errorName(errorCode)); + exit(errorCode); + } + + ds->printError=printPackageError; + ds->printErrorContext=stderr; + + // swap the item from its platform properties to the desired ones + udata_swap(ds, pItem->data, pItem->length, pItem->data, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_swap(item %ld) failed - %s\n", (long)idx, u_errorName(errorCode)); + exit(errorCode); + } + udata_closeSwapper(ds); + pItem->type=outType; + } + + // create the file and write its contents + makeFullFilenameAndDirs(filesPath, outName, filename, (int32_t)sizeof(filename)); + file=fopen(filename, "wb"); + if(file==nullptr) { + fprintf(stderr, "icupkg: unable to create file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + fileLength=(int32_t)fwrite(pItem->data, 1, pItem->length, file); + + if(ferror(file) || fileLength!=pItem->length) { + fprintf(stderr, "icupkg: unable to write complete file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + fclose(file); +} + +void +Package::extractItem(const char *filesPath, int32_t idx, char outType) { + extractItem(filesPath, items[idx].name, idx, outType); +} + +void +Package::extractItems(const char *filesPath, const char *pattern, char outType) { + int32_t idx; + + findItems(pattern); + while((idx=findNextItem())>=0) { + extractItem(filesPath, idx, outType); + } +} + +void +Package::extractItems(const char *filesPath, const Package &listPkg, char outType) { + const Item *pItem; + int32_t i; + + for(pItem=listPkg.items, i=0; i<listPkg.itemCount; ++pItem, ++i) { + extractItems(filesPath, pItem->name, outType); + } +} + +int32_t +Package::getItemCount() const { + return itemCount; +} + +const Item * +Package::getItem(int32_t idx) const { + if (0 <= idx && idx < itemCount) { + return &items[idx]; + } + return nullptr; +} + +void +Package::checkDependency(void *context, const char *itemName, const char *targetName) { + // check dependency: make sure the target item is in the package + Package *me=(Package *)context; + if(me->findItem(targetName)<0) { + me->isMissingItems=true; + fprintf(stderr, "Item %s depends on missing item %s\n", itemName, targetName); + } +} + +UBool +Package::checkDependencies() { + isMissingItems=false; + enumDependencies(this, checkDependency); + return (UBool)!isMissingItems; +} + +void +Package::enumDependencies(void *context, CheckDependency check) { + int32_t i; + + for(i=0; i<itemCount; ++i) { + enumDependencies(items+i, context, check); + } +} + +char * +Package::allocString(UBool in, int32_t length) { + char *p; + int32_t top; + + if(in) { + top=inStringTop; + p=inStrings+top; + } else { + top=outStringTop; + p=outStrings+top; + } + top+=length+1; + + if(top>STRING_STORE_SIZE) { + fprintf(stderr, "icupkg: string storage overflow\n"); + exit(U_BUFFER_OVERFLOW_ERROR); + } + if(in) { + inStringTop=top; + } else { + outStringTop=top; + } + return p; +} + +void +Package::sortItems() { + UErrorCode errorCode=U_ZERO_ERROR; + uprv_sortArray(items, itemCount, (int32_t)sizeof(Item), compareItems, nullptr, false, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: sorting item names failed - %s\n", u_errorName(errorCode)); + exit(errorCode); + } +} + +void Package::setItemCapacity(int32_t max) +{ + if(max<=itemMax) { + return; + } + Item *newItems = (Item*)uprv_malloc(max * sizeof(items[0])); + Item *oldItems = items; + if(newItems == nullptr) { + fprintf(stderr, "icupkg: Out of memory trying to allocate %lu bytes for %d items\n", + (unsigned long)(max*sizeof(items[0])), max); + exit(U_MEMORY_ALLOCATION_ERROR); + } + if(items && itemCount>0) { + uprv_memcpy(newItems, items, (size_t)itemCount*sizeof(items[0])); + } + itemMax = max; + items = newItems; + uprv_free(oldItems); +} + +void Package::ensureItemCapacity() +{ + if((itemCount+1)>itemMax) { + setItemCapacity(itemCount+kItemsChunk); + } +} + +U_NAMESPACE_END diff --git a/intl/icu/source/tools/toolutil/package.h b/intl/icu/source/tools/toolutil/package.h new file mode 100644 index 0000000000..ea60c13a74 --- /dev/null +++ b/intl/icu/source/tools/toolutil/package.h @@ -0,0 +1,203 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2005-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: package.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005aug25 +* created by: Markus W. Scherer +* +* Read, modify, and write ICU .dat data package files. +*/ + +#ifndef __PACKAGE_H__ +#define __PACKAGE_H__ + +#include "unicode/utypes.h" + +#include <stdio.h> + +// .dat package file representation ---------------------------------------- *** + +#define STRING_STORE_SIZE 100000 +#define MAX_PKG_NAME_LENGTH 64 + +typedef void CheckDependency(void *context, const char *itemName, const char *targetName); + +U_NAMESPACE_BEGIN + +struct Item { + char *name; + uint8_t *data; + int32_t length; + UBool isDataOwned; + char type; +}; + +class U_TOOLUTIL_API Package { +public: + /* + * Constructor. + * Prepare this object for a new, empty package. + */ + Package(); + + /* Destructor. */ + ~Package(); + + /** + * Uses the prefix of the first entry of the package in readPackage(), + * rather than the package basename. + */ + void setAutoPrefix() { doAutoPrefix=true; } + /** + * Same as setAutoPrefix(), plus the prefix must end with the platform type letter. + */ + void setAutoPrefixWithType() { + doAutoPrefix=true; + prefixEndsWithType=true; + } + void setPrefix(const char *p); + + /* + * Read an existing .dat package file. + * The header and item name strings are swapped into this object, + * but the items are left unswapped. + */ + void readPackage(const char *filename); + /* + * Write a .dat package file with the items in this object. + * Swap all pieces to the desired output platform properties. + * The package becomes unusable: + * The item names are swapped and sorted in the outCharset rather than the local one. + * Also, the items themselves are swapped in-place + */ + void writePackage(const char *filename, char outType, const char *comment); + + /* + * Return the input data type letter (l, b, or e). + */ + char getInType(); + + // find the item in items[], return the non-negative index if found, else the binary-not of the insertion point + int32_t findItem(const char *name, int32_t length=-1) const; + + /* + * Set internal state for following calls to findNextItem() which will return + * indexes for items whose names match the pattern. + */ + void findItems(const char *pattern); + int32_t findNextItem(); + /* + * Set the match mode for findItems() & findNextItem(). + * @param mode 0=default + * MATCH_NOSLASH * does not match a '/' + */ + void setMatchMode(uint32_t mode); + + enum { + MATCH_NOSLASH=1 + }; + + void addItem(const char *name); + void addItem(const char *name, uint8_t *data, int32_t length, UBool isDataOwned, char type); + void addFile(const char *filesPath, const char *name); + void addItems(const Package &listPkg); + + void removeItem(int32_t itemIndex); + void removeItems(const char *pattern); + void removeItems(const Package &listPkg); + + /* The extractItem() functions accept outputType=0 to mean "don't swap the item". */ + void extractItem(const char *filesPath, int32_t itemIndex, char outType); + void extractItems(const char *filesPath, const char *pattern, char outType); + void extractItems(const char *filesPath, const Package &listPkg, char outType); + + /* This variant extracts an item to a specific filename. */ + void extractItem(const char *filesPath, const char *outName, int32_t itemIndex, char outType); + + int32_t getItemCount() const; + const Item *getItem(int32_t idx) const; + + /* + * Check dependencies and return true if all dependencies are fulfilled. + */ + UBool checkDependencies(); + + /* + * Enumerate all the dependencies and give the results to context and call CheckDependency callback + * @param context user context (will be passed to check function) + * @param check will be called with context and any missing items + */ + void enumDependencies(void *context, CheckDependency check); + +private: + void enumDependencies(Item *pItem, void *context, CheckDependency check); + + /** + * Default CheckDependency function used by checkDependencies() + */ + static void checkDependency(void *context, const char *itemName, const char *targetName); + + /* + * Allocate a string in inStrings or outStrings. + * The length does not include the terminating NUL. + */ + char *allocString(UBool in, int32_t length); + + void sortItems(); + + // data fields + char inPkgName[MAX_PKG_NAME_LENGTH]; + char pkgPrefix[MAX_PKG_NAME_LENGTH]; + + uint8_t *inData; + uint8_t header[1024]; + int32_t inLength, headerLength; + uint8_t inCharset; + UBool inIsBigEndian; + UBool doAutoPrefix; + UBool prefixEndsWithType; + + int32_t itemCount; + int32_t itemMax; + Item *items; + + int32_t inStringTop, outStringTop; + char inStrings[STRING_STORE_SIZE], outStrings[STRING_STORE_SIZE]; + + // match mode for findItems(pattern) and findNextItem() + uint32_t matchMode; + + // state for findItems(pattern) and findNextItem() + const char *findPrefix, *findSuffix; + int32_t findPrefixLength, findSuffixLength; + int32_t findNextIndex; + + // state for checkDependencies() + UBool isMissingItems; + + /** + * Grow itemMax to new value + */ + void setItemCapacity(int32_t max); + + /** + * Grow itemMax to at least itemCount+1 + */ + void ensureItemCapacity(); +}; + +U_NAMESPACE_END + +#endif + + diff --git a/intl/icu/source/tools/toolutil/pkg_genc.cpp b/intl/icu/source/tools/toolutil/pkg_genc.cpp new file mode 100644 index 0000000000..741a8a5228 --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkg_genc.cpp @@ -0,0 +1,1396 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2009-2016, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ +#include "unicode/utypes.h" + +#if U_PLATFORM_HAS_WIN32_API +# define VC_EXTRALEAN +# define WIN32_LEAN_AND_MEAN +# define NOUSER +# define NOSERVICE +# define NOIME +# define NOMCX +#include <windows.h> +#include <time.h> +# ifdef __GNUC__ +# define WINDOWS_WITH_GNUC +# endif +#endif + +#if U_PLATFORM_IS_LINUX_BASED && U_HAVE_ELF_H +# define U_ELF +#endif + +#ifdef U_ELF +# include <elf.h> +# if defined(ELFCLASS64) +# define U_ELF64 +# endif + /* Old elf.h headers may not have EM_X86_64, or have EM_X8664 instead. */ +# ifndef EM_X86_64 +# define EM_X86_64 62 +# endif +# define ICU_ENTRY_OFFSET 0 +#endif + +#include <stdio.h> +#include <stdlib.h> +#include "unicode/putil.h" +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" +#include "toolutil.h" +#include "unicode/uclean.h" +#include "uoptions.h" +#include "pkg_genc.h" +#include "filetools.h" +#include "charstr.h" +#include "unicode/errorcode.h" + +#define MAX_COLUMN ((uint32_t)(0xFFFFFFFFU)) + +#define HEX_0X 0 /* 0x1234 */ +#define HEX_0H 1 /* 01234h */ + +/* prototypes --------------------------------------------------------------- */ +static void +getOutFilename( + const char *inFilename, + const char *destdir, + char *outFilename, + int32_t outFilenameCapacity, + char *entryName, + int32_t entryNameCapacity, + const char *newSuffix, + const char *optFilename); + +static uint32_t +write8(FileStream *out, uint8_t byte, uint32_t column); + +static uint32_t +write32(FileStream *out, uint32_t byte, uint32_t column); + +#if U_PLATFORM == U_PF_OS400 +static uint32_t +write8str(FileStream *out, uint8_t byte, uint32_t column); +#endif +/* -------------------------------------------------------------------------- */ + +/* +Creating Template Files for New Platforms + +Let the cc compiler help you get started. +Compile this program + const unsigned int x[5] = {1, 2, 0xdeadbeef, 0xffffffff, 16}; +with the -S option to produce assembly output. + +For example, this will generate array.s: +gcc -S array.c + +This will produce a .s file that may look like this: + + .file "array.c" + .version "01.01" +gcc2_compiled.: + .globl x + .section .rodata + .align 4 + .type x,@object + .size x,20 +x: + .long 1 + .long 2 + .long -559038737 + .long -1 + .long 16 + .ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)" + +which gives a starting point that will compile, and can be transformed +to become the template, generally with some consulting of as docs and +some experimentation. + +If you want ICU to automatically use this assembly, you should +specify "GENCCODE_ASSEMBLY=-a name" in the specific config/mh-* file, +where the name is the compiler or platform that you used in this +assemblyHeader data structure. +*/ +static const struct AssemblyType { + const char *name; + const char *header; + const char *beginLine; + const char *footer; + int8_t hexType; /* HEX_0X or HEX_0h */ +} assemblyHeader[] = { + /* For gcc assemblers, the meaning of .align changes depending on the */ + /* hardware, so we use .balign 16 which always means 16 bytes. */ + /* https://sourceware.org/binutils/docs/as/Pseudo-Ops.html */ + {"gcc", + ".globl %s\n" + "\t.section .note.GNU-stack,\"\",%%progbits\n" + "#ifdef __CET__\n" + "# include <cet.h>\n" + "#endif\n" + "\t.section .rodata\n" + "\t.balign 16\n" + "#ifdef U_HIDE_DATA_SYMBOL\n" + "\t.hidden %s\n" + "#endif\n" + "\t.type %s,%%object\n" + "%s:\n\n", + + ".long ",".size %s, .-%s\n",HEX_0X + }, + {"gcc-darwin", + /*"\t.section __TEXT,__text,regular,pure_instructions\n" + "\t.section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32\n"*/ + ".globl _%s\n" + "#ifdef U_HIDE_DATA_SYMBOL\n" + "\t.private_extern _%s\n" + "#endif\n" + "\t.data\n" + "\t.const\n" + "\t.balign 16\n" + "_%s:\n\n", + + ".long ","",HEX_0X + }, + /* macOS PPC should use `.p2align 4` instead `.balign 16` because is + * unknown pseudo ops for such legacy system*/ + {"gcc-darwin-ppc", + /*"\t.section __TEXT,__text,regular,pure_instructions\n" + "\t.section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32\n"*/ + ".globl _%s\n" + "#ifdef U_HIDE_DATA_SYMBOL\n" + "\t.private_extern _%s\n" + "#endif\n" + "\t.data\n" + "\t.const\n" + "\t.p2align 4\n" + "_%s:\n\n", + + ".long ","",HEX_0X + }, + {"gcc-cygwin", + ".globl _%s\n" + "\t.section .rodata\n" + "\t.balign 16\n" + "_%s:\n\n", + + ".long ","",HEX_0X + }, + {"gcc-mingw64", + ".globl %s\n" + "\t.section .rodata\n" + "\t.balign 16\n" + "%s:\n\n", + + ".long ","",HEX_0X + }, +/* 16 bytes alignment. */ +/* http://docs.oracle.com/cd/E19641-01/802-1947/802-1947.pdf */ + {"sun", + "\t.section \".rodata\"\n" + "\t.align 16\n" + ".globl %s\n" + "%s:\n", + + ".word ","",HEX_0X + }, +/* 16 bytes alignment for sun-x86. */ +/* http://docs.oracle.com/cd/E19963-01/html/821-1608/eoiyg.html */ + {"sun-x86", + "Drodata.rodata:\n" + "\t.type Drodata.rodata,@object\n" + "\t.size Drodata.rodata,0\n" + "\t.globl %s\n" + "\t.align 16\n" + "%s:\n", + + ".4byte ","",HEX_0X + }, +/* 1<<4 bit alignment for aix. */ +/* http://pic.dhe.ibm.com/infocenter/aix/v6r1/index.jsp?topic=%2Fcom.ibm.aix.aixassem%2Fdoc%2Falangref%2Fidalangref_csect_pseudoop.htm */ + {"xlc", + ".globl %s{RO}\n" + "\t.toc\n" + "%s:\n" + "\t.csect %s{RO}, 4\n", + + ".long ","",HEX_0X + }, + {"aCC-ia64", + "\t.file \"%s.s\"\n" + "\t.type %s,@object\n" + "\t.global %s\n" + "\t.secalias .abe$0.rodata, \".rodata\"\n" + "\t.section .abe$0.rodata = \"a\", \"progbits\"\n" + "\t.align 16\n" + "%s::\t", + + "data4 ","",HEX_0X + }, + {"aCC-parisc", + "\t.SPACE $TEXT$\n" + "\t.SUBSPA $LIT$\n" + "%s\n" + "\t.EXPORT %s\n" + "\t.ALIGN 16\n", + + ".WORD ","",HEX_0X + }, +/* align 16 bytes */ +/* http://msdn.microsoft.com/en-us/library/dwa9fwef.aspx */ + {"nasm", + "global %s\n" +#if defined(_WIN32) + "section .rdata align=16\n" +#else + "section .rodata align=16\n" +#endif + "%s:\n", + " dd ","",HEX_0X + }, + { "masm", + "\tTITLE %s\n" + "; generated by genccode\n" + ".386\n" + ".model flat\n" + "\tPUBLIC _%s\n" + "ICUDATA_%s\tSEGMENT READONLY PARA PUBLIC FLAT 'DATA'\n" + "\tALIGN 16\n" + "_%s\tLABEL DWORD\n", + "\tDWORD ","\nICUDATA_%s\tENDS\n\tEND\n",HEX_0H + }, + { "masm64", + "\tTITLE %s\n" + "; generated by genccode\n" + "\tPUBLIC _%s\n" + "ICUDATA_%s\tSEGMENT READONLY 'DATA'\n" + "\tALIGN 16\n" + "_%s\tLABEL DWORD\n", + "\tDWORD ","\nICUDATA_%s\tENDS\n\tEND\n",HEX_0H + } +}; + +static int32_t assemblyHeaderIndex = -1; +static int32_t hexType = HEX_0X; + +U_CAPI UBool U_EXPORT2 +checkAssemblyHeaderName(const char* optAssembly) { + int32_t idx; + assemblyHeaderIndex = -1; + for (idx = 0; idx < UPRV_LENGTHOF(assemblyHeader); idx++) { + if (uprv_strcmp(optAssembly, assemblyHeader[idx].name) == 0) { + assemblyHeaderIndex = idx; + hexType = assemblyHeader[idx].hexType; /* set the hex type */ + return true; + } + } + + return false; +} + + +U_CAPI void U_EXPORT2 +printAssemblyHeadersToStdErr() { + int32_t idx; + fprintf(stderr, "%s", assemblyHeader[0].name); + for (idx = 1; idx < UPRV_LENGTHOF(assemblyHeader); idx++) { + fprintf(stderr, ", %s", assemblyHeader[idx].name); + } + fprintf(stderr, + ")\n"); +} + +U_CAPI void U_EXPORT2 +writeAssemblyCode( + const char *filename, + const char *destdir, + const char *optEntryPoint, + const char *optFilename, + char *outFilePath, + size_t outFilePathCapacity) { + uint32_t column = MAX_COLUMN; + char entry[96]; + union { + uint32_t uint32s[1024]; + char chars[4096]; + } buffer; + FileStream *in, *out; + size_t i, length, count; + + in=T_FileStream_open(filename, "rb"); + if(in==nullptr) { + fprintf(stderr, "genccode: unable to open input file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + const char* newSuffix = nullptr; + + if (uprv_strcmp(assemblyHeader[assemblyHeaderIndex].name, "masm") == 0) { + newSuffix = ".masm"; + } + else if (uprv_strcmp(assemblyHeader[assemblyHeaderIndex].name, "nasm") == 0) { + newSuffix = ".asm"; + } else { + newSuffix = ".S"; + } + + getOutFilename( + filename, + destdir, + buffer.chars, + sizeof(buffer.chars), + entry, + sizeof(entry), + newSuffix, + optFilename); + out=T_FileStream_open(buffer.chars, "w"); + if(out==nullptr) { + fprintf(stderr, "genccode: unable to open output file %s\n", buffer.chars); + exit(U_FILE_ACCESS_ERROR); + } + + if (outFilePath != nullptr) { + if (uprv_strlen(buffer.chars) >= outFilePathCapacity) { + fprintf(stderr, "genccode: filename too long\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + uprv_strcpy(outFilePath, buffer.chars); +#if defined (WINDOWS_WITH_GNUC) && U_PLATFORM != U_PF_CYGWIN + /* Need to fix the file separator character when using MinGW. */ + swapFileSepChar(outFilePath, U_FILE_SEP_CHAR, '/'); +#endif + } + + if(optEntryPoint != nullptr) { + uprv_strcpy(entry, optEntryPoint); + uprv_strcat(entry, "_dat"); + } + + /* turn dashes or dots in the entry name into underscores */ + length=uprv_strlen(entry); + for(i=0; i<length; ++i) { + if(entry[i]=='-' || entry[i]=='.') { + entry[i]='_'; + } + } + + count = snprintf( + buffer.chars, sizeof(buffer.chars), + assemblyHeader[assemblyHeaderIndex].header, + entry, entry, entry, entry, + entry, entry, entry, entry); + if (count >= sizeof(buffer.chars)) { + fprintf(stderr, "genccode: entry name too long (long filename?)\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + T_FileStream_writeLine(out, buffer.chars); + T_FileStream_writeLine(out, assemblyHeader[assemblyHeaderIndex].beginLine); + + for(;;) { + memset(buffer.uint32s, 0, sizeof(buffer.uint32s)); + length=T_FileStream_read(in, buffer.uint32s, sizeof(buffer.uint32s)); + if(length==0) { + break; + } + for(i=0; i<(length/sizeof(buffer.uint32s[0])); i++) { + // TODO: What if the last read sees length not as a multiple of 4? + column = write32(out, buffer.uint32s[i], column); + } + } + + T_FileStream_writeLine(out, "\n"); + + count = snprintf( + buffer.chars, sizeof(buffer.chars), + assemblyHeader[assemblyHeaderIndex].footer, + entry, entry, entry, entry, + entry, entry, entry, entry); + if (count >= sizeof(buffer.chars)) { + fprintf(stderr, "genccode: entry name too long (long filename?)\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + T_FileStream_writeLine(out, buffer.chars); + + if(T_FileStream_error(in)) { + fprintf(stderr, "genccode: file read error while generating from file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + if(T_FileStream_error(out)) { + fprintf(stderr, "genccode: file write error while generating from file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + T_FileStream_close(out); + T_FileStream_close(in); +} + +U_CAPI void U_EXPORT2 +writeCCode( + const char *filename, + const char *destdir, + const char *optEntryPoint, + const char *optName, + const char *optFilename, + char *outFilePath, + size_t outFilePathCapacity) { + uint32_t column = MAX_COLUMN; + char buffer[4096], entry[96]; + FileStream *in, *out; + size_t i, length, count; + + in=T_FileStream_open(filename, "rb"); + if(in==nullptr) { + fprintf(stderr, "genccode: unable to open input file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + if(optName != nullptr) { /* prepend 'icudt28_' */ + // +2 includes the _ and the NUL + if (uprv_strlen(optName) + 2 > sizeof(entry)) { + fprintf(stderr, "genccode: entry name too long (long filename?)\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + strcpy(entry, optName); + strcat(entry, "_"); + } else { + entry[0] = 0; + } + + getOutFilename( + filename, + destdir, + buffer, + static_cast<int32_t>(sizeof(buffer)), + entry + uprv_strlen(entry), + static_cast<int32_t>(sizeof(entry) - uprv_strlen(entry)), + ".c", + optFilename); + + if (outFilePath != nullptr) { + if (uprv_strlen(buffer) >= outFilePathCapacity) { + fprintf(stderr, "genccode: filename too long\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + uprv_strcpy(outFilePath, buffer); +#if defined (WINDOWS_WITH_GNUC) && U_PLATFORM != U_PF_CYGWIN + /* Need to fix the file separator character when using MinGW. */ + swapFileSepChar(outFilePath, U_FILE_SEP_CHAR, '/'); +#endif + } + + out=T_FileStream_open(buffer, "w"); + if(out==nullptr) { + fprintf(stderr, "genccode: unable to open output file %s\n", buffer); + exit(U_FILE_ACCESS_ERROR); + } + + if(optEntryPoint != nullptr) { + uprv_strcpy(entry, optEntryPoint); + uprv_strcat(entry, "_dat"); + } + + /* turn dashes or dots in the entry name into underscores */ + length=uprv_strlen(entry); + for(i=0; i<length; ++i) { + if(entry[i]=='-' || entry[i]=='.') { + entry[i]='_'; + } + } + +#if U_PLATFORM == U_PF_OS400 + /* + TODO: Fix this once the compiler implements this feature. Keep in sync with udatamem.c + + This is here because this platform can't currently put + const data into the read-only pages of an object or + shared library (service program). Only strings are allowed in read-only + pages, so we use char * strings to store the data. + + In order to prevent the beginning of the data from ever matching the + magic numbers we must still use the initial double. + [grhoten 4/24/2003] + */ + count = snprintf(buffer, sizeof(buffer), + "#ifndef IN_GENERATED_CCODE\n" + "#define IN_GENERATED_CCODE\n" + "#define U_DISABLE_RENAMING 1\n" + "#include \"unicode/umachine.h\"\n" + "#endif\n" + "U_CDECL_BEGIN\n" + "const struct {\n" + " double bogus;\n" + " const char *bytes; \n" + "} %s={ 0.0, \n", + entry); + if (count >= sizeof(buffer)) { + fprintf(stderr, "genccode: entry name too long (long filename?)\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + T_FileStream_writeLine(out, buffer); + + for(;;) { + length=T_FileStream_read(in, buffer, sizeof(buffer)); + if(length==0) { + break; + } + for(i=0; i<length; ++i) { + column = write8str(out, (uint8_t)buffer[i], column); + } + } + + T_FileStream_writeLine(out, "\"\n};\nU_CDECL_END\n"); +#else + /* Function renaming shouldn't be done in data */ + count = snprintf(buffer, sizeof(buffer), + "#ifndef IN_GENERATED_CCODE\n" + "#define IN_GENERATED_CCODE\n" + "#define U_DISABLE_RENAMING 1\n" + "#include \"unicode/umachine.h\"\n" + "#endif\n" + "U_CDECL_BEGIN\n" + "const struct {\n" + " double bogus;\n" + " uint8_t bytes[%ld]; \n" + "} %s={ 0.0, {\n", + (long)T_FileStream_size(in), entry); + if (count >= sizeof(buffer)) { + fprintf(stderr, "genccode: entry name too long (long filename?)\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + T_FileStream_writeLine(out, buffer); + + for(;;) { + length=T_FileStream_read(in, buffer, sizeof(buffer)); + if(length==0) { + break; + } + for(i=0; i<length; ++i) { + column = write8(out, (uint8_t)buffer[i], column); + } + } + + T_FileStream_writeLine(out, "\n}\n};\nU_CDECL_END\n"); +#endif + + if(T_FileStream_error(in)) { + fprintf(stderr, "genccode: file read error while generating from file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + if(T_FileStream_error(out)) { + fprintf(stderr, "genccode: file write error while generating from file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + T_FileStream_close(out); + T_FileStream_close(in); +} + +static uint32_t +write32(FileStream *out, uint32_t bitField, uint32_t column) { + int32_t i; + char bitFieldStr[64]; /* This is more bits than needed for a 32-bit number */ + char *s = bitFieldStr; + uint8_t *ptrIdx = (uint8_t *)&bitField; + static const char hexToStr[16] = { + '0','1','2','3', + '4','5','6','7', + '8','9','A','B', + 'C','D','E','F' + }; + + /* write the value, possibly with comma and newline */ + if(column==MAX_COLUMN) { + /* first byte */ + column=1; + } else if(column<32) { + *(s++)=','; + ++column; + } else { + *(s++)='\n'; + uprv_strcpy(s, assemblyHeader[assemblyHeaderIndex].beginLine); + s+=uprv_strlen(s); + column=1; + } + + if (bitField < 10) { + /* It's a small number. Don't waste the space for 0x */ + *(s++)=hexToStr[bitField]; + } + else { + int seenNonZero = 0; /* This is used to remove leading zeros */ + + if(hexType==HEX_0X) { + *(s++)='0'; + *(s++)='x'; + } else if(hexType==HEX_0H) { + *(s++)='0'; + } + + /* This creates a 32-bit field */ +#if U_IS_BIG_ENDIAN + for (i = 0; i < sizeof(uint32_t); i++) +#else + for (i = sizeof(uint32_t)-1; i >= 0 ; i--) +#endif + { + uint8_t value = ptrIdx[i]; + if (value || seenNonZero) { + *(s++)=hexToStr[value>>4]; + *(s++)=hexToStr[value&0xF]; + seenNonZero = 1; + } + } + if(hexType==HEX_0H) { + *(s++)='h'; + } + } + + *(s++)=0; + T_FileStream_writeLine(out, bitFieldStr); + return column; +} + +static uint32_t +write8(FileStream *out, uint8_t byte, uint32_t column) { + char s[4]; + int i=0; + + /* convert the byte value to a string */ + if(byte>=100) { + s[i++]=(char)('0'+byte/100); + byte%=100; + } + if(i>0 || byte>=10) { + s[i++]=(char)('0'+byte/10); + byte%=10; + } + s[i++]=(char)('0'+byte); + s[i]=0; + + /* write the value, possibly with comma and newline */ + if(column==MAX_COLUMN) { + /* first byte */ + column=1; + } else if(column<16) { + T_FileStream_writeLine(out, ","); + ++column; + } else { + T_FileStream_writeLine(out, ",\n"); + column=1; + } + T_FileStream_writeLine(out, s); + return column; +} + +#if U_PLATFORM == U_PF_OS400 +static uint32_t +write8str(FileStream *out, uint8_t byte, uint32_t column) { + char s[8]; + + if (byte > 7) + snprintf(s, sizeof(s), "\\x%X", byte); + else + snprintf(s, sizeof(s), "\\%X", byte); + + /* write the value, possibly with comma and newline */ + if(column==MAX_COLUMN) { + /* first byte */ + column=1; + T_FileStream_writeLine(out, "\""); + } else if(column<24) { + ++column; + } else { + T_FileStream_writeLine(out, "\"\n\""); + column=1; + } + T_FileStream_writeLine(out, s); + return column; +} +#endif + +static void +getOutFilename( + const char *inFilename, + const char *destdir, + char *outFilename, + int32_t outFilenameCapacity, + char *entryName, + int32_t entryNameCapacity, + const char *newSuffix, + const char *optFilename) { + const char *basename=findBasename(inFilename), *suffix=uprv_strrchr(basename, '.'); + + icu::CharString outFilenameBuilder; + icu::CharString entryNameBuilder; + icu::ErrorCode status; + + /* copy path */ + if(destdir!=nullptr && *destdir!=0) { + outFilenameBuilder.append(destdir, status); + outFilenameBuilder.ensureEndsWithFileSeparator(status); + } else { + outFilenameBuilder.append(inFilename, static_cast<int32_t>(basename - inFilename), status); + } + inFilename=basename; + + if(suffix==nullptr) { + /* the filename does not have a suffix */ + entryNameBuilder.append(inFilename, status); + if(optFilename != nullptr) { + outFilenameBuilder.append(optFilename, status); + } else { + outFilenameBuilder.append(inFilename, status); + } + outFilenameBuilder.append(newSuffix, status); + } else { + int32_t saveOutFilenameLength = outFilenameBuilder.length(); + /* copy basename */ + while(inFilename<suffix) { + // iSeries cannot have '-' in the .o objects. + char c = (*inFilename=='-') ? '_' : *inFilename; + outFilenameBuilder.append(c, status); + entryNameBuilder.append(c, status); + inFilename++; + } + + /* replace '.' by '_' */ + outFilenameBuilder.append('_', status); + entryNameBuilder.append('_', status); + ++inFilename; + + /* copy suffix */ + outFilenameBuilder.append(inFilename, status); + entryNameBuilder.append(inFilename, status); + + if(optFilename != nullptr) { + outFilenameBuilder.truncate(saveOutFilenameLength); + outFilenameBuilder.append(optFilename, status); + } + // add ".c" + outFilenameBuilder.append(newSuffix, status); + } + + if (status.isFailure()) { + fprintf(stderr, "genccode: error building filename or entrypoint\n"); + exit(status.get()); + } + + if (outFilenameBuilder.length() >= outFilenameCapacity) { + fprintf(stderr, "genccode: output filename too long\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + + if (entryNameBuilder.length() >= entryNameCapacity) { + fprintf(stderr, "genccode: entry name too long (long filename?)\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + + outFilenameBuilder.extract(outFilename, outFilenameCapacity, status); + entryNameBuilder.extract(entryName, entryNameCapacity, status); +} + +#ifdef CAN_GENERATE_OBJECTS +static void +getArchitecture(uint16_t *pCPU, uint16_t *pBits, UBool *pIsBigEndian, const char *optMatchArch) { + union { + char bytes[2048]; +#ifdef U_ELF + Elf32_Ehdr header32; + /* Elf32_Ehdr and ELF64_Ehdr are identical for the necessary fields. */ +#elif U_PLATFORM_HAS_WIN32_API + IMAGE_FILE_HEADER header; +#endif + } buffer; + + const char *filename; + FileStream *in; + int32_t length; + +#ifdef U_ELF + +#elif U_PLATFORM_HAS_WIN32_API + const IMAGE_FILE_HEADER *pHeader; +#else +# error "Unknown platform for CAN_GENERATE_OBJECTS." +#endif + + if(optMatchArch != nullptr) { + filename=optMatchArch; + } else { + /* set defaults */ +#ifdef U_ELF + /* set EM_386 because elf.h does not provide better defaults */ + *pCPU=EM_386; + *pBits=32; + *pIsBigEndian=(UBool)(U_IS_BIG_ENDIAN ? ELFDATA2MSB : ELFDATA2LSB); +#elif U_PLATFORM_HAS_WIN32_API + // Windows always runs in little-endian mode. + *pIsBigEndian = false; + + // Note: The various _M_<arch> macros are predefined by the MSVC compiler based + // on the target compilation architecture. + // https://docs.microsoft.com/cpp/preprocessor/predefined-macros + + // link.exe will link an IMAGE_FILE_MACHINE_UNKNOWN data-only .obj file + // no matter what architecture it is targeting (though other values are + // required to match). Unfortunately, the variable name decoration/mangling + // is slightly different on x86, which means we can't use the UNKNOWN type + // for all architectures though. +# if defined(_M_IX86) + *pCPU = IMAGE_FILE_MACHINE_I386; +# else + *pCPU = IMAGE_FILE_MACHINE_UNKNOWN; +# endif +# if defined(_M_IA64) || defined(_M_AMD64) || defined (_M_ARM64) + *pBits = 64; // Doesn't seem to be used for anything interesting though? +# elif defined(_M_IX86) || defined(_M_ARM) + *pBits = 32; +# else +# error "Unknown platform for CAN_GENERATE_OBJECTS." +# endif +#else +# error "Unknown platform for CAN_GENERATE_OBJECTS." +#endif + return; + } + + in=T_FileStream_open(filename, "rb"); + if(in==nullptr) { + fprintf(stderr, "genccode: unable to open match-arch file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + length=T_FileStream_read(in, buffer.bytes, sizeof(buffer.bytes)); + +#ifdef U_ELF + if(length<(int32_t)sizeof(Elf32_Ehdr)) { + fprintf(stderr, "genccode: match-arch file %s is too short\n", filename); + exit(U_UNSUPPORTED_ERROR); + } + if( + buffer.header32.e_ident[0]!=ELFMAG0 || + buffer.header32.e_ident[1]!=ELFMAG1 || + buffer.header32.e_ident[2]!=ELFMAG2 || + buffer.header32.e_ident[3]!=ELFMAG3 || + buffer.header32.e_ident[EI_CLASS]<ELFCLASS32 || buffer.header32.e_ident[EI_CLASS]>ELFCLASS64 + ) { + fprintf(stderr, "genccode: match-arch file %s is not an ELF object file, or not supported\n", filename); + exit(U_UNSUPPORTED_ERROR); + } + + *pBits= buffer.header32.e_ident[EI_CLASS]==ELFCLASS32 ? 32 : 64; /* only 32 or 64: see check above */ +#ifdef U_ELF64 + if(*pBits!=32 && *pBits!=64) { + fprintf(stderr, "genccode: currently only supports 32-bit and 64-bit ELF format\n"); + exit(U_UNSUPPORTED_ERROR); + } +#else + if(*pBits!=32) { + fprintf(stderr, "genccode: built with elf.h missing 64-bit definitions\n"); + exit(U_UNSUPPORTED_ERROR); + } +#endif + + *pIsBigEndian=(UBool)(buffer.header32.e_ident[EI_DATA]==ELFDATA2MSB); + if(*pIsBigEndian!=U_IS_BIG_ENDIAN) { + fprintf(stderr, "genccode: currently only same-endianness ELF formats are supported\n"); + exit(U_UNSUPPORTED_ERROR); + } + /* TODO: Support byte swapping */ + + *pCPU=buffer.header32.e_machine; +#elif U_PLATFORM_HAS_WIN32_API + if(length<sizeof(IMAGE_FILE_HEADER)) { + fprintf(stderr, "genccode: match-arch file %s is too short\n", filename); + exit(U_UNSUPPORTED_ERROR); + } + /* TODO: Use buffer.header. Keep aliasing legal. */ + pHeader=(const IMAGE_FILE_HEADER *)buffer.bytes; + *pCPU=pHeader->Machine; + /* + * The number of bits is implicit with the Machine value. + * *pBits is ignored in the calling code, so this need not be precise. + */ + *pBits= *pCPU==IMAGE_FILE_MACHINE_I386 ? 32 : 64; + /* Windows always runs on little-endian CPUs. */ + *pIsBigEndian=false; +#else +# error "Unknown platform for CAN_GENERATE_OBJECTS." +#endif + + T_FileStream_close(in); +} + +U_CAPI void U_EXPORT2 +writeObjectCode( + const char *filename, + const char *destdir, + const char *optEntryPoint, + const char *optMatchArch, + const char *optFilename, + char *outFilePath, + size_t outFilePathCapacity, + UBool optWinDllExport) { + /* common variables */ + char buffer[4096], entry[96]={ 0 }; + FileStream *in, *out; + const char *newSuffix; + int32_t i, entryLength, length, size, entryOffset=0, entryLengthOffset=0; + + uint16_t cpu, bits; + UBool makeBigEndian; + + (void)optWinDllExport; /* unused except Windows */ + + /* platform-specific variables and initialization code */ +#ifdef U_ELF + /* 32-bit Elf file header */ + static Elf32_Ehdr header32={ + { + /* e_ident[] */ + ELFMAG0, ELFMAG1, ELFMAG2, ELFMAG3, + ELFCLASS32, + U_IS_BIG_ENDIAN ? ELFDATA2MSB : ELFDATA2LSB, + EV_CURRENT /* EI_VERSION */ + }, + ET_REL, + EM_386, + EV_CURRENT, /* e_version */ + 0, /* e_entry */ + 0, /* e_phoff */ + (Elf32_Off)sizeof(Elf32_Ehdr), /* e_shoff */ + 0, /* e_flags */ + (Elf32_Half)sizeof(Elf32_Ehdr), /* eh_size */ + 0, /* e_phentsize */ + 0, /* e_phnum */ + (Elf32_Half)sizeof(Elf32_Shdr), /* e_shentsize */ + 5, /* e_shnum */ + 2 /* e_shstrndx */ + }; + + /* 32-bit Elf section header table */ + static Elf32_Shdr sectionHeaders32[5]={ + { /* SHN_UNDEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }, + { /* .symtab */ + 1, /* sh_name */ + SHT_SYMTAB, + 0, /* sh_flags */ + 0, /* sh_addr */ + (Elf32_Off)(sizeof(header32)+sizeof(sectionHeaders32)), /* sh_offset */ + (Elf32_Word)(2*sizeof(Elf32_Sym)), /* sh_size */ + 3, /* sh_link=sect hdr index of .strtab */ + 1, /* sh_info=One greater than the symbol table index of the last + * local symbol (with STB_LOCAL). */ + 4, /* sh_addralign */ + (Elf32_Word)(sizeof(Elf32_Sym)) /* sh_entsize */ + }, + { /* .shstrtab */ + 9, /* sh_name */ + SHT_STRTAB, + 0, /* sh_flags */ + 0, /* sh_addr */ + (Elf32_Off)(sizeof(header32)+sizeof(sectionHeaders32)+2*sizeof(Elf32_Sym)), /* sh_offset */ + 40, /* sh_size */ + 0, /* sh_link */ + 0, /* sh_info */ + 1, /* sh_addralign */ + 0 /* sh_entsize */ + }, + { /* .strtab */ + 19, /* sh_name */ + SHT_STRTAB, + 0, /* sh_flags */ + 0, /* sh_addr */ + (Elf32_Off)(sizeof(header32)+sizeof(sectionHeaders32)+2*sizeof(Elf32_Sym)+40), /* sh_offset */ + (Elf32_Word)sizeof(entry), /* sh_size */ + 0, /* sh_link */ + 0, /* sh_info */ + 1, /* sh_addralign */ + 0 /* sh_entsize */ + }, + { /* .rodata */ + 27, /* sh_name */ + SHT_PROGBITS, + SHF_ALLOC, /* sh_flags */ + 0, /* sh_addr */ + (Elf32_Off)(sizeof(header32)+sizeof(sectionHeaders32)+2*sizeof(Elf32_Sym)+40+sizeof(entry)), /* sh_offset */ + 0, /* sh_size */ + 0, /* sh_link */ + 0, /* sh_info */ + 16, /* sh_addralign */ + 0 /* sh_entsize */ + } + }; + + /* symbol table */ + static Elf32_Sym symbols32[2]={ + { /* STN_UNDEF */ + 0, 0, 0, 0, 0, 0 + }, + { /* data entry point */ + 1, /* st_name */ + 0, /* st_value */ + 0, /* st_size */ + ELF64_ST_INFO(STB_GLOBAL, STT_OBJECT), + 0, /* st_other */ + 4 /* st_shndx=index of related section table entry */ + } + }; + + /* section header string table, with decimal string offsets */ + static const char sectionStrings[40]= + /* 0 */ "\0" + /* 1 */ ".symtab\0" + /* 9 */ ".shstrtab\0" + /* 19 */ ".strtab\0" + /* 27 */ ".rodata\0" + /* 35 */ "\0\0\0\0"; /* contains terminating NUL */ + /* 40: padded to multiple of 8 bytes */ + + /* + * Use entry[] for the string table which will contain only the + * entry point name. + * entry[0] must be 0 (NUL) + * The entry point name can be up to 38 characters long (sizeof(entry)-2). + */ + + /* 16-align .rodata in the .o file, just in case */ + static const char padding[16]={ 0 }; + int32_t paddingSize; + +#ifdef U_ELF64 + /* 64-bit Elf file header */ + static Elf64_Ehdr header64={ + { + /* e_ident[] */ + ELFMAG0, ELFMAG1, ELFMAG2, ELFMAG3, + ELFCLASS64, + U_IS_BIG_ENDIAN ? ELFDATA2MSB : ELFDATA2LSB, + EV_CURRENT /* EI_VERSION */ + }, + ET_REL, + EM_X86_64, + EV_CURRENT, /* e_version */ + 0, /* e_entry */ + 0, /* e_phoff */ + (Elf64_Off)sizeof(Elf64_Ehdr), /* e_shoff */ + 0, /* e_flags */ + (Elf64_Half)sizeof(Elf64_Ehdr), /* eh_size */ + 0, /* e_phentsize */ + 0, /* e_phnum */ + (Elf64_Half)sizeof(Elf64_Shdr), /* e_shentsize */ + 5, /* e_shnum */ + 2 /* e_shstrndx */ + }; + + /* 64-bit Elf section header table */ + static Elf64_Shdr sectionHeaders64[5]={ + { /* SHN_UNDEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }, + { /* .symtab */ + 1, /* sh_name */ + SHT_SYMTAB, + 0, /* sh_flags */ + 0, /* sh_addr */ + (Elf64_Off)(sizeof(header64)+sizeof(sectionHeaders64)), /* sh_offset */ + (Elf64_Xword)(2*sizeof(Elf64_Sym)), /* sh_size */ + 3, /* sh_link=sect hdr index of .strtab */ + 1, /* sh_info=One greater than the symbol table index of the last + * local symbol (with STB_LOCAL). */ + 4, /* sh_addralign */ + (Elf64_Xword)(sizeof(Elf64_Sym)) /* sh_entsize */ + }, + { /* .shstrtab */ + 9, /* sh_name */ + SHT_STRTAB, + 0, /* sh_flags */ + 0, /* sh_addr */ + (Elf64_Off)(sizeof(header64)+sizeof(sectionHeaders64)+2*sizeof(Elf64_Sym)), /* sh_offset */ + 40, /* sh_size */ + 0, /* sh_link */ + 0, /* sh_info */ + 1, /* sh_addralign */ + 0 /* sh_entsize */ + }, + { /* .strtab */ + 19, /* sh_name */ + SHT_STRTAB, + 0, /* sh_flags */ + 0, /* sh_addr */ + (Elf64_Off)(sizeof(header64)+sizeof(sectionHeaders64)+2*sizeof(Elf64_Sym)+40), /* sh_offset */ + (Elf64_Xword)sizeof(entry), /* sh_size */ + 0, /* sh_link */ + 0, /* sh_info */ + 1, /* sh_addralign */ + 0 /* sh_entsize */ + }, + { /* .rodata */ + 27, /* sh_name */ + SHT_PROGBITS, + SHF_ALLOC, /* sh_flags */ + 0, /* sh_addr */ + (Elf64_Off)(sizeof(header64)+sizeof(sectionHeaders64)+2*sizeof(Elf64_Sym)+40+sizeof(entry)), /* sh_offset */ + 0, /* sh_size */ + 0, /* sh_link */ + 0, /* sh_info */ + 16, /* sh_addralign */ + 0 /* sh_entsize */ + } + }; + + /* + * 64-bit symbol table + * careful: different order of items compared with Elf32_sym! + */ + static Elf64_Sym symbols64[2]={ + { /* STN_UNDEF */ + 0, 0, 0, 0, 0, 0 + }, + { /* data entry point */ + 1, /* st_name */ + ELF64_ST_INFO(STB_GLOBAL, STT_OBJECT), + 0, /* st_other */ + 4, /* st_shndx=index of related section table entry */ + 0, /* st_value */ + 0 /* st_size */ + } + }; + +#endif /* U_ELF64 */ + + /* entry[] have a leading NUL */ + entryOffset=1; + + /* in the common code, count entryLength from after the NUL */ + entryLengthOffset=1; + + newSuffix=".o"; + +#elif U_PLATFORM_HAS_WIN32_API + struct { + IMAGE_FILE_HEADER fileHeader; + IMAGE_SECTION_HEADER sections[2]; + char linkerOptions[100]; + } objHeader; + IMAGE_SYMBOL symbols[1]; + struct { + DWORD sizeofLongNames; + char longNames[100]; + } symbolNames; + + /* + * entry sometimes have a leading '_' + * overwritten if entryOffset==0 depending on the target platform + * see check for cpu below + */ + entry[0]='_'; + + newSuffix=".obj"; +#else +# error "Unknown platform for CAN_GENERATE_OBJECTS." +#endif + + /* deal with options, files and the entry point name */ + getArchitecture(&cpu, &bits, &makeBigEndian, optMatchArch); + if (optMatchArch) + { + printf("genccode: --match-arch cpu=%hu bits=%hu big-endian=%d\n", cpu, bits, makeBigEndian); + } + else + { + printf("genccode: using architecture cpu=%hu bits=%hu big-endian=%d\n", cpu, bits, makeBigEndian); + } +#if U_PLATFORM_HAS_WIN32_API + if(cpu==IMAGE_FILE_MACHINE_I386) { + entryOffset=1; + } +#endif + + in=T_FileStream_open(filename, "rb"); + if(in==nullptr) { + fprintf(stderr, "genccode: unable to open input file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + size=T_FileStream_size(in); + + getOutFilename( + filename, + destdir, + buffer, + sizeof(buffer), + entry + entryOffset, + sizeof(entry) - entryOffset, + newSuffix, + optFilename); + + if (outFilePath != nullptr) { + if (uprv_strlen(buffer) >= outFilePathCapacity) { + fprintf(stderr, "genccode: filename too long\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + uprv_strcpy(outFilePath, buffer); + } + + if(optEntryPoint != nullptr) { + uprv_strcpy(entry+entryOffset, optEntryPoint); + uprv_strcat(entry+entryOffset, "_dat"); + } + /* turn dashes in the entry name into underscores */ + entryLength=(int32_t)uprv_strlen(entry+entryLengthOffset); + for(i=0; i<entryLength; ++i) { + if(entry[entryLengthOffset+i]=='-') { + entry[entryLengthOffset+i]='_'; + } + } + + /* open the output file */ + out=T_FileStream_open(buffer, "wb"); + if(out==nullptr) { + fprintf(stderr, "genccode: unable to open output file %s\n", buffer); + exit(U_FILE_ACCESS_ERROR); + } + +#ifdef U_ELF + if(bits==32) { + header32.e_ident[EI_DATA]= makeBigEndian ? ELFDATA2MSB : ELFDATA2LSB; + header32.e_machine=cpu; + + /* 16-align .rodata in the .o file, just in case */ + paddingSize=sectionHeaders32[4].sh_offset & 0xf; + if(paddingSize!=0) { + paddingSize=0x10-paddingSize; + sectionHeaders32[4].sh_offset+=paddingSize; + } + + sectionHeaders32[4].sh_size=(Elf32_Word)size; + + symbols32[1].st_size=(Elf32_Word)size; + + /* write .o headers */ + T_FileStream_write(out, &header32, (int32_t)sizeof(header32)); + T_FileStream_write(out, sectionHeaders32, (int32_t)sizeof(sectionHeaders32)); + T_FileStream_write(out, symbols32, (int32_t)sizeof(symbols32)); + } else /* bits==64 */ { +#ifdef U_ELF64 + header64.e_ident[EI_DATA]= makeBigEndian ? ELFDATA2MSB : ELFDATA2LSB; + header64.e_machine=cpu; + + /* 16-align .rodata in the .o file, just in case */ + paddingSize=sectionHeaders64[4].sh_offset & 0xf; + if(paddingSize!=0) { + paddingSize=0x10-paddingSize; + sectionHeaders64[4].sh_offset+=paddingSize; + } + + sectionHeaders64[4].sh_size=(Elf64_Xword)size; + + symbols64[1].st_size=(Elf64_Xword)size; + + /* write .o headers */ + T_FileStream_write(out, &header64, (int32_t)sizeof(header64)); + T_FileStream_write(out, sectionHeaders64, (int32_t)sizeof(sectionHeaders64)); + T_FileStream_write(out, symbols64, (int32_t)sizeof(symbols64)); +#endif + } + + T_FileStream_write(out, sectionStrings, (int32_t)sizeof(sectionStrings)); + T_FileStream_write(out, entry, (int32_t)sizeof(entry)); + if(paddingSize!=0) { + T_FileStream_write(out, padding, paddingSize); + } +#elif U_PLATFORM_HAS_WIN32_API + /* populate the .obj headers */ + uprv_memset(&objHeader, 0, sizeof(objHeader)); + uprv_memset(&symbols, 0, sizeof(symbols)); + uprv_memset(&symbolNames, 0, sizeof(symbolNames)); + + /* write the linker export directive */ + if (optWinDllExport) { + uprv_strcpy(objHeader.linkerOptions, "-export:"); + length=8; + uprv_strcpy(objHeader.linkerOptions+length, entry); + length+=entryLength; + uprv_strcpy(objHeader.linkerOptions+length, ",data "); + length+=6; + } + else { + length=0; + } + + /* set the file header */ + objHeader.fileHeader.Machine=cpu; + objHeader.fileHeader.NumberOfSections=2; + objHeader.fileHeader.TimeDateStamp=(DWORD)time(nullptr); + objHeader.fileHeader.PointerToSymbolTable=IMAGE_SIZEOF_FILE_HEADER+2*IMAGE_SIZEOF_SECTION_HEADER+length+size; /* start of symbol table */ + objHeader.fileHeader.NumberOfSymbols=1; + + /* set the section for the linker options */ + uprv_strncpy((char *)objHeader.sections[0].Name, ".drectve", 8); + objHeader.sections[0].SizeOfRawData=length; + objHeader.sections[0].PointerToRawData=IMAGE_SIZEOF_FILE_HEADER+2*IMAGE_SIZEOF_SECTION_HEADER; + objHeader.sections[0].Characteristics=IMAGE_SCN_LNK_INFO|IMAGE_SCN_LNK_REMOVE|IMAGE_SCN_ALIGN_1BYTES; + + /* set the data section */ + uprv_strncpy((char *)objHeader.sections[1].Name, ".rdata", 6); + objHeader.sections[1].SizeOfRawData=size; + objHeader.sections[1].PointerToRawData=IMAGE_SIZEOF_FILE_HEADER+2*IMAGE_SIZEOF_SECTION_HEADER+length; + objHeader.sections[1].Characteristics=IMAGE_SCN_CNT_INITIALIZED_DATA|IMAGE_SCN_ALIGN_16BYTES|IMAGE_SCN_MEM_READ; + + /* set the symbol table */ + if(entryLength<=8) { + uprv_strncpy((char *)symbols[0].N.ShortName, entry, entryLength); + symbolNames.sizeofLongNames=4; + } else { + symbols[0].N.Name.Short=0; + symbols[0].N.Name.Long=4; + symbolNames.sizeofLongNames=4+entryLength+1; + uprv_strcpy(symbolNames.longNames, entry); + } + symbols[0].SectionNumber=2; + symbols[0].StorageClass=IMAGE_SYM_CLASS_EXTERNAL; + + /* write the file header and the linker options section */ + T_FileStream_write(out, &objHeader, objHeader.sections[1].PointerToRawData); +#else +# error "Unknown platform for CAN_GENERATE_OBJECTS." +#endif + + /* copy the data file into section 2 */ + for(;;) { + length=T_FileStream_read(in, buffer, sizeof(buffer)); + if(length==0) { + break; + } + T_FileStream_write(out, buffer, (int32_t)length); + } + +#if U_PLATFORM_HAS_WIN32_API + /* write the symbol table */ + T_FileStream_write(out, symbols, IMAGE_SIZEOF_SYMBOL); + T_FileStream_write(out, &symbolNames, symbolNames.sizeofLongNames); +#endif + + if(T_FileStream_error(in)) { + fprintf(stderr, "genccode: file read error while generating from file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + if(T_FileStream_error(out)) { + fprintf(stderr, "genccode: file write error while generating from file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + T_FileStream_close(out); + T_FileStream_close(in); +} +#endif diff --git a/intl/icu/source/tools/toolutil/pkg_genc.h b/intl/icu/source/tools/toolutil/pkg_genc.h new file mode 100644 index 0000000000..2dd1b45cde --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkg_genc.h @@ -0,0 +1,107 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2008-2011, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ + +#ifndef __PKG_GENC_H__ +#define __PKG_GENC_H__ + +#include "unicode/utypes.h" +#include "toolutil.h" + +#include "unicode/putil.h" +#include "putilimp.h" + +/*** Platform #defines move here ***/ +#if U_PLATFORM_HAS_WIN32_API +#ifdef __GNUC__ +#define WINDOWS_WITH_GNUC +#else +#define WINDOWS_WITH_MSVC +#endif +#endif + + +#if !defined(WINDOWS_WITH_MSVC) +#define BUILD_DATA_WITHOUT_ASSEMBLY +#endif + +#ifndef U_DISABLE_OBJ_CODE /* testing */ +#if defined(WINDOWS_WITH_MSVC) || U_PLATFORM_IS_LINUX_BASED +#define CAN_WRITE_OBJ_CODE +#endif +#if U_PLATFORM_HAS_WIN32_API || defined(U_ELF) +#define CAN_GENERATE_OBJECTS +#endif +#endif + +#if U_PLATFORM == U_PF_CYGWIN || defined(CYGWINMSVC) +#define USING_CYGWIN +#endif + +/* + * When building the data library without assembly, + * some platforms use a single c code file for all of + * the data to generate the final data library. This can + * increase the performance of the pkdata tool. + */ +#if U_PLATFORM == U_PF_OS400 +#define USE_SINGLE_CCODE_FILE +#endif + +/* Need to fix the file seperator character when using MinGW. */ +#if defined(WINDOWS_WITH_GNUC) || defined(USING_CYGWIN) +#define PKGDATA_FILE_SEP_STRING "/" +#else +#define PKGDATA_FILE_SEP_STRING U_FILE_SEP_STRING +#endif + +#define LARGE_BUFFER_MAX_SIZE 2048 +#define SMALL_BUFFER_MAX_SIZE 512 +#define SMALL_BUFFER_FLAG_NAMES 32 +#define BUFFER_PADDING_SIZE 20 + +/** End platform defines **/ + + + +U_CAPI void U_EXPORT2 +printAssemblyHeadersToStdErr(void); + +U_CAPI UBool U_EXPORT2 +checkAssemblyHeaderName(const char* optAssembly); + +U_CAPI void U_EXPORT2 +writeCCode( + const char *filename, + const char *destdir, + const char *optEntryPoint, + const char *optName, + const char *optFilename, + char *outFilePath, + size_t outFilePathCapacity); + +U_CAPI void U_EXPORT2 +writeAssemblyCode( + const char *filename, + const char *destdir, + const char *optEntryPoint, + const char *optFilename, + char *outFilePath, + size_t outFilePathCapacity); + +U_CAPI void U_EXPORT2 +writeObjectCode( + const char *filename, + const char *destdir, + const char *optEntryPoint, + const char *optMatchArch, + const char *optFilename, + char *outFilePath, + size_t outFilePathCapacity, + UBool optWinDllExport); + +#endif diff --git a/intl/icu/source/tools/toolutil/pkg_gencmn.cpp b/intl/icu/source/tools/toolutil/pkg_gencmn.cpp new file mode 100644 index 0000000000..a301c322eb --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkg_gencmn.cpp @@ -0,0 +1,578 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2008-2012, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ +#include "unicode/utypes.h" + +#include <stdio.h> +#include <stdlib.h> +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" +#include "toolutil.h" +#include "unicode/uclean.h" +#include "unewdata.h" +#include "putilimp.h" +#include "pkg_gencmn.h" + +#define STRING_STORE_SIZE 200000 + +#define COMMON_DATA_NAME U_ICUDATA_NAME +#define DATA_TYPE "dat" + +/* ICU package data file format (.dat files) ------------------------------- *** + +Description of the data format after the usual ICU data file header +(UDataInfo etc.). + +Format version 1 + +A .dat package file contains a simple Table of Contents of item names, +followed by the items themselves: + +1. ToC table + +uint32_t count; - number of items +UDataOffsetTOCEntry entry[count]; - pair of uint32_t values per item: + uint32_t nameOffset; - offset of the item name + uint32_t dataOffset; - offset of the item data +both are byte offsets from the beginning of the data + +2. item name strings + +All item names are stored as char * strings in one block between the ToC table +and the data items. + +3. data items + +The data items are stored following the item names block. +Each data item is 16-aligned. +The data items are stored in the sorted order of their names. + +Therefore, the top of the name strings block is the offset of the first item, +the length of the last item is the difference between its offset and +the .dat file length, and the length of all previous items is the difference +between its offset and the next one. + +----------------------------------------------------------------------------- */ + +/* UDataInfo cf. udata.h */ +static const UDataInfo dataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + sizeof(char16_t), + 0, + + {0x43, 0x6d, 0x6e, 0x44}, /* dataFormat="CmnD" */ + {1, 0, 0, 0}, /* formatVersion */ + {3, 0, 0, 0} /* dataVersion */ +}; + +static uint32_t maxSize; + +static char stringStore[STRING_STORE_SIZE]; +static uint32_t stringTop=0, basenameTotal=0; + +typedef struct { + char *pathname, *basename; + uint32_t basenameLength, basenameOffset, fileSize, fileOffset; +} File; + +#define CHUNK_FILE_COUNT 256 +static File *files = nullptr; +static uint32_t fileCount=0; +static uint32_t fileMax = 0; + + +static char *symPrefix = nullptr; + +#define LINE_BUFFER_SIZE 512 +/* prototypes --------------------------------------------------------------- */ + +static void +addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose); + +static char * +allocString(uint32_t length); + +U_CDECL_BEGIN +static int +compareFiles(const void *file1, const void *file2); +U_CDECL_END + +static char * +pathToFullPath(const char *path, const char *source); + +/* map non-tree separator (such as '\') to tree separator ('/') inplace. */ +static void +fixDirToTreePath(char *s); +/* -------------------------------------------------------------------------- */ + +U_CAPI void U_EXPORT2 +createCommonDataFile(const char *destDir, const char *name, const char *entrypointName, const char *type, const char *source, const char *copyRight, + const char *dataFile, uint32_t max_size, UBool sourceTOC, UBool verbose, char *gencmnFileName) { + static char buffer[4096]; + char *line; + char *linePtr; + char *s = nullptr; + UErrorCode errorCode=U_ZERO_ERROR; + uint32_t i, fileOffset, basenameOffset, length, nread; + FileStream *in, *file; + + line = (char *)uprv_malloc(sizeof(char) * LINE_BUFFER_SIZE); + if (line == nullptr) { + fprintf(stderr, "gencmn: unable to allocate memory for line buffer of size %d\n", LINE_BUFFER_SIZE); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + linePtr = line; + + maxSize = max_size; + + if (destDir == nullptr) { + destDir = u_getDataDirectory(); + } + if (name == nullptr) { + name = COMMON_DATA_NAME; + } + if (type == nullptr) { + type = DATA_TYPE; + } + if (source == nullptr) { + source = "."; + } + + if (dataFile == nullptr) { + in = T_FileStream_stdin(); + } else { + in = T_FileStream_open(dataFile, "r"); + if(in == nullptr) { + fprintf(stderr, "gencmn: unable to open input file %s\n", dataFile); + exit(U_FILE_ACCESS_ERROR); + } + } + + if (verbose) { + if(sourceTOC) { + printf("generating %s_%s.c (table of contents source file)\n", name, type); + } else { + printf("generating %s.%s (common data file with table of contents)\n", name, type); + } + } + + /* read the list of files and get their lengths */ + while((s != nullptr && *s != 0) || (s=T_FileStream_readLine(in, (line=linePtr), + LINE_BUFFER_SIZE))!=nullptr) { + /* remove trailing newline characters and parse space separated items */ + if (s != nullptr && *s != 0) { + line=s; + } else { + s=line; + } + while(*s!=0) { + if(*s==' ') { + *s=0; + ++s; + break; + } else if(*s=='\r' || *s=='\n') { + *s=0; + break; + } + ++s; + } + + /* check for comment */ + + if (*line == '#') { + continue; + } + + /* add the file */ +#if (U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR) + { + char *t; + while((t = uprv_strchr(line,U_FILE_ALT_SEP_CHAR))) { + *t = U_FILE_SEP_CHAR; + } + } +#endif + addFile(getLongPathname(line), name, source, sourceTOC, verbose); + } + + uprv_free(linePtr); + + if(in!=T_FileStream_stdin()) { + T_FileStream_close(in); + } + + if(fileCount==0) { + fprintf(stderr, "gencmn: no files listed in %s\n", dataFile == nullptr ? "<stdin>" : dataFile); + return; + } + + /* sort the files by basename */ + qsort(files, fileCount, sizeof(File), compareFiles); + + if(!sourceTOC) { + UNewDataMemory *out; + + /* determine the offsets of all basenames and files in this common one */ + basenameOffset=4+8*fileCount; + fileOffset=(basenameOffset+(basenameTotal+15))&~0xf; + for(i=0; i<fileCount; ++i) { + files[i].fileOffset=fileOffset; + fileOffset+=(files[i].fileSize+15)&~0xf; + files[i].basenameOffset=basenameOffset; + basenameOffset+=files[i].basenameLength; + } + + /* create the output file */ + out=udata_create(destDir, type, name, + &dataInfo, + copyRight == nullptr ? U_COPYRIGHT_STRING : copyRight, + &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "gencmn: udata_create(-d %s -n %s -t %s) failed - %s\n", + destDir, name, type, + u_errorName(errorCode)); + exit(errorCode); + } + + /* write the table of contents */ + udata_write32(out, fileCount); + for(i=0; i<fileCount; ++i) { + udata_write32(out, files[i].basenameOffset); + udata_write32(out, files[i].fileOffset); + } + + /* write the basenames */ + for(i=0; i<fileCount; ++i) { + udata_writeString(out, files[i].basename, files[i].basenameLength); + } + length=4+8*fileCount+basenameTotal; + + /* copy the files */ + for(i=0; i<fileCount; ++i) { + /* pad to 16-align the next file */ + length&=0xf; + if(length!=0) { + udata_writePadding(out, 16-length); + } + + if (verbose) { + printf("adding %s (%ld byte%s)\n", files[i].pathname, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s"); + } + + /* copy the next file */ + file=T_FileStream_open(files[i].pathname, "rb"); + if(file==nullptr) { + fprintf(stderr, "gencmn: unable to open listed file %s\n", files[i].pathname); + exit(U_FILE_ACCESS_ERROR); + } + for(nread = 0;;) { + length=T_FileStream_read(file, buffer, sizeof(buffer)); + if(length <= 0) { + break; + } + nread += length; + udata_writeBlock(out, buffer, length); + } + T_FileStream_close(file); + length=files[i].fileSize; + + if (nread != files[i].fileSize) { + fprintf(stderr, "gencmn: unable to read %s properly (got %ld/%ld byte%s)\n", files[i].pathname, (long)nread, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s"); + exit(U_FILE_ACCESS_ERROR); + } + } + + /* pad to 16-align the last file (cleaner, avoids growing .dat files in icuswap) */ + length&=0xf; + if(length!=0) { + udata_writePadding(out, 16-length); + } + + /* finish */ + udata_finish(out, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "gencmn: udata_finish() failed - %s\n", u_errorName(errorCode)); + exit(errorCode); + } + } else { + /* write a .c source file with the table of contents */ + char *filename; + FileStream *out; + + /* create the output filename */ + filename=s=buffer; + uprv_strcpy(filename, destDir); + s=filename+uprv_strlen(filename); + if(s>filename && *(s-1)!=U_FILE_SEP_CHAR) { + *s++=U_FILE_SEP_CHAR; + } + uprv_strcpy(s, name); + if(*(type)!=0) { + s+=uprv_strlen(s); + *s++='_'; + uprv_strcpy(s, type); + } + s+=uprv_strlen(s); + uprv_strcpy(s, ".c"); + + /* open the output file */ + out=T_FileStream_open(filename, "w"); + if (gencmnFileName != nullptr) { + uprv_strcpy(gencmnFileName, filename); + } + if(out==nullptr) { + fprintf(stderr, "gencmn: unable to open .c output file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + /* write the source file */ + snprintf(buffer, sizeof(buffer), + "/*\n" + " * ICU common data table of contents for %s.%s\n" + " * Automatically generated by icu/source/tools/gencmn/gencmn .\n" + " */\n\n" + "#include \"unicode/utypes.h\"\n" + "#include \"unicode/udata.h\"\n" + "\n" + "/* external symbol declarations for data (%d files) */\n", + name, type, fileCount); + T_FileStream_writeLine(out, buffer); + + snprintf(buffer, sizeof(buffer), "extern const char\n %s%s[]", symPrefix?symPrefix:"", files[0].pathname); + T_FileStream_writeLine(out, buffer); + for(i=1; i<fileCount; ++i) { + snprintf(buffer, sizeof(buffer), ",\n %s%s[]", symPrefix?symPrefix:"", files[i].pathname); + T_FileStream_writeLine(out, buffer); + } + T_FileStream_writeLine(out, ";\n\n"); + + snprintf( + buffer, sizeof(buffer), + "U_EXPORT struct {\n" + " uint16_t headerSize;\n" + " uint8_t magic1, magic2;\n" + " UDataInfo info;\n" + " char padding[%lu];\n" + " uint32_t count, reserved;\n" + " struct {\n" + " const char *name;\n" + " const void *data;\n" + " } toc[%lu];\n" + "} U_EXPORT2 %s_dat = {\n" + " 32, 0xda, 0x27, {\n" + " %lu, 0,\n" + " %u, %u, %u, 0,\n" + " {0x54, 0x6f, 0x43, 0x50},\n" + " {1, 0, 0, 0},\n" + " {0, 0, 0, 0}\n" + " },\n" + " \"\", %lu, 0, {\n", + static_cast<unsigned long>(32-4-sizeof(UDataInfo)), + static_cast<unsigned long>(fileCount), + entrypointName, + static_cast<unsigned long>(sizeof(UDataInfo)), + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + static_cast<unsigned long>(fileCount) + ); + T_FileStream_writeLine(out, buffer); + + snprintf(buffer, sizeof(buffer), " { \"%s\", %s%s }", files[0].basename, symPrefix?symPrefix:"", files[0].pathname); + T_FileStream_writeLine(out, buffer); + for(i=1; i<fileCount; ++i) { + snprintf(buffer, sizeof(buffer), ",\n { \"%s\", %s%s }", files[i].basename, symPrefix?symPrefix:"", files[i].pathname); + T_FileStream_writeLine(out, buffer); + } + + T_FileStream_writeLine(out, "\n }\n};\n"); + T_FileStream_close(out); + + uprv_free(symPrefix); + } +} + +static void +addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose) { + char *s; + uint32_t length; + char *fullPath = nullptr; + + if(fileCount==fileMax) { + fileMax += CHUNK_FILE_COUNT; + files = (File *)uprv_realloc(files, fileMax*sizeof(files[0])); /* note: never freed. */ + if(files==nullptr) { + fprintf(stderr, "pkgdata/gencmn: Could not allocate %u bytes for %d files\n", (unsigned int)(fileMax*sizeof(files[0])), fileCount); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + if(!sourceTOC) { + FileStream *file; + + if(uprv_pathIsAbsolute(filename)) { + fprintf(stderr, "gencmn: Error: absolute path encountered. Old style paths are not supported. Use relative paths such as 'fur.res' or 'translit%cfur.res'.\n\tBad path: '%s'\n", U_FILE_SEP_CHAR, filename); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + fullPath = pathToFullPath(filename, source); + /* store the pathname */ + length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1); + s=allocString(length); + uprv_strcpy(s, name); + uprv_strcat(s, U_TREE_ENTRY_SEP_STRING); + uprv_strcat(s, filename); + + /* get the basename */ + fixDirToTreePath(s); + files[fileCount].basename=s; + files[fileCount].basenameLength=length; + + files[fileCount].pathname=fullPath; + + basenameTotal+=length; + + /* try to open the file */ + file=T_FileStream_open(fullPath, "rb"); + if(file==nullptr) { + fprintf(stderr, "gencmn: unable to open listed file %s\n", fullPath); + exit(U_FILE_ACCESS_ERROR); + } + + /* get the file length */ + length=T_FileStream_size(file); + if(T_FileStream_error(file) || length<=20) { + fprintf(stderr, "gencmn: unable to get length of listed file %s\n", fullPath); + exit(U_FILE_ACCESS_ERROR); + } + + T_FileStream_close(file); + + /* do not add files that are longer than maxSize */ + if(maxSize && length>maxSize) { + if (verbose) { + printf("%s ignored (size %ld > %ld)\n", fullPath, (long)length, (long)maxSize); + } + return; + } + files[fileCount].fileSize=length; + } else { + char *t; + /* get and store the basename */ + /* need to include the package name */ + length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1); + s=allocString(length); + uprv_strcpy(s, name); + uprv_strcat(s, U_TREE_ENTRY_SEP_STRING); + uprv_strcat(s, filename); + fixDirToTreePath(s); + files[fileCount].basename=s; + /* turn the basename into an entry point name and store in the pathname field */ + t=files[fileCount].pathname=allocString(length); + while(--length>0) { + if(*s=='.' || *s=='-' || *s=='/') { + *t='_'; + } else { + *t=*s; + } + ++s; + ++t; + } + *t=0; + } + ++fileCount; +} + +static char * +allocString(uint32_t length) { + uint32_t top=stringTop+length; + char *p; + + if(top>STRING_STORE_SIZE) { + fprintf(stderr, "gencmn: out of memory\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + p=stringStore+stringTop; + stringTop=top; + return p; +} + +static char * +pathToFullPath(const char *path, const char *source) { + int32_t length; + int32_t newLength; + char *fullPath; + int32_t n; + + length = (uint32_t)(uprv_strlen(path) + 1); + newLength = (length + 1 + (int32_t)uprv_strlen(source)); + fullPath = (char *)uprv_malloc(newLength); + if(source != nullptr) { + uprv_strcpy(fullPath, source); + uprv_strcat(fullPath, U_FILE_SEP_STRING); + } else { + fullPath[0] = 0; + } + n = (int32_t)uprv_strlen(fullPath); + fullPath[n] = 0; /* Suppress compiler warning for unused variable n */ + /* when conditional code below is not compiled. */ + uprv_strcat(fullPath, path); + +#if (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) +#if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) + /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */ + for(;fullPath[n];n++) { + if(fullPath[n] == U_FILE_ALT_SEP_CHAR) { + fullPath[n] = U_FILE_SEP_CHAR; + } + } +#endif +#endif +#if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) + /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */ + for(;fullPath[n];n++) { + if(fullPath[n] == U_TREE_ENTRY_SEP_CHAR) { + fullPath[n] = U_FILE_SEP_CHAR; + } + } +#endif + return fullPath; +} + +U_CDECL_BEGIN +static int +compareFiles(const void *file1, const void *file2) { + /* sort by basename */ + return uprv_strcmp(((File *)file1)->basename, ((File *)file2)->basename); +} +U_CDECL_END + +static void +fixDirToTreePath(char *s) +{ + (void)s; +#if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) || ((U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)) + char *t; +#endif +#if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) + for(t=s;t=uprv_strchr(t,U_FILE_SEP_CHAR);) { + *t = U_TREE_ENTRY_SEP_CHAR; + } +#endif +#if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) + for(t=s;t=uprv_strchr(t,U_FILE_ALT_SEP_CHAR);) { + *t = U_TREE_ENTRY_SEP_CHAR; + } +#endif +} diff --git a/intl/icu/source/tools/toolutil/pkg_gencmn.h b/intl/icu/source/tools/toolutil/pkg_gencmn.h new file mode 100644 index 0000000000..238239960a --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkg_gencmn.h @@ -0,0 +1,18 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2008, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ + +#ifndef __PKG_GENCMN_H__ +#define __PKG_GENCMN_H__ + +#include "unicode/utypes.h" + +U_CAPI void U_EXPORT2 +createCommonDataFile(const char *destDir, const char *name, const char *entrypointName, const char *type, const char *source, const char *copyRight, + const char *dataFile, uint32_t max_size, UBool sourceTOC, UBool verbose, char *gencmnFileName); + +#endif diff --git a/intl/icu/source/tools/toolutil/pkg_icu.cpp b/intl/icu/source/tools/toolutil/pkg_icu.cpp new file mode 100644 index 0000000000..d9c6717ecd --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkg_icu.cpp @@ -0,0 +1,176 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2008-2015, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ +#include "unicode/utypes.h" +#include "unicode/localpointer.h" +#include "unicode/putil.h" +#include "cstring.h" +#include "toolutil.h" +#include "uoptions.h" +#include "uparse.h" +#include "package.h" +#include "pkg_icu.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +// read a file list -------------------------------------------------------- *** + +U_NAMESPACE_USE + +static const struct { + const char *suffix; + int32_t length; +} listFileSuffixes[]={ + { ".txt", 4 }, + { ".lst", 4 }, + { ".tmp", 4 } +}; + +/* check for multiple text file suffixes to see if this list name is a text file name */ +static UBool +isListTextFile(const char *listname) { + const char *listNameEnd=strchr(listname, 0); + const char *suffix; + int32_t i, length; + for(i=0; i<UPRV_LENGTHOF(listFileSuffixes); ++i) { + suffix=listFileSuffixes[i].suffix; + length=listFileSuffixes[i].length; + if((listNameEnd-listname)>length && 0==memcmp(listNameEnd-length, suffix, length)) { + return true; + } + } + return false; +} + +/* + * Read a file list. + * If the listname ends with ".txt", then read the list file + * (in the system/ invariant charset). + * If the listname ends with ".dat", then read the ICU .dat package file. + * Otherwise, read the file itself as a single-item list. + */ +U_CAPI Package * U_EXPORT2 +readList(const char *filesPath, const char *listname, UBool readContents, Package *listPkgIn) { + Package *listPkg = listPkgIn; + FILE *file; + const char *listNameEnd; + + if(listname==nullptr || listname[0]==0) { + fprintf(stderr, "missing list file\n"); + return nullptr; + } + + if (listPkg == nullptr) { + listPkg=new Package(); + if(listPkg==nullptr) { + fprintf(stderr, "icupkg: not enough memory\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + listNameEnd=strchr(listname, 0); + if(isListTextFile(listname)) { + // read the list file + char line[1024]; + char *end; + const char *start; + + file=fopen(listname, "r"); + if(file==nullptr) { + fprintf(stderr, "icupkg: unable to open list file \"%s\"\n", listname); + delete listPkg; + exit(U_FILE_ACCESS_ERROR); + } + + while(fgets(line, sizeof(line), file)) { + // remove comments + end=strchr(line, '#'); + if(end!=nullptr) { + *end=0; + } else { + // remove trailing CR LF + end=strchr(line, 0); + while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) { + *--end=0; + } + } + + // check first non-whitespace character and + // skip empty lines and + // skip lines starting with reserved characters + start=u_skipWhitespace(line); + if(*start==0 || nullptr!=strchr(U_PKG_RESERVED_CHARS, *start)) { + continue; + } + + // take whitespace-separated items from the line + for(;;) { + // find whitespace after the item or the end of the line + for(end=(char *)start; *end!=0 && *end!=' ' && *end!='\t'; ++end) {} + if(*end==0) { + // this item is the last one on the line + end=nullptr; + } else { + // the item is terminated by whitespace, terminate it with NUL + *end=0; + } + if(readContents) { + listPkg->addFile(filesPath, start); + } else { + listPkg->addItem(start); + } + + // find the start of the next item or exit the loop + if(end==nullptr || *(start=u_skipWhitespace(end+1))==0) { + break; + } + } + } + fclose(file); + } else if((listNameEnd-listname)>4 && 0==memcmp(listNameEnd-4, ".dat", 4)) { + // read the ICU .dat package + // Accept a .dat file whose name differs from the ToC prefixes. + listPkg->setAutoPrefix(); + listPkg->readPackage(listname); + } else { + // list the single file itself + if(readContents) { + listPkg->addFile(filesPath, listname); + } else { + listPkg->addItem(listname); + } + } + + return listPkg; +} + +U_CAPI int U_EXPORT2 +writePackageDatFile(const char *outFilename, const char *outComment, const char *sourcePath, const char *addList, Package *pkg, char outType) { + LocalPointer<Package> ownedPkg; + LocalPointer<Package> addListPkg; + + if (pkg == nullptr) { + ownedPkg.adoptInstead(new Package); + if(ownedPkg.isNull()) { + fprintf(stderr, "icupkg: not enough memory\n"); + return U_MEMORY_ALLOCATION_ERROR; + } + pkg = ownedPkg.getAlias(); + + addListPkg.adoptInstead(readList(sourcePath, addList, true, nullptr)); + if(addListPkg.isValid()) { + pkg->addItems(*addListPkg); + } else { + return U_ILLEGAL_ARGUMENT_ERROR; + } + } + + pkg->writePackage(outFilename, outType, outComment); + return 0; +} diff --git a/intl/icu/source/tools/toolutil/pkg_icu.h b/intl/icu/source/tools/toolutil/pkg_icu.h new file mode 100644 index 0000000000..638056e60b --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkg_icu.h @@ -0,0 +1,25 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2008-2016, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ + +#ifndef __PKG_ICU_H__ +#define __PKG_ICU_H__ + +#include "unicode/utypes.h" +#include "package.h" + +#define U_PKG_RESERVED_CHARS "\"%&'()*+,-./:;<=>?_" + +U_CAPI int U_EXPORT2 +writePackageDatFile(const char *outFilename, const char *outComment, + const char *sourcePath, const char *addList, icu::Package *pkg, + char outType); + +U_CAPI icu::Package * U_EXPORT2 +readList(const char *filesPath, const char *listname, UBool readContents, icu::Package *listPkgIn); + +#endif diff --git a/intl/icu/source/tools/toolutil/pkg_imp.h b/intl/icu/source/tools/toolutil/pkg_imp.h new file mode 100644 index 0000000000..29abd8d83c --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkg_imp.h @@ -0,0 +1,38 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2005-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: pkg_imp.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005sep18 +* created by: Markus W. Scherer +* +* Implementation definitions for data package functions in toolutil. +*/ + +#ifndef __PKG_IMP_H__ +#define __PKG_IMP_H__ + +#include "unicode/utypes.h" +#include "unicode/udata.h" + +/* + * Read an ICU data item with any platform type, + * return the pointer to the UDataInfo in its header, + * and set the lengths of the UDataInfo and of the whole header. + * All data remains in its platform type. + */ +U_CFUNC const UDataInfo * +getDataInfo(const uint8_t *data, int32_t length, + int32_t &infoLength, int32_t &headerLength, + UErrorCode *pErrorCode); + +#endif diff --git a/intl/icu/source/tools/toolutil/pkgitems.cpp b/intl/icu/source/tools/toolutil/pkgitems.cpp new file mode 100644 index 0000000000..e49775d56d --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkgitems.cpp @@ -0,0 +1,645 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2003-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: pkgitems.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005sep18 +* created by: Markus W. Scherer +* +* Companion file to package.cpp. Deals with details of ICU data item formats. +* Used for item dependencies. +* Contains adapted code from ucnv_bld.c (swapper code from 2003). +*/ + +#include "unicode/utypes.h" +#include "unicode/ures.h" +#include "unicode/putil.h" +#include "unicode/udata.h" +#include "cstring.h" +#include "uinvchar.h" +#include "ucmndata.h" +#include "udataswp.h" +#include "swapimpl.h" +#include "toolutil.h" +#include "package.h" +#include "pkg_imp.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +/* item formats in common */ + +#include "uresdata.h" +#include "ucnv_bld.h" +#include "ucnv_io.h" + +// general definitions ----------------------------------------------------- *** + +U_CDECL_BEGIN + +static void U_CALLCONV +printError(void *context, const char *fmt, va_list args) { + vfprintf((FILE *)context, fmt, args); +} + +U_CDECL_END + +// a data item in native-platform form ------------------------------------- *** + +U_NAMESPACE_BEGIN + +class NativeItem { +public: + NativeItem() : pItem(nullptr), pInfo(nullptr), bytes(nullptr), swapped(nullptr), length(0) {} + NativeItem(const Item *item, UDataSwapFn *swap) : swapped(nullptr) { + setItem(item, swap); + } + ~NativeItem() { + delete [] swapped; + } + const UDataInfo *getDataInfo() const { + return pInfo; + } + const uint8_t *getBytes() const { + return bytes; + } + int32_t getLength() const { + return length; + } + + void setItem(const Item *item, UDataSwapFn *swap) { + pItem=item; + int32_t infoLength, itemHeaderLength; + UErrorCode errorCode=U_ZERO_ERROR; + pInfo=::getDataInfo(pItem->data, pItem->length, infoLength, itemHeaderLength, &errorCode); + if(U_FAILURE(errorCode)) { + exit(errorCode); // should succeed because readFile() checks headers + } + length=pItem->length-itemHeaderLength; + + if(pInfo->isBigEndian==U_IS_BIG_ENDIAN && pInfo->charsetFamily==U_CHARSET_FAMILY) { + bytes=pItem->data+itemHeaderLength; + } else { + UDataSwapper *ds=udata_openSwapper((UBool)pInfo->isBigEndian, pInfo->charsetFamily, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_openSwapper(\"%s\") failed - %s\n", + pItem->name, u_errorName(errorCode)); + exit(errorCode); + } + + ds->printError=printError; + ds->printErrorContext=stderr; + + swapped=new uint8_t[pItem->length]; + if(swapped==nullptr) { + fprintf(stderr, "icupkg: unable to allocate memory for swapping \"%s\"\n", pItem->name); + exit(U_MEMORY_ALLOCATION_ERROR); + } + swap(ds, pItem->data, pItem->length, swapped, &errorCode); + pInfo=::getDataInfo(swapped, pItem->length, infoLength, itemHeaderLength, &errorCode); + bytes=swapped+itemHeaderLength; + udata_closeSwapper(ds); + } + } + +private: + const Item *pItem; + const UDataInfo *pInfo; + const uint8_t *bytes; + uint8_t *swapped; + int32_t length; +}; + +// check a dependency ------------------------------------------------------ *** + +/* + * assemble the target item name from the source item name, an ID + * and a suffix + */ +static void +makeTargetName(const char *itemName, const char *id, int32_t idLength, const char *suffix, + char *target, int32_t capacity, + UErrorCode *pErrorCode) { + const char *itemID; + int32_t treeLength, suffixLength, targetLength; + + // get the item basename + itemID=strrchr(itemName, '/'); + if(itemID!=nullptr) { + ++itemID; + } else { + itemID=itemName; + } + + // build the target string + treeLength=(int32_t)(itemID-itemName); + if(idLength<0) { + idLength=(int32_t)strlen(id); + } + suffixLength=(int32_t)strlen(suffix); + targetLength=treeLength+idLength+suffixLength; + if(targetLength>=capacity) { + fprintf(stderr, "icupkg/makeTargetName(%s) target item name length %ld too long\n", + itemName, (long)targetLength); + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + return; + } + + memcpy(target, itemName, treeLength); + memcpy(target+treeLength, id, idLength); + memcpy(target+treeLength+idLength, suffix, suffixLength+1); // +1 includes the terminating NUL +} + +static void +checkIDSuffix(const char *itemName, const char *id, int32_t idLength, const char *suffix, + CheckDependency check, void *context, + UErrorCode *pErrorCode) { + char target[200]; + makeTargetName(itemName, id, idLength, suffix, target, (int32_t)sizeof(target), pErrorCode); + if(U_SUCCESS(*pErrorCode)) { + check(context, itemName, target); + } +} + +/* assemble the target item name from the item's parent item name */ +static void +checkParent(const char *itemName, CheckDependency check, void *context, + UErrorCode *pErrorCode) { + const char *itemID, *parent, *parentLimit, *suffix; + int32_t parentLength; + + // get the item basename + itemID=strrchr(itemName, '/'); + if(itemID!=nullptr) { + ++itemID; + } else { + itemID=itemName; + } + + // get the item suffix + suffix=strrchr(itemID, '.'); + if(suffix==nullptr) { + // empty suffix, point to the end of the string + suffix=strrchr(itemID, 0); + } + + // get the position of the last '_' + for(parentLimit=suffix; parentLimit>itemID && *--parentLimit!='_';) {} + + if(parentLimit!=itemID) { + // get the parent item name by truncating the last part of this item's name */ + parent=itemID; + parentLength=(int32_t)(parentLimit-itemID); + } else { + // no '_' in the item name: the parent is the root bundle + parent="root"; + parentLength=4; + if((suffix-itemID)==parentLength && 0==memcmp(itemID, parent, parentLength)) { + // the item itself is "root", which does not depend on a parent + return; + } + } + checkIDSuffix(itemName, parent, parentLength, suffix, check, context, pErrorCode); +} + +// get dependencies from resource bundles ---------------------------------- *** + +static const char16_t SLASH=0x2f; + +/* + * Check for the alias from the string or alias resource res. + */ +static void +checkAlias(const char *itemName, + Resource res, const char16_t *alias, int32_t length, UBool useResSuffix, + CheckDependency check, void *context, UErrorCode *pErrorCode) { + int32_t i; + + if(!uprv_isInvariantUString(alias, length)) { + fprintf(stderr, "icupkg/ures_enumDependencies(%s res=%08x) alias string contains non-invariant characters\n", + itemName, res); + *pErrorCode=U_INVALID_CHAR_FOUND; + return; + } + + // extract the locale ID from alias strings like + // locale_ID/key1/key2/key3 + // locale_ID + + // search for the first slash + for(i=0; i<length && alias[i]!=SLASH; ++i) {} + + if(res_getPublicType(res)==URES_ALIAS) { + // ignore aliases with an initial slash: + // /ICUDATA/... and /pkgname/... go to a different package + // /LOCALE/... are for dynamic sideways fallbacks and don't go to a fixed bundle + if(i==0) { + return; // initial slash ('/') + } + + // ignore the intra-bundle path starting from the first slash ('/') + length=i; + } else /* URES_STRING */ { + // the whole string should only consist of a locale ID + if(i!=length) { + fprintf(stderr, "icupkg/ures_enumDependencies(%s res=%08x) %%ALIAS contains a '/'\n", + itemName, res); + *pErrorCode=U_UNSUPPORTED_ERROR; + return; + } + } + + // convert the Unicode string to char * + char localeID[48]; + if(length>=(int32_t)sizeof(localeID)) { + fprintf(stderr, "icupkg/ures_enumDependencies(%s res=%08x) alias locale ID length %ld too long\n", + itemName, res, (long)length); + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + return; + } + u_UCharsToChars(alias, localeID, length); + localeID[length]=0; + + checkIDSuffix(itemName, localeID, -1, (useResSuffix ? ".res" : ""), check, context, pErrorCode); +} + +/* + * Enumerate one resource item and its children and extract dependencies from + * aliases. + */ +static UBool +ures_enumDependencies(const char *itemName, + const ResourceData *pResData, + Resource res, const char *inKey, const char *parentKey, int32_t depth, + CheckDependency check, void *context, + Package *pkg, + UErrorCode *pErrorCode) { + UBool doCheckParent = true; // always remains true if depth>1 + switch(res_getPublicType(res)) { + case URES_STRING: + if(depth==1 && inKey!=nullptr && + (0==strcmp(inKey, "%%ALIAS") || 0==strcmp(inKey, "%%Parent"))) { + // Top-level %%ALIAS string: + // The alias resource bundle will be used instead of this one. + // Top-level %%Parent string: + // We use this bundle as well as the explicit parent bundle. + // Either way, the truncation parent is ignored. + doCheckParent = false; + // No tracing: build tool + int32_t length; + const char16_t *alias=res_getStringNoTrace(pResData, res, &length); + checkAlias(itemName, res, alias, length, /*useResSuffix=*/ true, + check, context, pErrorCode); + // If there is a %%ALIAS, then there should be nothing else in this resource bundle. + } else if(depth==2 && parentKey!=nullptr && 0==strcmp(parentKey, "%%DEPENDENCY")) { + // Second-level %%DEPENDENCY string: + // Explicit declaration of a dependency of this item on that one. + // No tracing: build tool + int32_t length; + const char16_t *alias=res_getStringNoTrace(pResData, res, &length); + checkAlias(itemName, res, alias, length, /*useResSuffix=*/ false, + check, context, pErrorCode); + } + // we ignore all other strings + break; + case URES_ALIAS: + { + int32_t length; + const char16_t *alias=res_getAlias(pResData, res, &length); + checkAlias(itemName, res, alias, length, true, check, context, pErrorCode); + } + break; + case URES_TABLE: + { + /* recurse */ + int32_t count=res_countArrayItems(pResData, res); + for(int32_t i=0; i<count; ++i) { + const char *itemKey; + Resource item=res_getTableItemByIndex(pResData, res, i, &itemKey); + // This doCheckParent return value is needed to + // propagate the possible false value from depth=1 to depth=0. + doCheckParent &= ures_enumDependencies( + itemName, pResData, + item, itemKey, + inKey, depth+1, + check, context, + pkg, + pErrorCode); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "icupkg/ures_enumDependencies(%s table res=%08x)[%d].recurse(%s: %08x) failed\n", + itemName, res, i, itemKey, item); + break; + } + } + } + break; + case URES_ARRAY: + { + /* recurse */ + int32_t count=res_countArrayItems(pResData, res); + for(int32_t i=0; i<count; ++i) { + Resource item=res_getArrayItem(pResData, res, i); + ures_enumDependencies( + itemName, pResData, + item, nullptr, + inKey, depth+1, + check, context, + pkg, + pErrorCode); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "icupkg/ures_enumDependencies(%s array res=%08x)[%d].recurse(%08x) failed\n", + itemName, res, i, item); + break; + } + } + } + break; + default: + break; + } + return doCheckParent; +} + +static void +ures_enumDependencies(const char *itemName, const UDataInfo *pInfo, + const uint8_t *inBytes, int32_t length, + CheckDependency check, void *context, + Package *pkg, + UErrorCode *pErrorCode) { + ResourceData resData; + + res_read(&resData, pInfo, inBytes, length, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "icupkg: .res format version %02x.%02x not supported, or bundle malformed\n", + pInfo->formatVersion[0], pInfo->formatVersion[1]); + exit(U_UNSUPPORTED_ERROR); + } + + icu::NativeItem nativePool; + + if(resData.usesPoolBundle) { + char poolName[200]; + makeTargetName(itemName, "pool", 4, ".res", poolName, (int32_t)sizeof(poolName), pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return; + } + check(context, itemName, poolName); + int32_t index=pkg->findItem(poolName); + if(index<0) { + // We cannot work with a bundle if its pool resource is missing. + // check() already printed a complaint. + return; + } + // TODO: Cache the native version in the Item itself. + nativePool.setItem(pkg->getItem(index), ures_swap); + const UDataInfo *poolInfo=nativePool.getDataInfo(); + if(poolInfo->formatVersion[0]<=1) { + fprintf(stderr, "icupkg: %s is not a pool bundle\n", poolName); + return; + } + const int32_t *poolRoot=(const int32_t *)nativePool.getBytes(); + const int32_t *poolIndexes=poolRoot+1; + int32_t poolIndexLength=poolIndexes[URES_INDEX_LENGTH]&0xff; + if(!(poolIndexLength>URES_INDEX_POOL_CHECKSUM && + (poolIndexes[URES_INDEX_ATTRIBUTES]&URES_ATT_IS_POOL_BUNDLE)) + ) { + fprintf(stderr, "icupkg: %s is not a pool bundle\n", poolName); + return; + } + if(resData.pRoot[1+URES_INDEX_POOL_CHECKSUM]==poolIndexes[URES_INDEX_POOL_CHECKSUM]) { + resData.poolBundleKeys=(const char *)(poolIndexes+poolIndexLength); + resData.poolBundleStrings=(const uint16_t *)(poolRoot+poolIndexes[URES_INDEX_KEYS_TOP]); + } else { + fprintf(stderr, "icupkg: %s has mismatched checksum for %s\n", poolName, itemName); + return; + } + } + + UBool doCheckParent = ures_enumDependencies( + itemName, &resData, + resData.rootRes, nullptr, nullptr, 0, + check, context, + pkg, + pErrorCode); + if(!doCheckParent) { + return; + } + + /* + * if the bundle attributes are present and the nofallback flag is not set, + * then add the parent bundle as a dependency + */ + if(pInfo->formatVersion[0]>1 || (pInfo->formatVersion[0]==1 && pInfo->formatVersion[1]>=1)) { + if(!resData.noFallback) { + /* this bundle participates in locale fallback */ + checkParent(itemName, check, context, pErrorCode); + } + } +} + +// get dependencies from conversion tables --------------------------------- *** + +#if !UCONFIG_NO_CONVERSION +/* code adapted from ucnv_swap() */ +static void +ucnv_enumDependencies(const UDataSwapper *ds, + const char *itemName, const UDataInfo *pInfo, + const uint8_t *inBytes, int32_t length, + CheckDependency check, void *context, + UErrorCode *pErrorCode) { + uint32_t staticDataSize; + + const UConverterStaticData *inStaticData; + + const _MBCSHeader *inMBCSHeader; + uint8_t outputType; + + /* check format version */ + if(!( + pInfo->formatVersion[0]==6 && + pInfo->formatVersion[1]>=2 + )) { + fprintf(stderr, "icupkg/ucnv_enumDependencies(): .cnv format version %02x.%02x not supported\n", + pInfo->formatVersion[0], pInfo->formatVersion[1]); + exit(U_UNSUPPORTED_ERROR); + } + + /* read the initial UConverterStaticData structure after the UDataInfo header */ + inStaticData=(const UConverterStaticData *)inBytes; + + if( length<(int32_t)sizeof(UConverterStaticData) || + (uint32_t)length<(staticDataSize=ds->readUInt32(inStaticData->structSize)) + ) { + udata_printError(ds, "icupkg/ucnv_enumDependencies(): too few bytes (%d after header) for an ICU .cnv conversion table\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return; + } + + inBytes+=staticDataSize; + length-=(int32_t)staticDataSize; + + /* check for supported conversionType values */ + if(inStaticData->conversionType==UCNV_MBCS) { + /* MBCS data */ + uint32_t mbcsHeaderLength, mbcsHeaderFlags, mbcsHeaderOptions; + int32_t extOffset; + + inMBCSHeader=(const _MBCSHeader *)inBytes; + + if(length<(int32_t)sizeof(_MBCSHeader)) { + udata_printError(ds, "icupkg/ucnv_enumDependencies(): too few bytes (%d after headers) for an ICU MBCS .cnv conversion table\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return; + } + if(inMBCSHeader->version[0]==4 && inMBCSHeader->version[1]>=1) { + mbcsHeaderLength=MBCS_HEADER_V4_LENGTH; + } else if(inMBCSHeader->version[0]==5 && inMBCSHeader->version[1]>=3 && + ((mbcsHeaderOptions=ds->readUInt32(inMBCSHeader->options))& + MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0 + ) { + mbcsHeaderLength=mbcsHeaderOptions&MBCS_OPT_LENGTH_MASK; + } else { + udata_printError(ds, "icupkg/ucnv_enumDependencies(): unsupported _MBCSHeader.version %d.%d\n", + inMBCSHeader->version[0], inMBCSHeader->version[1]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return; + } + + mbcsHeaderFlags=ds->readUInt32(inMBCSHeader->flags); + extOffset=(int32_t)(mbcsHeaderFlags>>8); + outputType=(uint8_t)mbcsHeaderFlags; + + if(outputType==MBCS_OUTPUT_EXT_ONLY) { + /* + * extension-only file, + * contains a base name instead of normal base table data + */ + char baseName[32]; + int32_t baseNameLength; + + /* there is extension data after the base data, see ucnv_ext.h */ + if(length<(extOffset+UCNV_EXT_INDEXES_MIN_LENGTH*4)) { + udata_printError(ds, "icupkg/ucnv_enumDependencies(): too few bytes (%d after headers) for an ICU MBCS .cnv conversion table with extension data\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return; + } + + /* swap the base name, between the header and the extension data */ + const char *inBaseName=(const char *)inBytes+mbcsHeaderLength*4; + baseNameLength=(int32_t)strlen(inBaseName); + if(baseNameLength>=(int32_t)sizeof(baseName)) { + udata_printError(ds, "icupkg/ucnv_enumDependencies(%s): base name length %ld too long\n", + itemName, baseNameLength); + *pErrorCode=U_UNSUPPORTED_ERROR; + return; + } + ds->swapInvChars(ds, inBaseName, baseNameLength+1, baseName, pErrorCode); + + checkIDSuffix(itemName, baseName, -1, ".cnv", check, context, pErrorCode); + } + } +} + +// ICU data formats -------------------------------------------------------- *** + +static const struct { + uint8_t dataFormat[4]; +} dataFormats[]={ + { { 0x52, 0x65, 0x73, 0x42 } }, /* dataFormat="ResB" */ + { { 0x63, 0x6e, 0x76, 0x74 } }, /* dataFormat="cnvt" */ + { { 0x43, 0x76, 0x41, 0x6c } } /* dataFormat="CvAl" */ +}; + +enum { + FMT_RES, + FMT_CNV, + FMT_ALIAS, + FMT_COUNT +}; + +static int32_t +getDataFormat(const uint8_t dataFormat[4]) { + int32_t i; + + for(i=0; i<FMT_COUNT; ++i) { + if(0==memcmp(dataFormats[i].dataFormat, dataFormat, 4)) { + return i; + } + } + return -1; +} + +// enumerate dependencies of a package item -------------------------------- *** + +void +Package::enumDependencies(Item *pItem, void *context, CheckDependency check) { + int32_t infoLength, itemHeaderLength; + UErrorCode errorCode=U_ZERO_ERROR; + const UDataInfo *pInfo=getDataInfo(pItem->data, pItem->length, infoLength, itemHeaderLength, &errorCode); + if(U_FAILURE(errorCode)) { + return; // should not occur because readFile() checks headers + } + + // find the data format and call the corresponding function, if any + int32_t format=getDataFormat(pInfo->dataFormat); + if(format>=0) { + switch(format) { + case FMT_RES: + { + /* + * Swap the resource bundle (if necessary) so that we can use + * the normal runtime uresdata.c code to read it. + * We do not want to duplicate that code, especially not together with on-the-fly swapping. + */ + NativeItem nrb(pItem, ures_swap); + ures_enumDependencies(pItem->name, nrb.getDataInfo(), nrb.getBytes(), nrb.getLength(), check, context, this, &errorCode); + break; + } + case FMT_CNV: + { + // TODO: share/cache swappers + UDataSwapper *ds=udata_openSwapper( + (UBool)pInfo->isBigEndian, pInfo->charsetFamily, + U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, + &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_openSwapper(\"%s\") failed - %s\n", + pItem->name, u_errorName(errorCode)); + exit(errorCode); + } + + ds->printError=printError; + ds->printErrorContext=stderr; + + const uint8_t *inBytes=pItem->data+itemHeaderLength; + int32_t length=pItem->length-itemHeaderLength; + + ucnv_enumDependencies(ds, pItem->name, pInfo, inBytes, length, check, context, &errorCode); + udata_closeSwapper(ds); + break; + } + default: + break; + } + + if(U_FAILURE(errorCode)) { + exit(errorCode); + } + } +} +#endif /* UCONFIG_NO_CONVERSION */ + +U_NAMESPACE_END diff --git a/intl/icu/source/tools/toolutil/ppucd.cpp b/intl/icu/source/tools/toolutil/ppucd.cpp new file mode 100644 index 0000000000..0d59b28ce4 --- /dev/null +++ b/intl/icu/source/tools/toolutil/ppucd.cpp @@ -0,0 +1,622 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2011-2014, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: ppucd.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2011dec11 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" +#include "unicode/uchar.h" +#include "charstr.h" +#include "cstring.h" +#include "ppucd.h" +#include "uassert.h" +#include "uparse.h" + +#include <stdio.h> +#include <string.h> + +U_NAMESPACE_BEGIN + +PropertyNames::~PropertyNames() {} + +// TODO: Create a concrete subclass for the default PropertyNames implementation +// using the ICU library built-in property names API & data. +// Currently only the genprops tool uses PreparsedUCD, and provides its own +// PropertyNames implementation using its just-build property names data and its own code. +// At some point, we should use PreparsedUCD in tests, and then we will need the +// default implementation somewhere. +#if 0 +int32_t +PropertyNames::getPropertyEnum(const char *name) const { + return u_getPropertyEnum(name); +} + +int32_t +PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const { + return u_getPropertyValueEnum((UProperty)property, name); +} +#endif + +UniProps::UniProps() + : start(U_SENTINEL), end(U_SENTINEL), + bmg(U_SENTINEL), bpb(U_SENTINEL), + scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL), + digitValue(-1), numericValue(nullptr), + name(nullptr), nameAlias(nullptr) { + memset(binProps, 0, sizeof(binProps)); + memset(intProps, 0, sizeof(intProps)); + memset(age, 0, 4); +} + +UniProps::~UniProps() {} + +const int32_t PreparsedUCD::kNumLineBuffers; + +PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode) + : pnames(nullptr), + file(nullptr), + defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0), + lineNumber(0), + lineType(NO_LINE), + fieldLimit(nullptr), lineLimit(nullptr) { + if(U_FAILURE(errorCode)) { return; } + + if(filename==nullptr || *filename==0 || (*filename=='-' && filename[1]==0)) { + filename=nullptr; + file=stdin; + } else { + file=fopen(filename, "r"); + } + if(file==nullptr) { + perror("error opening preparsed UCD"); + fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\""); + errorCode=U_FILE_ACCESS_ERROR; + return; + } + + memset(ucdVersion, 0, 4); + lines[0][0]=0; +} + +PreparsedUCD::~PreparsedUCD() { + if(file!=stdin) { + fclose(file); + } +} + +// Same order as the LineType values. +static const char *lineTypeStrings[]={ + nullptr, + nullptr, + "ucd", + "property", + "binary", + "value", + "defaults", + "block", + "cp", + "unassigned", + "algnamesrange" +}; + +PreparsedUCD::LineType +PreparsedUCD::readLine(UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return NO_LINE; } + // Select the next available line buffer. + while(!isLineBufferAvailable(lineIndex)) { + ++lineIndex; + if (lineIndex == kNumLineBuffers) { + lineIndex = 0; + } + } + char *line=lines[lineIndex]; + *line=0; + lineLimit=fieldLimit=line; + lineType=NO_LINE; + char *result=fgets(line, sizeof(lines[0]), file); + if(result==nullptr) { + if(ferror(file)) { + perror("error reading preparsed UCD"); + fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber); + errorCode=U_FILE_ACCESS_ERROR; + } + return NO_LINE; + } + ++lineNumber; + if(*line=='#') { + fieldLimit=strchr(line, 0); + return lineType=EMPTY_LINE; + } + // Remove trailing /r/n. + char c; + char *limit=strchr(line, 0); + while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; } + // Remove trailing white space. + while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; } + *limit=0; + lineLimit=limit; + if(line==limit) { + fieldLimit=limit; + return lineType=EMPTY_LINE; + } + // Split by ';'. + char *semi=line; + while((semi=strchr(semi, ';'))!=nullptr) { *semi++=0; } + fieldLimit=strchr(line, 0); + // Determine the line type. + int32_t type; + for(type=EMPTY_LINE+1;; ++type) { + if(type==LINE_TYPE_COUNT) { + fprintf(stderr, + "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n", + line, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return NO_LINE; + } + if(0==strcmp(line, lineTypeStrings[type])) { + break; + } + } + lineType=(LineType)type; + if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) { + u_versionFromString(ucdVersion, fieldLimit+1); + } + return lineType; +} + +const char * +PreparsedUCD::firstField() { + char *field=lines[lineIndex]; + fieldLimit=strchr(field, 0); + return field; +} + +const char * +PreparsedUCD::nextField() { + if(fieldLimit==lineLimit) { return nullptr; } + char *field=fieldLimit+1; + fieldLimit=strchr(field, 0); + return field; +} + +const UniProps * +PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return nullptr; } + newValues.clear(); + if(!lineHasPropertyValues()) { + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + firstField(); + const char *field=nextField(); + if(field==nullptr) { + // No range field after the type. + fprintf(stderr, + "error in preparsed UCD: missing default/block/cp range field " + "(no second field) on line %ld\n", + (long)lineNumber); + errorCode=U_PARSE_ERROR; + return nullptr; + } + UChar32 start, end; + if(!parseCodePointRange(field, start, end, errorCode)) { return nullptr; } + UniProps *props; + UBool insideBlock=false; // true if cp or unassigned range inside the block range. + switch(lineType) { + case DEFAULTS_LINE: + // Should occur before any block/cp/unassigned line. + if(blockLineIndex>=0) { + fprintf(stderr, + "error in preparsed UCD: default line %ld after one or more block lines\n", + (long)lineNumber); + errorCode=U_PARSE_ERROR; + return nullptr; + } + if(defaultLineIndex>=0) { + fprintf(stderr, + "error in preparsed UCD: second line with default properties on line %ld\n", + (long)lineNumber); + errorCode=U_PARSE_ERROR; + return nullptr; + } + if(start!=0 || end!=0x10ffff) { + fprintf(stderr, + "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return nullptr; + } + props=&defaultProps; + defaultLineIndex=lineIndex; + break; + case BLOCK_LINE: + blockProps=defaultProps; // Block inherits default properties. + props=&blockProps; + blockLineIndex=lineIndex; + break; + case CP_LINE: + case UNASSIGNED_LINE: + if(blockProps.start<=start && end<=blockProps.end) { + insideBlock=true; + if(lineType==CP_LINE) { + // Code point range fully inside the last block inherits the block properties. + cpProps=blockProps; + } else { + // Unassigned line inside the block is based on default properties + // which override block properties. + cpProps=defaultProps; + newValues=blockValues; + // Except, it inherits the one blk=Block property. + int32_t blkIndex=UCHAR_BLOCK-UCHAR_INT_START; + cpProps.intProps[blkIndex]=blockProps.intProps[blkIndex]; + newValues.remove((UChar32)UCHAR_BLOCK); + } + } else if(start>blockProps.end || end<blockProps.start) { + // Code point range fully outside the last block inherits the default properties. + cpProps=defaultProps; + } else { + // Code point range partially overlapping with the last block is illegal. + fprintf(stderr, + "error in preparsed UCD: cp range %s on line %ld only " + "partially overlaps with block range %04lX..%04lX\n", + field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end); + errorCode=U_PARSE_ERROR; + return nullptr; + } + props=&cpProps; + break; + default: + // Will not occur because of the range check above. + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + props->start=start; + props->end=end; + while((field=nextField())!=nullptr) { + if(!parseProperty(*props, field, newValues, errorCode)) { return nullptr; } + } + if(lineType==BLOCK_LINE) { + blockValues=newValues; + } else if(lineType==UNASSIGNED_LINE && insideBlock) { + // Unset newValues for values that are the same as the block values. + for(int32_t prop=0; prop<UCHAR_BINARY_LIMIT; ++prop) { + if(newValues.contains(prop) && cpProps.binProps[prop]==blockProps.binProps[prop]) { + newValues.remove(prop); + } + } + for(int32_t prop=UCHAR_INT_START; prop<UCHAR_INT_LIMIT; ++prop) { + int32_t index=prop-UCHAR_INT_START; + if(newValues.contains(prop) && cpProps.intProps[index]==blockProps.intProps[index]) { + newValues.remove(prop); + } + } + } + return props; +} + +static const struct { + const char *name; + int32_t prop; +} ppucdProperties[]={ + { "Name_Alias", PPUCD_NAME_ALIAS }, + { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS }, + { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING } +}; + +// Returns true for "ok to continue parsing fields". +UBool +PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, + UErrorCode &errorCode) { + CharString pBuffer; + const char *p=field; + const char *v=strchr(p, '='); + int binaryValue; + if(*p=='-') { + if(v!=nullptr) { + fprintf(stderr, + "error in preparsed UCD: mix of binary-property-no and " + "enum-property syntax '%s' on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return false; + } + binaryValue=0; + ++p; + } else if(v==nullptr) { + binaryValue=1; + } else { + binaryValue=-1; + // Copy out the property name rather than modifying the field (writing a NUL). + pBuffer.append(p, (int32_t)(v-p), errorCode); + p=pBuffer.data(); + ++v; + } + int32_t prop=pnames->getPropertyEnum(p); + if(prop<0) { + for(int32_t i=0;; ++i) { + if(i==UPRV_LENGTHOF(ppucdProperties)) { + // Ignore unknown property names. + return true; + } + if(0==uprv_stricmp(p, ppucdProperties[i].name)) { + prop=ppucdProperties[i].prop; + U_ASSERT(prop>=0); + break; + } + } + } + if(prop<UCHAR_BINARY_LIMIT) { + if(binaryValue>=0) { + props.binProps[prop]=(UBool)binaryValue; + } else { + // No binary value for a binary property. + fprintf(stderr, + "error in preparsed UCD: enum-property syntax '%s' " + "for binary property on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + } + } else if(binaryValue>=0) { + // Binary value for a non-binary property. + fprintf(stderr, + "error in preparsed UCD: binary-property syntax '%s' " + "for non-binary property on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + } else if (prop < UCHAR_INT_START) { + fprintf(stderr, + "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n", + prop, (long)lineNumber); + errorCode=U_PARSE_ERROR; + } else if(prop<UCHAR_INT_LIMIT) { + int32_t value=pnames->getPropertyValueEnum(prop, v); + if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) { + // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work. + char *end; + unsigned long ccc=uprv_strtoul(v, &end, 10); + if(v<end && *end==0 && ccc<=254) { + value=(int32_t)ccc; + } + } + if(value==UCHAR_INVALID_CODE) { + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid value on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + } else { + props.intProps[prop-UCHAR_INT_START]=value; + } + } else if(*v=='<') { + // Do not parse default values like <code point>, just set null values. + switch(prop) { + case UCHAR_BIDI_MIRRORING_GLYPH: + props.bmg=U_SENTINEL; + break; + case UCHAR_BIDI_PAIRED_BRACKET: + props.bpb=U_SENTINEL; + break; + case UCHAR_SIMPLE_CASE_FOLDING: + props.scf=U_SENTINEL; + break; + case UCHAR_SIMPLE_LOWERCASE_MAPPING: + props.slc=U_SENTINEL; + break; + case UCHAR_SIMPLE_TITLECASE_MAPPING: + props.stc=U_SENTINEL; + break; + case UCHAR_SIMPLE_UPPERCASE_MAPPING: + props.suc=U_SENTINEL; + break; + case UCHAR_CASE_FOLDING: + props.cf.remove(); + break; + case UCHAR_LOWERCASE_MAPPING: + props.lc.remove(); + break; + case UCHAR_TITLECASE_MAPPING: + props.tc.remove(); + break; + case UCHAR_UPPERCASE_MAPPING: + props.uc.remove(); + break; + case UCHAR_SCRIPT_EXTENSIONS: + props.scx.clear(); + break; + default: + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid default value on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + } + } else { + char c; + switch(prop) { + case UCHAR_NUMERIC_VALUE: + props.numericValue=v; + c=*v; + if('0'<=c && c<='9' && v[1]==0) { + props.digitValue=c-'0'; + } else { + props.digitValue=-1; + } + break; + case UCHAR_NAME: + props.name=v; + break; + case UCHAR_AGE: + u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric. + break; + case UCHAR_BIDI_MIRRORING_GLYPH: + props.bmg=parseCodePoint(v, errorCode); + break; + case UCHAR_BIDI_PAIRED_BRACKET: + props.bpb=parseCodePoint(v, errorCode); + break; + case UCHAR_SIMPLE_CASE_FOLDING: + props.scf=parseCodePoint(v, errorCode); + break; + case UCHAR_SIMPLE_LOWERCASE_MAPPING: + props.slc=parseCodePoint(v, errorCode); + break; + case UCHAR_SIMPLE_TITLECASE_MAPPING: + props.stc=parseCodePoint(v, errorCode); + break; + case UCHAR_SIMPLE_UPPERCASE_MAPPING: + props.suc=parseCodePoint(v, errorCode); + break; + case UCHAR_CASE_FOLDING: + parseString(v, props.cf, errorCode); + break; + case UCHAR_LOWERCASE_MAPPING: + parseString(v, props.lc, errorCode); + break; + case UCHAR_TITLECASE_MAPPING: + parseString(v, props.tc, errorCode); + break; + case UCHAR_UPPERCASE_MAPPING: + parseString(v, props.uc, errorCode); + break; + case PPUCD_NAME_ALIAS: + props.nameAlias=v; + break; + case PPUCD_CONDITIONAL_CASE_MAPPINGS: + case PPUCD_TURKIC_CASE_FOLDING: + // No need to parse their values: They are hardcoded in the runtime library. + break; + case UCHAR_SCRIPT_EXTENSIONS: + parseScriptExtensions(v, props.scx, errorCode); + break; + default: + // Ignore unhandled properties. + return true; + } + } + if(U_SUCCESS(errorCode)) { + newValues.add((UChar32)prop); + return true; + } else { + return false; + } +} + +UBool +PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return false; } + if(lineType!=ALG_NAMES_RANGE_LINE) { + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + return false; + } + firstField(); + const char *field=nextField(); + if(field==nullptr) { + // No range field after the type. + fprintf(stderr, + "error in preparsed UCD: missing algnamesrange range field " + "(no second field) on line %ld\n", + (long)lineNumber); + errorCode=U_PARSE_ERROR; + return false; + } + return parseCodePointRange(field, start, end, errorCode); +} + +UChar32 +PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) { + char *end; + uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || *end!=0 || value>=0x110000) { + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid code point on line %ld\n", + s, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return U_SENTINEL; + } + return (UChar32)value; +} + +UBool +PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) { + uint32_t st, e; + u_parseCodePointRange(s, &st, &e, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n", + s, (long)lineNumber); + return false; + } + start=(UChar32)st; + end=(UChar32)e; + return true; +} + +void +PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) { + char16_t *buffer=toUCharPtr(uni.getBuffer(-1)); + int32_t length=u_parseString(s, buffer, uni.getCapacity(), nullptr, &errorCode); + if(errorCode==U_BUFFER_OVERFLOW_ERROR) { + errorCode=U_ZERO_ERROR; + uni.releaseBuffer(0); + buffer=toUCharPtr(uni.getBuffer(length)); + length=u_parseString(s, buffer, uni.getCapacity(), nullptr, &errorCode); + } + uni.releaseBuffer(length); + if(U_FAILURE(errorCode)) { + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n", + s, (long)lineNumber); + } +} + +void +PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return; } + scx.clear(); + CharString scString; + for(;;) { + const char *scs; + const char *scLimit=strchr(s, ' '); + if(scLimit!=nullptr) { + scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data(); + if(U_FAILURE(errorCode)) { return; } + } else { + scs=s; + } + int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs); + if(script==UCHAR_INVALID_CODE) { + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid script code on line %ld\n", + scs, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return; + } else if(scx.contains(script)) { + fprintf(stderr, + "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n", + scs, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return; + } else { + scx.add(script); + } + if(scLimit!=nullptr) { + s=scLimit+1; + } else { + break; + } + } + if(scx.isEmpty()) { + fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber); + errorCode=U_PARSE_ERROR; + } +} + +U_NAMESPACE_END diff --git a/intl/icu/source/tools/toolutil/ppucd.h b/intl/icu/source/tools/toolutil/ppucd.h new file mode 100644 index 0000000000..d5c63fab49 --- /dev/null +++ b/intl/icu/source/tools/toolutil/ppucd.h @@ -0,0 +1,180 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2011-2013, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: ppucd.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2011dec11 +* created by: Markus W. Scherer +*/ + +#ifndef __PPUCD_H__ +#define __PPUCD_H__ + +#include "unicode/utypes.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" + +#include <stdio.h> + +/** Additions to the uchar.h enum UProperty. */ +enum { + /** Name_Alias */ + PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT, + PPUCD_CONDITIONAL_CASE_MAPPINGS, + PPUCD_TURKIC_CASE_FOLDING +}; + +U_NAMESPACE_BEGIN + +class U_TOOLUTIL_API PropertyNames { +public: + virtual ~PropertyNames(); + virtual int32_t getPropertyEnum(const char *name) const = 0; + virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const = 0; +}; + +struct U_TOOLUTIL_API UniProps { + UniProps(); + ~UniProps(); + + int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; } + + UChar32 start, end; + UBool binProps[UCHAR_BINARY_LIMIT]; + int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]; + UVersionInfo age; + UChar32 bmg, bpb; + UChar32 scf, slc, stc, suc; + int32_t digitValue; + const char *numericValue; + const char *name; + const char *nameAlias; + UnicodeString cf, lc, tc, uc; + UnicodeSet scx; +}; + +class U_TOOLUTIL_API PreparsedUCD { +public: + enum LineType { + /** No line, end of file. */ + NO_LINE, + /** Empty line. (Might contain a comment.) */ + EMPTY_LINE, + + /** ucd;6.1.0 */ + UNICODE_VERSION_LINE, + + /** property;Binary;Alpha;Alphabetic */ + PROPERTY_LINE, + /** binary;N;No;F;False */ + BINARY_LINE, + /** value;gc;Zs;Space_Separator */ + VALUE_LINE, + + /** defaults;0000..10FFFF;age=NA;bc=L;... */ + DEFAULTS_LINE, + /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */ + BLOCK_LINE, + /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */ + CP_LINE, + /** unassigned;E01F0..E0FFF;bc=BN;CWKCF;DI;GCB=CN;NFKC_CF= */ + UNASSIGNED_LINE, + + /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */ + ALG_NAMES_RANGE_LINE, + + LINE_TYPE_COUNT + }; + + /** + * Constructor. + * Prepare this object for a new, empty package. + */ + PreparsedUCD(const char *filename, UErrorCode &errorCode); + + /** Destructor. */ + ~PreparsedUCD(); + + /** Sets (aliases) a PropertyNames implementation. Caller retains ownership. */ + void setPropertyNames(const PropertyNames *pn) { pnames=pn; } + + /** + * Reads a line from the preparsed UCD file. + * Splits the line by replacing each ';' with a NUL. + */ + LineType readLine(UErrorCode &errorCode); + + /** Returns the number of the line read by readLine(). */ + int32_t getLineNumber() const { return lineNumber; } + + /** Returns the line's next field, or nullptr. */ + const char *nextField(); + + /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */ + const UVersionInfo &getUnicodeVersion() const { return ucdVersion; } + + /** Returns true if the current line has property values. */ + UBool lineHasPropertyValues() const { + return DEFAULTS_LINE<=lineType && lineType<=UNASSIGNED_LINE; + } + + /** + * Parses properties from the current line. + * Clears newValues and sets UProperty codes for property values mentioned + * on the current line (as opposed to being inherited). + * Returns a pointer to the filled-in UniProps, or nullptr if something went wrong. + * The returned UniProps are usable until the next line of the same type is read. + */ + const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode); + + /** + * Returns the code point range for the current algnamesrange line. + * Calls & parses nextField(). + * Further nextField() calls will yield the range's type & prefix string. + * Returns U_SUCCESS(errorCode). + */ + UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode); + +private: + UBool isLineBufferAvailable(int32_t i) { + return defaultLineIndex!=i && blockLineIndex!=i; + } + + /** Resets the field iterator and returns the line's first field (the line type field). */ + const char *firstField(); + + UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, + UErrorCode &errorCode); + UChar32 parseCodePoint(const char *s, UErrorCode &errorCode); + UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode); + void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode); + void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode); + + static const int32_t kNumLineBuffers=3; + + const PropertyNames *pnames; // aliased + FILE *file; + int32_t defaultLineIndex, blockLineIndex, lineIndex; + int32_t lineNumber; + LineType lineType; + char *fieldLimit; + char *lineLimit; + + UVersionInfo ucdVersion; + UniProps defaultProps, blockProps, cpProps; + UnicodeSet blockValues; + // Multiple lines so that default and block properties can maintain pointers + // into their line buffers. + char lines[kNumLineBuffers][4096]; +}; + +U_NAMESPACE_END + +#endif // __PPUCD_H__ diff --git a/intl/icu/source/tools/toolutil/sources.txt b/intl/icu/source/tools/toolutil/sources.txt new file mode 100644 index 0000000000..d3288997e2 --- /dev/null +++ b/intl/icu/source/tools/toolutil/sources.txt @@ -0,0 +1,24 @@ +collationinfo.cpp +dbgutil.cpp +denseranges.cpp +filestrm.cpp +filetools.cpp +flagparser.cpp +package.cpp +pkg_genc.cpp +pkg_gencmn.cpp +pkg_icu.cpp +pkgitems.cpp +ppucd.cpp +swapimpl.cpp +toolutil.cpp +ucbuf.cpp +ucln_tu.cpp +ucm.cpp +ucmstate.cpp +udbgutil.cpp +unewdata.cpp +uoptions.cpp +uparse.cpp +writesrc.cpp +xmlparser.cpp diff --git a/intl/icu/source/tools/toolutil/swapimpl.cpp b/intl/icu/source/tools/toolutil/swapimpl.cpp new file mode 100644 index 0000000000..9c58563965 --- /dev/null +++ b/intl/icu/source/tools/toolutil/swapimpl.cpp @@ -0,0 +1,1048 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2005-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: swapimpl.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005may05 +* created by: Markus W. Scherer +* +* Data file swapping functions moved here from the common library +* because some data is hardcoded in ICU4C and needs not be swapped any more. +* Moving the functions here simplifies testing (for code coverage) because +* we need not jump through hoops (like adding snapshots of these files +* to testdata). +* +* The declarations for these functions remain in the internal header files +* in icu/source/common/ +*/ + +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/udata.h" + +/* Explicit include statement for std_string.h is needed + * for compilation on certain platforms. (e.g. AIX/VACPP) + */ +#include "unicode/std_string.h" + +#include "cmemory.h" +#include "cstring.h" +#include "uinvchar.h" +#include "uassert.h" +#include "uarrsort.h" +#include "ucmndata.h" +#include "udataswp.h" +#include "ulayout_props.h" + +/* swapping implementations in common */ + +#include "emojiprops.h" +#include "uresdata.h" +#include "ucnv_io.h" +#include "uprops.h" +#include "ucase.h" +#include "ubidi_props.h" +#include "ucol_swp.h" +#include "ucnv_bld.h" +#include "unormimp.h" +#include "normalizer2impl.h" +#include "sprpimpl.h" +#include "propname.h" +#include "rbbidata.h" +#include "utrie.h" +#include "utrie2.h" +#include "dictionarydata.h" + +/* swapping implementations in i18n */ + +#if !UCONFIG_NO_NORMALIZATION +#include "uspoof_impl.h" +#endif + +U_NAMESPACE_USE + +/* definitions */ + +/* Unicode property (value) aliases data swapping --------------------------- */ + +static int32_t U_CALLCONV +upname_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + /* udata_swapDataHeader checks the arguments */ + int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* check data format and format version */ + const UDataInfo *pInfo= + reinterpret_cast<const UDataInfo *>( + static_cast<const char *>(inData)+4); + if(!( + pInfo->dataFormat[0]==0x70 && /* dataFormat="pnam" */ + pInfo->dataFormat[1]==0x6e && + pInfo->dataFormat[2]==0x61 && + pInfo->dataFormat[3]==0x6d && + pInfo->formatVersion[0]==2 + )) { + udata_printError(ds, "upname_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as pnames.icu\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + const uint8_t *inBytes=static_cast<const uint8_t *>(inData)+headerSize; + uint8_t *outBytes=static_cast<uint8_t *>(outData)+headerSize; + + if(length>=0) { + length-=headerSize; + // formatVersion 2 initially has indexes[8], 32 bytes. + if(length<32) { + udata_printError(ds, "upname_swap(): too few bytes (%d after header) for pnames.icu\n", + (int)length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + } + + const int32_t *inIndexes=reinterpret_cast<const int32_t *>(inBytes); + int32_t totalSize=udata_readInt32(ds, inIndexes[PropNameData::IX_TOTAL_SIZE]); + if(length>=0) { + if(length<totalSize) { + udata_printError(ds, "upname_swap(): too few bytes (%d after header, should be %d) " + "for pnames.icu\n", + (int)length, (int)totalSize); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + int32_t numBytesIndexesAndValueMaps= + udata_readInt32(ds, inIndexes[PropNameData::IX_BYTE_TRIES_OFFSET]); + + // Swap the indexes[] and the valueMaps[]. + ds->swapArray32(ds, inBytes, numBytesIndexesAndValueMaps, outBytes, pErrorCode); + + // Copy the rest of the data. + if(inBytes!=outBytes) { + uprv_memcpy(outBytes+numBytesIndexesAndValueMaps, + inBytes+numBytesIndexesAndValueMaps, + totalSize-numBytesIndexesAndValueMaps); + } + + // We need not swap anything else: + // + // The ByteTries are already byte-serialized, and are fixed on ASCII. + // (On an EBCDIC machine, the input string is converted to lowercase ASCII + // while matching.) + // + // The name groups are mostly invariant characters, but since we only + // generate, and keep in subversion, ASCII versions of pnames.icu, + // and since only ICU4J uses the pnames.icu data file + // (the data is hardcoded in ICU4C) and ICU4J uses ASCII data files, + // we just copy those bytes too. + } + + return headerSize+totalSize; +} + +/* Unicode properties data swapping ----------------------------------------- */ + +static int32_t U_CALLCONV +uprops_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t headerSize, i; + + int32_t dataIndexes[UPROPS_INDEX_COUNT]; + const int32_t *inData32; + + /* udata_swapDataHeader checks the arguments */ + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==0x55 && /* dataFormat="UPro" */ + pInfo->dataFormat[1]==0x50 && + pInfo->dataFormat[2]==0x72 && + pInfo->dataFormat[3]==0x6f && + (3<=pInfo->formatVersion[0] && pInfo->formatVersion[0]<=7) && + (pInfo->formatVersion[0]>=7 || + (pInfo->formatVersion[2]==UTRIE_SHIFT && + pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT)) + )) { + udata_printError(ds, "uprops_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not a Unicode properties file\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + /* the properties file must contain at least the indexes array */ + if(length>=0 && (length-headerSize)<(int32_t)sizeof(dataIndexes)) { + udata_printError(ds, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n", + length-headerSize); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + /* read the indexes */ + inData32=(const int32_t *)((const char *)inData+headerSize); + for(i=0; i<UPROPS_INDEX_COUNT; ++i) { + dataIndexes[i]=udata_readInt32(ds, inData32[i]); + } + + /* + * comments are copied from the data format description in genprops/store.c + * indexes[] constants are in uprops.h + */ + int32_t dataTop; + if(length>=0) { + int32_t *outData32; + + /* + * In formatVersion 7, UPROPS_DATA_TOP_INDEX has the post-header data size. + * In earlier formatVersions, it is 0 and a lower dataIndexes entry + * has the top of the last item. + */ + for(i=UPROPS_DATA_TOP_INDEX; i>0 && (dataTop=dataIndexes[i])==0; --i) {} + + if((length-headerSize)<(4*dataTop)) { + udata_printError(ds, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n", + length-headerSize); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + outData32=(int32_t *)((char *)outData+headerSize); + + /* copy everything for inaccessible data (padding) */ + if(inData32!=outData32) { + uprv_memcpy(outData32, inData32, 4*(size_t)dataTop); + } + + /* swap the indexes[16] */ + ds->swapArray32(ds, inData32, 4*UPROPS_INDEX_COUNT, outData32, pErrorCode); + + /* + * swap the main properties UTrie + * PT serialized properties trie, see utrie.h (byte size: 4*(i0-16)) + */ + utrie_swapAnyVersion(ds, + inData32+UPROPS_INDEX_COUNT, + 4*(dataIndexes[UPROPS_PROPS32_INDEX]-UPROPS_INDEX_COUNT), + outData32+UPROPS_INDEX_COUNT, + pErrorCode); + + /* + * swap the properties and exceptions words + * P const uint32_t props32[i1-i0]; + * E const uint32_t exceptions[i2-i1]; + */ + ds->swapArray32(ds, + inData32+dataIndexes[UPROPS_PROPS32_INDEX], + 4*(dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX]-dataIndexes[UPROPS_PROPS32_INDEX]), + outData32+dataIndexes[UPROPS_PROPS32_INDEX], + pErrorCode); + + /* + * swap the UChars + * U const char16_t uchars[2*(i3-i2)]; + */ + ds->swapArray16(ds, + inData32+dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX], + 4*(dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX]-dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX]), + outData32+dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX], + pErrorCode); + + /* + * swap the additional UTrie + * i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties + */ + utrie_swapAnyVersion(ds, + inData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX], + 4*(dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]-dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX]), + outData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX], + pErrorCode); + + /* + * swap the properties vectors + * PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4]; + */ + ds->swapArray32(ds, + inData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX], + 4*(dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]-dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]), + outData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX], + pErrorCode); + + // swap the Script_Extensions data + // SCX const uint16_t scriptExtensions[2*(i7-i6)]; + ds->swapArray16(ds, + inData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX], + 4*(dataIndexes[UPROPS_RESERVED_INDEX_7]-dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]), + outData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX], + pErrorCode); + } + + /* i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data */ + return headerSize+4*dataIndexes[UPROPS_RESERVED_INDEX_7]; +} + +/* Unicode case mapping data swapping --------------------------------------- */ + +static int32_t U_CALLCONV +ucase_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t headerSize; + + const uint8_t *inBytes; + uint8_t *outBytes; + + const int32_t *inIndexes; + int32_t indexes[16]; + + int32_t i, offset, count, size; + + /* udata_swapDataHeader checks the arguments */ + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==UCASE_FMT_0 && /* dataFormat="cAsE" */ + pInfo->dataFormat[1]==UCASE_FMT_1 && + pInfo->dataFormat[2]==UCASE_FMT_2 && + pInfo->dataFormat[3]==UCASE_FMT_3 && + ((pInfo->formatVersion[0]==1 && + pInfo->formatVersion[2]==UTRIE_SHIFT && + pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT) || + (2<=pInfo->formatVersion[0] && pInfo->formatVersion[0]<=4)) + )) { + udata_printError(ds, "ucase_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as case mapping data\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + inBytes=(const uint8_t *)inData+headerSize; + outBytes=(uint8_t *)outData+headerSize; + + inIndexes=(const int32_t *)inBytes; + + if(length>=0) { + length-=headerSize; + if(length<16*4) { + udata_printError(ds, "ucase_swap(): too few bytes (%d after header) for case mapping data\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + } + + /* read the first 16 indexes (ICU 3.2/format version 1: UCASE_IX_TOP==16, might grow) */ + for(i=0; i<16; ++i) { + indexes[i]=udata_readInt32(ds, inIndexes[i]); + } + + /* get the total length of the data */ + size=indexes[UCASE_IX_LENGTH]; + + if(length>=0) { + if(length<size) { + udata_printError(ds, "ucase_swap(): too few bytes (%d after header) for all of case mapping data\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + /* copy the data for inaccessible bytes */ + if(inBytes!=outBytes) { + uprv_memcpy(outBytes, inBytes, size); + } + + offset=0; + + /* swap the int32_t indexes[] */ + count=indexes[UCASE_IX_INDEX_TOP]*4; + ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode); + offset+=count; + + /* swap the UTrie */ + count=indexes[UCASE_IX_TRIE_SIZE]; + utrie_swapAnyVersion(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + + /* swap the uint16_t exceptions[] and unfold[] */ + count=(indexes[UCASE_IX_EXC_LENGTH]+indexes[UCASE_IX_UNFOLD_LENGTH])*2; + ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + + U_ASSERT(offset==size); + } + + return headerSize+size; +} + +/* Unicode bidi/shaping data swapping --------------------------------------- */ + +static int32_t U_CALLCONV +ubidi_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t headerSize; + + const uint8_t *inBytes; + uint8_t *outBytes; + + const int32_t *inIndexes; + int32_t indexes[16]; + + int32_t i, offset, count, size; + + /* udata_swapDataHeader checks the arguments */ + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==UBIDI_FMT_0 && /* dataFormat="BiDi" */ + pInfo->dataFormat[1]==UBIDI_FMT_1 && + pInfo->dataFormat[2]==UBIDI_FMT_2 && + pInfo->dataFormat[3]==UBIDI_FMT_3 && + ((pInfo->formatVersion[0]==1 && + pInfo->formatVersion[2]==UTRIE_SHIFT && + pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT) || + pInfo->formatVersion[0]==2) + )) { + udata_printError(ds, "ubidi_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as bidi/shaping data\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + inBytes=(const uint8_t *)inData+headerSize; + outBytes=(uint8_t *)outData+headerSize; + + inIndexes=(const int32_t *)inBytes; + + if(length>=0) { + length-=headerSize; + if(length<16*4) { + udata_printError(ds, "ubidi_swap(): too few bytes (%d after header) for bidi/shaping data\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + } + + /* read the first 16 indexes (ICU 3.4/format version 1: UBIDI_IX_TOP==16, might grow) */ + for(i=0; i<16; ++i) { + indexes[i]=udata_readInt32(ds, inIndexes[i]); + } + + /* get the total length of the data */ + size=indexes[UBIDI_IX_LENGTH]; + + if(length>=0) { + if(length<size) { + udata_printError(ds, "ubidi_swap(): too few bytes (%d after header) for all of bidi/shaping data\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + /* copy the data for inaccessible bytes */ + if(inBytes!=outBytes) { + uprv_memcpy(outBytes, inBytes, size); + } + + offset=0; + + /* swap the int32_t indexes[] */ + count=indexes[UBIDI_IX_INDEX_TOP]*4; + ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode); + offset+=count; + + /* swap the UTrie */ + count=indexes[UBIDI_IX_TRIE_SIZE]; + utrie_swapAnyVersion(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + + /* swap the uint32_t mirrors[] */ + count=indexes[UBIDI_IX_MIRROR_LENGTH]*4; + ds->swapArray32(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + + /* just skip the uint8_t jgArray[] and jgArray2[] */ + count=indexes[UBIDI_IX_JG_LIMIT]-indexes[UBIDI_IX_JG_START]; + offset+=count; + count=indexes[UBIDI_IX_JG_LIMIT2]-indexes[UBIDI_IX_JG_START2]; + offset+=count; + + U_ASSERT(offset==size); + } + + return headerSize+size; +} + +/* Unicode normalization data swapping -------------------------------------- */ + +#if !UCONFIG_NO_NORMALIZATION + +static int32_t U_CALLCONV +unorm_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t headerSize; + + const uint8_t *inBytes; + uint8_t *outBytes; + + const int32_t *inIndexes; + int32_t indexes[32]; + + int32_t i, offset, count, size; + + /* udata_swapDataHeader checks the arguments */ + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */ + pInfo->dataFormat[1]==0x6f && + pInfo->dataFormat[2]==0x72 && + pInfo->dataFormat[3]==0x6d && + pInfo->formatVersion[0]==2 + )) { + udata_printError(ds, "unorm_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unorm.icu\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + inBytes=(const uint8_t *)inData+headerSize; + outBytes=(uint8_t *)outData+headerSize; + + inIndexes=(const int32_t *)inBytes; + + if(length>=0) { + length-=headerSize; + if(length<32*4) { + udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for unorm.icu\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + } + + /* read the first 32 indexes (ICU 2.8/format version 2.2: _NORM_INDEX_TOP==32, might grow) */ + for(i=0; i<32; ++i) { + indexes[i]=udata_readInt32(ds, inIndexes[i]); + } + + /* calculate the total length of the data */ + size= + 32*4+ /* size of indexes[] */ + indexes[_NORM_INDEX_TRIE_SIZE]+ + indexes[_NORM_INDEX_UCHAR_COUNT]*2+ + indexes[_NORM_INDEX_COMBINE_DATA_COUNT]*2+ + indexes[_NORM_INDEX_FCD_TRIE_SIZE]+ + indexes[_NORM_INDEX_AUX_TRIE_SIZE]+ + indexes[_NORM_INDEX_CANON_SET_COUNT]*2; + + if(length>=0) { + if(length<size) { + udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for all of unorm.icu\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + /* copy the data for inaccessible bytes */ + if(inBytes!=outBytes) { + uprv_memcpy(outBytes, inBytes, size); + } + + offset=0; + + /* swap the indexes[] */ + count=32*4; + ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode); + offset+=count; + + /* swap the main UTrie */ + count=indexes[_NORM_INDEX_TRIE_SIZE]; + utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + + /* swap the uint16_t extraData[] and the uint16_t combiningTable[] */ + count=(indexes[_NORM_INDEX_UCHAR_COUNT]+indexes[_NORM_INDEX_COMBINE_DATA_COUNT])*2; + ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + + /* swap the FCD UTrie */ + count=indexes[_NORM_INDEX_FCD_TRIE_SIZE]; + if(count!=0) { + utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + } + + /* swap the aux UTrie */ + count=indexes[_NORM_INDEX_AUX_TRIE_SIZE]; + if(count!=0) { + utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + } + + /* swap the uint16_t combiningTable[] */ + count=indexes[_NORM_INDEX_CANON_SET_COUNT]*2; + ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + } + + return headerSize+size; +} + +#endif + +// Unicode text layout properties data swapping -------------------------------- + +static int32_t U_CALLCONV +ulayout_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + // udata_swapDataHeader checks the arguments. + int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + // Check data format and format version. + const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4); + if (!( + pInfo->dataFormat[0] == ULAYOUT_FMT_0 && // dataFormat="Layo" + pInfo->dataFormat[1] == ULAYOUT_FMT_1 && + pInfo->dataFormat[2] == ULAYOUT_FMT_2 && + pInfo->dataFormat[3] == ULAYOUT_FMT_3 && + pInfo->formatVersion[0] == 1)) { + udata_printError(ds, + "ulayout_swap(): data format %02x.%02x.%02x.%02x (format version %02x) " + "is not recognized as text layout properties data\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode = U_UNSUPPORTED_ERROR; + return 0; + } + + const uint8_t *inBytes = (const uint8_t *)inData + headerSize; + uint8_t *outBytes = (uint8_t *)outData + headerSize; + + const int32_t *inIndexes = (const int32_t *)inBytes; + + if (length >= 0) { + length -= headerSize; + if (length < 12 * 4) { + udata_printError(ds, + "ulayout_swap(): too few bytes (%d after header) for text layout properties data\n", + length); + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + } + + int32_t indexesLength = udata_readInt32(ds, inIndexes[ULAYOUT_IX_INDEXES_LENGTH]); + if (indexesLength < 12) { + udata_printError(ds, + "ulayout_swap(): too few indexes (%d) for text layout properties data\n", + indexesLength); + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + // Read the data offsets before swapping anything. + int32_t indexes[ULAYOUT_IX_TRIES_TOP + 1]; + for (int32_t i = ULAYOUT_IX_INPC_TRIE_TOP; i <= ULAYOUT_IX_TRIES_TOP; ++i) { + indexes[i] = udata_readInt32(ds, inIndexes[i]); + } + int32_t size = indexes[ULAYOUT_IX_TRIES_TOP]; + + if (length >= 0) { + if (length < size) { + udata_printError(ds, + "ulayout_swap(): too few bytes (%d after header) " + "for all of text layout properties data\n", + length); + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + // Copy the data for inaccessible bytes. + if (inBytes != outBytes) { + uprv_memcpy(outBytes, inBytes, size); + } + + // Swap the int32_t indexes[]. + int32_t offset = 0; + int32_t count = indexesLength * 4; + ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode); + offset += count; + + // Swap each trie. + for (int32_t i = ULAYOUT_IX_INPC_TRIE_TOP; i <= ULAYOUT_IX_TRIES_TOP; ++i) { + int32_t top = indexes[i]; + count = top - offset; + U_ASSERT(count >= 0); + if (count >= 16) { + utrie_swapAnyVersion(ds, inBytes + offset, count, outBytes + offset, pErrorCode); + } + offset = top; + } + + U_ASSERT(offset == size); + } + + return headerSize + size; +} + +// Unicode emoji properties data swapping -------------------------------------- + +static int32_t U_CALLCONV +uemoji_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + // udata_swapDataHeader checks the arguments. + int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + // Check data format and format version. + const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4); + if (!( + pInfo->dataFormat[0] == u'E' && + pInfo->dataFormat[1] == u'm' && + pInfo->dataFormat[2] == u'o' && + pInfo->dataFormat[3] == u'j' && + pInfo->formatVersion[0] == 1)) { + udata_printError(ds, + "uemoji_swap(): data format %02x.%02x.%02x.%02x (format version %02x) " + "is not recognized as emoji properties data\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode = U_UNSUPPORTED_ERROR; + return 0; + } + + const uint8_t *inBytes = (const uint8_t *)inData + headerSize; + uint8_t *outBytes = (uint8_t *)outData + headerSize; + + const int32_t *inIndexes = (const int32_t *)inBytes; + + if (length >= 0) { + length -= headerSize; + // We expect to read at least EmojiProps::IX_TOTAL_SIZE. + if (length < 14 * 4) { + udata_printError(ds, + "uemoji_swap(): too few bytes (%d after header) for emoji properties data\n", + length); + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + } + + // First offset after indexes[]. + int32_t cpTrieOffset = udata_readInt32(ds, inIndexes[EmojiProps::IX_CPTRIE_OFFSET]); + int32_t indexesLength = cpTrieOffset / 4; + if (indexesLength < 14) { + udata_printError(ds, + "uemoji_swap(): too few indexes (%d) for emoji properties data\n", + indexesLength); + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + // Read the data offsets before swapping anything. + int32_t indexes[EmojiProps::IX_TOTAL_SIZE + 1]; + indexes[0] = cpTrieOffset; + for (int32_t i = 1; i <= EmojiProps::IX_TOTAL_SIZE; ++i) { + indexes[i] = udata_readInt32(ds, inIndexes[i]); + } + int32_t size = indexes[EmojiProps::IX_TOTAL_SIZE]; + + if (length >= 0) { + if (length < size) { + udata_printError(ds, + "uemoji_swap(): too few bytes (%d after header) " + "for all of emoji properties data\n", + length); + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + // Copy the data for inaccessible bytes. + if (inBytes != outBytes) { + uprv_memcpy(outBytes, inBytes, size); + } + + // Swap the int32_t indexes[]. + int32_t offset = 0; + int32_t top = cpTrieOffset; + ds->swapArray32(ds, inBytes, top - offset, outBytes, pErrorCode); + offset = top; + + // Swap the code point trie. + top = indexes[EmojiProps::IX_CPTRIE_OFFSET + 1]; + int32_t count = top - offset; + U_ASSERT(count >= 0); + if (count >= 16) { + utrie_swapAnyVersion(ds, inBytes + offset, count, outBytes + offset, pErrorCode); + } + offset = top; + + // Swap all of the string tries. + // They are all serialized as arrays of 16-bit units. + offset = indexes[EmojiProps::IX_BASIC_EMOJI_TRIE_OFFSET]; + top = indexes[EmojiProps::IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET + 1]; + ds->swapArray16(ds, inBytes + offset, top - offset, outBytes + offset, pErrorCode); + offset = top; + + U_ASSERT(offset == size); + } + + return headerSize + size; +} + +/* Swap 'Test' data from gentest */ +static int32_t U_CALLCONV +test_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t headerSize; + + const uint8_t *inBytes; + uint8_t *outBytes; + + int32_t offset; + + /* udata_swapDataHeader checks the arguments */ + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + udata_printError(ds, "test_swap(): data header swap failed %s\n", pErrorCode != nullptr ? u_errorName(*pErrorCode) : "pErrorCode is nullptr"); + return 0; + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==0x54 && /* dataFormat="Norm" */ + pInfo->dataFormat[1]==0x65 && + pInfo->dataFormat[2]==0x73 && + pInfo->dataFormat[3]==0x74 && + pInfo->formatVersion[0]==1 + )) { + udata_printError(ds, "test_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as testdata\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + inBytes=(const uint8_t *)inData+headerSize; + outBytes=(uint8_t *)outData+headerSize; + + int32_t size16 = 2; // 16bit plus padding + int32_t sizeStr = 5; // 4 char inv-str plus null + int32_t size = size16 + sizeStr; + + if(length>=0) { + if(length<size) { + udata_printError(ds, "test_swap(): too few bytes (%d after header, wanted %d) for all of testdata\n", + length, size); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + offset =0; + /* swap a 1 entry array */ + ds->swapArray16(ds, inBytes+offset, size16, outBytes+offset, pErrorCode); + offset+=size16; + ds->swapInvChars(ds, inBytes+offset, sizeStr, outBytes+offset, pErrorCode); + } + + return headerSize+size; +} + +/* swap any data (except a .dat package) ------------------------------------ */ + +static const struct { + uint8_t dataFormat[4]; + UDataSwapFn *swapFn; +} swapFns[]={ + { { 0x52, 0x65, 0x73, 0x42 }, ures_swap }, /* dataFormat="ResB" */ +#if !UCONFIG_NO_LEGACY_CONVERSION + { { 0x63, 0x6e, 0x76, 0x74 }, ucnv_swap }, /* dataFormat="cnvt" */ +#endif +#if !UCONFIG_NO_CONVERSION + { { 0x43, 0x76, 0x41, 0x6c }, ucnv_swapAliases }, /* dataFormat="CvAl" */ +#endif +#if !UCONFIG_NO_IDNA + { { 0x53, 0x50, 0x52, 0x50 }, usprep_swap }, /* dataFormat="SPRP" */ +#endif + /* insert data formats here, descending by expected frequency of occurrence */ + { { 0x55, 0x50, 0x72, 0x6f }, uprops_swap }, /* dataFormat="UPro" */ + + { { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 }, + ucase_swap }, /* dataFormat="cAsE" */ + + { { UBIDI_FMT_0, UBIDI_FMT_1, UBIDI_FMT_2, UBIDI_FMT_3 }, + ubidi_swap }, /* dataFormat="BiDi" */ + +#if !UCONFIG_NO_NORMALIZATION + { { 0x4e, 0x6f, 0x72, 0x6d }, unorm_swap }, /* dataFormat="Norm" */ + { { 0x4e, 0x72, 0x6d, 0x32 }, unorm2_swap }, /* dataFormat="Nrm2" */ +#endif + + { { ULAYOUT_FMT_0, ULAYOUT_FMT_1, ULAYOUT_FMT_2, ULAYOUT_FMT_3 }, + ulayout_swap }, // dataFormat="Layo" + + { { u'E', u'm', u'o', u'j' }, uemoji_swap }, + +#if !UCONFIG_NO_COLLATION + { { 0x55, 0x43, 0x6f, 0x6c }, ucol_swap }, /* dataFormat="UCol" */ + { { 0x49, 0x6e, 0x76, 0x43 }, ucol_swapInverseUCA },/* dataFormat="InvC" */ +#endif +#if !UCONFIG_NO_BREAK_ITERATION + { { 0x42, 0x72, 0x6b, 0x20 }, ubrk_swap }, /* dataFormat="Brk " */ + { { 0x44, 0x69, 0x63, 0x74 }, udict_swap }, /* dataFormat="Dict" */ +#endif + { { 0x70, 0x6e, 0x61, 0x6d }, upname_swap }, /* dataFormat="pnam" */ + { { 0x75, 0x6e, 0x61, 0x6d }, uchar_swapNames }, /* dataFormat="unam" */ +#if !UCONFIG_NO_NORMALIZATION + { { 0x43, 0x66, 0x75, 0x20 }, uspoof_swap }, /* dataFormat="Cfu " */ +#endif + { { 0x54, 0x65, 0x73, 0x74 }, test_swap } /* dataFormat="Test" */ +}; + +U_CAPI int32_t U_EXPORT2 +udata_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + char dataFormatChars[4]; + const UDataInfo *pInfo; + int32_t i, swappedLength; + + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* + * Preflight the header first; checks for illegal arguments, too. + * Do not swap the header right away because the format-specific swapper + * will swap it, get the headerSize again, and also use the header + * information. Otherwise we would have to pass some of the information + * and not be able to use the UDataSwapFn signature. + */ + udata_swapDataHeader(ds, inData, -1, nullptr, pErrorCode); + + /* + * If we wanted udata_swap() to also handle non-loadable data like a UTrie, + * then we could check here for further known magic values and structures. + */ + if(U_FAILURE(*pErrorCode)) { + return 0; /* the data format was not recognized */ + } + + pInfo=(const UDataInfo *)((const char *)inData+4); + + { + /* convert the data format from ASCII to Unicode to the system charset */ + char16_t u[4]={ + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3] + }; + + if(uprv_isInvariantUString(u, 4)) { + u_UCharsToChars(u, dataFormatChars, 4); + } else { + dataFormatChars[0]=dataFormatChars[1]=dataFormatChars[2]=dataFormatChars[3]='?'; + } + } + + /* dispatch to the swap function for the dataFormat */ + for(i=0; i<UPRV_LENGTHOF(swapFns); ++i) { + if(0==memcmp(swapFns[i].dataFormat, pInfo->dataFormat, 4)) { + swappedLength=swapFns[i].swapFn(ds, inData, length, outData, pErrorCode); + + if(U_FAILURE(*pErrorCode)) { + udata_printError(ds, "udata_swap(): failure swapping data format %02x.%02x.%02x.%02x (\"%c%c%c%c\") - %s\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + dataFormatChars[0], dataFormatChars[1], + dataFormatChars[2], dataFormatChars[3], + u_errorName(*pErrorCode)); + } else if(swappedLength<(length-15)) { + /* swapped less than expected */ + udata_printError(ds, "udata_swap() warning: swapped only %d out of %d bytes - data format %02x.%02x.%02x.%02x (\"%c%c%c%c\")\n", + swappedLength, length, + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + dataFormatChars[0], dataFormatChars[1], + dataFormatChars[2], dataFormatChars[3], + u_errorName(*pErrorCode)); + } + + return swappedLength; + } + } + + /* the dataFormat was not recognized */ + udata_printError(ds, "udata_swap(): unknown data format %02x.%02x.%02x.%02x (\"%c%c%c%c\")\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + dataFormatChars[0], dataFormatChars[1], + dataFormatChars[2], dataFormatChars[3]); + + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; +} diff --git a/intl/icu/source/tools/toolutil/swapimpl.h b/intl/icu/source/tools/toolutil/swapimpl.h new file mode 100644 index 0000000000..8c6474f662 --- /dev/null +++ b/intl/icu/source/tools/toolutil/swapimpl.h @@ -0,0 +1,45 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2005, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: swapimpl.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005jul29 +* created by: Markus W. Scherer +* +* Declarations for data file swapping functions not declared in internal +* library headers. +*/ + +#ifndef __SWAPIMPL_H__ +#define __SWAPIMPL_H__ + +#include "unicode/utypes.h" +#include "udataswp.h" + +/** + * Identifies and then transforms the ICU data piece in-place, or determines + * its length. See UDataSwapFn. + * This function handles single data pieces (but not .dat data packages) + * and internally dispatches to per-type swap functions. + * Sets a U_UNSUPPORTED_ERROR if the data format is not recognized. + * + * @see UDataSwapFn + * @see udata_openSwapper + * @see udata_openSwapperForInputData + * @internal ICU 2.8 + */ +U_CAPI int32_t U_EXPORT2 +udata_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode); + +#endif diff --git a/intl/icu/source/tools/toolutil/toolutil.cpp b/intl/icu/source/tools/toolutil/toolutil.cpp new file mode 100644 index 0000000000..7e7bdc78a1 --- /dev/null +++ b/intl/icu/source/tools/toolutil/toolutil.cpp @@ -0,0 +1,381 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: toolutil.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999nov19 +* created by: Markus W. Scherer +* +* 6/25/08 - Added Cygwin specific code in uprv_mkdir - Brian Rower +* +* This file contains utility functions for ICU tools like genccode. +*/ + +#include "unicode/platform.h" +#if U_PLATFORM == U_PF_MINGW +// *cough* - for struct stat +#ifdef __STRICT_ANSI__ +#undef __STRICT_ANSI__ +#endif +#endif + +#include <stdio.h> +#include <sys/stat.h> +#include <fstream> +#include <time.h> +#include "unicode/utypes.h" + +#ifndef U_TOOLUTIL_IMPLEMENTATION +#error U_TOOLUTIL_IMPLEMENTATION not set - must be set for all ICU source files in common/ - see https://unicode-org.github.io/icu/userguide/howtouseicu +#endif + +#if U_PLATFORM_USES_ONLY_WIN32_API +# define VC_EXTRALEAN +# define WIN32_LEAN_AND_MEAN +# define NOUSER +# define NOSERVICE +# define NOIME +# define NOMCX +# if U_PLATFORM == U_PF_MINGW +# define __NO_MINGW_LFS /* gets around missing 'off64_t' */ +# endif +# include <windows.h> +# include <direct.h> +#else +# include <sys/stat.h> +# include <sys/types.h> +#endif + +/* In MinGW environment, io.h needs to be included for _mkdir() */ +#if U_PLATFORM == U_PF_MINGW +#include <io.h> +#endif + +#include <errno.h> + +#include <cstddef> + +#include "unicode/errorcode.h" +#include "unicode/putil.h" +#include "cmemory.h" +#include "cstring.h" +#include "toolutil.h" + +U_NAMESPACE_BEGIN + +IcuToolErrorCode::~IcuToolErrorCode() { + // Safe because our handleFailure() does not throw exceptions. + if(isFailure()) { handleFailure(); } +} + +void IcuToolErrorCode::handleFailure() const { + fprintf(stderr, "error at %s: %s\n", location, errorName()); + exit(errorCode); +} + +U_NAMESPACE_END + +static int32_t currentYear = -1; + +U_CAPI int32_t U_EXPORT2 getCurrentYear() { + if(currentYear == -1) { + time_t now = time(nullptr); + tm *fields = gmtime(&now); + currentYear = 1900 + fields->tm_year; + } + return currentYear; +} + + +U_CAPI const char * U_EXPORT2 +getLongPathname(const char *pathname) { +#if U_PLATFORM_USES_ONLY_WIN32_API + /* anticipate problems with "short" pathnames */ + static WIN32_FIND_DATAA info; + HANDLE file=FindFirstFileA(pathname, &info); + if(file!=INVALID_HANDLE_VALUE) { + if(info.cAlternateFileName[0]!=0) { + /* this file has a short name, get and use the long one */ + const char *basename=findBasename(pathname); + if(basename!=pathname) { + /* prepend the long filename with the original path */ + uprv_memmove(info.cFileName+(basename-pathname), info.cFileName, uprv_strlen(info.cFileName)+1); + uprv_memcpy(info.cFileName, pathname, basename-pathname); + } + pathname=info.cFileName; + } + FindClose(file); + } +#endif + return pathname; +} + +U_CAPI const char * U_EXPORT2 +findDirname(const char *path, char *buffer, int32_t bufLen, UErrorCode* status) { + if(U_FAILURE(*status)) return nullptr; + const char *resultPtr = nullptr; + int32_t resultLen = 0; + + const char *basename=uprv_strrchr(path, U_FILE_SEP_CHAR); +#if U_FILE_ALT_SEP_CHAR!=U_FILE_SEP_CHAR + const char *basenameAlt=uprv_strrchr(path, U_FILE_ALT_SEP_CHAR); + if(basenameAlt && (!basename || basename<basenameAlt)) { + basename = basenameAlt; + } +#endif + if(!basename) { + /* no basename - return ''. */ + resultPtr = ""; + resultLen = 0; + } else { + resultPtr = path; + resultLen = static_cast<int32_t>(basename - path); + if(resultLen<1) { + resultLen = 1; /* '/' or '/a' -> '/' */ + } + } + + if((resultLen+1) <= bufLen) { + uprv_strncpy(buffer, resultPtr, resultLen); + buffer[resultLen]=0; + return buffer; + } else { + *status = U_BUFFER_OVERFLOW_ERROR; + return nullptr; + } +} + +U_CAPI const char * U_EXPORT2 +findBasename(const char *filename) { + const char *basename=uprv_strrchr(filename, U_FILE_SEP_CHAR); + +#if U_FILE_ALT_SEP_CHAR!=U_FILE_SEP_CHAR + //be lenient about pathname separators on Windows, like official implementation of C++17 std::filesystem in MSVC + //would be convenient to merge this loop with the one above, but alas, there is no such solution in the standard library + const char *alt_basename=uprv_strrchr(filename, U_FILE_ALT_SEP_CHAR); + if(alt_basename>basename) { + basename=alt_basename; + } +#endif + + if(basename!=nullptr) { + return basename+1; + } else { + return filename; + } +} + +U_CAPI void U_EXPORT2 +uprv_mkdir(const char *pathname, UErrorCode *status) { + + int retVal = 0; +#if U_PLATFORM_USES_ONLY_WIN32_API + retVal = _mkdir(pathname); +#else + retVal = mkdir(pathname, S_IRWXU | (S_IROTH | S_IXOTH) | (S_IROTH | S_IXOTH)); +#endif + if (retVal && errno != EEXIST) { +#if U_PF_MINGW <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN + /*if using Cygwin and the mkdir says it failed...check if the directory already exists..*/ + /* if it does...don't give the error, if it does not...give the error - Brian Rower - 6/25/08 */ + struct stat st; + + if(stat(pathname,&st) != 0) + { + *status = U_FILE_ACCESS_ERROR; + } +#else + *status = U_FILE_ACCESS_ERROR; +#endif + } +} + +#if !UCONFIG_NO_FILE_IO +U_CAPI UBool U_EXPORT2 +uprv_fileExists(const char *file) { + struct stat stat_buf; + if (stat(file, &stat_buf) == 0) { + return true; + } else { + return false; + } +} +#endif + +U_CAPI int32_t U_EXPORT2 +uprv_compareGoldenFiles( + const char* buffer, int32_t bufferLen, + const char* goldenFilePath, + bool overwrite) { + + if (overwrite) { + std::ofstream ofs; + ofs.open(goldenFilePath); + ofs.write(buffer, bufferLen); + ofs.close(); + return -1; + } + + std::ifstream ifs(goldenFilePath, std::ifstream::in); + int32_t pos = 0; + char c; + while (ifs.get(c) && pos < bufferLen) { + if (c != buffer[pos]) { + // Files differ at this position + break; + } + pos++; + } + if (pos == bufferLen && ifs.eof()) { + // Files are same lengths + pos = -1; + } + ifs.close(); + return pos; +} + +/*U_CAPI UDate U_EXPORT2 +uprv_getModificationDate(const char *pathname, UErrorCode *status) +{ + if(U_FAILURE(*status)) { + return; + } + // TODO: handle case where stat is not available + struct stat st; + + if(stat(pathname,&st) != 0) + { + *status = U_FILE_ACCESS_ERROR; + } else { + return st.st_mtime; + } +} +*/ + +/* tool memory helper ------------------------------------------------------- */ + +struct UToolMemory { + char name[64]; + int32_t capacity, maxCapacity, size, idx; + void *array; + alignas(std::max_align_t) char staticArray[1]; +}; + +U_CAPI UToolMemory * U_EXPORT2 +utm_open(const char *name, int32_t initialCapacity, int32_t maxCapacity, int32_t size) { + UToolMemory *mem; + + if(maxCapacity<initialCapacity) { + maxCapacity=initialCapacity; + } + + mem=(UToolMemory *)uprv_malloc(sizeof(UToolMemory)+initialCapacity*size); + if(mem==nullptr) { + fprintf(stderr, "error: %s - out of memory\n", name); + exit(U_MEMORY_ALLOCATION_ERROR); + } + mem->array=mem->staticArray; + + uprv_strcpy(mem->name, name); + mem->capacity=initialCapacity; + mem->maxCapacity=maxCapacity; + mem->size=size; + mem->idx=0; + return mem; +} + +U_CAPI void U_EXPORT2 +utm_close(UToolMemory *mem) { + if(mem!=nullptr) { + if(mem->array!=mem->staticArray) { + uprv_free(mem->array); + } + uprv_free(mem); + } +} + + +U_CAPI void * U_EXPORT2 +utm_getStart(UToolMemory *mem) { + return (char *)mem->array; +} + +U_CAPI int32_t U_EXPORT2 +utm_countItems(UToolMemory *mem) { + return mem->idx; +} + + +static UBool +utm_hasCapacity(UToolMemory *mem, int32_t capacity) { + if(mem->capacity<capacity) { + int32_t newCapacity; + + if(mem->maxCapacity<capacity) { + fprintf(stderr, "error: %s - trying to use more than maxCapacity=%ld units\n", + mem->name, (long)mem->maxCapacity); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + /* try to allocate a larger array */ + if(capacity>=2*mem->capacity) { + newCapacity=capacity; + } else if(mem->capacity<=mem->maxCapacity/3) { + newCapacity=2*mem->capacity; + } else { + newCapacity=mem->maxCapacity; + } + + if(mem->array==mem->staticArray) { + mem->array=uprv_malloc(newCapacity*mem->size); + if(mem->array!=nullptr) { + uprv_memcpy(mem->array, mem->staticArray, (size_t)mem->idx*mem->size); + } + } else { + mem->array=uprv_realloc(mem->array, newCapacity*mem->size); + } + + if(mem->array==nullptr) { + fprintf(stderr, "error: %s - out of memory\n", mem->name); + exit(U_MEMORY_ALLOCATION_ERROR); + } + mem->capacity=newCapacity; + } + + return true; +} + +U_CAPI void * U_EXPORT2 +utm_alloc(UToolMemory *mem) { + char *p=nullptr; + int32_t oldIndex=mem->idx; + int32_t newIndex=oldIndex+1; + if(utm_hasCapacity(mem, newIndex)) { + p=(char *)mem->array+oldIndex*mem->size; + mem->idx=newIndex; + uprv_memset(p, 0, mem->size); + } + return p; +} + +U_CAPI void * U_EXPORT2 +utm_allocN(UToolMemory *mem, int32_t n) { + char *p=nullptr; + int32_t oldIndex=mem->idx; + int32_t newIndex=oldIndex+n; + if(utm_hasCapacity(mem, newIndex)) { + p=(char *)mem->array+oldIndex*mem->size; + mem->idx=newIndex; + uprv_memset(p, 0, n*mem->size); + } + return p; +} diff --git a/intl/icu/source/tools/toolutil/toolutil.h b/intl/icu/source/tools/toolutil/toolutil.h new file mode 100644 index 0000000000..b32a0b8762 --- /dev/null +++ b/intl/icu/source/tools/toolutil/toolutil.h @@ -0,0 +1,201 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2013, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: toolutil.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999nov19 +* created by: Markus W. Scherer +* +* This file defines utility functions for ICU tools like genccode. +*/ + +#ifndef __TOOLUTIL_H__ +#define __TOOLUTIL_H__ + +#include "unicode/utypes.h" + +#ifdef __cplusplus + +#include "unicode/errorcode.h" + +U_NAMESPACE_BEGIN + +/** + * ErrorCode subclass for use in ICU command-line tools. + * The destructor calls handleFailure() which calls exit(errorCode) when isFailure(). + */ +class U_TOOLUTIL_API IcuToolErrorCode : public ErrorCode { +public: + /** + * @param loc A short string describing where the IcuToolErrorCode is used. + */ + IcuToolErrorCode(const char *loc) : location(loc) {} + virtual ~IcuToolErrorCode(); +protected: + virtual void handleFailure() const override; +private: + const char *location; +}; + +U_NAMESPACE_END + +#endif + +/* + * For Windows, a path/filename may be the short (8.3) version + * of the "real", long one. In this case, the short one + * is abbreviated and contains a tilde etc. + * This function returns a pointer to the original pathname + * if it is the "real" one itself, and a pointer to a static + * buffer (not thread-safe) containing the long version + * if the pathname is indeed abbreviated. + * + * On platforms other than Windows, this function always returns + * the input pathname pointer. + * + * This function is especially useful in tools that are called + * by a batch file for loop, which yields short pathnames on Win9x. + */ +U_CAPI const char * U_EXPORT2 +getLongPathname(const char *pathname); + +/** + * Find the basename at the end of a pathname, i.e., the part + * after the last file separator, and return a pointer + * to this part of the pathname. + * If the pathname only contains a basename and no file separator, + * then the pathname pointer itself is returned. + **/ +U_CAPI const char * U_EXPORT2 +findBasename(const char *filename); + +/** + * Find the directory name of a pathname, that is, everything + * up to but not including the last file separator. + * + * If successful, copies the directory name into the output buffer along with + * a terminating NULL. + * + * If there isn't a directory name in the path, it returns an empty string. + * @param path the full pathname to inspect. + * @param buffer the output buffer + * @param bufLen the output buffer length + * @param status error code- may return U_BUFFER_OVERFLOW_ERROR if bufLen is too small. + * @return If successful, a pointer to the output buffer. If failure or bufLen is too small, NULL. + **/ +U_CAPI const char * U_EXPORT2 +findDirname(const char *path, char *buffer, int32_t bufLen, UErrorCode* status); + +/* + * Return the current year in the Gregorian calendar. Used for copyright generation. + */ +U_CAPI int32_t U_EXPORT2 +getCurrentYear(); + +/* + * Creates a directory with pathname. + * + * @param status Set to an error code when mkdir failed. + */ +U_CAPI void U_EXPORT2 +uprv_mkdir(const char *pathname, UErrorCode *status); + +#if !UCONFIG_NO_FILE_IO +/** + * Return true if the named item exists + * @param file filename + * @return true if named item (file, dir, etc) exists, false otherwise + */ +U_CAPI UBool U_EXPORT2 +uprv_fileExists(const char *file); +#endif + +/** + * Performs a golden data test. Asserts that the contents of the buffer is equal + * to the data in goldenFilePath. + * + * Pass the value of the -G flag to "overwrite"; if true, new goldens will be + * written to the filesystem. + * + * @return The first index at which the files differ, or -1 if they are the same. + */ +U_CAPI int32_t U_EXPORT2 +uprv_compareGoldenFiles( + const char* buffer, int32_t bufferLen, + const char* goldenFilePath, + bool overwrite); + +/** + * Return the modification date for the specified file or directory. + * Return value is undefined if there was an error. + */ +/*U_CAPI UDate U_EXPORT2 +uprv_getModificationDate(const char *pathname, UErrorCode *status); +*/ +/* + * Returns the modification + * + * @param status Set to an error code when mkdir failed. + */ + +/* + * UToolMemory is used for generic, custom memory management. + * It is allocated with enough space for count*size bytes starting + * at array. + * The array is declared with a union of large data types so + * that its base address is aligned for any types. + * If size is a multiple of a data type size, then such items + * can be safely allocated inside the array, at offsets that + * are themselves multiples of size. + */ +struct UToolMemory; +typedef struct UToolMemory UToolMemory; + +/** + * Open a UToolMemory object for allocation of initialCapacity to maxCapacity + * items with size bytes each. + */ +U_CAPI UToolMemory * U_EXPORT2 +utm_open(const char *name, int32_t initialCapacity, int32_t maxCapacity, int32_t size); + +/** + * Close a UToolMemory object. + */ +U_CAPI void U_EXPORT2 +utm_close(UToolMemory *mem); + +/** + * Get the pointer to the beginning of the array of items. + * The pointer becomes invalid after allocation of new items. + */ +U_CAPI void * U_EXPORT2 +utm_getStart(UToolMemory *mem); + +/** + * Get the current number of items. + */ +U_CAPI int32_t U_EXPORT2 +utm_countItems(UToolMemory *mem); + +/** + * Allocate one more item and return the pointer to its start in the array. + */ +U_CAPI void * U_EXPORT2 +utm_alloc(UToolMemory *mem); + +/** + * Allocate n items and return the pointer to the start of the first one in the array. + */ +U_CAPI void * U_EXPORT2 +utm_allocN(UToolMemory *mem, int32_t n); + +#endif diff --git a/intl/icu/source/tools/toolutil/toolutil.vcxproj b/intl/icu/source/tools/toolutil/toolutil.vcxproj new file mode 100644 index 0000000000..0995ef06f7 --- /dev/null +++ b/intl/icu/source/tools/toolutil/toolutil.vcxproj @@ -0,0 +1,272 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{6B231032-3CB5-4EED-9210-810D666A23A0}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>DynamicLibrary</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* project configurations --> + <ItemDefinitionGroup> + <ClCompile> + <AdditionalIncludeDirectories>..\..\..\include;..\..\common;..\..\i18n;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PreprocessorDefinitions>U_TOOLUTIL_IMPLEMENTATION;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + </ClCompile> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + <DebugInformationFormat>EditAndContinue</DebugInformationFormat> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icuind.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icuin.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Win32' project configurations --> + <ItemDefinitionGroup Condition="'$(Platform)'=='Win32'"> + <ClCompile> + <PrecompiledHeaderOutputFile>.\x86\$(Configuration)/toolutil.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>.\x86\$(Configuration)/</AssemblerListingLocation> + <ObjectFileName>.\x86\$(Configuration)/</ObjectFileName> + <ProgramDataBaseFileName>.\x86\$(Configuration)/</ProgramDataBaseFileName> + </ClCompile> + <Link> + <AdditionalLibraryDirectories>..\..\..\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> + <Midl> + <TypeLibraryName>.\..\..\..\lib\icutu.tlb</TypeLibraryName> + </Midl> + <Link> + <OutputFile>..\..\..\bin\icutu$(IcuMajorVersion).dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\lib\icutu.pdb</ProgramDatabaseFile> + <DataExecutionPrevention> + </DataExecutionPrevention> + <ImportLibrary>..\..\..\lib\icutu.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> + <Midl> + <TypeLibraryName>.\..\..\..\lib\icutud.tlb</TypeLibraryName> + </Midl> + <Link> + <OutputFile>..\..\..\bin\icutu$(IcuMajorVersion)d.dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\lib\icutud.pdb</ProgramDatabaseFile> + <DataExecutionPrevention> + </DataExecutionPrevention> + <ImportLibrary>..\..\..\lib\icutud.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'x64' project configurations --> + <ItemDefinitionGroup Condition="'$(Platform)'=='x64'"> + <ClCompile> + <PrecompiledHeaderOutputFile>.\x64\$(Configuration)/toolutil.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>.\x64\$(Configuration)/</AssemblerListingLocation> + <ObjectFileName>.\x64\$(Configuration)/</ObjectFileName> + <ProgramDataBaseFileName>.\x64\$(Configuration)/</ProgramDataBaseFileName> + </ClCompile> + <Link> + <AdditionalLibraryDirectories>..\..\..\lib64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <Midl> + <TypeLibraryName>.\..\..\..\lib64\icutu.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WholeProgramOptimization>true</WholeProgramOptimization> + </ClCompile> + <Link> + <OutputFile>..\..\..\bin64\icutu$(IcuMajorVersion).dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\lib64\icutu.pdb</ProgramDatabaseFile> + <ImportLibrary>..\..\..\lib64\icutu.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <Midl> + <TypeLibraryName>.\..\..\..\lib64\icutud.tlb</TypeLibraryName> + </Midl> + <Link> + <OutputFile>..\..\..\bin64\icutu$(IcuMajorVersion)d.dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\lib64\icutud.pdb</ProgramDatabaseFile> + <ImportLibrary>..\..\..\lib64\icutud.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Platform)'=='ARM'"> + <ClCompile> + <PrecompiledHeaderOutputFile>.\ARM\$(Configuration)/toolutil.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>.\ARM\$(Configuration)/</AssemblerListingLocation> + <ObjectFileName>.\ARM\$(Configuration)/</ObjectFileName> + <ProgramDataBaseFileName>.\ARM\$(Configuration)/</ProgramDataBaseFileName> + </ClCompile> + <Link> + <AdditionalLibraryDirectories>.\..\..\..\libARM;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM'"> + <Midl> + <TypeLibraryName>..\..\..\libARM\icutu.tlb</TypeLibraryName> + </Midl> + <Link> + <OutputFile>..\..\..\binARM\icutu$(IcuMajorVersion).dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\libARM\icutu.pdb</ProgramDatabaseFile> + <ImportLibrary>..\..\..\libARM\icutu.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'"> + <Midl> + <TypeLibraryName>.\..\..\..\libARM\icutud.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <DebugInformationFormat>ProgramDatabase</DebugInformationFormat> + </ClCompile> + <Link> + <OutputFile>..\..\..\binARM\icutu$(IcuMajorVersion)d.dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\libARM\icutud.pdb</ProgramDatabaseFile> + <ImportLibrary>..\..\..\libARM\icutud.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Platform)'=='ARM64'"> + <ClCompile> + <PrecompiledHeaderOutputFile>.\ARM64\$(Configuration)/toolutil.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>.\ARM64\$(Configuration)/</AssemblerListingLocation> + <ObjectFileName>.\ARM64\$(Configuration)/</ObjectFileName> + <ProgramDataBaseFileName>.\ARM64\$(Configuration)/</ProgramDataBaseFileName> + </ClCompile> + <Link> + <AdditionalLibraryDirectories>.\..\..\..\libARM64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'"> + <Midl> + <TypeLibraryName>.\..\..\..\libARM64\icutu.tlb</TypeLibraryName> + </Midl> + <Link> + <OutputFile>..\..\..\binARM64\icutu$(IcuMajorVersion).dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\libARM64\icutu.pdb</ProgramDatabaseFile> + <ImportLibrary>..\..\..\libARM64\icutu.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'"> + <Midl> + <TypeLibraryName>.\..\..\..\libARM64\icutud.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <DebugInformationFormat>ProgramDatabase</DebugInformationFormat> + </ClCompile> + <Link> + <OutputFile>..\..\..\binARM64\icutu$(IcuMajorVersion)d.dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\libARM64\icutud.pdb</ProgramDatabaseFile> + <ImportLibrary>..\..\..\libARM64\icutud.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="collationinfo.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="denseranges.cpp" /> + <ClCompile Include="filestrm.cpp" /> + <ClCompile Include="filetools.cpp" /> + <ClCompile Include="flagparser.cpp" /> + <ClCompile Include="package.cpp" /> + <ClCompile Include="pkg_genc.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="pkg_gencmn.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="pkg_icu.cpp" /> + <ClCompile Include="pkgitems.cpp" /> + <ClCompile Include="ppucd.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="swapimpl.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="toolutil.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="ucbuf.cpp" /> + <ClCompile Include="ucm.cpp" /> + <ClCompile Include="ucmstate.cpp" /> + <ClCompile Include="unewdata.cpp" /> + <ClCompile Include="uoptions.cpp" /> + <ClCompile Include="uparse.cpp" /> + <ClCompile Include="writesrc.cpp" /> + <ClCompile Include="xmlparser.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="dbgutil.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="udbgutil.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="ucln_tu.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="collationinfo.h" /> + <ClInclude Include="denseranges.h" /> + <ClInclude Include="filestrm.h" /> + <ClInclude Include="filetools.h" /> + <ClInclude Include="flagparser.h" /> + <ClInclude Include="package.h" /> + <ClInclude Include="pkg_genc.h" /> + <ClInclude Include="pkg_gencmn.h" /> + <ClInclude Include="pkg_icu.h" /> + <ClInclude Include="pkg_imp.h" /> + <ClInclude Include="ppucd.h" /> + <ClInclude Include="swapimpl.h" /> + <ClInclude Include="toolutil.h" /> + <ClInclude Include="ucbuf.h" /> + <ClInclude Include="ucm.h" /> + <ClInclude Include="unewdata.h" /> + <ClInclude Include="uoptions.h" /> + <ClInclude Include="uparse.h" /> + <ClInclude Include="writesrc.h" /> + <ClInclude Include="xmlparser.h" /> + <ClInclude Include="dbgutil.h" /> + <ClInclude Include="udbgutil.h" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project> diff --git a/intl/icu/source/tools/toolutil/ucbuf.cpp b/intl/icu/source/tools/toolutil/ucbuf.cpp new file mode 100644 index 0000000000..1eb54e260e --- /dev/null +++ b/intl/icu/source/tools/toolutil/ucbuf.cpp @@ -0,0 +1,788 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File ucbuf.cpp +* +* Modification History: +* +* Date Name Description +* 05/10/01 Ram Creation. +******************************************************************************* +*/ + +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/uchar.h" +#include "unicode/ucnv.h" +#include "unicode/ucnv_err.h" +#include "unicode/ustring.h" +#include "unicode/utf16.h" +#include "filestrm.h" +#include "cstring.h" +#include "cmemory.h" +#include "ustrfmt.h" +#include "ucbuf.h" +#include <stdio.h> + +#if !UCONFIG_NO_CONVERSION + + +#define MAX_IN_BUF 1000 +#define MAX_U_BUF 1500 +#define CONTEXT_LEN 20 + +struct UCHARBUF { + char16_t* buffer; + char16_t* currentPos; + char16_t* bufLimit; + int32_t bufCapacity; + int32_t remaining; + int32_t signatureLength; + FileStream* in; + UConverter* conv; + UBool showWarning; /* makes this API not produce any errors */ + UBool isBuffered; +}; + +U_CAPI UBool U_EXPORT2 +ucbuf_autodetect_fs(FileStream* in, const char** cp, UConverter** conv, int32_t* signatureLength, UErrorCode* error){ + char start[8]; + int32_t numRead; + + char16_t target[1]={ 0 }; + char16_t* pTarget; + const char* pStart; + + /* read a few bytes */ + numRead=T_FileStream_read(in, start, sizeof(start)); + + *cp = ucnv_detectUnicodeSignature(start, numRead, signatureLength, error); + + /* unread the bytes beyond what was consumed for U+FEFF */ + T_FileStream_rewind(in); + if (*signatureLength > 0) { + T_FileStream_read(in, start, *signatureLength); + } + + if(*cp==nullptr){ + *conv =nullptr; + return false; + } + + /* open the converter for the detected Unicode charset */ + *conv = ucnv_open(*cp,error); + + /* convert and ignore initial U+FEFF, and the buffer overflow */ + pTarget = target; + pStart = start; + ucnv_toUnicode(*conv, &pTarget, target+1, &pStart, start+*signatureLength, nullptr, false, error); + *signatureLength = (int32_t)(pStart - start); + if(*error==U_BUFFER_OVERFLOW_ERROR) { + *error=U_ZERO_ERROR; + } + + /* verify that we successfully read exactly U+FEFF */ + if(U_SUCCESS(*error) && (pTarget!=(target+1) || target[0]!=0xfeff)) { + *error=U_INTERNAL_PROGRAM_ERROR; + } + + + return true; +} +static UBool ucbuf_isCPKnown(const char* cp){ + if(ucnv_compareNames("UTF-8",cp)==0){ + return true; + } + if(ucnv_compareNames("UTF-16BE",cp)==0){ + return true; + } + if(ucnv_compareNames("UTF-16LE",cp)==0){ + return true; + } + if(ucnv_compareNames("UTF-16",cp)==0){ + return true; + } + if(ucnv_compareNames("UTF-32",cp)==0){ + return true; + } + if(ucnv_compareNames("UTF-32BE",cp)==0){ + return true; + } + if(ucnv_compareNames("UTF-32LE",cp)==0){ + return true; + } + if(ucnv_compareNames("SCSU",cp)==0){ + return true; + } + if(ucnv_compareNames("BOCU-1",cp)==0){ + return true; + } + if(ucnv_compareNames("UTF-7",cp)==0){ + return true; + } + return false; +} + +U_CAPI FileStream * U_EXPORT2 +ucbuf_autodetect(const char* fileName, const char** cp,UConverter** conv, int32_t* signatureLength,UErrorCode* error){ + FileStream* in=nullptr; + if(error==nullptr || U_FAILURE(*error)){ + return nullptr; + } + if(conv==nullptr || cp==nullptr || fileName==nullptr){ + *error = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + /* open the file */ + in= T_FileStream_open(fileName,"rb"); + + if(in == nullptr){ + *error=U_FILE_ACCESS_ERROR; + return nullptr; + } + + if(ucbuf_autodetect_fs(in,cp,conv,signatureLength,error)) { + return in; + } else { + ucnv_close(*conv); + *conv=nullptr; + T_FileStream_close(in); + return nullptr; + } +} + +/* fill the uchar buffer */ +static UCHARBUF* +ucbuf_fillucbuf( UCHARBUF* buf,UErrorCode* error){ + char16_t* pTarget=nullptr; + char16_t* target=nullptr; + const char* source=nullptr; + char carr[MAX_IN_BUF] = {'\0'}; + char* cbuf = carr; + int32_t inputRead=0; + int32_t outputWritten=0; + int32_t offset=0; + const char* sourceLimit =nullptr; + int32_t cbufSize=0; + pTarget = buf->buffer; + /* check if we arrived here without exhausting the buffer*/ + if(buf->currentPos<buf->bufLimit){ + offset = (int32_t)(buf->bufLimit-buf->currentPos); + memmove(buf->buffer,buf->currentPos,offset* sizeof(char16_t)); + } + +#ifdef UCBUF_DEBUG + memset(pTarget+offset,0xff,sizeof(char16_t)*(MAX_IN_BUF-offset)); +#endif + if(buf->isBuffered){ + cbufSize = MAX_IN_BUF; + /* read the file */ + inputRead=T_FileStream_read(buf->in,cbuf,cbufSize-offset); + buf->remaining-=inputRead; + + }else{ + cbufSize = T_FileStream_size(buf->in); + cbuf = (char*)uprv_malloc(cbufSize); + if (cbuf == nullptr) { + *error = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + inputRead= T_FileStream_read(buf->in,cbuf,cbufSize); + buf->remaining-=inputRead; + } + + /* just to be sure...*/ + if ( 0 == inputRead ) + buf->remaining = 0; + + target=pTarget; + /* convert the bytes */ + if(buf->conv){ + /* set the callback to stop */ + UConverterToUCallback toUOldAction ; + void* toUOldContext; + void* toUNewContext=nullptr; + ucnv_setToUCallBack(buf->conv, + UCNV_TO_U_CALLBACK_STOP, + toUNewContext, + &toUOldAction, + (const void**)&toUOldContext, + error); + /* since state is saved in the converter we add offset to source*/ + target = pTarget+offset; + source = cbuf; + sourceLimit = source + inputRead; + ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset), + &source,sourceLimit,nullptr, + (UBool)(buf->remaining==0),error); + + if(U_FAILURE(*error)){ + char context[CONTEXT_LEN+1]; + char preContext[CONTEXT_LEN+1]; + char postContext[CONTEXT_LEN+1]; + int8_t len = CONTEXT_LEN; + int32_t start=0; + int32_t stop =0; + int32_t pos =0; + /* use erro1 to preserve the error code */ + UErrorCode error1 =U_ZERO_ERROR; + + if( buf->showWarning==true){ + fprintf(stderr,"\n###WARNING: Encountered abnormal bytes while" + " converting input stream to target encoding: %s\n", + u_errorName(*error)); + } + + + /* now get the context chars */ + ucnv_getInvalidChars(buf->conv,context,&len,&error1); + context[len]= 0 ; /* null terminate the buffer */ + + pos = (int32_t)(source - cbuf - len); + + /* for pre-context */ + start = (pos <=CONTEXT_LEN)? 0 : (pos - (CONTEXT_LEN-1)); + stop = pos-len; + + memcpy(preContext,cbuf+start,stop-start); + /* null terminate the buffer */ + preContext[stop-start] = 0; + + /* for post-context */ + start = pos+len; + stop = (int32_t)(((pos+CONTEXT_LEN)<= (sourceLimit-cbuf) )? (pos+(CONTEXT_LEN-1)) : (sourceLimit-cbuf)); + + memcpy(postContext,source,stop-start); + /* null terminate the buffer */ + postContext[stop-start] = 0; + + if(buf->showWarning ==true){ + /* print out the context */ + fprintf(stderr,"\tPre-context: %s\n",preContext); + fprintf(stderr,"\tContext: %s\n",context); + fprintf(stderr,"\tPost-context: %s\n", postContext); + } + + /* reset the converter */ + ucnv_reset(buf->conv); + + /* set the call back to substitute + * and restart conversion + */ + ucnv_setToUCallBack(buf->conv, + UCNV_TO_U_CALLBACK_SUBSTITUTE, + toUNewContext, + &toUOldAction, + (const void**)&toUOldContext, + &error1); + + /* reset source and target start positions */ + target = pTarget+offset; + source = cbuf; + + /* re convert */ + ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset), + &source,sourceLimit,nullptr, + (UBool)(buf->remaining==0),&error1); + + } + outputWritten = (int32_t)(target - pTarget); + +#ifdef UCBUF_DEBUG + { + int i; + target = pTarget; + for(i=0;i<numRead;i++){ + /* printf("%c", (char)(*target++));*/ + } + } +#endif + + }else{ + u_charsToUChars(cbuf,target+offset,inputRead); + outputWritten=((buf->remaining>cbufSize)? cbufSize:inputRead+offset); + } + buf->currentPos = pTarget; + buf->bufLimit=pTarget+outputWritten; + *buf->bufLimit=0; /*NUL terminate*/ + if(cbuf!=carr){ + uprv_free(cbuf); + } + return buf; +} + + + +/* get a char16_t from the stream*/ +U_CAPI int32_t U_EXPORT2 +ucbuf_getc(UCHARBUF* buf,UErrorCode* error){ + if(error==nullptr || U_FAILURE(*error)){ + return false; + } + if(buf->currentPos>=buf->bufLimit){ + if(buf->remaining==0){ + return U_EOF; + } + buf=ucbuf_fillucbuf(buf,error); + if(U_FAILURE(*error)){ + return U_EOF; + } + } + + return *(buf->currentPos++); +} + +/* get a UChar32 from the stream*/ +U_CAPI int32_t U_EXPORT2 +ucbuf_getc32(UCHARBUF* buf,UErrorCode* error){ + int32_t retVal = (int32_t)U_EOF; + if(error==nullptr || U_FAILURE(*error)){ + return false; + } + if(buf->currentPos+1>=buf->bufLimit){ + if(buf->remaining==0){ + return U_EOF; + } + buf=ucbuf_fillucbuf(buf,error); + if(U_FAILURE(*error)){ + return U_EOF; + } + } + if(U16_IS_LEAD(*(buf->currentPos))){ + retVal=U16_GET_SUPPLEMENTARY(buf->currentPos[0],buf->currentPos[1]); + buf->currentPos+=2; + }else{ + retVal = *(buf->currentPos++); + } + return retVal; +} + +/* u_unescapeAt() callback to return a char16_t*/ +static char16_t U_CALLCONV +_charAt(int32_t offset, void *context) { + return ((UCHARBUF*) context)->currentPos[offset]; +} + +/* getc and escape it */ +U_CAPI int32_t U_EXPORT2 +ucbuf_getcx32(UCHARBUF* buf,UErrorCode* error) { + int32_t length; + int32_t offset; + UChar32 c32,c1,c2; + if(error==nullptr || U_FAILURE(*error)){ + return false; + } + /* Fill the buffer if it is empty */ + if (buf->currentPos >=buf->bufLimit-2) { + ucbuf_fillucbuf(buf,error); + } + + /* Get the next character in the buffer */ + if (buf->currentPos < buf->bufLimit) { + c1 = *(buf->currentPos)++; + } else { + c1 = U_EOF; + } + + c2 = *(buf->currentPos); + + /* If it isn't a backslash, return it */ + if (c1 != 0x005C) { + return c1; + } + + /* Determine the amount of data in the buffer */ + length = (int32_t)(buf->bufLimit - buf->currentPos); + + /* The longest escape sequence is \Uhhhhhhhh; make sure + we have at least that many characters */ + if (length < 10) { + + /* fill the buffer */ + ucbuf_fillucbuf(buf,error); + length = (int32_t)(buf->bufLimit - buf->buffer); + } + + /* Process the escape */ + offset = 0; + c32 = u_unescapeAt(_charAt, &offset, length, (void*)buf); + + /* check if u_unescapeAt unescaped and converted + * to c32 or not + */ + if(c32==(UChar32)0xFFFFFFFF){ + if(buf->showWarning) { + char context[CONTEXT_LEN+1]; + int32_t len = CONTEXT_LEN; + if(length < len) { + len = length; + } + context[len]= 0 ; /* null terminate the buffer */ + u_UCharsToChars( buf->currentPos, context, len); + fprintf(stderr,"Bad escape: [%c%s]...\n", (int)c1, context); + } + *error= U_ILLEGAL_ESCAPE_SEQUENCE; + return c1; + }else if(c32!=c2 || (c32==0x0075 && c2==0x0075 && c1==0x005C) /* for \u0075 c2=0x0075 and c32==0x0075*/){ + /* Update the current buffer position */ + buf->currentPos += offset; + }else{ + /* unescaping failed so we just return + * c1 and not consume the buffer + * this is useful for rules with escapes + * in resource bundles + * eg: \' \\ \" + */ + return c1; + } + + return c32; +} + +U_CAPI UCHARBUF* U_EXPORT2 +ucbuf_open(const char* fileName,const char** cp,UBool showWarning, UBool buffered, UErrorCode* error){ + + FileStream* in = nullptr; + int32_t fileSize=0; + const char* knownCp; + if(error==nullptr || U_FAILURE(*error)){ + return nullptr; + } + if(cp==nullptr || fileName==nullptr){ + *error = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + if (!uprv_strcmp(fileName, "-")) { + in = T_FileStream_stdin(); + }else{ + in = T_FileStream_open(fileName, "rb"); + } + + if(in!=nullptr){ + UCHARBUF* buf =(UCHARBUF*) uprv_malloc(sizeof(UCHARBUF)); + fileSize = T_FileStream_size(in); + if(buf == nullptr){ + *error = U_MEMORY_ALLOCATION_ERROR; + T_FileStream_close(in); + return nullptr; + } + buf->in=in; + buf->conv=nullptr; + buf->showWarning = showWarning; + buf->isBuffered = buffered; + buf->signatureLength=0; + if(*cp==nullptr || **cp=='\0'){ + /* don't have code page name... try to autodetect */ + ucbuf_autodetect_fs(in,cp,&buf->conv,&buf->signatureLength,error); + }else if(ucbuf_isCPKnown(*cp)){ + /* discard BOM */ + ucbuf_autodetect_fs(in,&knownCp,&buf->conv,&buf->signatureLength,error); + } + if(U_SUCCESS(*error) && buf->conv==nullptr) { + buf->conv=ucnv_open(*cp,error); + } + if(U_FAILURE(*error)){ + ucnv_close(buf->conv); + uprv_free(buf); + T_FileStream_close(in); + return nullptr; + } + + if((buf->conv==nullptr) && (buf->showWarning==true)){ + fprintf(stderr,"###WARNING: No converter defined. Using codepage of system.\n"); + } + buf->remaining=fileSize-buf->signatureLength; + if(buf->isBuffered){ + buf->bufCapacity=MAX_U_BUF; + }else{ + buf->bufCapacity=buf->remaining+buf->signatureLength+1/*for terminating nul*/; + } + buf->buffer=(char16_t*) uprv_malloc(U_SIZEOF_UCHAR * buf->bufCapacity ); + if (buf->buffer == nullptr) { + *error = U_MEMORY_ALLOCATION_ERROR; + ucbuf_close(buf); + return nullptr; + } + buf->currentPos=buf->buffer; + buf->bufLimit=buf->buffer; + if(U_FAILURE(*error)){ + fprintf(stderr, "Could not open codepage [%s]: %s\n", *cp, u_errorName(*error)); + ucbuf_close(buf); + return nullptr; + } + ucbuf_fillucbuf(buf,error); + if(U_FAILURE(*error)){ + ucbuf_close(buf); + return nullptr; + } + return buf; + } + *error =U_FILE_ACCESS_ERROR; + return nullptr; +} + + + +/* TODO: this method will fail if at the + * beginning of buffer and the uchar to unget + * is from the previous buffer. Need to implement + * system to take care of that situation. + */ +U_CAPI void U_EXPORT2 +ucbuf_ungetc(int32_t c,UCHARBUF* buf){ + /* decrement currentPos pointer + * if not at the beginning of buffer + */ + if(buf->currentPos!=buf->buffer){ + if(*(buf->currentPos-1)==c){ + buf->currentPos--; + } else { + /* ungetc failed - did not match. */ + } + } else { + /* ungetc failed - beginning of buffer. */ + } +} + +/* frees the resources of char16_t* buffer */ +static void +ucbuf_closebuf(UCHARBUF* buf){ + uprv_free(buf->buffer); + buf->buffer = nullptr; +} + +/* close the buf and release resources*/ +U_CAPI void U_EXPORT2 +ucbuf_close(UCHARBUF* buf){ + if(buf!=nullptr){ + if(buf->conv){ + ucnv_close(buf->conv); + } + T_FileStream_close(buf->in); + ucbuf_closebuf(buf); + uprv_free(buf); + } +} + +/* rewind the buf and file stream */ +U_CAPI void U_EXPORT2 +ucbuf_rewind(UCHARBUF* buf,UErrorCode* error){ + if(error==nullptr || U_FAILURE(*error)){ + return; + } + if(buf){ + buf->currentPos=buf->buffer; + buf->bufLimit=buf->buffer; + T_FileStream_rewind(buf->in); + buf->remaining=T_FileStream_size(buf->in)-buf->signatureLength; + + ucnv_resetToUnicode(buf->conv); + if(buf->signatureLength>0) { + char16_t target[1]={ 0 }; + char16_t* pTarget; + char start[8]; + const char* pStart; + int32_t numRead; + + /* read the signature bytes */ + numRead=T_FileStream_read(buf->in, start, buf->signatureLength); + + /* convert and ignore initial U+FEFF, and the buffer overflow */ + pTarget = target; + pStart = start; + ucnv_toUnicode(buf->conv, &pTarget, target+1, &pStart, start+numRead, nullptr, false, error); + if(*error==U_BUFFER_OVERFLOW_ERROR) { + *error=U_ZERO_ERROR; + } + + /* verify that we successfully read exactly U+FEFF */ + if(U_SUCCESS(*error) && (numRead!=buf->signatureLength || pTarget!=(target+1) || target[0]!=0xfeff)) { + *error=U_INTERNAL_PROGRAM_ERROR; + } + } + } +} + + +U_CAPI int32_t U_EXPORT2 +ucbuf_size(UCHARBUF* buf){ + if(buf){ + if(buf->isBuffered){ + return (T_FileStream_size(buf->in)-buf->signatureLength)/ucnv_getMinCharSize(buf->conv); + }else{ + return (int32_t)(buf->bufLimit - buf->buffer); + } + } + return 0; +} + +U_CAPI const char16_t* U_EXPORT2 +ucbuf_getBuffer(UCHARBUF* buf,int32_t* len,UErrorCode* error){ + if(error==nullptr || U_FAILURE(*error)){ + return nullptr; + } + if(buf==nullptr || len==nullptr){ + *error = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + *len = (int32_t)(buf->bufLimit - buf->buffer); + return buf->buffer; +} + +U_CAPI const char* U_EXPORT2 +ucbuf_resolveFileName(const char* inputDir, const char* fileName, char* target, int32_t* len, UErrorCode* status){ + int32_t requiredLen = 0; + int32_t dirlen = 0; + int32_t filelen = 0; + if(status==nullptr || U_FAILURE(*status)){ + return nullptr; + } + + if(inputDir == nullptr || fileName == nullptr || len==nullptr || (target==nullptr && *len>0)){ + *status = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + + + dirlen = (int32_t)uprv_strlen(inputDir); + filelen = (int32_t)uprv_strlen(fileName); + if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) { + requiredLen = dirlen + filelen + 2; + if((*len < requiredLen) || target==nullptr){ + *len = requiredLen; + *status = U_BUFFER_OVERFLOW_ERROR; + return nullptr; + } + + target[0] = '\0'; + /* + * append the input dir to openFileName if the first char in + * filename is not file separation char and the last char input directory is not '.'. + * This is to support : + * genrb -s. /home/icu/data + * genrb -s. icu/data + * The user cannot mix notations like + * genrb -s. /icu/data --- the absolute path specified. -s redundant + * user should use + * genrb -s. icu/data --- start from CWD and look in icu/data dir + */ + if( (fileName[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')){ + uprv_strcpy(target, inputDir); + target[dirlen] = U_FILE_SEP_CHAR; + } + target[dirlen + 1] = '\0'; + } else { + requiredLen = dirlen + filelen + 1; + if((*len < requiredLen) || target==nullptr){ + *len = requiredLen; + *status = U_BUFFER_OVERFLOW_ERROR; + return nullptr; + } + + uprv_strcpy(target, inputDir); + } + + uprv_strcat(target, fileName); + return target; +} +/* + * Unicode TR 13 says any of the below chars is + * a new line char in a readline function in addition + * to CR+LF combination which needs to be + * handled separately + */ +static UBool ucbuf_isCharNewLine(char16_t c){ + switch(c){ + case 0x000A: /* LF */ + case 0x000D: /* CR */ + case 0x000C: /* FF */ + case 0x0085: /* NEL */ + case 0x2028: /* LS */ + case 0x2029: /* PS */ + return true; + default: + return false; + } +} + +U_CAPI const char16_t* U_EXPORT2 +ucbuf_readline(UCHARBUF* buf,int32_t* len,UErrorCode* err){ + char16_t* temp = buf->currentPos; + char16_t* savePos =nullptr; + char16_t c=0x0000; + if(buf->isBuffered){ + /* The input is buffered we have to do more + * for returning a pointer U_TRUNCATED_CHAR_FOUND + */ + for(;;){ + c = *temp++; + if(buf->remaining==0){ + return nullptr; /* end of file is reached return nullptr */ + } + if(temp>=buf->bufLimit && buf->currentPos == buf->buffer){ + *err= U_TRUNCATED_CHAR_FOUND; + return nullptr; + }else{ + ucbuf_fillucbuf(buf,err); + if(U_FAILURE(*err)){ + return nullptr; + } + } + /* + * According to TR 13 readLine functions must interpret + * CR, CR+LF, LF, NEL, PS, LS or FF as line seperators + */ + /* Windows CR LF */ + if(c ==0x0d && temp <= buf->bufLimit && *temp == 0x0a ){ + *len = (int32_t)(temp++ - buf->currentPos); + savePos = buf->currentPos; + buf->currentPos = temp; + return savePos; + } + /* else */ + + if (temp>=buf->bufLimit|| ucbuf_isCharNewLine(c)){ /* Unipad inserts 2028 line separators! */ + *len = (int32_t)(temp - buf->currentPos); + savePos = buf->currentPos; + buf->currentPos = temp; + return savePos; + } + } + }else{ + /* we know that all input is read into the internal + * buffer so we can safely return pointers + */ + for(;;){ + c = *temp++; + + if(buf->currentPos==buf->bufLimit){ + return nullptr; /* end of file is reached return nullptr */ + } + /* Windows CR LF */ + if(c ==0x0d && temp <= buf->bufLimit && *temp == 0x0a ){ + *len = (int32_t)(temp++ - buf->currentPos); + savePos = buf->currentPos; + buf->currentPos = temp; + return savePos; + } + /* else */ + if (temp>=buf->bufLimit|| ucbuf_isCharNewLine(c)) { /* Unipad inserts 2028 line separators! */ + *len = (int32_t)(temp - buf->currentPos); + savePos = buf->currentPos; + buf->currentPos = temp; + return savePos; + } + } + } + /* not reached */ + /* A compiler warning will appear if all paths don't contain a return statement. */ +/* return nullptr;*/ +} +#endif diff --git a/intl/icu/source/tools/toolutil/ucbuf.h b/intl/icu/source/tools/toolutil/ucbuf.h new file mode 100644 index 0000000000..117920b794 --- /dev/null +++ b/intl/icu/source/tools/toolutil/ucbuf.h @@ -0,0 +1,218 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File ucbuf.h +* +* Modification History: +* +* Date Name Description +* 05/10/01 Ram Creation. +* +* This API reads in files and returns UChars +******************************************************************************* +*/ + +#include "unicode/localpointer.h" +#include "unicode/ucnv.h" +#include "filestrm.h" + +#if !UCONFIG_NO_CONVERSION + +#ifndef UCBUF_H +#define UCBUF_H 1 + +typedef struct UCHARBUF UCHARBUF; +/** + * End of file value + */ +#define U_EOF ((int32_t)0xFFFFFFFF) +/** + * Error value if a sequence cannot be unescaped + */ +#define U_ERR ((int32_t)0xFFFFFFFE) + +typedef struct ULine ULine; + +struct ULine { + UChar *name; + int32_t len; +}; + +/** + * Opens the UCHARBUF with the given file stream and code page for conversion + * @param fileName Name of the file to open. + * @param codepage The encoding of the file stream to convert to Unicode. + * If *codepage is NULL on input the API will try to autodetect + * popular Unicode encodings + * @param showWarning Flag to print out warnings to STDOUT + * @param buffered If true performs a buffered read of the input file. If false reads + * the whole file into memory and converts it. + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + * @return pointer to the newly opened UCHARBUF + */ +U_CAPI UCHARBUF* U_EXPORT2 +ucbuf_open(const char* fileName,const char** codepage,UBool showWarning, UBool buffered, UErrorCode* err); + +/** + * Gets a UTF-16 code unit at the current position from the converted buffer + * and increments the current position + * @param buf Pointer to UCHARBUF structure + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + */ +U_CAPI int32_t U_EXPORT2 +ucbuf_getc(UCHARBUF* buf,UErrorCode* err); + +/** + * Gets a UTF-32 code point at the current position from the converted buffer + * and increments the current position + * @param buf Pointer to UCHARBUF structure + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + */ +U_CAPI int32_t U_EXPORT2 +ucbuf_getc32(UCHARBUF* buf,UErrorCode* err); + +/** + * Gets a UTF-16 code unit at the current position from the converted buffer after + * unescaping and increments the current position. If the escape sequence is for UTF-32 + * code point (\\Uxxxxxxxx) then a UTF-32 codepoint is returned + * @param buf Pointer to UCHARBUF structure + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + */ +U_CAPI int32_t U_EXPORT2 +ucbuf_getcx32(UCHARBUF* buf,UErrorCode* err); + +/** + * Gets a pointer to the current position in the internal buffer and length of the line. + * It imperative to make a copy of the returned buffer before performing operations on it. + * @param buf Pointer to UCHARBUF structure + * @param len Output param to receive the len of the buffer returned till end of the line + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + * Error: U_TRUNCATED_CHAR_FOUND + * @return Pointer to the internal buffer, NULL if EOF + */ +U_CAPI const UChar* U_EXPORT2 +ucbuf_readline(UCHARBUF* buf,int32_t* len, UErrorCode* err); + + +/** + * Resets the buffers and the underlying file stream. + * @param buf Pointer to UCHARBUF structure + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + */ +U_CAPI void U_EXPORT2 +ucbuf_rewind(UCHARBUF* buf,UErrorCode* err); + +/** + * Returns a pointer to the internal converted buffer + * @param buf Pointer to UCHARBUF structure + * @param len Pointer to int32_t to receive the length of buffer + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + * @return Pointer to internal UChar buffer + */ +U_CAPI const UChar* U_EXPORT2 +ucbuf_getBuffer(UCHARBUF* buf,int32_t* len,UErrorCode* err); + +/** + * Closes the UCHARBUF structure members and cleans up the malloc'ed memory + * @param buf Pointer to UCHARBUF structure + */ +U_CAPI void U_EXPORT2 +ucbuf_close(UCHARBUF* buf); + +#if U_SHOW_CPLUSPLUS_API + +U_NAMESPACE_BEGIN + +/** + * \class LocalUCHARBUFPointer + * "Smart pointer" class, closes a UCHARBUF via ucbuf_close(). + * For most methods see the LocalPointerBase base class. + * + * @see LocalPointerBase + * @see LocalPointer + */ +U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close); + +U_NAMESPACE_END + +#endif + +/** + * Rewinds the buffer by one codepoint. Does not rewind over escaped characters. + */ +U_CAPI void U_EXPORT2 +ucbuf_ungetc(int32_t ungetChar,UCHARBUF* buf); + + +/** + * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected. + * Some Unicode charsets are stateful and need byte identifiers to be converted also to bring + * the converter to correct state for converting the rest of the stream. So the UConverter parameter + * is necessary. + * If the charset was autodetected, the caller must close both the input FileStream + * and the converter. + * + * @param fileName The file name to be opened and encoding autodected + * @param conv Output param to receive the opened converter if autodetected; NULL otherwise. + * @param cp Output param to receive the detected encoding + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + * @return The input FileStream if its charset was autodetected; NULL otherwise. + */ +U_CAPI FileStream * U_EXPORT2 +ucbuf_autodetect(const char* fileName, const char** cp,UConverter** conv, +int32_t* signatureLength, UErrorCode* status); + +/** + * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected. + * Some Unicode charsets are stateful and need byte identifiers to be converted also to bring + * the converter to correct state for converting the rest of the stream. So the UConverter parameter + * is necessary. + * If the charset was autodetected, the caller must close the converter. + * + * @param fileStream The file stream whose encoding is to be detected + * @param conv Output param to receive the opened converter if autodetected; NULL otherwise. + * @param cp Output param to receive the detected encoding + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + * @return Boolean whether the Unicode charset was autodetected. + */ + +U_CAPI UBool U_EXPORT2 +ucbuf_autodetect_fs(FileStream* in, const char** cp, UConverter** conv, int32_t* signatureLength, UErrorCode* status); + +/** + * Returns the approximate size in UChars required for converting the file to UChars + */ +U_CAPI int32_t U_EXPORT2 +ucbuf_size(UCHARBUF* buf); + +U_CAPI const char* U_EXPORT2 +ucbuf_resolveFileName(const char* inputDir, const char* fileName, char* target, int32_t* len, UErrorCode* status); + +#endif +#endif + diff --git a/intl/icu/source/tools/toolutil/ucln_tu.cpp b/intl/icu/source/tools/toolutil/ucln_tu.cpp new file mode 100644 index 0000000000..4727227ebf --- /dev/null +++ b/intl/icu/source/tools/toolutil/ucln_tu.cpp @@ -0,0 +1,19 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2007-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + + +/** Auto-client **/ +#define UCLN_TYPE UCLN_TOOLUTIL +#include "ucln_imp.h" + +int uprv_dummyFunction_TU(); +int uprv_dummyFunction_TU() +{ + /* this is here to prevent the compiler from complaining about an empty file */ + return 0; +} diff --git a/intl/icu/source/tools/toolutil/ucm.cpp b/intl/icu/source/tools/toolutil/ucm.cpp new file mode 100644 index 0000000000..272570e72f --- /dev/null +++ b/intl/icu/source/tools/toolutil/ucm.cpp @@ -0,0 +1,1195 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2003-2013, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: ucm.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003jun20 +* created by: Markus W. Scherer +* +* This file reads a .ucm file, stores its mappings and sorts them. +* It implements handling of Unicode conversion mappings from .ucm files +* for makeconv, canonucm, rptp2ucm, etc. +* +* Unicode code point sequences with a length of more than 1, +* as well as byte sequences with more than 4 bytes or more than one complete +* character sequence are handled to support m:n mappings. +*/ + +#include "unicode/utypes.h" +#include "unicode/ustring.h" +#include "cstring.h" +#include "cmemory.h" +#include "filestrm.h" +#include "uarrsort.h" +#include "ucnvmbcs.h" +#include "ucnv_bld.h" +#include "ucnv_ext.h" +#include "uparse.h" +#include "ucm.h" +#include <stdio.h> + +#if !UCONFIG_NO_CONVERSION + +/* -------------------------------------------------------------------------- */ + +static void +printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) { + int32_t j; + + for(j=0; j<m->uLen; ++j) { + fprintf(f, "<U%04lX>", (long)codePoints[j]); + } + + fputc(' ', f); + + for(j=0; j<m->bLen; ++j) { + fprintf(f, "\\x%02X", bytes[j]); + } + + if(m->f>=0) { + fprintf(f, " |%u\n", m->f); + } else { + fputs("\n", f); + } +} + +U_CAPI void U_EXPORT2 +ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) { + printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f); +} + +U_CAPI void U_EXPORT2 +ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) { + UCMapping *m; + int32_t i, length; + + m=table->mappings; + length=table->mappingsLength; + if(byUnicode) { + for(i=0; i<length; ++m, ++i) { + ucm_printMapping(table, m, f); + } + } else { + const int32_t *map=table->reverseMap; + for(i=0; i<length; ++i) { + ucm_printMapping(table, m+map[i], f); + } + } +} + +/* mapping comparisons ------------------------------------------------------ */ + +static int32_t +compareUnicode(UCMTable *lTable, const UCMapping *l, + UCMTable *rTable, const UCMapping *r) { + const UChar32 *lu, *ru; + int32_t result, i, length; + + if(l->uLen==1 && r->uLen==1) { + /* compare two single code points */ + return l->u-r->u; + } + + /* get pointers to the code point sequences */ + lu=UCM_GET_CODE_POINTS(lTable, l); + ru=UCM_GET_CODE_POINTS(rTable, r); + + /* get the minimum length */ + if(l->uLen<=r->uLen) { + length=l->uLen; + } else { + length=r->uLen; + } + + /* compare the code points */ + for(i=0; i<length; ++i) { + result=lu[i]-ru[i]; + if(result!=0) { + return result; + } + } + + /* compare the lengths */ + return l->uLen-r->uLen; +} + +static int32_t +compareBytes(UCMTable *lTable, const UCMapping *l, + UCMTable *rTable, const UCMapping *r, + UBool lexical) { + const uint8_t *lb, *rb; + int32_t result, i, length; + + /* + * A lexical comparison is used for sorting in the builder, to allow + * an efficient search for a byte sequence that could be a prefix + * of a previously entered byte sequence. + * + * Comparing by lengths first is for compatibility with old .ucm tools + * like canonucm and rptp2ucm. + */ + if(lexical) { + /* get the minimum length and continue */ + if(l->bLen<=r->bLen) { + length=l->bLen; + } else { + length=r->bLen; + } + } else { + /* compare lengths first */ + result=l->bLen-r->bLen; + if(result!=0) { + return result; + } else { + length=l->bLen; + } + } + + /* get pointers to the byte sequences */ + lb=UCM_GET_BYTES(lTable, l); + rb=UCM_GET_BYTES(rTable, r); + + /* compare the bytes */ + for(i=0; i<length; ++i) { + result=lb[i]-rb[i]; + if(result!=0) { + return result; + } + } + + /* compare the lengths */ + return l->bLen-r->bLen; +} + +/* compare UCMappings for sorting */ +static int32_t +compareMappings(UCMTable *lTable, const UCMapping *l, + UCMTable *rTable, const UCMapping *r, + UBool uFirst) { + int32_t result; + + /* choose which side to compare first */ + if(uFirst) { + /* Unicode then bytes */ + result=compareUnicode(lTable, l, rTable, r); + if(result==0) { + result=compareBytes(lTable, l, rTable, r, false); /* not lexically, like canonucm */ + } + } else { + /* bytes then Unicode */ + result=compareBytes(lTable, l, rTable, r, true); /* lexically, for builder */ + if(result==0) { + result=compareUnicode(lTable, l, rTable, r); + } + } + + if(result!=0) { + return result; + } + + /* compare the flags */ + return l->f-r->f; +} +U_CDECL_BEGIN +/* sorting by Unicode first sorts mappings directly */ +static int32_t U_CALLCONV +compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) { + return compareMappings( + (UCMTable *)context, (const UCMapping *)left, + (UCMTable *)context, (const UCMapping *)right, true); +} + +/* sorting by bytes first sorts the reverseMap; use indirection to mappings */ +static int32_t U_CALLCONV +compareMappingsBytesFirst(const void *context, const void *left, const void *right) { + UCMTable *table=(UCMTable *)context; + int32_t l=*(const int32_t *)left, r=*(const int32_t *)right; + return compareMappings( + table, table->mappings+l, + table, table->mappings+r, false); +} +U_CDECL_END + +U_CAPI void U_EXPORT2 +ucm_sortTable(UCMTable *t) { + UErrorCode errorCode; + int32_t i; + + if(t->isSorted) { + return; + } + + errorCode=U_ZERO_ERROR; + + /* 1. sort by Unicode first */ + uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping), + compareMappingsUnicodeFirst, t, + false, &errorCode); + + /* build the reverseMap */ + if(t->reverseMap==nullptr) { + /* + * allocate mappingsCapacity instead of mappingsLength so that + * if mappings are added, the reverseMap need not be + * reallocated each time + * (see ucm_moveMappings() and ucm_addMapping()) + */ + t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t)); + if(t->reverseMap==nullptr) { + fprintf(stderr, "ucm error: unable to allocate reverseMap\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + for(i=0; i<t->mappingsLength; ++i) { + t->reverseMap[i]=i; + } + + /* 2. sort reverseMap by mappings bytes first */ + uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t), + compareMappingsBytesFirst, t, + false, &errorCode); + + if(U_FAILURE(errorCode)) { + fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n", + u_errorName(errorCode)); + exit(errorCode); + } + + t->isSorted=true; +} + +/* + * remove mappings with their move flag set from the base table + * and move some of them (with UCM_MOVE_TO_EXT) to the extension table + */ +U_CAPI void U_EXPORT2 +ucm_moveMappings(UCMTable *base, UCMTable *ext) { + UCMapping *mb, *mbLimit; + int8_t flag; + + mb=base->mappings; + mbLimit=mb+base->mappingsLength; + + while(mb<mbLimit) { + flag=mb->moveFlag; + if(flag!=0) { + /* reset the move flag */ + mb->moveFlag=0; + + if(ext!=nullptr && (flag&UCM_MOVE_TO_EXT)) { + /* add the mapping to the extension table */ + ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb)); + } + + /* remove this mapping: move the last base mapping down and overwrite the current one */ + if(mb<(mbLimit-1)) { + uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping)); + } + --mbLimit; + --base->mappingsLength; + base->isSorted=false; + } else { + ++mb; + } + } +} + +enum { + NEEDS_MOVE=1, + HAS_ERRORS=2 +}; + +static uint8_t +checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext, + UBool moveToExt, UBool intersectBase) { + (void)baseStates; + + UCMapping *mb, *me, *mbLimit, *meLimit; + int32_t cmp; + uint8_t result; + + mb=base->mappings; + mbLimit=mb+base->mappingsLength; + + me=ext->mappings; + meLimit=me+ext->mappingsLength; + + result=0; + + for(;;) { + /* skip irrelevant mappings on both sides */ + for(;;) { + if(mb==mbLimit) { + return result; + } + + if((0<=mb->f && mb->f<=2) || mb->f==4) { + break; + } + + ++mb; + } + + for(;;) { + if(me==meLimit) { + return result; + } + + if((0<=me->f && me->f<=2) || me->f==4) { + break; + } + + ++me; + } + + /* compare the base and extension mappings */ + cmp=compareUnicode(base, mb, ext, me); + if(cmp<0) { + if(intersectBase && (intersectBase!=2 || mb->bLen>1)) { + /* + * mapping in base but not in ext, move it + * + * if ext is DBCS, move DBCS mappings here + * and check SBCS ones for Unicode prefix below + */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + + /* does mb map from an input sequence that is a prefix of me's? */ + } else if( mb->uLen<me->uLen && + 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) + ) { + if(moveToExt) { + /* mark this mapping to be moved to the extension table */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is a prefix of the input sequence of an extension mapping\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; + } + } + + ++mb; + } else if(cmp==0) { + /* + * same output: remove the extension mapping, + * otherwise treat as an error + */ + if( mb->f==me->f && mb->bLen==me->bLen && + 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) + ) { + me->moveFlag|=UCM_REMOVE_MAPPING; + result|=NEEDS_MOVE; + } else if(intersectBase) { + /* mapping in base but not in ext, move it */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is the same as the input sequence of an extension mapping\n" + " but it maps differently\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; + } + + ++mb; + } else /* cmp>0 */ { + ++me; + } + } +} + +static uint8_t +checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, + UBool moveToExt, UBool intersectBase) { + UCMapping *mb, *me; + int32_t *baseMap, *extMap; + int32_t b, e, bLimit, eLimit, cmp; + uint8_t result; + UBool isSISO; + + baseMap=base->reverseMap; + extMap=ext->reverseMap; + + b=e=0; + bLimit=base->mappingsLength; + eLimit=ext->mappingsLength; + + result=0; + + isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO); + + for(;;) { + /* skip irrelevant mappings on both sides */ + for(;; ++b) { + if(b==bLimit) { + return result; + } + mb=base->mappings+baseMap[b]; + + if(intersectBase==2 && mb->bLen==1) { + /* + * comparing a base against a DBCS extension: + * leave SBCS base mappings alone + */ + continue; + } + + if(mb->f==0 || mb->f==3) { + break; + } + } + + for(;;) { + if(e==eLimit) { + return result; + } + me=ext->mappings+extMap[e]; + + if(me->f==0 || me->f==3) { + break; + } + + ++e; + } + + /* compare the base and extension mappings */ + cmp=compareBytes(base, mb, ext, me, true); + if(cmp<0) { + if(intersectBase) { + /* mapping in base but not in ext, move it */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + + /* + * does mb map from an input sequence that is a prefix of me's? + * for SI/SO tables, a single byte is never a prefix because it + * occurs in a separate single-byte state + */ + } else if( mb->bLen<me->bLen && + (!isSISO || mb->bLen>1) && + 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) + ) { + if(moveToExt) { + /* mark this mapping to be moved to the extension table */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is a prefix of the input sequence of an extension mapping\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; + } + } + + ++b; + } else if(cmp==0) { + /* + * same output: remove the extension mapping, + * otherwise treat as an error + */ + if( mb->f==me->f && mb->uLen==me->uLen && + 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) + ) { + me->moveFlag|=UCM_REMOVE_MAPPING; + result|=NEEDS_MOVE; + } else if(intersectBase) { + /* mapping in base but not in ext, move it */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is the same as the input sequence of an extension mapping\n" + " but it maps differently\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; + } + + ++b; + } else /* cmp>0 */ { + ++e; + } + } +} + +U_CAPI UBool U_EXPORT2 +ucm_checkValidity(UCMTable *table, UCMStates *baseStates) { + UCMapping *m, *mLimit; + int32_t count; + UBool isOK; + + m=table->mappings; + mLimit=m+table->mappingsLength; + isOK=true; + + while(m<mLimit) { + count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen); + if(count<1) { + ucm_printMapping(table, m, stderr); + isOK=false; + } + ++m; + } + + return isOK; +} + +U_CAPI UBool U_EXPORT2 +ucm_checkBaseExt(UCMStates *baseStates, + UCMTable *base, UCMTable *ext, UCMTable *moveTarget, + UBool intersectBase) { + uint8_t result; + + /* if we have an extension table, we must always use precision flags */ + if(base->flagsType&UCM_FLAGS_IMPLICIT) { + fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n"); + return false; + } + if(ext->flagsType&UCM_FLAGS_IMPLICIT) { + fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n"); + return false; + } + + /* checking requires both tables to be sorted */ + ucm_sortTable(base); + ucm_sortTable(ext); + + /* check */ + result= + checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=nullptr), intersectBase)| + checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=nullptr), intersectBase); + + if(result&HAS_ERRORS) { + return false; + } + + if(result&NEEDS_MOVE) { + ucm_moveMappings(ext, nullptr); + ucm_moveMappings(base, moveTarget); + ucm_sortTable(base); + ucm_sortTable(ext); + if(moveTarget!=nullptr) { + ucm_sortTable(moveTarget); + } + } + + return true; +} + +/* merge tables for rptp2ucm ------------------------------------------------ */ + +U_CAPI void U_EXPORT2 +ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, + const uint8_t *subchar, int32_t subcharLength, + uint8_t subchar1) { + UCMapping *fromUMapping, *toUMapping; + int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp; + + ucm_sortTable(fromUTable); + ucm_sortTable(toUTable); + + fromUMapping=fromUTable->mappings; + toUMapping=toUTable->mappings; + + fromUTop=fromUTable->mappingsLength; + toUTop=toUTable->mappingsLength; + + fromUIndex=toUIndex=0; + + while(fromUIndex<fromUTop && toUIndex<toUTop) { + cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, true); + if(cmp==0) { + /* equal: roundtrip, nothing to do (flags are initially 0) */ + ++fromUMapping; + ++toUMapping; + + ++fromUIndex; + ++toUIndex; + } else if(cmp<0) { + /* + * the fromU mapping does not have a toU counterpart: + * fallback Unicode->codepage + */ + if( (fromUMapping->bLen==subcharLength && + 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || + (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) + ) { + fromUMapping->f=2; /* SUB mapping */ + } else { + fromUMapping->f=1; /* normal fallback */ + } + + ++fromUMapping; + ++fromUIndex; + } else { + /* + * the toU mapping does not have a fromU counterpart: + * (reverse) fallback codepage->Unicode, copy it to the fromU table + */ + + /* ignore reverse fallbacks to Unicode SUB */ + if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { + toUMapping->f=3; /* reverse fallback */ + ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); + + /* the table may have been reallocated */ + fromUMapping=fromUTable->mappings+fromUIndex; + } + + ++toUMapping; + ++toUIndex; + } + } + + /* either one or both tables are exhausted */ + while(fromUIndex<fromUTop) { + /* leftover fromU mappings are fallbacks */ + if( (fromUMapping->bLen==subcharLength && + 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || + (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) + ) { + fromUMapping->f=2; /* SUB mapping */ + } else { + fromUMapping->f=1; /* normal fallback */ + } + + ++fromUMapping; + ++fromUIndex; + } + + while(toUIndex<toUTop) { + /* leftover toU mappings are reverse fallbacks */ + + /* ignore reverse fallbacks to Unicode SUB */ + if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { + toUMapping->f=3; /* reverse fallback */ + ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); + } + + ++toUMapping; + ++toUIndex; + } + + fromUTable->isSorted=false; +} + +/* separate extension mappings out of base table for rptp2ucm --------------- */ + +U_CAPI UBool U_EXPORT2 +ucm_separateMappings(UCMFile *ucm, UBool isSISO) { + UCMTable *table; + UCMapping *m, *mLimit; + int32_t type; + UBool needsMove, isOK; + + table=ucm->base; + m=table->mappings; + mLimit=m+table->mappingsLength; + + needsMove=false; + isOK=true; + + for(; m<mLimit; ++m) { + if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) { + fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n"); + ucm_printMapping(table, m, stderr); + m->moveFlag|=UCM_REMOVE_MAPPING; + needsMove=true; + continue; + } + + type=ucm_mappingType( + &ucm->states, m, + UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m)); + if(type<0) { + /* illegal byte sequence */ + printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr); + isOK=false; + } else if(type>0) { + m->moveFlag|=UCM_MOVE_TO_EXT; + needsMove=true; + } + } + + if(!isOK) { + return false; + } + if(needsMove) { + ucm_moveMappings(ucm->base, ucm->ext); + return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, false); + } else { + ucm_sortTable(ucm->base); + return true; + } +} + +/* ucm parser --------------------------------------------------------------- */ + +U_CAPI int8_t U_EXPORT2 +ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) { + const char *s=*ps; + char *end; + uint8_t byte; + int8_t bLen; + + bLen=0; + for(;;) { + /* skip an optional plus sign */ + if(bLen>0 && *s=='+') { + ++s; + } + if(*s!='\\') { + break; + } + + if( s[1]!='x' || + (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4 + ) { + fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line); + return -1; + } + + if(bLen==UCNV_EXT_MAX_BYTES) { + fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line); + return -1; + } + bytes[bLen++]=byte; + s=end; + } + + *ps=s; + return bLen; +} + +/* parse a mapping line; must not be empty */ +U_CAPI UBool U_EXPORT2 +ucm_parseMappingLine(UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES], + const char *line) { + const char *s; + char *end; + UChar32 cp; + int32_t u16Length; + int8_t uLen, bLen, f; + + s=line; + uLen=bLen=0; + + /* parse code points */ + for(;;) { + /* skip an optional plus sign */ + if(uLen>0 && *s=='+') { + ++s; + } + if(*s!='<') { + break; + } + + if( s[1]!='U' || + (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 || + *end!='>' + ) { + fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line); + return false; + } + if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) { + fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line); + return false; + } + + if(uLen==UCNV_EXT_MAX_UCHARS) { + fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line); + return false; + } + codePoints[uLen++]=cp; + s=end+1; + } + + if(uLen==0) { + fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line); + return false; + } else if(uLen==1) { + m->u=codePoints[0]; + } else { + UErrorCode errorCode=U_ZERO_ERROR; + u_strFromUTF32(nullptr, 0, &u16Length, codePoints, uLen, &errorCode); + if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) || + u16Length>UCNV_EXT_MAX_UCHARS + ) { + fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line); + return false; + } + } + + s=u_skipWhitespace(s); + + /* parse bytes */ + bLen=ucm_parseBytes(bytes, line, &s); + + if(bLen<0) { + return false; + } else if(bLen==0) { + fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line); + return false; + } else if(bLen<=4) { + uprv_memcpy(m->b.bytes, bytes, bLen); + } + + /* skip everything until the fallback indicator, even the start of a comment */ + for(;;) { + if(*s==0) { + f=-1; /* no fallback indicator */ + break; + } else if(*s=='|') { + f=(int8_t)(s[1]-'0'); + if((uint8_t)f>4) { + fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line); + return false; + } + break; + } + ++s; + } + + m->uLen=uLen; + m->bLen=bLen; + m->f=f; + return true; +} + +/* general APIs ------------------------------------------------------------- */ + +U_CAPI UCMTable * U_EXPORT2 +ucm_openTable() { + UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable)); + if(table==nullptr) { + fprintf(stderr, "ucm error: unable to allocate a UCMTable\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + memset(table, 0, sizeof(UCMTable)); + return table; +} + +U_CAPI void U_EXPORT2 +ucm_closeTable(UCMTable *table) { + if(table!=nullptr) { + uprv_free(table->mappings); + uprv_free(table->codePoints); + uprv_free(table->bytes); + uprv_free(table->reverseMap); + uprv_free(table); + } +} + +U_CAPI void U_EXPORT2 +ucm_resetTable(UCMTable *table) { + if(table!=nullptr) { + table->mappingsLength=0; + table->flagsType=0; + table->unicodeMask=0; + table->bytesLength=table->codePointsLength=0; + table->isSorted=false; + } +} + +U_CAPI void U_EXPORT2 +ucm_addMapping(UCMTable *table, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]) { + UCMapping *tm; + UChar32 c; + int32_t idx; + + if(table->mappingsLength>=table->mappingsCapacity) { + /* make the mappings array larger */ + if(table->mappingsCapacity==0) { + table->mappingsCapacity=1000; + } else { + table->mappingsCapacity*=10; + } + table->mappings=(UCMapping *)uprv_realloc(table->mappings, + table->mappingsCapacity*sizeof(UCMapping)); + if(table->mappings==nullptr) { + fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n", + (int)table->mappingsCapacity); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + if(table->reverseMap!=nullptr) { + /* the reverseMap must be reallocated in a new sort */ + uprv_free(table->reverseMap); + table->reverseMap=nullptr; + } + } + + if(m->uLen>1 && table->codePointsCapacity==0) { + table->codePointsCapacity=10000; + table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4); + if(table->codePoints==nullptr) { + fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n", + (int)table->codePointsCapacity); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + if(m->bLen>4 && table->bytesCapacity==0) { + table->bytesCapacity=10000; + table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity); + if(table->bytes==nullptr) { + fprintf(stderr, "ucm error: unable to allocate %d bytes\n", + (int)table->bytesCapacity); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + if(m->uLen>1) { + idx=table->codePointsLength; + table->codePointsLength+=m->uLen; + if(table->codePointsLength>table->codePointsCapacity) { + fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4); + m->u=idx; + } + + if(m->bLen>4) { + idx=table->bytesLength; + table->bytesLength+=m->bLen; + if(table->bytesLength>table->bytesCapacity) { + fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + uprv_memcpy(table->bytes+idx, bytes, m->bLen); + m->b.idx=idx; + } + + /* set unicodeMask */ + for(idx=0; idx<m->uLen; ++idx) { + c=codePoints[idx]; + if(c>=0x10000) { + table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */ + } else if(U_IS_SURROGATE(c)) { + table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */ + } + } + + /* set flagsType */ + if(m->f<0) { + table->flagsType|=UCM_FLAGS_IMPLICIT; + } else { + table->flagsType|=UCM_FLAGS_EXPLICIT; + } + + tm=table->mappings+table->mappingsLength++; + uprv_memcpy(tm, m, sizeof(UCMapping)); + + table->isSorted=false; +} + +U_CAPI UCMFile * U_EXPORT2 +ucm_open() { + UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile)); + if(ucm==nullptr) { + fprintf(stderr, "ucm error: unable to allocate a UCMFile\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + memset(ucm, 0, sizeof(UCMFile)); + + ucm->base=ucm_openTable(); + ucm->ext=ucm_openTable(); + + ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT; + ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER; + ucm->states.outputType=-1; + ucm->states.minCharLength=ucm->states.maxCharLength=1; + + return ucm; +} + +U_CAPI void U_EXPORT2 +ucm_close(UCMFile *ucm) { + if(ucm!=nullptr) { + ucm_closeTable(ucm->base); + ucm_closeTable(ucm->ext); + uprv_free(ucm); + } +} + +U_CAPI int32_t U_EXPORT2 +ucm_mappingType(UCMStates *baseStates, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]) { + (void)codePoints; + /* check validity of the bytes and count the characters in them */ + int32_t count=ucm_countChars(baseStates, bytes, m->bLen); + if(count<1) { + /* illegal byte sequence */ + return -1; + } + + /* + * Suitable for an ICU conversion base table means: + * - a 1:1 mapping (1 Unicode code point : 1 byte sequence) + * - precision flag 0..3 + * - SBCS: any 1:1 mapping + * (the table stores additional bits to distinguish mapping types) + * - MBCS: not a |2 SUB mapping for <subchar1> + * - MBCS: not a |1 fallback to 0x00 + * - MBCS: not a multi-byte mapping with leading 0x00 bytes + * + * Further restrictions for fromUnicode tables + * are enforced in makeconv (MBCSOkForBaseFromUnicode()). + * + * All of the MBCS fromUnicode specific tests could be removed from here, + * but the ones above are for unusual mappings, and removing the tests + * from here would change canonucm output which seems gratuitous. + * (Markus Scherer 2006-nov-28) + * + * Exception: All implicit mappings (f<0) that need to be moved + * because of fromUnicode restrictions _must_ be moved here because + * makeconv uses a hack for moving mappings only for the fromUnicode table + * that only works with non-negative values of f. + */ + if( m->uLen==1 && count==1 && m->f<=3 && + (baseStates->maxCharLength==1 || + !((m->f==2 && m->bLen==1) || + (m->f==1 && bytes[0]==0) || + (m->f<=1 && m->bLen>1 && bytes[0]==0))) + ) { + return 0; /* suitable for a base table */ + } else { + return 1; /* needs to go into an extension table */ + } +} + +U_CAPI UBool U_EXPORT2 +ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]) { + int32_t type; + + if(m->f==2 && m->uLen>1) { + fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n"); + printMapping(m, codePoints, bytes, stderr); + return false; + } + + if(baseStates!=nullptr) { + /* check validity of the bytes and count the characters in them */ + type=ucm_mappingType(baseStates, m, codePoints, bytes); + if(type<0) { + /* illegal byte sequence */ + printMapping(m, codePoints, bytes, stderr); + return false; + } + } else { + /* not used - adding a mapping for an extension-only table before its base table is read */ + type=1; + } + + /* + * Add the mapping to the base table if this is requested and suitable. + * Otherwise, add it to the extension table. + */ + if(forBase && type==0) { + ucm_addMapping(ucm->base, m, codePoints, bytes); + } else { + ucm_addMapping(ucm->ext, m, codePoints, bytes); + } + + return true; +} + +U_CAPI UBool U_EXPORT2 +ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) { + UCMapping m={ 0, {0}, 0, 0, 0, 0 }; + UChar32 codePoints[UCNV_EXT_MAX_UCHARS]; + uint8_t bytes[UCNV_EXT_MAX_BYTES]; + + const char *s; + + /* ignore empty and comment lines */ + if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') { + return true; + } + + return + ucm_parseMappingLine(&m, codePoints, bytes, line) && + ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes); +} + +U_CAPI void U_EXPORT2 +ucm_readTable(UCMFile *ucm, FileStream* convFile, + UBool forBase, UCMStates *baseStates, + UErrorCode *pErrorCode) { + char line[500]; + char *end; + UBool isOK; + + if(U_FAILURE(*pErrorCode)) { + return; + } + + isOK=true; + + for(;;) { + /* read the next line */ + if(!T_FileStream_readLine(convFile, line, sizeof(line))) { + fprintf(stderr, "incomplete charmap section\n"); + isOK=false; + break; + } + + /* remove CR LF */ + end=uprv_strchr(line, 0); + while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) { + --end; + } + *end=0; + + /* ignore empty and comment lines */ + if(line[0]==0 || line[0]=='#') { + continue; + } + + /* stop at the end of the mapping table */ + if(0==uprv_strcmp(line, "END CHARMAP")) { + break; + } + + isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates); + } + + if(!isOK) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } +} +#endif diff --git a/intl/icu/source/tools/toolutil/ucm.h b/intl/icu/source/tools/toolutil/ucm.h new file mode 100644 index 0000000000..8ea90604d4 --- /dev/null +++ b/intl/icu/source/tools/toolutil/ucm.h @@ -0,0 +1,302 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ******************************************************************************* + * Copyright (C) 2003-2013, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + * file name: ucm.h + * encoding: UTF-8 + * tab size: 8 (not used) + * indentation:4 + * + * created on: 2003jun20 + * created by: Markus W. Scherer + * + * Definitions for the .ucm file parser and handler module ucm.c. + */ + +#ifndef __UCM_H__ +#define __UCM_H__ + +#include "unicode/utypes.h" +#include "ucnvmbcs.h" +#include "ucnv_ext.h" +#include "filestrm.h" +#include <stdio.h> + +#if !UCONFIG_NO_CONVERSION + +U_CDECL_BEGIN + +/* constants for UCMapping.moveFlag */ +enum { + UCM_MOVE_TO_EXT=1, + UCM_REMOVE_MAPPING=2 +}; + +/* + * Per-mapping data structure + * + * u if uLen==1: Unicode code point + * else index to uLen code points + * b if bLen<=4: up to 4 bytes + * else index to bLen bytes + * uLen number of code points + * bLen number of words containing left-justified bytes + * bIsMultipleChars indicates that the bytes contain more than one sequence + * according to the state table + * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3) + * or "good one-way" mapping (4). + * Same values as in the source file after | + */ +typedef struct UCMapping { + UChar32 u; + union { + uint32_t idx; + uint8_t bytes[4]; + } b; + int8_t uLen, bLen, f, moveFlag; +} UCMapping; + +/* constants for UCMTable.flagsType */ +enum { + UCM_FLAGS_INITIAL, /* no mappings parsed yet */ + UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */ + UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */ + UCM_FLAGS_MIXED /* both implicit and explicit */ +}; + +typedef struct UCMTable { + UCMapping *mappings; + int32_t mappingsCapacity, mappingsLength; + + UChar32 *codePoints; + int32_t codePointsCapacity, codePointsLength; + + uint8_t *bytes; + int32_t bytesCapacity, bytesLength; + + /* index map for mapping by bytes first */ + int32_t *reverseMap; + + uint8_t unicodeMask; + int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */ + UBool isSorted; +} UCMTable; + +enum { + MBCS_STATE_FLAG_DIRECT=1, + MBCS_STATE_FLAG_SURROGATES, + + MBCS_STATE_FLAG_READY=16 +}; + +typedef struct UCMStates { + int32_t stateTable[MBCS_MAX_STATE_COUNT][256]; + uint32_t stateFlags[MBCS_MAX_STATE_COUNT], + stateOffsetSum[MBCS_MAX_STATE_COUNT]; + + int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits; + int8_t conversionType, outputType; +} UCMStates; + +typedef struct UCMFile { + UCMTable *base, *ext; + UCMStates states; + + char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH]; +} UCMFile; + +/* simple accesses ---------------------------------------------------------- */ + +#define UCM_GET_CODE_POINTS(t, m) \ + (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u) + +#define UCM_GET_BYTES(t, m) \ + (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx) + +/* APIs --------------------------------------------------------------------- */ + +U_CAPI UCMFile * U_EXPORT2 +ucm_open(void); + +U_CAPI void U_EXPORT2 +ucm_close(UCMFile *ucm); + +U_CAPI UBool U_EXPORT2 +ucm_parseHeaderLine(UCMFile *ucm, + char *line, char **pKey, char **pValue); + +/* @return -1 illegal bytes 0 suitable for base table 1 needs to go into extension table */ +U_CAPI int32_t U_EXPORT2 +ucm_mappingType(UCMStates *baseStates, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]); + +/* add a mapping to the base or extension table as appropriate */ +U_CAPI UBool U_EXPORT2 +ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]); + +U_CAPI UBool U_EXPORT2 +ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates); + + +U_CAPI UCMTable * U_EXPORT2 +ucm_openTable(void); + +U_CAPI void U_EXPORT2 +ucm_closeTable(UCMTable *table); + +U_CAPI void U_EXPORT2 +ucm_resetTable(UCMTable *table); + +U_CAPI void U_EXPORT2 +ucm_sortTable(UCMTable *t); + +/* + * Remove mappings with their move flag set from the base table + * and move some of them (with UCM_MOVE_TO_EXT) to the extension table. + */ +U_CAPI void U_EXPORT2 +ucm_moveMappings(UCMTable *base, UCMTable *ext); + +/** + * Read a table from a .ucm file, from after the CHARMAP line to + * including the END CHARMAP line. + */ +U_CAPI void U_EXPORT2 +ucm_readTable(UCMFile *ucm, FileStream* convFile, + UBool forBase, UCMStates *baseStates, + UErrorCode *pErrorCode); + +/** + * Check the validity of mappings against a base table's states; + * necessary for extension-only tables that were read before their base tables. + */ +U_CAPI UBool U_EXPORT2 +ucm_checkValidity(UCMTable *ext, UCMStates *baseStates); + +/** + * Check a base table against an extension table. + * Set the moveTarget!=NULL if it is possible to move mappings from the base. + * This is the case where base and extension tables are parsed from a single file + * (moveTarget==ext) + * or when delta file mappings are subtracted from a base table. + * + * When a base table cannot be modified because a delta file is parsed in makeconv, + * then set moveTarget=NULL. + * + * if(intersectBase) then mappings that exist in the base table but not in + * the extension table are moved to moveTarget instead of showing an error. + * + * Special mode: + * If intersectBase==2 for a DBCS extension table, then SBCS mappings are + * not moved out of the base unless their Unicode input requires it. + * This helps ucmkbase generate base tables for DBCS-only extension .cnv files. + * + * For both tables in the same file, the extension table is automatically + * built. + * For separate files, the extension file can use a complete mapping table (.ucm file), + * so that common mappings need not be stripped out manually. + * + * + * Sort both tables, and then for each mapping direction: + * + * If intersectBase is true and the base table contains a mapping + * that does not exist in the extension table, then this mapping is moved + * to moveTarget. + * + * - otherwise - + * + * If the base table contains a mapping for which the input sequence is + * the same as the extension input, then + * - if the output is the same: remove the extension mapping + * - else: error + * + * If the base table contains a mapping for which the input sequence is + * a prefix of the extension input, then + * - if moveTarget!=NULL: move the base mapping to the moveTarget table + * - else: error + * + * @return false in case of an irreparable error + */ +U_CAPI UBool U_EXPORT2 +ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, + UCMTable *moveTarget, UBool intersectBase); + +U_CAPI void U_EXPORT2 +ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode); + +U_CAPI void U_EXPORT2 +ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f); + + +U_CAPI void U_EXPORT2 +ucm_addState(UCMStates *states, const char *s); + +U_CAPI void U_EXPORT2 +ucm_processStates(UCMStates *states, UBool ignoreSISOCheck); + +U_CAPI int32_t U_EXPORT2 +ucm_countChars(UCMStates *states, + const uint8_t *bytes, int32_t length); + + +U_CAPI int8_t U_EXPORT2 +ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps); + +U_CAPI UBool U_EXPORT2 +ucm_parseMappingLine(UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES], + const char *line); + +U_CAPI void U_EXPORT2 +ucm_addMapping(UCMTable *table, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]); + +/* very makeconv-specific functions ----------------------------------------- */ + +/* finalize and optimize states after the toUnicode mappings are processed */ +U_CAPI void U_EXPORT2 +ucm_optimizeStates(UCMStates *states, + uint16_t **pUnicodeCodeUnits, + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + UBool verbose); + +/* moved here because it is used inside ucmstate.c */ +U_CAPI int32_t U_EXPORT2 +ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + uint32_t offset); + +/* very rptp2ucm-specific functions ----------------------------------------- */ + +/* + * Input: Separate tables with mappings from/to Unicode, + * subchar and subchar1 (0 if none). + * All mappings must have flag 0. + * + * Output: fromUTable will contain the union of mappings with the correct + * precision flags, and be sorted. + */ +U_CAPI void U_EXPORT2 +ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, + const uint8_t *subchar, int32_t subcharLength, + uint8_t subchar1); + +U_CAPI UBool U_EXPORT2 +ucm_separateMappings(UCMFile *ucm, UBool isSISO); + +U_CDECL_END + +#endif + +#endif + diff --git a/intl/icu/source/tools/toolutil/ucmstate.cpp b/intl/icu/source/tools/toolutil/ucmstate.cpp new file mode 100644 index 0000000000..08782f68d1 --- /dev/null +++ b/intl/icu/source/tools/toolutil/ucmstate.cpp @@ -0,0 +1,1053 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2003-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: ucmstate.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003oct09 +* created by: Markus W. Scherer +* +* This file handles ICU .ucm file state information as part of the ucm module. +* Most of this code used to be in makeconv.c. +*/ + +#include "unicode/utypes.h" +#include "cstring.h" +#include "cmemory.h" +#include "uarrsort.h" +#include "ucnvmbcs.h" +#include "ucnv_ext.h" +#include "uparse.h" +#include "ucm.h" +#include <stdio.h> + +#if !UCONFIG_NO_CONVERSION + +/* MBCS state handling ------------------------------------------------------ */ + +/* + * state table row grammar (ebnf-style): + * (whitespace is allowed between all tokens) + * + * row=[[firstentry ','] entry (',' entry)*] + * firstentry="initial" | "surrogates" + * (initial state (default for state 0), output is all surrogate pairs) + * entry=range [':' nextstate] ['.' action] + * range=number ['-' number] + * nextstate=number + * (0..7f) + * action='u' | 's' | 'p' | 'i' + * (unassigned, state change only, surrogate pair, illegal) + * number=(1- or 2-digit hexadecimal number) + */ +static const char * +parseState(const char *s, int32_t state[256], uint32_t *pFlags) { + const char *t; + uint32_t start, end, i; + int32_t entry; + + /* initialize the state: all illegal with U+ffff */ + for(i=0; i<256; ++i) { + state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff); + } + + /* skip leading white space */ + s=u_skipWhitespace(s); + + /* is there an "initial" or "surrogates" directive? */ + if(uprv_strncmp("initial", s, 7)==0) { + *pFlags=MBCS_STATE_FLAG_DIRECT; + s=u_skipWhitespace(s+7); + if(*s++!=',') { + return s-1; + } + } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) { + *pFlags=MBCS_STATE_FLAG_SURROGATES; + s=u_skipWhitespace(s+10); + if(*s++!=',') { + return s-1; + } + } else if(*s==0) { + /* empty state row: all-illegal */ + return nullptr; + } + + for(;;) { + /* read an entry, the start of the range first */ + s=u_skipWhitespace(s); + start=uprv_strtoul(s, (char **)&t, 16); + if(s==t || 0xff<start) { + return s; + } + s=u_skipWhitespace(t); + + /* read the end of the range if there is one */ + if(*s=='-') { + s=u_skipWhitespace(s+1); + end=uprv_strtoul(s, (char **)&t, 16); + if(s==t || end<start || 0xff<end) { + return s; + } + s=u_skipWhitespace(t); + } else { + end=start; + } + + /* determine the state entry for this range */ + if(*s!=':' && *s!='.') { + /* the default is: final state with valid entries */ + entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0); + } else { + entry=MBCS_ENTRY_TRANSITION(0, 0); + if(*s==':') { + /* get the next state, default to 0 */ + s=u_skipWhitespace(s+1); + i=uprv_strtoul(s, (char **)&t, 16); + if(s!=t) { + if(0x7f<i) { + return s; + } + s=u_skipWhitespace(t); + entry=MBCS_ENTRY_SET_STATE(entry, i); + } + } + + /* get the state action, default to valid */ + if(*s=='.') { + /* this is a final state */ + entry=MBCS_ENTRY_SET_FINAL(entry); + + s=u_skipWhitespace(s+1); + if(*s=='u') { + /* unassigned set U+fffe */ + entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe); + s=u_skipWhitespace(s+1); + } else if(*s=='p') { + if(*pFlags!=MBCS_STATE_FLAG_DIRECT) { + entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR); + } else { + entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16); + } + s=u_skipWhitespace(s+1); + } else if(*s=='s') { + entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY); + s=u_skipWhitespace(s+1); + } else if(*s=='i') { + /* illegal set U+ffff */ + entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff); + s=u_skipWhitespace(s+1); + } else { + /* default to valid */ + entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16); + } + } else { + /* this is an intermediate state, nothing to do */ + } + } + + /* adjust "final valid" states according to the state flags */ + if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) { + switch(*pFlags) { + case 0: + /* no adjustment */ + break; + case MBCS_STATE_FLAG_DIRECT: + /* set the valid-direct code point to "unassigned"==0xfffe */ + entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe); + break; + case MBCS_STATE_FLAG_SURROGATES: + entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0); + break; + default: + break; + } + } + + /* set this entry for the range */ + for(i=start; i<=end; ++i) { + state[i]=entry; + } + + if(*s==',') { + ++s; + } else { + return *s==0 ? nullptr : s; + } + } +} + +U_CAPI void U_EXPORT2 +ucm_addState(UCMStates *states, const char *s) { + const char *error; + + if(states->countStates==MBCS_MAX_STATE_COUNT) { + fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT); + exit(U_INVALID_TABLE_FORMAT); + } + + error=parseState(s, states->stateTable[states->countStates], + &states->stateFlags[states->countStates]); + if(error!=nullptr) { + fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error); + exit(U_INVALID_TABLE_FORMAT); + } + + ++states->countStates; +} + +U_CAPI UBool U_EXPORT2 +ucm_parseHeaderLine(UCMFile *ucm, + char *line, char **pKey, char **pValue) { + UCMStates *states; + char *s, *end; + char c; + + states=&ucm->states; + + /* remove comments and trailing CR and LF and remove whitespace from the end */ + for(end=line; (c=*end)!=0; ++end) { + if(c=='#' || c=='\r' || c=='\n') { + break; + } + } + while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) { + --end; + } + *end=0; + + /* skip leading white space and ignore empty lines */ + s=(char *)u_skipWhitespace(line); + if(*s==0) { + return true; + } + + /* stop at the beginning of the mapping section */ + if(uprv_memcmp(s, "CHARMAP", 7)==0) { + return false; + } + + /* get the key name, bracketed in <> */ + if(*s!='<') { + fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line); + exit(U_INVALID_TABLE_FORMAT); + } + *pKey=++s; + while(*s!='>') { + if(*s==0) { + fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line); + exit(U_INVALID_TABLE_FORMAT); + } + ++s; + } + *s=0; + + /* get the value string, possibly quoted */ + s=(char *)u_skipWhitespace(s+1); + if(*s!='"') { + *pValue=s; + } else { + /* remove the quotes */ + *pValue=s+1; + if(end>*pValue && *(end-1)=='"') { + *--end=0; + } + } + + /* collect the information from the header field, ignore unknown keys */ + if(uprv_strcmp(*pKey, "uconv_class")==0) { + if(uprv_strcmp(*pValue, "DBCS")==0) { + states->conversionType=UCNV_DBCS; + } else if(uprv_strcmp(*pValue, "SBCS")==0) { + states->conversionType = UCNV_SBCS; + } else if(uprv_strcmp(*pValue, "MBCS")==0) { + states->conversionType = UCNV_MBCS; + } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) { + states->conversionType = UCNV_EBCDIC_STATEFUL; + } else { + fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue); + exit(U_INVALID_TABLE_FORMAT); + } + return true; + } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) { + c=**pValue; + if('1'<=c && c<='4' && (*pValue)[1]==0) { + states->maxCharLength=(int8_t)(c-'0'); + states->outputType=(int8_t)(states->maxCharLength-1); + } else { + fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue); + exit(U_INVALID_TABLE_FORMAT); + } + return true; + } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) { + c=**pValue; + if('1'<=c && c<='4' && (*pValue)[1]==0) { + states->minCharLength=(int8_t)(c-'0'); + } else { + fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue); + exit(U_INVALID_TABLE_FORMAT); + } + return true; + } else if(uprv_strcmp(*pKey, "icu:state")==0) { + /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */ + switch(states->conversionType) { + case UCNV_SBCS: + case UCNV_DBCS: + case UCNV_EBCDIC_STATEFUL: + states->conversionType=UCNV_MBCS; + break; + case UCNV_MBCS: + break; + default: + fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n"); + exit(U_INVALID_TABLE_FORMAT); + } + + if(states->maxCharLength==0) { + fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n"); + exit(U_INVALID_TABLE_FORMAT); + } + ucm_addState(states, *pValue); + return true; + } else if(uprv_strcmp(*pKey, "icu:base")==0) { + if(**pValue==0) { + fprintf(stderr, "ucm error: <icu:base> without a base table name\n"); + exit(U_INVALID_TABLE_FORMAT); + } + uprv_strcpy(ucm->baseName, *pValue); + return true; + } + + return false; +} + +/* post-processing ---------------------------------------------------------- */ + +static int32_t +sumUpStates(UCMStates *states) { + int32_t entry, sum, state, cell, count; + UBool allStatesReady; + + /* + * Sum up the offsets for all states. + * In each final state (where there are only final entries), + * the offsets add up directly. + * In all other state table rows, for each transition entry to another state, + * the offsets sum of that state needs to be added. + * This is achieved in at most countStates iterations. + */ + allStatesReady=false; + for(count=states->countStates; !allStatesReady && count>=0; --count) { + allStatesReady=true; + for(state=states->countStates-1; state>=0; --state) { + if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) { + allStatesReady=false; + sum=0; + + /* at first, add up only the final delta offsets to keep them <512 */ + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[state][cell]; + if(MBCS_ENTRY_IS_FINAL(entry)) { + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_VALID_16: + states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); + sum+=1; + break; + case MBCS_STATE_VALID_16_PAIR: + states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); + sum+=2; + break; + default: + /* no addition */ + break; + } + } + } + + /* now, add up the delta offsets for the transitional entries */ + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[state][cell]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) { + states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum); + sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)]; + } else { + /* that next state does not have a sum yet, we cannot finish the one for this state */ + sum=-1; + break; + } + } + } + + if(sum!=-1) { + states->stateOffsetSum[state]=sum; + states->stateFlags[state]|=MBCS_STATE_FLAG_READY; + } + } + } + } + + if(!allStatesReady) { + fprintf(stderr, "ucm error: the state table contains loops\n"); + exit(U_INVALID_TABLE_FORMAT); + } + + /* + * For all "direct" (i.e., initial) states>0, + * the offsets need to be increased by the sum of + * the previous initial states. + */ + sum=states->stateOffsetSum[0]; + for(state=1; state<states->countStates; ++state) { + if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { + int32_t sum2=sum; + sum+=states->stateOffsetSum[state]; + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[state][cell]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2); + } + } + } + } + + /* round up to the next even number to have the following data 32-bit-aligned */ + return states->countToUCodeUnits=(sum+1)&~1; +} + +U_CAPI void U_EXPORT2 +ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) { + int32_t entry, state, cell, count; + + if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) { + fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n"); + exit(U_INVALID_TABLE_FORMAT); + } + + if(states->countStates==0) { + switch(states->conversionType) { + case UCNV_SBCS: + /* SBCS: use MBCS data structure with a default state table */ + if(states->maxCharLength!=1) { + fprintf(stderr, "error: SBCS codepage with max B/char!=1\n"); + exit(U_INVALID_TABLE_FORMAT); + } + states->conversionType=UCNV_MBCS; + ucm_addState(states, "0-ff"); + break; + case UCNV_MBCS: + fprintf(stderr, "ucm error: missing state table information (<icu:state>) for MBCS\n"); + exit(U_INVALID_TABLE_FORMAT); + break; + case UCNV_EBCDIC_STATEFUL: + /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */ + if(states->minCharLength!=1 || states->maxCharLength!=2) { + fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n"); + exit(U_INVALID_TABLE_FORMAT); + } + states->conversionType=UCNV_MBCS; + ucm_addState(states, "0-ff, e:1.s, f:0.s"); + ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4"); + ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i"); + ucm_addState(states, "0-ff:1.i, 40:1."); + ucm_addState(states, "0-ff:1.i"); + break; + case UCNV_DBCS: + /* DBCS: use MBCS data structure with a default state table */ + if(states->minCharLength!=2 || states->maxCharLength!=2) { + fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n"); + exit(U_INVALID_TABLE_FORMAT); + } + states->conversionType = UCNV_MBCS; + ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3"); + ucm_addState(states, "41-fe"); + ucm_addState(states, "40"); + ucm_addState(states, ""); + break; + default: + fprintf(stderr, "ucm error: unknown charset structure\n"); + exit(U_INVALID_TABLE_FORMAT); + break; + } + } + + /* + * check that the min/max character lengths are reasonable; + * to do this right, all paths through the state table would have to be + * recursively walked while keeping track of the sequence lengths, + * but these simple checks cover most state tables in practice + */ + if(states->maxCharLength<states->minCharLength) { + fprintf(stderr, "ucm error: max B/char < min B/char\n"); + exit(U_INVALID_TABLE_FORMAT); + } + + /* count non-direct states and compare with max B/char */ + count=0; + for(state=0; state<states->countStates; ++state) { + if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) { + ++count; + } + } + if(states->maxCharLength>count+1) { + fprintf(stderr, "ucm error: max B/char too large\n"); + exit(U_INVALID_TABLE_FORMAT); + } + + if(states->minCharLength==1) { + int32_t action; + + /* + * if there are single-byte characters, + * then the initial state must have direct result states + */ + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[0][cell]; + if( MBCS_ENTRY_IS_FINAL(entry) && + ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 || + action==MBCS_STATE_UNASSIGNED) + ) { + break; + } + } + + if(cell==256) { + fprintf(stderr, "ucm warning: min B/char too small\n"); + } + } + + /* + * make sure that all "next state" values are within limits + * and that all next states after final ones have the "direct" + * flag of initial states + */ + for(state=states->countStates-1; state>=0; --state) { + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[state][cell]; + if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) { + fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n", + (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); + exit(U_INVALID_TABLE_FORMAT); + } + if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) { + fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n", + (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); + exit(U_INVALID_TABLE_FORMAT); + } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) { + fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n", + (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); + exit(U_INVALID_TABLE_FORMAT); + } + } + } + + /* is this an SI/SO (like EBCDIC-stateful) state table? */ + if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) { + if(states->maxCharLength!=2) { + fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength); + exit(U_INVALID_TABLE_FORMAT); + } + if(states->countStates<3) { + fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates); + exit(U_INVALID_TABLE_FORMAT); + } + /* are the SI/SO all in the right places? */ + if( ignoreSISOCheck || + (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && + states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) && + states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && + states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0)) + ) { + states->outputType=MBCS_OUTPUT_2_SISO; + } else { + fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n"); + exit(U_INVALID_TABLE_FORMAT); + } + state=2; + } else { + state=1; + } + + /* check that no unexpected state is a "direct" one */ + while(state<states->countStates) { + if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { + fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state); + exit(U_INVALID_TABLE_FORMAT); + } + ++state; + } + + sumUpStates(states); +} + +/* find a fallback for this offset; return the index or -1 if not found */ +U_CAPI int32_t U_EXPORT2 +ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + uint32_t offset) { + int32_t i; + + if(countToUFallbacks==0) { + /* shortcut: most codepages do not have fallbacks from codepage to Unicode */ + return -1; + } + + /* do a linear search for the fallback mapping (the table is not yet sorted) */ + for(i=0; i<countToUFallbacks; ++i) { + if(offset==toUFallbacks[i].offset) { + return i; + } + } + return -1; +} + +/* + * This function tries to compact toUnicode tables for 2-byte codepages + * by finding lead bytes with all-unassigned trail bytes and adding another state + * for them. + */ +static void +compactToUnicode2(UCMStates *states, + uint16_t **pUnicodeCodeUnits, + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + UBool verbose) { + int32_t (*oldStateTable)[256]; + uint16_t count[256]; + uint16_t *oldUnicodeCodeUnits; + int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum; + int32_t i, j, leadState, trailState, newState, fallback; + uint16_t unit; + + /* find the lead state */ + if(states->outputType==MBCS_OUTPUT_2_SISO) { + /* use the DBCS lead state for SI/SO codepages */ + leadState=1; + } else { + leadState=0; + } + + /* find the main trail state: the most used target state */ + uprv_memset(count, 0, sizeof(count)); + for(i=0; i<256; ++i) { + entry=states->stateTable[leadState][i]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + ++count[MBCS_ENTRY_TRANSITION_STATE(entry)]; + } + } + trailState=0; + for(i=1; i<states->countStates; ++i) { + if(count[i]>count[trailState]) { + trailState=i; + } + } + + /* count possible savings from lead bytes with all-unassigned results in all trail bytes */ + uprv_memset(count, 0, sizeof(count)); + savings=0; + /* for each lead byte */ + for(i=0; i<256; ++i) { + entry=states->stateTable[leadState][i]; + if(MBCS_ENTRY_IS_TRANSITION(entry) && + (MBCS_ENTRY_TRANSITION_STATE(entry))==static_cast<uint32_t>(trailState)) { + /* the offset is different for each lead byte */ + offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); + /* for each trail byte for this lead byte */ + for(j=0; j<256; ++j) { + entry=states->stateTable[trailState][j]; + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_VALID_16: + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); + if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) { + ++count[i]; + } else { + j=999; /* do not count for this lead byte because there are assignments */ + } + break; + case MBCS_STATE_VALID_16_PAIR: + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); + if((*pUnicodeCodeUnits)[entry]==0xfffe) { + count[i]+=2; + } else { + j=999; /* do not count for this lead byte because there are assignments */ + } + break; + default: + break; + } + } + if(j==256) { + /* all trail bytes for this lead byte are unassigned */ + savings+=count[i]; + } else { + count[i]=0; + } + } + } + /* subtract from the possible savings the cost of an additional state */ + savings=savings*2-1024; /* count bytes, not 16-bit words */ + if(savings<=0) { + return; + } + if(verbose) { + printf("compacting toUnicode data saves %ld bytes\n", (long)savings); + } + if(states->countStates>=MBCS_MAX_STATE_COUNT) { + fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n"); + return; + } + + /* make a copy of the state table */ + oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024); + if(oldStateTable==nullptr) { + fprintf(stderr, "cannot compact toUnicode: out of memory\n"); + return; + } + uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024); + + /* add the new state */ + /* + * this function does not catch the degenerate case where all lead bytes + * have all-unassigned trail bytes and the lead state could be removed + */ + newState=states->countStates++; + states->stateFlags[newState]=0; + /* copy the old trail state, turning all assigned states into unassigned ones */ + for(i=0; i<256; ++i) { + entry=states->stateTable[trailState][i]; + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_VALID_16: + case MBCS_STATE_VALID_16_PAIR: + states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe); + break; + default: + states->stateTable[newState][i]=entry; + break; + } + } + + /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */ + for(i=0; i<256; ++i) { + if(count[i]>0) { + states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState); + } + } + + /* sum up the new state table */ + for(i=0; i<states->countStates; ++i) { + states->stateFlags[i]&=~MBCS_STATE_FLAG_READY; + } + sum=sumUpStates(states); + + /* allocate a new, smaller code units array */ + oldUnicodeCodeUnits=*pUnicodeCodeUnits; + if(sum==0) { + *pUnicodeCodeUnits=nullptr; + if(oldUnicodeCodeUnits!=nullptr) { + uprv_free(oldUnicodeCodeUnits); + } + uprv_free(oldStateTable); + return; + } + *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t)); + if(*pUnicodeCodeUnits==nullptr) { + fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n", + (long)sum); + /* revert to the old state table */ + *pUnicodeCodeUnits=oldUnicodeCodeUnits; + --states->countStates; + uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024); + uprv_free(oldStateTable); + return; + } + for(i=0; i<sum; ++i) { + (*pUnicodeCodeUnits)[i]=0xfffe; + } + + /* copy the code units for all assigned characters */ + /* + * The old state table has the same lead _and_ trail states for assigned characters! + * The differences are in the offsets, and in the trail states for some unassigned characters. + * For each character with an assigned state in the new table, it was assigned in the old one. + * Only still-assigned characters are copied. + * Note that fallback mappings need to get their offset values adjusted. + */ + + /* for each initial state */ + for(leadState=0; leadState<states->countStates; ++leadState) { + if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) { + /* for each lead byte from there */ + for(i=0; i<256; ++i) { + entry=states->stateTable[leadState][i]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); + /* the new state does not have assigned states */ + if(trailState!=newState) { + trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry); + oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]); + /* for each trail byte */ + for(j=0; j<256; ++j) { + entry=states->stateTable[trailState][j]; + /* copy assigned-character code units and adjust fallback offsets */ + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_VALID_16: + offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); + /* find the old offset according to the old state table */ + oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); + unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset]; + if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) { + toUFallbacks[fallback].offset=0x80000000|offset; + } + break; + case MBCS_STATE_VALID_16_PAIR: + offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); + /* find the old offset according to the old state table */ + oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); + (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++]; + (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset]; + break; + default: + break; + } + } + } + } + } + } + } + + /* remove temporary flags from fallback offsets that protected them from being modified twice */ + for(i=0; i<countToUFallbacks; ++i) { + toUFallbacks[i].offset&=0x7fffffff; + } + + /* free temporary memory */ + uprv_free(oldUnicodeCodeUnits); + uprv_free(oldStateTable); +} + +/* + * recursive sub-function of compactToUnicodeHelper() + * returns: + * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved, + * if all sequences from this state are unassigned, returns the + * <0 there are assignments in unicodeCodeUnits[] + * 0 no use of unicodeCodeUnits[] + */ +static int32_t +findUnassigned(UCMStates *states, + uint16_t *unicodeCodeUnits, + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + int32_t state, int32_t offset, uint32_t b) { + int32_t i, entry, savings, localSavings, belowSavings; + UBool haveAssigned; + + localSavings=belowSavings=0; + haveAssigned=false; + for(i=0; i<256; ++i) { + entry=states->stateTable[state][i]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + savings=findUnassigned(states, + unicodeCodeUnits, + toUFallbacks, countToUFallbacks, + MBCS_ENTRY_TRANSITION_STATE(entry), + offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), + (b<<8)|(uint32_t)i); + if(savings<0) { + haveAssigned=true; + } else if(savings>0) { + printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n", + (unsigned long)((b<<8)|i), (long)state, (long)savings); + belowSavings+=savings; + } + } else if(!haveAssigned) { + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_VALID_16: + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); + if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) { + localSavings+=2; + } else { + haveAssigned=true; + } + break; + case MBCS_STATE_VALID_16_PAIR: + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); + if(unicodeCodeUnits[entry]==0xfffe) { + localSavings+=4; + } else { + haveAssigned=true; + } + break; + default: + break; + } + } + } + if(haveAssigned) { + return -1; + } else { + return localSavings+belowSavings; + } +} + +/* helper function for finding compaction opportunities */ +static void +compactToUnicodeHelper(UCMStates *states, + uint16_t *unicodeCodeUnits, + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) { + int32_t state, savings; + + /* for each initial state */ + for(state=0; state<states->countStates; ++state) { + if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { + savings=findUnassigned(states, + unicodeCodeUnits, + toUFallbacks, countToUFallbacks, + state, 0, 0); + if(savings>0) { + printf(" all-unassigned sequences from initial state %ld use %ld bytes\n", + (long)state, (long)savings); + } + } + } +} + +U_CDECL_BEGIN +static int32_t U_CALLCONV +compareFallbacks(const void *context, const void *fb1, const void *fb2) { + (void)context; + return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset; +} +U_CDECL_END + +U_CAPI void U_EXPORT2 +ucm_optimizeStates(UCMStates *states, + uint16_t **pUnicodeCodeUnits, + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + UBool verbose) { + UErrorCode errorCode; + int32_t state, cell, entry; + + /* test each state table entry */ + for(state=0; state<states->countStates; ++state) { + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[state][cell]; + /* + * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code + * and the code point is "unassigned" (0xfffe), then change it to + * the "unassigned" action code with bits 26..23 set to zero and U+fffe. + */ + if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) { + states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED); + } + } + } + + /* try to compact the toUnicode tables */ + if(states->maxCharLength==2) { + compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose); + } else if(states->maxCharLength>2) { + if(verbose) { + compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks); + } + } + + /* sort toUFallbacks */ + /* + * It should be safe to sort them before compactToUnicode2() is called, + * because it should not change the relative order of the offset values + * that it adjusts, but they need to be sorted at some point, and + * it is safest here. + */ + if(countToUFallbacks>0) { + errorCode=U_ZERO_ERROR; /* nothing bad will happen... */ + uprv_sortArray(toUFallbacks, countToUFallbacks, + sizeof(_MBCSToUFallback), + compareFallbacks, nullptr, false, &errorCode); + } +} + +/* use a complete state table ----------------------------------------------- */ + +U_CAPI int32_t U_EXPORT2 +ucm_countChars(UCMStates *states, + const uint8_t *bytes, int32_t length) { + uint32_t offset; + int32_t i, entry, count; + uint8_t state; + + offset=0; + count=0; + state=0; + + if(states->countStates==0) { + fprintf(stderr, "ucm error: there is no state information!\n"); + return -1; + } + + /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */ + if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) { + state=1; + } + + /* + * Walk down the state table like in conversion, + * much like getNextUChar(). + * We assume that c<=0x10ffff. + */ + for(i=0; i<length; ++i) { + entry=states->stateTable[state][bytes[i]]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); + offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); + } else { + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_ILLEGAL: + fprintf(stderr, "ucm error: byte sequence ends in illegal state\n"); + return -1; + case MBCS_STATE_CHANGE_ONLY: + fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n"); + return -1; + case MBCS_STATE_UNASSIGNED: + case MBCS_STATE_FALLBACK_DIRECT_16: + case MBCS_STATE_VALID_DIRECT_16: + case MBCS_STATE_FALLBACK_DIRECT_20: + case MBCS_STATE_VALID_DIRECT_20: + case MBCS_STATE_VALID_16: + case MBCS_STATE_VALID_16_PAIR: + /* count a complete character and prepare for a new one */ + ++count; + state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); + offset=0; + break; + default: + /* reserved, must never occur */ + fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry); + return -1; + } + } + } + + if(offset!=0) { + fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %u\n", state); + return -1; + } + + /* + * for SI/SO (like EBCDIC-stateful), multiple-character results + * must consist of only double-byte sequences + */ + if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) { + fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count); + return -1; + } + + return count; +} +#endif + diff --git a/intl/icu/source/tools/toolutil/udbgutil.cpp b/intl/icu/source/tools/toolutil/udbgutil.cpp new file mode 100644 index 0000000000..3f4bf3718e --- /dev/null +++ b/intl/icu/source/tools/toolutil/udbgutil.cpp @@ -0,0 +1,769 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2007-2016, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + +#include "udbgutil.h" +#include <string.h> +#include "ustr_imp.h" +#include "cmemory.h" +#include "cstring.h" +#include "putilimp.h" +#include "unicode/ulocdata.h" +#include "unicode/ucnv.h" +#include "unicode/unistr.h" +#include "cstr.h" + +/* +To add a new enum type + (For example: UShoeSize with values USHOE_WIDE=0, USHOE_REGULAR, USHOE_NARROW, USHOE_COUNT) + + 0. Make sure that all lines you add are protected with appropriate uconfig guards, + such as '#if !UCONFIG_NO_SHOES'. + 1. udbgutil.h: add UDBG_UShoeSize to the UDebugEnumType enum before UDBG_ENUM_COUNT + ( The subsequent steps involve this file, udbgutil.cpp ) + 2. Find the marker "Add new enum types above this line" + 3. Before that marker, add a #include of any header file you need. + 4. Each enum type has three things in this section: a #define, a count_, and an array of Fields. + It may help to copy and paste a previous definition. + 5. In the case of the USHOE_... strings above, "USHOE_" is common to all values- six characters + " #define LEN_USHOE 6 " + 6 characters will strip off "USHOE_" leaving enum values of WIDE, REGULAR, and NARROW. + 6. Define the 'count_' variable, with the number of enum values. If the enum has a _MAX or _COUNT value, + that can be helpful for automatically defining the count. Otherwise define it manually. + " static const int32_t count_UShoeSize = USHOE_COUNT; " + 7. Define the field names, in order. + " static const Field names_UShoeSize[] = { + " FIELD_NAME_STR( LEN_USHOE, USHOE_WIDE ), + " FIELD_NAME_STR( LEN_USHOE, USHOE_REGULAR ), + " FIELD_NAME_STR( LEN_USHOE, USHOE_NARROW ), + " }; + ( The following command was usedfor converting ucol.h into partially correct entities ) + grep "^[ ]*UCOL" < unicode/ucol.h | + sed -e 's%^[ ]*\([A-Z]*\)_\([A-Z_]*\).*% FIELD_NAME_STR( LEN_\1, \1_\2 ),%g' + 8. Now, a bit farther down, add the name of the enum itself to the end of names_UDebugEnumType + ( UDebugEnumType is an enum, too!) + names_UDebugEnumType[] { ... + " FIELD_NAME_STR( LEN_UDBG, UDBG_UShoeSize ), " + 9. Find the function _udbg_enumCount and add the count macro: + " COUNT_CASE(UShoeSize) + 10. Find the function _udbg_enumFields and add the field macro: + " FIELD_CASE(UShoeSize) + 11. verify that your test code, and Java data generation, works properly. +*/ + +/** + * Structure representing an enum value + */ +struct Field { + int32_t prefix; /**< how many characters to remove in the prefix - i.e. UCHAR_ = 5 */ + const char *str; /**< The actual string value */ + int32_t num; /**< The numeric value */ +}; + +/** + * Define another field name. Used in an array of Field s + * @param y the common prefix length (i.e. 6 for "USHOE_" ) + * @param x the actual enum value - it will be copied in both string and symbolic form. + * @see Field + */ +#define FIELD_NAME_STR(y,x) { y, #x, x } + + +// TODO: Currently, this whole functionality goes away with UCONFIG_NO_FORMATTING. Should be split up. +#if !UCONFIG_NO_FORMATTING + +// Calendar +#include "unicode/ucal.h" + +// 'UCAL_' = 5 +#define LEN_UCAL 5 /* UCAL_ */ +static const int32_t count_UCalendarDateFields = UCAL_FIELD_COUNT; +static const Field names_UCalendarDateFields[] = +{ + FIELD_NAME_STR( LEN_UCAL, UCAL_ERA ), + FIELD_NAME_STR( LEN_UCAL, UCAL_YEAR ), + FIELD_NAME_STR( LEN_UCAL, UCAL_MONTH ), + FIELD_NAME_STR( LEN_UCAL, UCAL_WEEK_OF_YEAR ), + FIELD_NAME_STR( LEN_UCAL, UCAL_WEEK_OF_MONTH ), + FIELD_NAME_STR( LEN_UCAL, UCAL_DATE ), + FIELD_NAME_STR( LEN_UCAL, UCAL_DAY_OF_YEAR ), + FIELD_NAME_STR( LEN_UCAL, UCAL_DAY_OF_WEEK ), + FIELD_NAME_STR( LEN_UCAL, UCAL_DAY_OF_WEEK_IN_MONTH ), + FIELD_NAME_STR( LEN_UCAL, UCAL_AM_PM ), + FIELD_NAME_STR( LEN_UCAL, UCAL_HOUR ), + FIELD_NAME_STR( LEN_UCAL, UCAL_HOUR_OF_DAY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_MINUTE ), + FIELD_NAME_STR( LEN_UCAL, UCAL_SECOND ), + FIELD_NAME_STR( LEN_UCAL, UCAL_MILLISECOND ), + FIELD_NAME_STR( LEN_UCAL, UCAL_ZONE_OFFSET ), + FIELD_NAME_STR( LEN_UCAL, UCAL_DST_OFFSET ), + FIELD_NAME_STR( LEN_UCAL, UCAL_YEAR_WOY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_DOW_LOCAL ), + FIELD_NAME_STR( LEN_UCAL, UCAL_EXTENDED_YEAR ), + FIELD_NAME_STR( LEN_UCAL, UCAL_JULIAN_DAY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_MILLISECONDS_IN_DAY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_IS_LEAP_MONTH ), +#ifndef U_HIDE_DRAFT_API + FIELD_NAME_STR( LEN_UCAL, UCAL_ORDINAL_MONTH ), +#endif // U_HIDE_DRAFT_API +}; + + +static const int32_t count_UCalendarMonths = UCAL_UNDECIMBER+1; +static const Field names_UCalendarMonths[] = +{ + FIELD_NAME_STR( LEN_UCAL, UCAL_JANUARY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_FEBRUARY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_MARCH ), + FIELD_NAME_STR( LEN_UCAL, UCAL_APRIL ), + FIELD_NAME_STR( LEN_UCAL, UCAL_MAY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_JUNE ), + FIELD_NAME_STR( LEN_UCAL, UCAL_JULY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_AUGUST ), + FIELD_NAME_STR( LEN_UCAL, UCAL_SEPTEMBER ), + FIELD_NAME_STR( LEN_UCAL, UCAL_OCTOBER ), + FIELD_NAME_STR( LEN_UCAL, UCAL_NOVEMBER ), + FIELD_NAME_STR( LEN_UCAL, UCAL_DECEMBER ), + FIELD_NAME_STR( LEN_UCAL, UCAL_UNDECIMBER) +}; + +#include "unicode/udat.h" + +#define LEN_UDAT 5 /* "UDAT_" */ +static const int32_t count_UDateFormatStyle = UDAT_SHORT+1; +static const Field names_UDateFormatStyle[] = +{ + FIELD_NAME_STR( LEN_UDAT, UDAT_FULL ), + FIELD_NAME_STR( LEN_UDAT, UDAT_LONG ), + FIELD_NAME_STR( LEN_UDAT, UDAT_MEDIUM ), + FIELD_NAME_STR( LEN_UDAT, UDAT_SHORT ), + /* end regular */ + /* + * negative enums.. leave out for now. + FIELD_NAME_STR( LEN_UDAT, UDAT_NONE ), + FIELD_NAME_STR( LEN_UDAT, UDAT_PATTERN ), + */ +}; + +#endif + +#include "unicode/uloc.h" + +#define LEN_UAR 12 /* "ULOC_ACCEPT_" */ +static const int32_t count_UAcceptResult = 3; +static const Field names_UAcceptResult[] = +{ + FIELD_NAME_STR( LEN_UAR, ULOC_ACCEPT_FAILED ), + FIELD_NAME_STR( LEN_UAR, ULOC_ACCEPT_VALID ), + FIELD_NAME_STR( LEN_UAR, ULOC_ACCEPT_FALLBACK ), +}; + +#if !UCONFIG_NO_COLLATION +#include "unicode/ucol.h" +#define LEN_UCOL 5 /* UCOL_ */ +static const int32_t count_UColAttributeValue = UCOL_ATTRIBUTE_VALUE_COUNT; +static const Field names_UColAttributeValue[] = { + FIELD_NAME_STR( LEN_UCOL, UCOL_PRIMARY ), + FIELD_NAME_STR( LEN_UCOL, UCOL_SECONDARY ), + FIELD_NAME_STR( LEN_UCOL, UCOL_TERTIARY ), +// FIELD_NAME_STR( LEN_UCOL, UCOL_CE_STRENGTH_LIMIT ), + FIELD_NAME_STR( LEN_UCOL, UCOL_QUATERNARY ), + // gap + FIELD_NAME_STR( LEN_UCOL, UCOL_IDENTICAL ), +// FIELD_NAME_STR( LEN_UCOL, UCOL_STRENGTH_LIMIT ), + FIELD_NAME_STR( LEN_UCOL, UCOL_OFF ), + FIELD_NAME_STR( LEN_UCOL, UCOL_ON ), + // gap + FIELD_NAME_STR( LEN_UCOL, UCOL_SHIFTED ), + FIELD_NAME_STR( LEN_UCOL, UCOL_NON_IGNORABLE ), + // gap + FIELD_NAME_STR( LEN_UCOL, UCOL_LOWER_FIRST ), + FIELD_NAME_STR( LEN_UCOL, UCOL_UPPER_FIRST ), +}; + +#endif + + +#if UCONFIG_ENABLE_PLUGINS +#include "unicode/icuplug.h" + +#define LEN_UPLUG_REASON 13 /* UPLUG_REASON_ */ +static const int32_t count_UPlugReason = UPLUG_REASON_COUNT; +static const Field names_UPlugReason[] = { + FIELD_NAME_STR( LEN_UPLUG_REASON, UPLUG_REASON_QUERY ), + FIELD_NAME_STR( LEN_UPLUG_REASON, UPLUG_REASON_LOAD ), + FIELD_NAME_STR( LEN_UPLUG_REASON, UPLUG_REASON_UNLOAD ), +}; + +#define LEN_UPLUG_LEVEL 12 /* UPLUG_LEVEL_ */ +static const int32_t count_UPlugLevel = UPLUG_LEVEL_COUNT; +static const Field names_UPlugLevel[] = { + FIELD_NAME_STR( LEN_UPLUG_LEVEL, UPLUG_LEVEL_INVALID ), + FIELD_NAME_STR( LEN_UPLUG_LEVEL, UPLUG_LEVEL_UNKNOWN ), + FIELD_NAME_STR( LEN_UPLUG_LEVEL, UPLUG_LEVEL_LOW ), + FIELD_NAME_STR( LEN_UPLUG_LEVEL, UPLUG_LEVEL_HIGH ), +}; +#endif + +#define LEN_UDBG 5 /* "UDBG_" */ +static const int32_t count_UDebugEnumType = UDBG_ENUM_COUNT; +static const Field names_UDebugEnumType[] = +{ + FIELD_NAME_STR( LEN_UDBG, UDBG_UDebugEnumType ), +#if !UCONFIG_NO_FORMATTING + FIELD_NAME_STR( LEN_UDBG, UDBG_UCalendarDateFields ), + FIELD_NAME_STR( LEN_UDBG, UDBG_UCalendarMonths ), + FIELD_NAME_STR( LEN_UDBG, UDBG_UDateFormatStyle ), +#endif +#if UCONFIG_ENABLE_PLUGINS + FIELD_NAME_STR( LEN_UDBG, UDBG_UPlugReason ), + FIELD_NAME_STR( LEN_UDBG, UDBG_UPlugLevel ), +#endif + FIELD_NAME_STR( LEN_UDBG, UDBG_UAcceptResult ), +#if !UCONFIG_NO_COLLATION + FIELD_NAME_STR( LEN_UDBG, UDBG_UColAttributeValue ), +#endif +}; + + +// --- Add new enum types above this line --- + +#define COUNT_CASE(x) case UDBG_##x: return (actual?count_##x:UPRV_LENGTHOF(names_##x)); +#define COUNT_FAIL_CASE(x) case UDBG_##x: return -1; + +#define FIELD_CASE(x) case UDBG_##x: return names_##x; +#define FIELD_FAIL_CASE(x) case UDBG_##x: return nullptr; + +// low level + +/** + * @param type type of item + * @param actual true: for the actual enum's type (UCAL_FIELD_COUNT, etc), or false for the string count + */ +static int32_t _udbg_enumCount(UDebugEnumType type, UBool actual) { + switch(type) { + COUNT_CASE(UDebugEnumType) +#if !UCONFIG_NO_FORMATTING + COUNT_CASE(UCalendarDateFields) + COUNT_CASE(UCalendarMonths) + COUNT_CASE(UDateFormatStyle) +#endif +#if UCONFIG_ENABLE_PLUGINS + COUNT_CASE(UPlugReason) + COUNT_CASE(UPlugLevel) +#endif + COUNT_CASE(UAcceptResult) +#if !UCONFIG_NO_COLLATION + COUNT_CASE(UColAttributeValue) +#endif + // COUNT_FAIL_CASE(UNonExistentEnum) + default: + return -1; + } +} + +static const Field* _udbg_enumFields(UDebugEnumType type) { + switch(type) { + FIELD_CASE(UDebugEnumType) +#if !UCONFIG_NO_FORMATTING + FIELD_CASE(UCalendarDateFields) + FIELD_CASE(UCalendarMonths) + FIELD_CASE(UDateFormatStyle) +#endif +#if UCONFIG_ENABLE_PLUGINS + FIELD_CASE(UPlugReason) + FIELD_CASE(UPlugLevel) +#endif + FIELD_CASE(UAcceptResult) + // FIELD_FAIL_CASE(UNonExistentEnum) +#if !UCONFIG_NO_COLLATION + FIELD_CASE(UColAttributeValue) +#endif + default: + return nullptr; + } +} + +// implementation + +int32_t udbg_enumCount(UDebugEnumType type) { + return _udbg_enumCount(type, false); +} + +int32_t udbg_enumExpectedCount(UDebugEnumType type) { + return _udbg_enumCount(type, true); +} + +const char * udbg_enumName(UDebugEnumType type, int32_t field) { + if(field<0 || + field>=_udbg_enumCount(type,false)) { // also will catch unsupported items + return nullptr; + } else { + const Field *fields = _udbg_enumFields(type); + if(fields == nullptr) { + return nullptr; + } else { + return fields[field].str + fields[field].prefix; + } + } +} + +int32_t udbg_enumArrayValue(UDebugEnumType type, int32_t field) { + if(field<0 || + field>=_udbg_enumCount(type,false)) { // also will catch unsupported items + return -1; + } else { + const Field *fields = _udbg_enumFields(type); + if(fields == nullptr) { + return -1; + } else { + return fields[field].num; + } + } +} + +int32_t udbg_enumByName(UDebugEnumType type, const char *value) { + if(type<0||type>=_udbg_enumCount(UDBG_UDebugEnumType, true)) { + return -1; // type out of range + } + const Field *fields = _udbg_enumFields(type); + if (fields != nullptr) { + for(int32_t field = 0;field<_udbg_enumCount(type, false);field++) { + if(!strcmp(value, fields[field].str + fields[field].prefix)) { + return fields[field].num; + } + } + // try with the prefix + for(int32_t field = 0;field<_udbg_enumCount(type, false);field++) { + if(!strcmp(value, fields[field].str)) { + return fields[field].num; + } + } + } + // fail + return -1; +} + +/* platform info */ +/** + * Print the current platform + */ +U_CAPI const char *udbg_getPlatform() +{ +#if U_PLATFORM_USES_ONLY_WIN32_API + return "Windows"; +#elif U_PLATFORM == U_PF_CYGWIN + return "Cygwin"; +#elif U_PLATFORM == U_PF_UNKNOWN + return "unknown"; +#elif U_PLATFORM == U_PF_DARWIN + return "Darwin"; +#elif U_PLATFORM == U_PF_BSD + return "BSD"; +#elif U_PLATFORM == U_PF_QNX + return "QNX"; +#elif U_PLATFORM == U_PF_LINUX + return "Linux"; +#elif U_PLATFORM == U_PF_ANDROID + return "Android"; +#elif U_PLATFORM == U_PF_CLASSIC_MACOS + return "MacOS (Classic)"; +#elif U_PLATFORM == U_PF_OS390 + return "IBM z"; +#elif U_PLATFORM == U_PF_OS400 + return "IBM i"; +#else + return "Other (POSIX-like)"; +#endif +} + +struct USystemParams; + +typedef int32_t U_CALLCONV USystemParameterCallback(const USystemParams *param, char *target, int32_t targetCapacity, UErrorCode *status); + +struct USystemParams { + const char *paramName; + USystemParameterCallback *paramFunction; + const char *paramStr; + int32_t paramInt; +}; + +/* parameter types */ +U_CAPI int32_t +paramEmpty(const USystemParams * /* param */, char *target, int32_t targetCapacity, UErrorCode *status) { + if(U_FAILURE(*status))return 0; + return u_terminateChars(target, targetCapacity, 0, status); +} + +U_CAPI int32_t +paramStatic(const USystemParams *param, char *target, int32_t targetCapacity, UErrorCode *status) { + if(param->paramStr==nullptr) return paramEmpty(param,target,targetCapacity,status); + if(U_FAILURE(*status))return 0; + int32_t len = static_cast<int32_t>(uprv_strlen(param->paramStr)); + if(target!=nullptr) { + uprv_strncpy(target,param->paramStr,uprv_min(len,targetCapacity)); + } + return u_terminateChars(target, targetCapacity, len, status); +} + +static const char *nullString = "(null)"; + +static int32_t stringToStringBuffer(char *target, int32_t targetCapacity, const char *str, UErrorCode *status) { + if(str==nullptr) str=nullString; + + int32_t len = static_cast<int32_t>(uprv_strlen(str)); + if (U_SUCCESS(*status)) { + if(target!=nullptr) { + uprv_strncpy(target,str,uprv_min(len,targetCapacity)); + } + } else { + const char *s = u_errorName(*status); + len = static_cast<int32_t>(uprv_strlen(s)); + if(target!=nullptr) { + uprv_strncpy(target,s,uprv_min(len,targetCapacity)); + } + } + return u_terminateChars(target, targetCapacity, len, status); +} + +static int32_t integerToStringBuffer(char *target, int32_t targetCapacity, int32_t n, int32_t radix, UErrorCode *status) { + if(U_FAILURE(*status)) return 0; + char str[300]; + T_CString_integerToString(str,n,radix); + return stringToStringBuffer(target,targetCapacity,str,status); +} + +U_CAPI int32_t +paramInteger(const USystemParams *param, char *target, int32_t targetCapacity, UErrorCode *status) { + if(U_FAILURE(*status))return 0; + if(param->paramStr==nullptr || param->paramStr[0]=='d') { + return integerToStringBuffer(target,targetCapacity,param->paramInt, 10,status); + } else if(param->paramStr[0]=='x') { + return integerToStringBuffer(target,targetCapacity,param->paramInt, 16,status); + } else if(param->paramStr[0]=='o') { + return integerToStringBuffer(target,targetCapacity,param->paramInt, 8,status); + } else if(param->paramStr[0]=='b') { + return integerToStringBuffer(target,targetCapacity,param->paramInt, 2,status); + } else { + *status = U_INTERNAL_PROGRAM_ERROR; + return 0; + } +} + + +U_CAPI int32_t +paramCldrVersion(const USystemParams * /* param */, char *target, int32_t targetCapacity, UErrorCode *status) { + if(U_FAILURE(*status))return 0; + char str[200]=""; + UVersionInfo icu; + + ulocdata_getCLDRVersion(icu, status); + if(U_SUCCESS(*status)) { + u_versionToString(icu, str); + return stringToStringBuffer(target,targetCapacity,str,status); + } else { + return 0; + } +} + + +#if !UCONFIG_NO_FORMATTING +U_CAPI int32_t +paramTimezoneDefault(const USystemParams * /* param */, char *target, int32_t targetCapacity, UErrorCode *status) { + if(U_FAILURE(*status))return 0; + char16_t buf[100]; + char buf2[100]; + int32_t len; + + len = ucal_getDefaultTimeZone(buf, 100, status); + if(U_SUCCESS(*status)&&len>0) { + u_UCharsToChars(buf, buf2, len+1); + return stringToStringBuffer(target,targetCapacity, buf2,status); + } else { + return 0; + } +} +#endif + +U_CAPI int32_t +paramLocaleDefaultBcp47(const USystemParams * /* param */, char *target, int32_t targetCapacity, UErrorCode *status) { + if(U_FAILURE(*status))return 0; + const char *def = uloc_getDefault(); + return uloc_toLanguageTag(def,target,targetCapacity,false,status); +} + + +/* simple 1-liner param functions */ +#define STRING_PARAM(func, str) U_CAPI int32_t \ + func(const USystemParams *, char *target, int32_t targetCapacity, UErrorCode *status) \ + { return stringToStringBuffer(target,targetCapacity,(str),status); } + +STRING_PARAM(paramIcudataPath, u_getDataDirectory()) +STRING_PARAM(paramPlatform, udbg_getPlatform()) +STRING_PARAM(paramLocaleDefault, uloc_getDefault()) +#if !UCONFIG_NO_CONVERSION +STRING_PARAM(paramConverterDefault, ucnv_getDefaultName()) +#endif + +#if !UCONFIG_NO_FORMATTING +STRING_PARAM(paramTimezoneVersion, ucal_getTZDataVersion(status)) +#endif + +static const USystemParams systemParams[] = { + { "copyright", paramStatic, U_COPYRIGHT_STRING,0 }, + { "product", paramStatic, "icu4c",0 }, + { "product.full", paramStatic, "International Components for Unicode for C/C++",0 }, + { "version", paramStatic, U_ICU_VERSION,0 }, + { "version.unicode", paramStatic, U_UNICODE_VERSION,0 }, + { "platform.number", paramInteger, "d",U_PLATFORM}, + { "platform.type", paramPlatform, nullptr ,0}, + { "locale.default", paramLocaleDefault, nullptr, 0}, + { "locale.default.bcp47", paramLocaleDefaultBcp47, nullptr, 0}, +#if !UCONFIG_NO_CONVERSION + { "converter.default", paramConverterDefault, nullptr, 0}, +#endif + { "icudata.name", paramStatic, U_ICUDATA_NAME, 0}, + { "icudata.path", paramIcudataPath, nullptr, 0}, + + { "cldr.version", paramCldrVersion, nullptr, 0}, + +#if !UCONFIG_NO_FORMATTING + { "tz.version", paramTimezoneVersion, nullptr, 0}, + { "tz.default", paramTimezoneDefault, nullptr, 0}, +#endif + + { "cpu.bits", paramInteger, "d", (sizeof(void*))*8}, + { "cpu.big_endian", paramInteger, "b", U_IS_BIG_ENDIAN}, + { "os.wchar_width", paramInteger, "d", U_SIZEOF_WCHAR_T}, + { "os.charset_family", paramInteger, "d", U_CHARSET_FAMILY}, +#if defined (U_HOST) + { "os.host", paramStatic, U_HOST, 0}, +#endif +#if defined (U_BUILD) + { "build.build", paramStatic, U_BUILD, 0}, +#endif +#if defined (U_CC) + { "build.cc", paramStatic, U_CC, 0}, +#endif +#if defined (U_CXX) + { "build.cxx", paramStatic, U_CXX, 0}, +#endif +#if defined (CYGWINMSVC) + { "build.cygwinmsvc", paramInteger, "b", 1}, +#endif + { "uconfig.internal_digitlist", paramInteger, "b", 1}, /* always 1 */ + { "uconfig.have_parseallinput", paramInteger, "b", UCONFIG_HAVE_PARSEALLINPUT}, + + +}; + +#define U_SYSPARAM_COUNT UPRV_LENGTHOF(systemParams) + +U_CAPI const char *udbg_getSystemParameterNameByIndex(int32_t i) { + if(i>=0 && i < (int32_t)U_SYSPARAM_COUNT) { + return systemParams[i].paramName; + } else { + return nullptr; + } +} + + +U_CAPI int32_t udbg_getSystemParameterValueByIndex(int32_t i, char *buffer, int32_t bufferCapacity, UErrorCode *status) { + if(i>=0 && i< (int32_t)U_SYSPARAM_COUNT) { + return systemParams[i].paramFunction(&(systemParams[i]),buffer,bufferCapacity,status); + } else { + return 0; + } +} + +U_CAPI void udbg_writeIcuInfo(FILE *out) { + char str[2000]; + /* todo: API for writing DTD? */ + fprintf(out, " <icuSystemParams type=\"icu4c\">\n"); + const char *paramName; + for(int32_t i=0;(paramName=udbg_getSystemParameterNameByIndex(i))!=nullptr;i++) { + UErrorCode status2 = U_ZERO_ERROR; + udbg_getSystemParameterValueByIndex(i, str,2000,&status2); + if(U_SUCCESS(status2)) { + fprintf(out," <param name=\"%s\">%s</param>\n", paramName,str); + } else { + fprintf(out," <!-- n=\"%s\" ERROR: %s -->\n", paramName, u_errorName(status2)); + } + } + fprintf(out, " </icuSystemParams>\n"); +} + +#define UNICODE_BUG_URL "https://unicode-org.atlassian.net/browse/" +#define OLD_CLDR_PREFIX "cldrbug:" +#define CLDR_BUG_PREFIX "CLDR-" +#define ICU_BUG_PREFIX "ICU-" + + + +#include <set> +#include <map> +#include <string> +#include <ostream> +#include <iostream> + +class KnownIssues { +public: + KnownIssues(); + ~KnownIssues(); + void add(const char *ticket, const char *where, const char16_t *msg, UBool *firstForTicket, UBool *firstForWhere); + void add(const char *ticket, const char *where, const char *msg, UBool *firstForTicket, UBool *firstForWhere); + UBool print(); +private: + std::map< std::string, + std::map < std::string, std::set < std::string > > > fTable; +}; + +KnownIssues::KnownIssues() + : fTable() +{ +} + +KnownIssues::~KnownIssues() +{ +} + +/** + * Map cldr:1234 to CLDR-1234 + * Map 1234 to ICU-1234 + */ +static std::string mapTicketId(const char *ticketStr) { + std::string ticket(ticketStr); + // TODO: Can remove this function once all logKnownIssue calls are switched over + // to the ICU-1234 and CLDR-1234 format. + if(ticket.rfind(OLD_CLDR_PREFIX) == 0) { + // map cldrbug:1234 to CLDR-1234 + ticket.replace(0, uprv_strlen(OLD_CLDR_PREFIX), CLDR_BUG_PREFIX); + } else if(::isdigit(ticket[0])) { + // map 1234 to ICU-1234 + ticket.insert(0, ICU_BUG_PREFIX); + } + return ticket; +} + +void KnownIssues::add(const char *ticketStr, const char *where, const char16_t *msg, UBool *firstForTicket, UBool *firstForWhere) +{ + const std::string ticket = mapTicketId(ticketStr); + if(fTable.find(ticket) == fTable.end()) { + if(firstForTicket!=nullptr) *firstForTicket = true; + fTable[ticket] = std::map < std::string, std::set < std::string > >(); + } else { + if(firstForTicket!=nullptr) *firstForTicket = false; + } + if(where==nullptr) return; + + if(fTable[ticket].find(where) == fTable[ticket].end()) { + if(firstForWhere!=nullptr) *firstForWhere = true; + fTable[ticket][where] = std::set < std::string >(); + } else { + if(firstForWhere!=nullptr) *firstForWhere = false; + } + if(msg==nullptr || !*msg) return; + + const icu::UnicodeString ustr(msg); + + fTable[ticket][where].insert(std::string(icu::CStr(ustr)())); +} + +void KnownIssues::add(const char *ticketStr, const char *where, const char *msg, UBool *firstForTicket, UBool *firstForWhere) +{ + const std::string ticket = mapTicketId(ticketStr); + if(fTable.find(ticket) == fTable.end()) { + if(firstForTicket!=nullptr) *firstForTicket = true; + fTable[ticket] = std::map < std::string, std::set < std::string > >(); + } else { + if(firstForTicket!=nullptr) *firstForTicket = false; + } + if(where==nullptr) return; + + if(fTable[ticket].find(where) == fTable[ticket].end()) { + if(firstForWhere!=nullptr) *firstForWhere = true; + fTable[ticket][where] = std::set < std::string >(); + } else { + if(firstForWhere!=nullptr) *firstForWhere = false; + } + if(msg==nullptr || !*msg) return; + + std::string str(msg); + fTable[ticket][where].insert(str); +} + +UBool KnownIssues::print() +{ + if(fTable.empty()) { + return false; + } + + std::cout << "KNOWN ISSUES" << std::endl; + for( std::map< std::string, + std::map < std::string, std::set < std::string > > >::iterator i = fTable.begin(); + i != fTable.end(); + i++ ) { + const std::string ticketid = (*i).first; + std::cout << "[" << ticketid << "] "; + if(ticketid.rfind(ICU_BUG_PREFIX) == 0 || ticketid.rfind(CLDR_BUG_PREFIX) == 0) { + // If it's a unicode.org bug. + std::cout << UNICODE_BUG_URL << ticketid; + } // Else: some other kind of bug. Allow this, but without a URL. + std::cout << std::endl; + + for( std::map< std::string, std::set < std::string > >::iterator ii = (*i).second.begin(); + ii != (*i).second.end(); + ii++ ) { + std::cout << " " << (*ii).first << std::endl; + for ( std::set < std::string >::iterator iii = (*ii).second.begin(); + iii != (*ii).second.end(); + iii++ ) { + std::cout << " " << '"' << (*iii) << '"' << std::endl; + } + } + } + return true; +} + +U_CAPI void *udbg_knownIssue_openU(void *ptr, const char *ticket, char *where, const char16_t *msg, UBool *firstForTicket, + UBool *firstForWhere) { + KnownIssues *t = static_cast<KnownIssues*>(ptr); + if(t==nullptr) { + t = new KnownIssues(); + } + + t->add(ticket, where, msg, firstForTicket, firstForWhere); + + return static_cast<void*>(t); +} + +U_CAPI void *udbg_knownIssue_open(void *ptr, const char *ticket, char *where, const char *msg, UBool *firstForTicket, + UBool *firstForWhere) { + KnownIssues *t = static_cast<KnownIssues*>(ptr); + if(t==nullptr) { + t = new KnownIssues(); + } + + t->add(ticket, where, msg, firstForTicket, firstForWhere); + + return static_cast<void*>(t); +} + +U_CAPI UBool udbg_knownIssue_print(void *ptr) { + KnownIssues *t = static_cast<KnownIssues*>(ptr); + if(t==nullptr) { + return false; + } else { + t->print(); + return true; + } +} + +U_CAPI void udbg_knownIssue_close(void *ptr) { + KnownIssues *t = static_cast<KnownIssues*>(ptr); + delete t; +} diff --git a/intl/icu/source/tools/toolutil/udbgutil.h b/intl/icu/source/tools/toolutil/udbgutil.h new file mode 100644 index 0000000000..e3ed513839 --- /dev/null +++ b/intl/icu/source/tools/toolutil/udbgutil.h @@ -0,0 +1,147 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +************************************************************************ +* Copyright (c) 2008-2015, International Business Machines +* Corporation and others. All Rights Reserved. +************************************************************************ +*/ + +/** C Utilities to aid in debugging **/ + +#ifndef _UDBGUTIL_H +#define _UDBGUTIL_H + +#include "unicode/utypes.h" +#include <stdio.h> + +enum UDebugEnumType { + UDBG_UDebugEnumType = 0, /* Self-referential, strings for UDebugEnumType. Count=ENUM_COUNT. */ +#if !UCONFIG_NO_FORMATTING + UDBG_UCalendarDateFields, /* UCalendarDateFields. Count=UCAL_FIELD_COUNT. Unsupported if UCONFIG_NO_FORMATTING. */ + UDBG_UCalendarMonths, /* UCalendarMonths. Count= (UCAL_UNDECIMBER+1) */ + UDBG_UDateFormatStyle, /* Count = UDAT_SHORT=1 */ +#endif +#if UCONFIG_ENABLE_PLUGINS + UDBG_UPlugReason, /* Count = UPLUG_REASON_COUNT */ + UDBG_UPlugLevel, /* COUNT = UPLUG_LEVEL_COUNT */ +#endif + UDBG_UAcceptResult, /* Count = ULOC_ACCEPT_FALLBACK+1=3 */ + + /* All following enums may be discontiguous. */ + +#if !UCONFIG_NO_COLLATION + UDBG_UColAttributeValue, /* UCOL_ATTRIBUTE_VALUE_COUNT */ +#endif + UDBG_ENUM_COUNT, + UDBG_HIGHEST_CONTIGUOUS_ENUM = UDBG_UAcceptResult, /**< last enum in this list with contiguous (testable) values. */ + UDBG_INVALID_ENUM = -1 /** Invalid enum value **/ +}; + +typedef enum UDebugEnumType UDebugEnumType; + +/** + * @param type the type of enum + * Print how many enums are contained for this type. + * Should be equal to the appropriate _COUNT constant or there is an error. Return -1 if unsupported. + */ +U_CAPI int32_t U_EXPORT2 udbg_enumCount(UDebugEnumType type); + +/** + * Convert an enum to a string + * @param type type of enum + * @param field field number + * @return string of the format "ERA", "YEAR", etc, or NULL if out of range or unsupported + */ +U_CAPI const char * U_EXPORT2 udbg_enumName(UDebugEnumType type, int32_t field); + +/** + * for consistency checking + * @param type the type of enum + * Print how many enums should be contained for this type. + * This is equal to the appropriate _COUNT constant or there is an error. Returns -1 if unsupported. + */ +U_CAPI int32_t U_EXPORT2 udbg_enumExpectedCount(UDebugEnumType type); + +/** + * For consistency checking, returns the expected enum ordinal value for the given index value. + * @param type which type + * @param field field number + * @return should be equal to 'field' or -1 if out of range. + */ +U_CAPI int32_t U_EXPORT2 udbg_enumArrayValue(UDebugEnumType type, int32_t field); + +/** + * Locate the specified field value by name. + * @param type which type + * @param name name of string (case sensitive) + * @return should be a field value or -1 if not found. + */ +U_CAPI int32_t U_EXPORT2 udbg_enumByName(UDebugEnumType type, const char *name); + + +/** + * Return the Platform (U_PLATFORM) as a string + */ +U_CAPI const char *udbg_getPlatform(void); + +/** + * Get the nth system parameter's name + * @param i index of name, starting from zero + * @return name, or NULL if off the end + * @see udbg_getSystemParameterValue + */ +U_CAPI const char *udbg_getSystemParameterNameByIndex(int32_t i); + +/** + * Get the nth system parameter's value, in a user supplied buffer + * @parameter i index of value, starting from zero + * @param status error status + * @return length written (standard termination rules) + * @see udbg_getSystemParameterName + */ +U_CAPI int32_t udbg_getSystemParameterValueByIndex(int32_t i, char *buffer, int32_t bufferCapacity, UErrorCode *status); + +/** + * Write ICU info as XML + */ +U_CAPI void udbg_writeIcuInfo(FILE *f); + +/** + * \def UDBG_KNOWNISSUE_LEN + * Length of output buffer for udbg_knownIssueURLFrom + */ +#define UDBG_KNOWNISSUE_LEN 255 + +/** + * Open (or reopen) a 'known issue' table. + * @param ptr pointer to 'table'. Opaque. + * @return new or existing ptr + */ +U_CAPI void *udbg_knownIssue_openU(void *ptr, const char *ticket, char *where, const UChar *msg, UBool *firstForTicket, + UBool *firstForWhere); + + +/** + * Open (or reopen) a 'known issue' table. + * @param ptr pointer to 'table'. Opaque. + * @return new or existing ptr + */ +U_CAPI void *udbg_knownIssue_open(void *ptr, const char *ticket, char *where, const char *msg, UBool *firstForTicket, + UBool *firstForWhere); + +/** + * Print 'known issue' table, to std::cout. + * @param ptr pointer from udbg_knownIssue + * @return true if there were any issues. + */ +U_CAPI UBool udbg_knownIssue_print(void *ptr); + +/** + * Close 'known issue' table. + * @param ptr + */ +U_CAPI void udbg_knownIssue_close(void *ptr); + + +#endif diff --git a/intl/icu/source/tools/toolutil/unewdata.cpp b/intl/icu/source/tools/toolutil/unewdata.cpp new file mode 100644 index 0000000000..27414d2eba --- /dev/null +++ b/intl/icu/source/tools/toolutil/unewdata.cpp @@ -0,0 +1,286 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2010, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: unewdata.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999oct25 +* created by: Markus W. Scherer +*/ + +#include <stdio.h> +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/ustring.h" +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" +#include "unicode/udata.h" +#include "unewdata.h" + +struct UNewDataMemory { + FileStream *file; + uint16_t headerSize; + uint8_t magic1, magic2; +}; + +U_CAPI UNewDataMemory * U_EXPORT2 +udata_create(const char *dir, const char *type, const char *name, + const UDataInfo *pInfo, + const char *comment, + UErrorCode *pErrorCode) { + UNewDataMemory *pData; + uint16_t headerSize, commentLength; + char filename[512]; + uint8_t bytes[16]; + int32_t length; + + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return nullptr; + } else if(name==nullptr || *name==0 || pInfo==nullptr) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + + /* allocate the data structure */ + pData=(UNewDataMemory *)uprv_malloc(sizeof(UNewDataMemory)); + if(pData==nullptr) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + + char dirSepChar = U_FILE_SEP_CHAR; +#if (U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR) + // We may need to append a different directory separator when building for Cygwin or MSYS2. + if(dir && *dir) { + if(!uprv_strchr(dir, U_FILE_SEP_CHAR) && uprv_strchr(dir, U_FILE_ALT_SEP_CHAR)) { + dirSepChar = U_FILE_ALT_SEP_CHAR; + } + } +#endif + + /* Check that the full path won't be too long */ + length = 0; /* Start with nothing */ + if(dir != nullptr && *dir !=0) /* Add directory length if one was given */ + { + length += static_cast<int32_t>(strlen(dir)); + + /* Add 1 if dir doesn't end with path sep */ + if (dir[strlen(dir) - 1]!= dirSepChar) { + length++; + } + } + length += static_cast<int32_t>(strlen(name)); /* Add the filename length */ + + if(type != nullptr && *type !=0) { /* Add directory length if given */ + length += static_cast<int32_t>(strlen(type)); + } + + + /* LDH buffer Length error check */ + if(length > ((int32_t)sizeof(filename) - 1)) + { + *pErrorCode = U_BUFFER_OVERFLOW_ERROR; + uprv_free(pData); + return nullptr; + } + + /* open the output file */ + if(dir!=nullptr && *dir!=0) { /* if dir has a value, we prepend it to the filename */ + char *p=filename+strlen(dir); + uprv_strcpy(filename, dir); + if (*(p-1)!=dirSepChar) { + *p++=dirSepChar; + *p=0; + } + } else { /* otherwise, we'll output to the current dir */ + filename[0]=0; + } + uprv_strcat(filename, name); + if(type!=nullptr && *type!=0) { + uprv_strcat(filename, "."); + uprv_strcat(filename, type); + } + pData->file=T_FileStream_open(filename, "wb"); + if(pData->file==nullptr) { + uprv_free(pData); + *pErrorCode=U_FILE_ACCESS_ERROR; + return nullptr; + } + + /* write the header information */ + headerSize=(uint16_t)(pInfo->size+4); + if(comment!=nullptr && *comment!=0) { + commentLength=(uint16_t)(uprv_strlen(comment)+1); + headerSize+=commentLength; + } else { + commentLength=0; + } + + /* write the size of the header, take padding into account */ + pData->headerSize=(uint16_t)((headerSize+15)&~0xf); + pData->magic1=0xda; + pData->magic2=0x27; + T_FileStream_write(pData->file, &pData->headerSize, 4); + + /* write the information data */ + T_FileStream_write(pData->file, pInfo, pInfo->size); + + /* write the comment */ + if(commentLength>0) { + T_FileStream_write(pData->file, comment, commentLength); + } + + /* write padding bytes to align the data section to 16 bytes */ + headerSize&=0xf; + if(headerSize!=0) { + headerSize=(uint16_t)(16-headerSize); + uprv_memset(bytes, 0, headerSize); + T_FileStream_write(pData->file, bytes, headerSize); + } + + return pData; +} + +U_CAPI uint32_t U_EXPORT2 +udata_finish(UNewDataMemory *pData, UErrorCode *pErrorCode) { + uint32_t fileLength=0; + + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + if(pData!=nullptr) { + if(pData->file!=nullptr) { + /* fflush(pData->file);*/ + fileLength=T_FileStream_size(pData->file); + if(T_FileStream_error(pData->file)) { + *pErrorCode=U_FILE_ACCESS_ERROR; + } else { + fileLength-=pData->headerSize; + } + T_FileStream_close(pData->file); + } + uprv_free(pData); + } + + return fileLength; +} + +/* dummy UDataInfo cf. udata.h */ +static const UDataInfo dummyDataInfo = { + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, + + { 0, 0, 0, 0 }, /* dummy dataFormat */ + { 0, 0, 0, 0 }, /* dummy formatVersion */ + { 0, 0, 0, 0 } /* dummy dataVersion */ +}; + +U_CAPI void U_EXPORT2 +udata_createDummy(const char *dir, const char *type, const char *name, UErrorCode *pErrorCode) { + if(U_SUCCESS(*pErrorCode)) { + udata_finish(udata_create(dir, type, name, &dummyDataInfo, nullptr, pErrorCode), pErrorCode); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "error %s writing dummy data file %s" U_FILE_SEP_STRING "%s.%s\n", + u_errorName(*pErrorCode), dir, name, type); + exit(*pErrorCode); + } + } +} + +U_CAPI void U_EXPORT2 +udata_write8(UNewDataMemory *pData, uint8_t byte) { + if(pData!=nullptr && pData->file!=nullptr) { + T_FileStream_write(pData->file, &byte, 1); + } +} + +U_CAPI void U_EXPORT2 +udata_write16(UNewDataMemory *pData, uint16_t word) { + if(pData!=nullptr && pData->file!=nullptr) { + T_FileStream_write(pData->file, &word, 2); + } +} + +U_CAPI void U_EXPORT2 +udata_write32(UNewDataMemory *pData, uint32_t wyde) { + if(pData!=nullptr && pData->file!=nullptr) { + T_FileStream_write(pData->file, &wyde, 4); + } +} + +U_CAPI void U_EXPORT2 +udata_writeBlock(UNewDataMemory *pData, const void *s, int32_t length) { + if(pData!=nullptr && pData->file!=nullptr) { + if(length>0) { + T_FileStream_write(pData->file, s, length); + } + } +} + +U_CAPI void U_EXPORT2 +udata_writePadding(UNewDataMemory *pData, int32_t length) { + static const uint8_t padding[16]={ + 0xaa, 0xaa, 0xaa, 0xaa, + 0xaa, 0xaa, 0xaa, 0xaa, + 0xaa, 0xaa, 0xaa, 0xaa, + 0xaa, 0xaa, 0xaa, 0xaa + }; + if(pData!=nullptr && pData->file!=nullptr) { + while(length>=16) { + T_FileStream_write(pData->file, padding, 16); + length-=16; + } + if(length>0) { + T_FileStream_write(pData->file, padding, length); + } + } +} + +U_CAPI void U_EXPORT2 +udata_writeString(UNewDataMemory *pData, const char *s, int32_t length) { + if(pData!=nullptr && pData->file!=nullptr) { + if(length==-1) { + length=(int32_t)uprv_strlen(s); + } + if(length>0) { + T_FileStream_write(pData->file, s, length); + } + } +} + +U_CAPI void U_EXPORT2 +udata_writeUString(UNewDataMemory *pData, const char16_t *s, int32_t length) { + if(pData!=nullptr && pData->file!=nullptr) { + if(length==-1) { + length=u_strlen(s); + } + if(length>0) { + T_FileStream_write(pData->file, s, length*sizeof(char16_t)); + } + } +} + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ + diff --git a/intl/icu/source/tools/toolutil/unewdata.h b/intl/icu/source/tools/toolutil/unewdata.h new file mode 100644 index 0000000000..137fb49584 --- /dev/null +++ b/intl/icu/source/tools/toolutil/unewdata.h @@ -0,0 +1,113 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2010, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: unewdata.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999oct25 +* created by: Markus W. Scherer +*/ + +#ifndef __UNEWDATA_H__ +#define __UNEWDATA_H__ + +#include "unicode/utypes.h" +#include "unicode/udata.h" + +/* API for writing data -----------------------------------------------------*/ + +/** @memo Forward declaration of the data memory creation type. */ +typedef struct UNewDataMemory UNewDataMemory; + +/** + * Create a new binary data file. + * The file-writing <code>udata_</code> functions facilitate writing + * binary data files that can be read by ICU's <code>udata</code> API. + * This function opens a new file with a filename determined from its + * parameters - of the form "name.type". + * It then writes a short header, followed by the <code>UDataInfo</code> + * structure and, optionally, by the comment string. + * It then writes padding bytes to round up to a multiple of 16 bytes. + * Subsequent write operations will thus start at an offset in the file + * that is a multiple of 16. <code>udata_getMemory()</code> will return + * a pointer to this same starting offset. + * + * See udata.h . + * + * @param dir A string that specifies the directory where the data will be + * written. If <code>NULL</code>, then + * <code>u_getDataDirectory</code> is used. + * @param type A string that specifies the type of data to be written. + * For example, resource bundles are written with type "res", + * conversion tables with type "cnv". + * This may be <code>NULL</code> or empty. + * @param name A string that specifies the name of the data. + * @param pInfo A pointer to a correctly filled <code>UDataInfo</code> + * structure that will be copied into the file. + * @param comment A string (e.g., a copyright statement) that will be + * copied into the file if it is not <code>NULL</code> + * or empty. This string serves only as a comment in the binary + * file. It will not be accessible by any API. + * @param pErrorCode An ICU UErrorCode parameter. It must not be <code>NULL</code>. + */ +U_CAPI UNewDataMemory * U_EXPORT2 +udata_create(const char *dir, const char *type, const char *name, + const UDataInfo *pInfo, + const char *comment, + UErrorCode *pErrorCode); + +/** @memo Close a newly written binary file. */ +U_CAPI uint32_t U_EXPORT2 +udata_finish(UNewDataMemory *pData, UErrorCode *pErrorCode); + +/** @memo Write a dummy data file. */ +U_CAPI void U_EXPORT2 +udata_createDummy(const char *dir, const char *type, const char *name, UErrorCode *pErrorCode); + +/** @memo Write an 8-bit byte to the file. */ +U_CAPI void U_EXPORT2 +udata_write8(UNewDataMemory *pData, uint8_t byte); + +/** @memo Write a 16-bit word to the file. */ +U_CAPI void U_EXPORT2 +udata_write16(UNewDataMemory *pData, uint16_t word); + +/** @memo Write a 32-bit word to the file. */ +U_CAPI void U_EXPORT2 +udata_write32(UNewDataMemory *pData, uint32_t wyde); + +/** @memo Write a block of bytes to the file. */ +U_CAPI void U_EXPORT2 +udata_writeBlock(UNewDataMemory *pData, const void *s, int32_t length); + +/** @memo Write a block of arbitrary padding bytes to the file. */ +U_CAPI void U_EXPORT2 +udata_writePadding(UNewDataMemory *pData, int32_t length); + +/** @memo Write a <code>char*</code> string of platform "invariant characters" to the file. */ +U_CAPI void U_EXPORT2 +udata_writeString(UNewDataMemory *pData, const char *s, int32_t length); + +/** @memo Write a <code>UChar*</code> string of Unicode character code units to the file. */ +U_CAPI void U_EXPORT2 +udata_writeUString(UNewDataMemory *pData, const UChar *s, int32_t length); + + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ + +#endif diff --git a/intl/icu/source/tools/toolutil/uoptions.cpp b/intl/icu/source/tools/toolutil/uoptions.cpp new file mode 100644 index 0000000000..808164ae4d --- /dev/null +++ b/intl/icu/source/tools/toolutil/uoptions.cpp @@ -0,0 +1,133 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uoptions.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000apr17 +* created by: Markus W. Scherer +* +* This file provides a command line argument parser. +*/ + +#include "unicode/utypes.h" +#include "cstring.h" +#include "uoptions.h" + +U_CAPI int U_EXPORT2 +u_parseArgs(int argc, char* argv[], + int optionCount, UOption options[]) { + char *arg; + int i=1, remaining=1; + char c, stopOptions=0; + + while(i<argc) { + arg=argv[i]; + if(!stopOptions && *arg=='-' && (c=arg[1])!=0) { + /* process an option */ + UOption *option=nullptr; + arg+=2; + if(c=='-') { + /* process a long option */ + if(*arg==0) { + /* stop processing options after "--" */ + stopOptions=1; + } else { + /* search for the option string */ + int j; + for(j=0; j<optionCount; ++j) { + if(options[j].longName && uprv_strcmp(arg, options[j].longName)==0) { + option=options+j; + break; + } + } + if(option==nullptr) { + /* no option matches */ + return -i; + } + option->doesOccur=1; + + if(option->hasArg!=UOPT_NO_ARG) { + /* parse the argument for the option, if any */ + if(i+1<argc && !(argv[i+1][0]=='-' && argv[i+1][1]!=0)) { + /* argument in the next argv[], and there is not an option in there */ + option->value=argv[++i]; + } else if(option->hasArg==UOPT_REQUIRES_ARG) { + /* there is no argument, but one is required: return with error */ + option->doesOccur=0; + return -i; + } + } + + if(option->optionFn!=nullptr && option->optionFn(option->context, option)<0) { + /* the option function was called and returned an error */ + option->doesOccur=0; + return -i; + } + } + } else { + /* process one or more short options */ + do { + /* search for the option letter */ + int j; + for(j=0; j<optionCount; ++j) { + if(c==options[j].shortName) { + option=options+j; + break; + } + } + if(option==nullptr) { + /* no option matches */ + return -i; + } + option->doesOccur=1; + + if(option->hasArg!=UOPT_NO_ARG) { + /* parse the argument for the option, if any */ + if(*arg!=0) { + /* argument following in the same argv[] */ + option->value=arg; + /* do not process the rest of this arg as option letters */ + break; + } else if(i+1<argc && !(argv[i+1][0]=='-' && argv[i+1][1]!=0)) { + /* argument in the next argv[], and there is not an option in there */ + option->value=argv[++i]; + /* this break is redundant because we know that *arg==0 */ + break; + } else if(option->hasArg==UOPT_REQUIRES_ARG) { + /* there is no argument, but one is required: return with error */ + option->doesOccur=0; + return -i; + } + } + + if(option->optionFn!=nullptr && option->optionFn(option->context, option)<0) { + /* the option function was called and returned an error */ + option->doesOccur=0; + return -i; + } + + /* get the next option letter */ + option=nullptr; + c=*arg++; + } while(c!=0); + } + + /* go to next argv[] */ + ++i; + } else { + /* move a non-option up in argv[] */ + argv[remaining++]=arg; + ++i; + } + } + return remaining; +} diff --git a/intl/icu/source/tools/toolutil/uoptions.h b/intl/icu/source/tools/toolutil/uoptions.h new file mode 100644 index 0000000000..d00e3da924 --- /dev/null +++ b/intl/icu/source/tools/toolutil/uoptions.h @@ -0,0 +1,143 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uoptions.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000apr17 +* created by: Markus W. Scherer +* +* This file provides a command line argument parser. +*/ + +#ifndef __UOPTIONS_H__ +#define __UOPTIONS_H__ + +#include "unicode/utypes.h" + +/* This should usually be called before calling u_parseArgs */ +/*#if U_PLATFORM == U_PF_OS390 && (U_CHARSET_FAMILY == U_ASCII_FAMILY)*/ + /* translate args from EBCDIC to ASCII */ +/*# define U_MAIN_INIT_ARGS(argc, argv) __argvtoascii_a(argc, argv)*/ +/*#elif defined(XP_MAC_CONSOLE)*/ +#if defined(XP_MAC_CONSOLE) +# include <console.h> + /* Get the arguments from the GUI, since old Macs don't have a console Window. */ +# define U_MAIN_INIT_ARGS(argc, argv) argc = ccommand((char***)&argv) +#else + /* Normally we do nothing. */ +# define U_MAIN_INIT_ARGS(argc, argv) +#endif + + + +/* forward declarations for the function declaration */ +struct UOption; +typedef struct UOption UOption; + +/* function to be called for a command line option */ +typedef int UOptionFn(void *context, UOption *option); + +/* values of UOption.hasArg */ +enum { UOPT_NO_ARG, UOPT_REQUIRES_ARG, UOPT_OPTIONAL_ARG }; + +/* structure describing a command line option */ +struct UOption { + const char *longName; /* "foo" for --foo */ + const char *value; /* output placeholder, will point to the argument string, if any */ + UOptionFn *optionFn; /* function to be called when this option occurs */ + void *context; /* parameter for the function */ + char shortName; /* 'f' for -f */ + char hasArg; /* enum value: option takes no/requires/may have argument */ + char doesOccur; /* boolean for "this one occurred" */ +}; + +/* macro for an entry in a declaration of UOption[] */ +#define UOPTION_DEF(longName, shortName, hasArg) \ + { longName, NULL, NULL, NULL, shortName, hasArg, 0 } + +/* ICU Tools option definitions */ +#define UOPTION_HELP_H UOPTION_DEF("help", 'h', UOPT_NO_ARG) +#define UOPTION_HELP_QUESTION_MARK UOPTION_DEF("help", '?', UOPT_NO_ARG) +#define UOPTION_VERBOSE UOPTION_DEF("verbose", 'v', UOPT_NO_ARG) +#define UOPTION_QUIET UOPTION_DEF("quiet", 'q', UOPT_NO_ARG) +#define UOPTION_VERSION UOPTION_DEF("version", 'V', UOPT_NO_ARG) +#define UOPTION_COPYRIGHT UOPTION_DEF("copyright", 'c', UOPT_NO_ARG) + +#define UOPTION_DESTDIR UOPTION_DEF("destdir", 'd', UOPT_REQUIRES_ARG) +#define UOPTION_SOURCEDIR UOPTION_DEF("sourcedir", 's', UOPT_REQUIRES_ARG) +#define UOPTION_ENCODING UOPTION_DEF("encoding", 'e', UOPT_REQUIRES_ARG) +#define UOPTION_ICUDATADIR UOPTION_DEF("icudatadir", 'i', UOPT_REQUIRES_ARG) +#define UOPTION_WRITE_JAVA UOPTION_DEF("write-java", 'j', UOPT_OPTIONAL_ARG) +#define UOPTION_PACKAGE_NAME UOPTION_DEF("package-name", 'p', UOPT_REQUIRES_ARG) +#define UOPTION_BUNDLE_NAME UOPTION_DEF("bundle-name", 'b', UOPT_REQUIRES_ARG) + +/** + * C Command line argument parser. + * + * This function takes the argv[argc] command line and a description of + * the program's options in form of an array of UOption structures. + * Each UOption defines a long and a short name (a string and a character) + * for options like "--foo" and "-f". + * + * Each option is marked with whether it does not take an argument, + * requires one, or optionally takes one. The argument may follow in + * the same argv[] entry for short options, or it may always follow + * in the next argv[] entry. + * + * An argument is in the next argv[] entry for both long and short name + * options, except it is taken from directly behind the short name in + * its own argv[] entry if there are characters following the option letter. + * An argument in its own argv[] entry must not begin with a '-' + * unless it is only the '-' itself. There is no restriction of the + * argument format if it is part of the short name options's argv[] entry. + * + * The argument is stored in the value field of the corresponding + * UOption entry, and the doesOccur field is set to 1 if the option + * is found at all. + * + * Short name options without arguments can be collapsed into a single + * argv[] entry. After an option letter takes an argument, following + * letters will be taken as its argument. + * + * If the same option is found several times, then the last + * argument value will be stored in the value field. + * + * For each option, a function can be called. This could be used + * for options that occur multiple times and all arguments are to + * be collected. + * + * All options are removed from the argv[] array itself. If the parser + * is successful, then it returns the number of remaining non-option + * strings (including argv[0]). + * argv[0], the program name, is never read or modified. + * + * An option "--" ends option processing; everything after this + * remains in the argv[] array. + * + * An option string "-" alone is treated as a non-option. + * + * If an option is not recognized or an argument missing, then + * the parser returns with the negative index of the argv[] entry + * where the error was detected. + * + * The OS/400 compiler requires that argv either be "char* argv[]", + * or "const char* const argv[]", and it will not accept, + * "const char* argv[]" as a definition for main(). + * + * @param argv This parameter is modified + * @param options This parameter is modified + */ +U_CAPI int U_EXPORT2 +u_parseArgs(int argc, char* argv[], + int optionCount, UOption options[]); + +#endif diff --git a/intl/icu/source/tools/toolutil/uparse.cpp b/intl/icu/source/tools/toolutil/uparse.cpp new file mode 100644 index 0000000000..5aee48b5a4 --- /dev/null +++ b/intl/icu/source/tools/toolutil/uparse.cpp @@ -0,0 +1,383 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uparse.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000apr18 +* created by: Markus W. Scherer +* +* This file provides a parser for files that are delimited by one single +* character like ';' or TAB. Example: the Unicode Character Properties files +* like UnicodeData.txt are semicolon-delimited. +*/ + +#include "unicode/utypes.h" +#include "unicode/uchar.h" +#include "unicode/ustring.h" +#include "unicode/utf16.h" +#include "cstring.h" +#include "filestrm.h" +#include "uparse.h" +#include "ustr_imp.h" + +#include <stdio.h> + +U_CAPI const char * U_EXPORT2 +u_skipWhitespace(const char *s) { + while(U_IS_INV_WHITESPACE(*s)) { + ++s; + } + return s; +} + +U_CAPI char * U_EXPORT2 +u_rtrim(char *s) { + char *end=uprv_strchr(s, 0); + while(s<end && U_IS_INV_WHITESPACE(*(end-1))) { + *--end = 0; + } + return end; +} + +/* + * If the string starts with # @missing: then return the pointer to the + * following non-whitespace character. + * Otherwise return the original pointer. + * Unicode 5.0 adds such lines in some data files to document + * default property values. + * Poor man's regex for variable amounts of white space. + */ +static const char * +getMissingLimit(const char *s) { + const char *s0=s; + if( + *(s=u_skipWhitespace(s))=='#' && + *(s=u_skipWhitespace(s+1))=='@' && + 0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) && + *(s=u_skipWhitespace(s+7))==':' + ) { + return u_skipWhitespace(s+1); + } else { + return s0; + } +} + +U_CAPI void U_EXPORT2 +u_parseDelimitedFile(const char *filename, char delimiter, + char *fields[][2], int32_t fieldCount, + UParseLineFn *lineFn, void *context, + UErrorCode *pErrorCode) { + FileStream *file; + char line[10000]; + char *start, *limit; + int32_t i, length; + + if(U_FAILURE(*pErrorCode)) { + return; + } + + if(fields==nullptr || lineFn==nullptr || fieldCount<=0) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if(filename==nullptr || *filename==0 || (*filename=='-' && filename[1]==0)) { + filename=nullptr; + file=T_FileStream_stdin(); + } else { + file=T_FileStream_open(filename, "r"); + } + if(file==nullptr) { + *pErrorCode=U_FILE_ACCESS_ERROR; + return; + } + + while(T_FileStream_readLine(file, line, sizeof(line))!=nullptr) { + /* remove trailing newline characters */ + length=(int32_t)(u_rtrim(line)-line); + + /* + * detect a line with # @missing: + * start parsing after that, or else from the beginning of the line + * set the default warning for @missing lines + */ + start=(char *)getMissingLimit(line); + if(start==line) { + *pErrorCode=U_ZERO_ERROR; + } else { + *pErrorCode=U_USING_DEFAULT_WARNING; + } + + /* skip this line if it is empty or a comment */ + if(*start==0 || *start=='#') { + continue; + } + + /* remove in-line comments */ + limit=uprv_strchr(start, '#'); + if(limit!=nullptr) { + /* get white space before the pound sign */ + while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) { + --limit; + } + + /* truncate the line */ + *limit=0; + } + + /* skip lines with only whitespace */ + if(u_skipWhitespace(start)[0]==0) { + continue; + } + + /* for each field, call the corresponding field function */ + for(i=0; i<fieldCount; ++i) { + /* set the limit pointer of this field */ + limit=start; + while(*limit!=delimiter && *limit!=0) { + ++limit; + } + + /* set the field start and limit in the fields array */ + fields[i][0]=start; + fields[i][1]=limit; + + /* set start to the beginning of the next field, if any */ + start=limit; + if(*start!=0) { + ++start; + } else if(i+1<fieldCount) { + *pErrorCode=U_PARSE_ERROR; + limit=line+length; + i=fieldCount; + break; + } + } + + /* too few fields? */ + if(U_FAILURE(*pErrorCode)) { + break; + } + + /* call the field function */ + lineFn(context, fields, fieldCount, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + break; + } + } + + if(filename!=nullptr) { + T_FileStream_close(file); + } +} + +/* + * parse a list of code points + * store them as a UTF-32 string in dest[destCapacity] + * return the number of code points + */ +U_CAPI int32_t U_EXPORT2 +u_parseCodePoints(const char *s, + uint32_t *dest, int32_t destCapacity, + UErrorCode *pErrorCode) { + char *end; + uint32_t value; + int32_t count; + + if(U_FAILURE(*pErrorCode)) { + return 0; + } + if(s==nullptr || destCapacity<0 || (destCapacity>0 && dest==nullptr)) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + count=0; + for(;;) { + s=u_skipWhitespace(s); + if(*s==';' || *s==0) { + return count; + } + + /* read one code point */ + value=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) { + *pErrorCode=U_PARSE_ERROR; + return 0; + } + + /* append it to the destination array */ + if(count<destCapacity) { + dest[count++]=value; + } else { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + + /* go to the following characters */ + s=end; + } +} + +/* + * parse a list of code points + * store them as a string in dest[destCapacity] + * set the first code point in *pFirst + * @return The length of the string in numbers of UChars. + */ +U_CAPI int32_t U_EXPORT2 +u_parseString(const char *s, + char16_t *dest, int32_t destCapacity, + uint32_t *pFirst, + UErrorCode *pErrorCode) { + char *end; + uint32_t value; + int32_t destLength; + + if(U_FAILURE(*pErrorCode)) { + return 0; + } + if(s==nullptr || destCapacity<0 || (destCapacity>0 && dest==nullptr)) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + if(pFirst!=nullptr) { + *pFirst=0xffffffff; + } + + destLength=0; + for(;;) { + s=u_skipWhitespace(s); + if(*s==';' || *s==0) { + if(destLength<destCapacity) { + dest[destLength]=0; + } else if(destLength==destCapacity) { + *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; + } else { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + return destLength; + } + + /* read one code point */ + value=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) { + *pErrorCode=U_PARSE_ERROR; + return 0; + } + + /* store the first code point */ + if(pFirst!=nullptr) { + *pFirst=value; + pFirst=nullptr; + } + + /* append it to the destination array */ + if((destLength+U16_LENGTH(value))<=destCapacity) { + U16_APPEND_UNSAFE(dest, destLength, value); + } else { + destLength+=U16_LENGTH(value); + } + + /* go to the following characters */ + s=end; + } +} + +/* read a range like start or start..end */ +U_CAPI int32_t U_EXPORT2 +u_parseCodePointRangeAnyTerminator(const char *s, + uint32_t *pStart, uint32_t *pEnd, + const char **terminator, + UErrorCode *pErrorCode) { + char *end; + uint32_t value; + + if(U_FAILURE(*pErrorCode)) { + return 0; + } + if(s==nullptr || pStart==nullptr || pEnd==nullptr) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* read the start code point */ + s=u_skipWhitespace(s); + value=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || value>=0x110000) { + *pErrorCode=U_PARSE_ERROR; + return 0; + } + *pStart=*pEnd=value; + + /* is there a "..end"? */ + s=u_skipWhitespace(end); + if(*s!='.' || s[1]!='.') { + *terminator=end; + return 1; + } + s=u_skipWhitespace(s+2); + + /* read the end code point */ + value=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || value>=0x110000) { + *pErrorCode=U_PARSE_ERROR; + return 0; + } + *pEnd=value; + + /* is this a valid range? */ + if(value<*pStart) { + *pErrorCode=U_PARSE_ERROR; + return 0; + } + + *terminator=end; + return value-*pStart+1; +} + +U_CAPI int32_t U_EXPORT2 +u_parseCodePointRange(const char *s, + uint32_t *pStart, uint32_t *pEnd, + UErrorCode *pErrorCode) { + const char *terminator; + int32_t rangeLength= + u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode); + if(U_SUCCESS(*pErrorCode)) { + terminator=u_skipWhitespace(terminator); + if(*terminator!=';' && *terminator!=0) { + *pErrorCode=U_PARSE_ERROR; + return 0; + } + } + return rangeLength; +} + +U_CAPI int32_t U_EXPORT2 +u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) { + const char *read = source; + int32_t i = 0; + unsigned int value = 0; + if(sLen == -1) { + sLen = (int32_t)strlen(source); + } + + while(read < source+sLen) { + sscanf(read, "%2x", &value); + if(i < destCapacity) { + dest[i] = (char)value; + } + i++; + read += 2; + } + return u_terminateChars(dest, destCapacity, i, status); +} diff --git a/intl/icu/source/tools/toolutil/uparse.h b/intl/icu/source/tools/toolutil/uparse.h new file mode 100644 index 0000000000..df0e79a21f --- /dev/null +++ b/intl/icu/source/tools/toolutil/uparse.h @@ -0,0 +1,153 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2010, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uparse.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000apr18 +* created by: Markus W. Scherer +* +* This file provides a parser for files that are delimited by one single +* character like ';' or TAB. Example: the Unicode Character Properties files +* like UnicodeData.txt are semicolon-delimited. +*/ + +#ifndef __UPARSE_H__ +#define __UPARSE_H__ + +#include "unicode/utypes.h" + +/** + * Is c an invariant-character whitespace? + * @param c invariant character + */ +#define U_IS_INV_WHITESPACE(c) ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n') + +U_CDECL_BEGIN + +/** + * Skip space ' ' and TAB '\t' characters. + * + * @param s Pointer to characters. + * @return Pointer to first character at or after s that is not a space or TAB. + */ +U_CAPI const char * U_EXPORT2 +u_skipWhitespace(const char *s); + +/** + * Trim whitespace (including line endings) from the end of the string. + * + * @param s Pointer to the string. + * @return Pointer to the new end of the string. + */ +U_CAPI char * U_EXPORT2 +u_rtrim(char *s); + +/** Function type for u_parseDelimitedFile(). */ +typedef void U_CALLCONV +UParseLineFn(void *context, + char *fields[][2], + int32_t fieldCount, + UErrorCode *pErrorCode); + +/** + * Parser for files that are similar to UnicodeData.txt: + * This function opens the file and reads it line by line. It skips empty lines + * and comment lines that start with a '#'. + * All other lines are separated into fields with one delimiter character + * (semicolon for Unicode Properties files) between two fields. The last field in + * a line does not need to be terminated with a delimiter. + * + * For each line, after segmenting it, a line function is called. + * It gets passed the array of field start and limit pointers that is + * passed into this parser and filled by it for each line. + * For each field i of the line, the start pointer in fields[i][0] + * points to the beginning of the field, while the limit pointer in fields[i][1] + * points behind the field, i.e., to the delimiter or the line end. + * + * The context parameter of the line function is + * the same as the one for the parse function. + * + * The line function may modify the contents of the fields including the + * limit characters. + * + * If the file cannot be opened, or there is a parsing error or a field function + * sets *pErrorCode, then the parser returns with *pErrorCode set to an error code. + */ +U_CAPI void U_EXPORT2 +u_parseDelimitedFile(const char *filename, char delimiter, + char *fields[][2], int32_t fieldCount, + UParseLineFn *lineFn, void *context, + UErrorCode *pErrorCode); + +/** + * Parse a string of code points like 0061 0308 0300. + * s must end with either ';' or NUL. + * + * @return Number of code points. + */ +U_CAPI int32_t U_EXPORT2 +u_parseCodePoints(const char *s, + uint32_t *dest, int32_t destCapacity, + UErrorCode *pErrorCode); + +/** + * Parse a list of code points like 0061 0308 0300 + * into a UChar * string. + * s must end with either ';' or NUL. + * + * Set the first code point in *pFirst. + * + * @param s Input char * string. + * @param dest Output string buffer. + * @param destCapacity Capacity of dest in numbers of UChars. + * @param pFirst If pFirst!=NULL the *pFirst will be set to the first + * code point in the string. + * @param pErrorCode ICU error code. + * @return The length of the string in numbers of UChars. + */ +U_CAPI int32_t U_EXPORT2 +u_parseString(const char *s, + UChar *dest, int32_t destCapacity, + uint32_t *pFirst, + UErrorCode *pErrorCode); + +/** + * Parse a code point range like + * 0085 or + * 4E00..9FA5. + * + * s must contain such a range and end with either ';' or NUL. + * + * @return Length of code point range, end-start+1 + */ +U_CAPI int32_t U_EXPORT2 +u_parseCodePointRange(const char *s, + uint32_t *pStart, uint32_t *pEnd, + UErrorCode *pErrorCode); + +/** + * Same as u_parseCodePointRange() but the range may be terminated by + * any character. The position of the terminating character is returned via + * the *terminator output parameter. + */ +U_CAPI int32_t U_EXPORT2 +u_parseCodePointRangeAnyTerminator(const char *s, + uint32_t *pStart, uint32_t *pEnd, + const char **terminator, + UErrorCode *pErrorCode); + +U_CAPI int32_t U_EXPORT2 +u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status); + +U_CDECL_END + +#endif diff --git a/intl/icu/source/tools/toolutil/writesrc.cpp b/intl/icu/source/tools/toolutil/writesrc.cpp new file mode 100644 index 0000000000..55c2f277b3 --- /dev/null +++ b/intl/icu/source/tools/toolutil/writesrc.cpp @@ -0,0 +1,515 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2005-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: writesrc.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005apr23 +* created by: Markus W. Scherer +* +* Helper functions for writing source code for data. +*/ + +#include <stdio.h> +#include <time.h> + +// The C99 standard suggested that C++ implementations not define PRId64 etc. constants +// unless this macro is defined. +// See the Notes at https://en.cppreference.com/w/cpp/types/integer . +// Similar to defining __STDC_LIMIT_MACROS in unicode/ptypes.h . +#ifndef __STDC_FORMAT_MACROS +# define __STDC_FORMAT_MACROS +#endif +#include <cinttypes> + +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/ucptrie.h" +#include "unicode/errorcode.h" +#include "unicode/uniset.h" +#include "unicode/usetiter.h" +#include "unicode/utf16.h" +#include "utrie2.h" +#include "cstring.h" +#include "writesrc.h" +#include "util.h" + +U_NAMESPACE_BEGIN + +ValueNameGetter::~ValueNameGetter() {} + +U_NAMESPACE_END + +U_NAMESPACE_USE + +static FILE * +usrc_createWithoutHeader(const char *path, const char *filename) { + char buffer[1024]; + const char *p; + char *q; + FILE *f; + char c; + + if(path==nullptr) { + p=filename; + } else { + /* concatenate path and filename, with U_FILE_SEP_CHAR in between if necessary */ + uprv_strcpy(buffer, path); + q=buffer+uprv_strlen(buffer); + if(q>buffer && (c=*(q-1))!=U_FILE_SEP_CHAR && c!=U_FILE_ALT_SEP_CHAR) { + *q++=U_FILE_SEP_CHAR; + } + uprv_strcpy(q, filename); + p=buffer; + } + + f=fopen(p, "w"); + if (f==nullptr) { + fprintf( + stderr, + "usrc_create(%s, %s): unable to create file\n", + path!=nullptr ? path : "", filename); + } + return f; +} + +U_CAPI FILE * U_EXPORT2 +usrc_create(const char *path, const char *filename, int32_t copyrightYear, const char *generator) { + FILE *f = usrc_createWithoutHeader(path, filename); + if (f == nullptr) { + return f; + } + usrc_writeCopyrightHeader(f, "//", copyrightYear); + usrc_writeFileNameGeneratedBy(f, "//", filename, generator); + return f; +} + +U_CAPI FILE * U_EXPORT2 +usrc_createTextData(const char *path, const char *filename, int32_t copyrightYear, const char *generator) { + FILE *f = usrc_createWithoutHeader(path, filename); + if (f == nullptr) { + return f; + } + usrc_writeCopyrightHeader(f, "#", copyrightYear); + usrc_writeFileNameGeneratedBy(f, "#", filename, generator); + return f; +} + +U_CAPI void U_EXPORT2 +usrc_writeCopyrightHeader(FILE *f, const char *prefix, int32_t copyrightYear) { + fprintf(f, + "%s Copyright (C) %d and later: Unicode, Inc. and others.\n" + "%s License & terms of use: http://www.unicode.org/copyright.html\n", + prefix, copyrightYear, prefix); + if (copyrightYear <= 2016) { + fprintf(f, + "%s Copyright (C) 1999-2016, International Business Machines\n" + "%s Corporation and others. All Rights Reserved.\n", + prefix, prefix); + } +} + +U_CAPI void U_EXPORT2 +usrc_writeFileNameGeneratedBy( + FILE *f, + const char *prefix, + const char *filename, + const char *generator) { + char buffer[1024]; + const struct tm *lt; + time_t t; + + const char *pattern = + "%s\n" + "%s file name: %s\n" + "%s\n" + "%s machine-generated by: %s\n" + "\n"; + + time(&t); + lt=localtime(&t); + if(generator==nullptr) { + strftime(buffer, sizeof(buffer), "%Y-%m-%d", lt); + fprintf(f, pattern, prefix, prefix, filename, prefix, prefix, buffer); + } else { + fprintf(f, pattern, prefix, prefix, filename, prefix, prefix, generator); + } +} + +U_CAPI void U_EXPORT2 +usrc_writeArray(FILE *f, + const char *prefix, + const void *p, int32_t width, int32_t length, + const char *indent, + const char *postfix) { + const uint8_t *p8; + const uint16_t *p16; + const uint32_t *p32; + const int64_t *p64; // Signed due to TOML! + int64_t value; // Signed due to TOML! + int32_t i, col; + + p8=nullptr; + p16=nullptr; + p32=nullptr; + p64=nullptr; + switch(width) { + case 8: + p8=(const uint8_t *)p; + break; + case 16: + p16=(const uint16_t *)p; + break; + case 32: + p32=(const uint32_t *)p; + break; + case 64: + p64=(const int64_t *)p; + break; + default: + fprintf(stderr, "usrc_writeArray(width=%ld) unrecognized width\n", (long)width); + return; + } + if(prefix!=nullptr) { + fprintf(f, prefix, (long)length); + } + for(i=col=0; i<length; ++i, ++col) { + if(i>0) { + if(col<16) { + fputc(',', f); + } else { + fputs(",\n", f); + fputs(indent, f); + col=0; + } + } + switch(width) { + case 8: + value=p8[i]; + break; + case 16: + value=p16[i]; + break; + case 32: + value=p32[i]; + break; + case 64: + value=p64[i]; + break; + default: + value=0; /* unreachable */ + break; + } + fprintf(f, value<=9 ? "%" PRId64 : "0x%" PRIx64, value); + } + if(postfix!=nullptr) { + fputs(postfix, f); + } +} + +U_CAPI void U_EXPORT2 +usrc_writeUTrie2Arrays(FILE *f, + const char *indexPrefix, const char *data32Prefix, + const UTrie2 *pTrie, + const char *postfix) { + if(pTrie->data32==nullptr) { + /* 16-bit trie */ + usrc_writeArray(f, indexPrefix, pTrie->index, 16, pTrie->indexLength+pTrie->dataLength, "", postfix); + } else { + /* 32-bit trie */ + usrc_writeArray(f, indexPrefix, pTrie->index, 16, pTrie->indexLength, "", postfix); + usrc_writeArray(f, data32Prefix, pTrie->data32, 32, pTrie->dataLength, "", postfix); + } +} + +U_CAPI void U_EXPORT2 +usrc_writeUTrie2Struct(FILE *f, + const char *prefix, + const UTrie2 *pTrie, + const char *indexName, const char *data32Name, + const char *postfix) { + if(prefix!=nullptr) { + fputs(prefix, f); + } + if(pTrie->data32==nullptr) { + /* 16-bit trie */ + fprintf( + f, + " %s,\n" /* index */ + " %s+%ld,\n" /* data16 */ + " nullptr,\n", /* data32 */ + indexName, + indexName, + (long)pTrie->indexLength); + } else { + /* 32-bit trie */ + fprintf( + f, + " %s,\n" /* index */ + " nullptr,\n" /* data16 */ + " %s,\n", /* data32 */ + indexName, + data32Name); + } + fprintf( + f, + " %ld,\n" /* indexLength */ + " %ld,\n" /* dataLength */ + " 0x%hx,\n" /* index2NullOffset */ + " 0x%hx,\n" /* dataNullOffset */ + " 0x%lx,\n" /* initialValue */ + " 0x%lx,\n" /* errorValue */ + " 0x%lx,\n" /* highStart */ + " 0x%lx,\n" /* highValueIndex */ + " nullptr, 0, false, false, 0, nullptr\n", + (long)pTrie->indexLength, (long)pTrie->dataLength, + (short)pTrie->index2NullOffset, (short)pTrie->dataNullOffset, + (long)pTrie->initialValue, (long)pTrie->errorValue, + (long)pTrie->highStart, (long)pTrie->highValueIndex); + if(postfix!=nullptr) { + fputs(postfix, f); + } +} + +U_CAPI void U_EXPORT2 +usrc_writeUCPTrieArrays(FILE *f, + const char *indexPrefix, const char *dataPrefix, + const UCPTrie *pTrie, + const char *postfix, + UTargetSyntax syntax) { + const char* indent = (syntax == UPRV_TARGET_SYNTAX_TOML) ? " " : ""; + usrc_writeArray(f, indexPrefix, pTrie->index, 16, pTrie->indexLength, indent, postfix); + int32_t width= + pTrie->valueWidth==UCPTRIE_VALUE_BITS_16 ? 16 : + pTrie->valueWidth==UCPTRIE_VALUE_BITS_32 ? 32 : + pTrie->valueWidth==UCPTRIE_VALUE_BITS_8 ? 8 : 0; + usrc_writeArray(f, dataPrefix, pTrie->data.ptr0, width, pTrie->dataLength, indent, postfix); +} + +U_CAPI void U_EXPORT2 +usrc_writeUCPTrieStruct(FILE *f, + const char *prefix, + const UCPTrie *pTrie, + const char *indexName, const char *dataName, + const char *postfix, + UTargetSyntax syntax) { + if(prefix!=nullptr) { + fputs(prefix, f); + } + if (syntax == UPRV_TARGET_SYNTAX_CCODE) { + fprintf( + f, + " %s,\n" // index + " { %s },\n", // data (union) + indexName, + dataName); + } + const char* pattern = + (syntax == UPRV_TARGET_SYNTAX_CCODE) ? + " %ld, %ld,\n" // indexLength, dataLength + " 0x%lx, 0x%x,\n" // highStart, shifted12HighStart + " %d, %d,\n" // type, valueWidth + " 0, 0,\n" // reserved32, reserved16 + " 0x%x, 0x%lx,\n" // index3NullOffset, dataNullOffset + " 0x%lx,\n" // nullValue + : + "indexLength = %ld\n" + "dataLength = %ld\n" + "highStart = 0x%lx\n" + "shifted12HighStart = 0x%x\n" + "type = %d\n" + "valueWidth = %d\n" + "index3NullOffset = 0x%x\n" + "dataNullOffset = 0x%lx\n" + "nullValue = 0x%lx\n" + ; + fprintf( + f, + pattern, + (long)pTrie->indexLength, (long)pTrie->dataLength, + (long)pTrie->highStart, pTrie->shifted12HighStart, + pTrie->type, pTrie->valueWidth, + pTrie->index3NullOffset, (long)pTrie->dataNullOffset, + (long)pTrie->nullValue); + if(postfix!=nullptr) { + fputs(postfix, f); + } +} + +U_CAPI void U_EXPORT2 +usrc_writeUCPTrie(FILE *f, const char *name, const UCPTrie *pTrie, UTargetSyntax syntax) { + int32_t width= + pTrie->valueWidth==UCPTRIE_VALUE_BITS_16 ? 16 : + pTrie->valueWidth==UCPTRIE_VALUE_BITS_32 ? 32 : + pTrie->valueWidth==UCPTRIE_VALUE_BITS_8 ? 8 : 0; + char line[100], line2[100], line3[100], line4[100]; + + switch (syntax) { + case UPRV_TARGET_SYNTAX_CCODE: + snprintf(line, sizeof(line), "static const uint16_t %s_trieIndex[%%ld]={\n", name); + snprintf(line2, sizeof(line2), "static const uint%d_t %s_trieData[%%ld]={\n", (int)width, name); + snprintf(line3, sizeof(line3), "\n};\n\n"); + break; + case UPRV_TARGET_SYNTAX_TOML: + snprintf(line, sizeof(line), "index = [\n "); + snprintf(line2, sizeof(line2), "data_%d = [\n ", (int)width); + snprintf(line3, sizeof(line3), "\n]\n"); + break; + default: + UPRV_UNREACHABLE_EXIT; + } + usrc_writeUCPTrieArrays(f, line, line2, pTrie, line3, syntax); + + switch (syntax) { + case UPRV_TARGET_SYNTAX_CCODE: + snprintf(line, sizeof(line), "static const UCPTrie %s_trie={\n", name); + snprintf(line2, sizeof(line2), "%s_trieIndex", name); + snprintf(line3, sizeof(line3), "%s_trieData", name); + snprintf(line4, sizeof(line4), "};\n\n"); + break; + case UPRV_TARGET_SYNTAX_TOML: + line[0] = 0; + line2[0] = 0; + line3[0] = 0; + line4[0] = 0; + break; + default: + UPRV_UNREACHABLE_EXIT; + } + usrc_writeUCPTrieStruct(f, line, pTrie, line2, line3, line4, syntax); +} + +U_CAPI void U_EXPORT2 +usrc_writeUnicodeSet( + FILE *f, + const USet *pSet, + UTargetSyntax syntax) { + // ccode is not yet supported + U_ASSERT(syntax == UPRV_TARGET_SYNTAX_TOML); + + // Write out a list of ranges + const UnicodeSet* set = UnicodeSet::fromUSet(pSet); + UnicodeSetIterator it(*set); + fprintf(f, "# Inclusive ranges of the code points in the set.\n"); + fprintf(f, "ranges = [\n"); + bool seenFirstString = false; + while (it.nextRange()) { + if (it.isString()) { + if (!seenFirstString) { + seenFirstString = true; + fprintf(f, "]\nstrings = [\n"); + } + const UnicodeString& str = it.getString(); + fprintf(f, " "); + usrc_writeStringAsASCII(f, str.getBuffer(), str.length(), syntax); + fprintf(f, ",\n"); + } else { + U_ASSERT(!seenFirstString); + UChar32 start = it.getCodepoint(); + UChar32 end = it.getCodepointEnd(); + fprintf(f, " [0x%x, 0x%x],\n", start, end); + } + } + fprintf(f, "]\n"); +} + +U_CAPI void U_EXPORT2 +usrc_writeUCPMap( + FILE *f, + const UCPMap *pMap, + icu::ValueNameGetter *valueNameGetter, + UTargetSyntax syntax) { + // ccode is not yet supported + U_ASSERT(syntax == UPRV_TARGET_SYNTAX_TOML); + (void) syntax; // silence unused variable errors + + // Print out list of ranges + UChar32 start = 0, end; + uint32_t value; + fprintf(f, "# Code points `a` through `b` have value `v`, corresponding to `name`.\n"); + fprintf(f, "ranges = [\n"); + while ((end = ucpmap_getRange(pMap, start, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value)) >= 0) { + if (valueNameGetter != nullptr) { + const char *name = valueNameGetter->getName(value); + fprintf(f, " {a=0x%x, b=0x%x, v=%u, name=\"%s\"},\n", start, end, value, name); + } else { + fprintf(f, " {a=0x%x, b=0x%x, v=%u},\n", start, end, value); + } + start = end + 1; + } + fprintf(f, "]\n"); +} + +U_CAPI void U_EXPORT2 +usrc_writeArrayOfMostlyInvChars(FILE *f, + const char *prefix, + const char *p, int32_t length, + const char *postfix) { + int32_t i, col; + int prev2, prev, c; + + if(prefix!=nullptr) { + fprintf(f, prefix, (long)length); + } + prev2=prev=-1; + for(i=col=0; i<length; ++i, ++col) { + c=(uint8_t)p[i]; + if(i>0) { + /* Break long lines. Try to break at interesting places, to minimize revision diffs. */ + if( + /* Very long line. */ + col>=32 || + /* Long line, break after terminating NUL. */ + (col>=24 && prev2>=0x20 && prev==0) || + /* Medium-long line, break before non-NUL, non-character byte. */ + (col>=16 && (prev==0 || prev>=0x20) && 0<c && c<0x20) + ) { + fputs(",\n", f); + col=0; + } else { + fputc(',', f); + } + } + fprintf(f, c<0x20 ? "%u" : "'%c'", c); + prev2=prev; + prev=c; + } + if(postfix!=nullptr) { + fputs(postfix, f); + } +} + +U_CAPI void U_EXPORT2 +usrc_writeStringAsASCII(FILE *f, + const char16_t* ptr, int32_t length, + UTargetSyntax) { + // For now, assume all UTargetSyntax values are valid here. + fprintf(f, "\""); + int32_t i = 0; + UChar32 cp; + while (i < length) { + U16_NEXT(ptr, i, length, cp); + if (cp == u'"') { + fprintf(f, "\\\""); + } else if (ICU_Utility::isUnprintable(cp)) { + UnicodeString u16result; + ICU_Utility::escapeUnprintable(u16result, cp); + std::string u8result; + u16result.toUTF8String(u8result); + fprintf(f, "%s", u8result.data()); + } else { + U_ASSERT(cp < 0x80); + char s[2] = {static_cast<char>(cp), 0}; + fprintf(f, "%s", s); + } + } + fprintf(f, "\""); +} diff --git a/intl/icu/source/tools/toolutil/writesrc.h b/intl/icu/source/tools/toolutil/writesrc.h new file mode 100644 index 0000000000..9c0be5a100 --- /dev/null +++ b/intl/icu/source/tools/toolutil/writesrc.h @@ -0,0 +1,198 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2005-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: writesrc.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005apr23 +* created by: Markus W. Scherer +* +* Helper functions for writing source code for data. +*/ + +#ifndef __WRITESRC_H__ +#define __WRITESRC_H__ + +#include <stdio.h> +#include "unicode/utypes.h" +#include "unicode/ucpmap.h" +#include "unicode/ucptrie.h" +#include "unicode/umutablecptrie.h" +#include "unicode/uset.h" +#include "utrie2.h" + +/** + * An input to some of the functions in this file specifying whether to write data + * as C/C++ code initializers or as TOML. + */ +typedef enum UTargetSyntax { + UPRV_TARGET_SYNTAX_CCODE = 0, + UPRV_TARGET_SYNTAX_TOML = 1, +} UTargetSyntax; + +/** + * Creates a source text file and writes a header comment with the ICU copyright. + * Writes a C/Java-style comment with the generator name. + */ +U_CAPI FILE * U_EXPORT2 +usrc_create(const char *path, const char *filename, int32_t copyrightYear, const char *generator); + +/** + * Creates a source text file and writes a header comment with the ICU copyright. + * Writes the comment with # lines, as used in scripts and text data. + */ +U_CAPI FILE * U_EXPORT2 +usrc_createTextData(const char *path, const char *filename, int32_t copyrightYear, const char *generator); + +/** + * Writes the ICU copyright to a file stream, with configurable year and comment style. + */ +U_CAPI void U_EXPORT2 +usrc_writeCopyrightHeader(FILE *f, const char *prefix, int32_t copyrightYear); + +/** + * Writes information about the file being machine-generated. + */ +U_CAPI void U_EXPORT2 +usrc_writeFileNameGeneratedBy( + FILE *f, + const char *prefix, + const char *filename, + const char *generator); + +/** + * Writes the contents of an array of 8/16/32/64-bit words. + * The prefix and postfix are optional (can be NULL) and are written first/last. + * The prefix may contain a %ld or similar field for the array length. + * The {} and declaration etc. need to be included in prefix/postfix or + * printed before and after the array contents. + */ +U_CAPI void U_EXPORT2 +usrc_writeArray(FILE *f, + const char *prefix, + const void *p, int32_t width, int32_t length, + const char *indent, + const char *postfix); + +/** + * Calls usrc_writeArray() for the index and data arrays of a frozen UTrie2. + * Only the index array is written for a 16-bit UTrie2. In this case, dataPrefix + * is ignored and can be NULL. + */ +U_CAPI void U_EXPORT2 +usrc_writeUTrie2Arrays(FILE *f, + const char *indexPrefix, const char *dataPrefix, + const UTrie2 *pTrie, + const char *postfix); + +/** + * Writes the UTrie2 struct values. + * The {} and declaration etc. need to be included in prefix/postfix or + * printed before and after the array contents. + */ +U_CAPI void U_EXPORT2 +usrc_writeUTrie2Struct(FILE *f, + const char *prefix, + const UTrie2 *pTrie, + const char *indexName, const char *dataName, + const char *postfix); + +/** + * Calls usrc_writeArray() for the index and data arrays of a UCPTrie. + */ +U_CAPI void U_EXPORT2 +usrc_writeUCPTrieArrays(FILE *f, + const char *indexPrefix, const char *dataPrefix, + const UCPTrie *pTrie, + const char *postfix, + UTargetSyntax syntax); + +/** + * Writes the UCPTrie struct values. + * The {} and declaration etc. need to be included in prefix/postfix or + * printed before and after the array contents. + */ +U_CAPI void U_EXPORT2 +usrc_writeUCPTrieStruct(FILE *f, + const char *prefix, + const UCPTrie *pTrie, + const char *indexName, const char *dataName, + const char *postfix, + UTargetSyntax syntax); + +/** + * Writes the UCPTrie arrays and struct values. + */ +U_CAPI void U_EXPORT2 +usrc_writeUCPTrie(FILE *f, const char *name, const UCPTrie *pTrie, UTargetSyntax syntax); + +/** + * Writes the UnicodeSet range and string lists. + */ +U_CAPI void U_EXPORT2 +usrc_writeUnicodeSet( + FILE *f, + const USet *pSet, + UTargetSyntax syntax); + +#ifdef __cplusplus + +U_NAMESPACE_BEGIN + +class U_TOOLUTIL_API ValueNameGetter { +public: + virtual ~ValueNameGetter(); + virtual const char *getName(uint32_t value) = 0; +}; + +U_NAMESPACE_END + +/** + * Writes the UCPMap ranges list. + * + * The "valueNameGetter" argument is optional; ignored if nullptr. + * If present, it will be used to look up value name strings. + */ +U_CAPI void U_EXPORT2 +usrc_writeUCPMap( + FILE *f, + const UCPMap *pMap, + icu::ValueNameGetter *valueNameGetter, + UTargetSyntax syntax); + +#endif // __cplusplus + +/** + * Writes the contents of an array of mostly invariant characters. + * Characters 0..0x1f are printed as numbers, + * others as characters with single quotes: '%c'. + * + * The prefix and postfix are optional (can be NULL) and are written first/last. + * The prefix may contain a %ld or similar field for the array length. + * The {} and declaration etc. need to be included in prefix/postfix or + * printed before and after the array contents. + */ +U_CAPI void U_EXPORT2 +usrc_writeArrayOfMostlyInvChars(FILE *f, + const char *prefix, + const char *p, int32_t length, + const char *postfix); + +/** + * Writes a syntactically valid Unicode string in all ASCII, escaping quotes + * and non-ASCII characters. + */ +U_CAPI void U_EXPORT2 +usrc_writeStringAsASCII(FILE *f, + const UChar* ptr, int32_t length, + UTargetSyntax syntax); + +#endif diff --git a/intl/icu/source/tools/toolutil/xmlparser.cpp b/intl/icu/source/tools/toolutil/xmlparser.cpp new file mode 100644 index 0000000000..edb85bdab0 --- /dev/null +++ b/intl/icu/source/tools/toolutil/xmlparser.cpp @@ -0,0 +1,827 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2004-2010, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: xmlparser.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2004jul21 +* created by: Andy Heninger +*/ + +#include <stdio.h> +#include "unicode/uchar.h" +#include "unicode/ucnv.h" +#include "unicode/regex.h" +#include "filestrm.h" +#include "xmlparser.h" + +#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION + +// character constants +enum { + x_QUOT=0x22, + x_AMP=0x26, + x_APOS=0x27, + x_LT=0x3c, + x_GT=0x3e, + x_l=0x6c +}; + +#define XML_SPACES "[ \\u0009\\u000d\\u000a]" + +// XML #4 +#define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ + "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \ + "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \ + "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" + +// XML #5 +#define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]" + +// XML #6 +#define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" + +U_NAMESPACE_BEGIN + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) + +// +// UXMLParser constructor. Mostly just initializes the ICU regexes that are +// used for parsing. +// +UXMLParser::UXMLParser(UErrorCode &status) : + // XML Declaration. XML Production #23. + // example: "<?xml version=1.0 encoding="utf-16" ?> + // This is a sloppy implementation - just look for the leading <?xml and the closing ?> + // allow for a possible leading BOM. + mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), + + // XML Comment production #15 + // example: "<!-- whatever --> + // note, does not detect an illegal "--" within comments + mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status), + + // XML Spaces + // production [3] + mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), + + // XML Doctype decl production #28 + // example "<!DOCTYPE foo SYSTEM "somewhere" > + // or "<!DOCTYPE foo [internal dtd]> + // TODO: we don't actually parse the DOCTYPE or internal subsets. + // Some internal dtd subsets could confuse this simple-minded + // attempt at skipping over them, specifically, occurrences + // of closing square brackets. These could appear in comments, + // or in parameter entity declarations, for example. + mXMLDoctype(UnicodeString( + "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV + ), 0, status), + + // XML PI production #16 + // example "<?target stuff?> + mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), + + // XML Element Start Productions #40, #41 + // example <foo att1='abc' att2="d e f" > + // capture #1: the tag name + // + mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" + "(?:" + XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " + "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' + ")*" // * for zero or more attributes. + XML_SPACES "*?>", -1, US_INV), 0, status), // match " >" + + // XML Element End production #42 + // example </foo> + mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status), + + // XML Element Empty production #44 + // example <foo att1="abc" att2="d e f" /> + mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" + "(?:" + XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " + "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' + ")*" // * for zero or more attributes. + XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />" + + + // XMLCharData. Everything but '<'. Note that & will be dealt with later. + mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), + + // Attribute name = "value". XML Productions 10, 40/41 + // Capture group 1 is name, + // 2 is the attribute value, including the quotes. + // + // Note that attributes are scanned twice. The first time is with + // the regex for an entire element start. There, the attributes + // are checked syntactically, but not separated out one by one. + // Here, we match a single attribute, and make its name and + // attribute value available to the parser code. + mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" + "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), + + + mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), + + // Match any of the new-line sequences in content. + // All are changed to \u000a. + mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), + + // & char references + // We will figure out what we've got based on which capture group has content. + // The last one is a catchall for unrecognized entity references.. + // 1 2 3 4 5 6 7 8 + mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"), + 0, status), + + fNames(status), + fElementStack(status), + fOneLF((char16_t)0x0a) // Plain new-line string, used in new line normalization. + { + } + +UXMLParser * +UXMLParser::createParser(UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { + return nullptr; + } else { + return new UXMLParser(errorCode); + } +} + +UXMLParser::~UXMLParser() {} + +UXMLElement * +UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { + char bytes[4096], charsetBuffer[100]; + FileStream *f; + const char *charset, *pb; + UnicodeString src; + UConverter *cnv; + char16_t *buffer, *pu; + int32_t fileLength, bytesLength, length, capacity; + UBool flush; + + if(U_FAILURE(errorCode)) { + return nullptr; + } + + f=T_FileStream_open(filename, "rb"); + if(f==nullptr) { + errorCode=U_FILE_ACCESS_ERROR; + return nullptr; + } + + bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); + if(bytesLength<(int32_t)sizeof(bytes)) { + // we have already read the entire file + fileLength=bytesLength; + } else { + // get the file length + fileLength=T_FileStream_size(f); + } + + /* + * get the charset: + * 1. Unicode signature + * 2. treat as ISO-8859-1 and read XML encoding="charser" + * 3. default to UTF-8 + */ + charset=ucnv_detectUnicodeSignature(bytes, bytesLength, nullptr, &errorCode); + if(U_SUCCESS(errorCode) && charset!=nullptr) { + // open converter according to Unicode signature + cnv=ucnv_open(charset, &errorCode); + } else { + // read as Latin-1 and parse the XML declaration and encoding + cnv=ucnv_open("ISO-8859-1", &errorCode); + if(U_FAILURE(errorCode)) { + // unexpected error opening Latin-1 converter + goto exit; + } + + buffer=toUCharPtr(src.getBuffer(bytesLength)); + if(buffer==nullptr) { + // unexpected failure to reserve some string capacity + errorCode=U_MEMORY_ALLOCATION_ERROR; + goto exit; + } + pb=bytes; + pu=buffer; + ucnv_toUnicode( + cnv, + &pu, buffer+src.getCapacity(), + &pb, bytes+bytesLength, + nullptr, true, &errorCode); + src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); + ucnv_close(cnv); + cnv=nullptr; + if(U_FAILURE(errorCode)) { + // unexpected error in conversion from Latin-1 + src.remove(); + goto exit; + } + + // parse XML declaration + if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { + int32_t declEnd=mXMLDecl.end(errorCode); + // go beyond <?xml + int32_t pos=src.indexOf((char16_t)x_l)+1; + + mAttrValue.reset(src); + while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element. + UnicodeString attName = mAttrValue.group(1, errorCode); + UnicodeString attValue = mAttrValue.group(2, errorCode); + + // Trim the quotes from the att value. These are left over from the original regex + // that parsed the attribute, which couldn't conveniently strip them. + attValue.remove(0,1); // one char from the beginning + attValue.truncate(attValue.length()-1); // and one from the end. + + if(attName==UNICODE_STRING("encoding", 8)) { + length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer)); + charset=charsetBuffer; + break; + } + pos = mAttrValue.end(2, errorCode); + } + + if(charset==nullptr) { + // default to UTF-8 + charset="UTF-8"; + } + cnv=ucnv_open(charset, &errorCode); + } + } + + if(U_FAILURE(errorCode)) { + // unable to open the converter + goto exit; + } + + // convert the file contents + capacity=fileLength; // estimated capacity + src.getBuffer(capacity); + src.releaseBuffer(0); // zero length + flush=false; + for(;;) { + // convert contents of bytes[bytesLength] + pb=bytes; + for(;;) { + length=src.length(); + buffer=toUCharPtr(src.getBuffer(capacity)); + if(buffer==nullptr) { + // unexpected failure to reserve some string capacity + errorCode=U_MEMORY_ALLOCATION_ERROR; + goto exit; + } + + pu=buffer+length; + ucnv_toUnicode( + cnv, &pu, buffer+src.getCapacity(), + &pb, bytes+bytesLength, + nullptr, false, &errorCode); + src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); + if(errorCode==U_BUFFER_OVERFLOW_ERROR) { + errorCode=U_ZERO_ERROR; + capacity=(3*src.getCapacity())/2; // increase capacity by 50% + } else { + break; + } + } + + if(U_FAILURE(errorCode)) { + break; // conversion error + } + + if(flush) { + break; // completely converted the file + } + + // read next block + bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); + if(bytesLength==0) { + // reached end of file, convert once more to flush the converter + flush=true; + } + } + +exit: + ucnv_close(cnv); + T_FileStream_close(f); + + if(U_SUCCESS(errorCode)) { + return parse(src, errorCode); + } else { + return nullptr; + } +} + +UXMLElement * +UXMLParser::parse(const UnicodeString &src, UErrorCode &status) { + if(U_FAILURE(status)) { + return nullptr; + } + + UXMLElement *root = nullptr; + fPos = 0; // TODO use just a local pos variable and pass it into functions + // where necessary? + + // set all matchers to work on the input string + mXMLDecl.reset(src); + mXMLComment.reset(src); + mXMLSP.reset(src); + mXMLDoctype.reset(src); + mXMLPI.reset(src); + mXMLElemStart.reset(src); + mXMLElemEnd.reset(src); + mXMLElemEmpty.reset(src); + mXMLCharData.reset(src); + mAttrValue.reset(src); + mAttrNormalizer.reset(src); + mNewLineNormalizer.reset(src); + mAmps.reset(src); + + // Consume the XML Declaration, if present. + if (mXMLDecl.lookingAt(fPos, status)) { + fPos = mXMLDecl.end(status); + } + + // Consume "misc" [XML production 27] appearing before DocType + parseMisc(status); + + // Consume a DocType declaration, if present. + if (mXMLDoctype.lookingAt(fPos, status)) { + fPos = mXMLDoctype.end(status); + } + + // Consume additional "misc" [XML production 27] appearing after the DocType + parseMisc(status); + + // Get the root element + if (mXMLElemEmpty.lookingAt(fPos, status)) { + // Root is an empty element (no nested elements or content) + root = createElement(mXMLElemEmpty, status); + fPos = mXMLElemEmpty.end(status); + } else { + if (mXMLElemStart.lookingAt(fPos, status) == false) { + error("Root Element expected", status); + goto errorExit; + } + root = createElement(mXMLElemStart, status); + UXMLElement *el = root; + + // + // This is the loop that consumes the root element of the document, + // including all nested content. Nested elements are handled by + // explicit pushes/pops of the element stack; there is no recursion + // in the control flow of this code. + // "el" always refers to the current element, the one to which content + // is being added. It is above the top of the element stack. + for (;;) { + // Nested Element Start + if (mXMLElemStart.lookingAt(fPos, status)) { + UXMLElement *t = createElement(mXMLElemStart, status); + el->fChildren.addElement(t, status); + t->fParent = el; + fElementStack.push(el, status); + el = t; + continue; + } + + // Text Content. String is concatenated onto the current node's content, + // but only if it contains something other than spaces. + UnicodeString s = scanContent(status); + if (s.length() > 0) { + mXMLSP.reset(s); + if (mXMLSP.matches(status) == false) { + // This chunk of text contains something other than just + // white space. Make a child node for it. + replaceCharRefs(s, status); + el->fChildren.addElement(s.clone(), status); + } + mXMLSP.reset(src); // The matchers need to stay set to the main input string. + continue; + } + + // Comments. Discard. + if (mXMLComment.lookingAt(fPos, status)) { + fPos = mXMLComment.end(status); + continue; + } + + // PIs. Discard. + if (mXMLPI.lookingAt(fPos, status)) { + fPos = mXMLPI.end(status); + continue; + } + + // Element End + if (mXMLElemEnd.lookingAt(fPos, status)) { + fPos = mXMLElemEnd.end(0, status); + const UnicodeString name = mXMLElemEnd.group(1, status); + if (name != *el->fName) { + error("Element start / end tag mismatch", status); + goto errorExit; + } + if (fElementStack.empty()) { + // Close of the root element. We're done with the doc. + el = nullptr; + break; + } + el = (UXMLElement *)fElementStack.pop(); + continue; + } + + // Empty Element. Stored as a child of the current element, but not stacked. + if (mXMLElemEmpty.lookingAt(fPos, status)) { + UXMLElement *t = createElement(mXMLElemEmpty, status); + el->fChildren.addElement(t, status); + continue; + } + + // Hit something within the document that doesn't match anything. + // It's an error. + error("Unrecognized markup", status); + break; + } + + if (el != nullptr || !fElementStack.empty()) { + // We bailed out early, for some reason. + error("Root element not closed.", status); + goto errorExit; + } + } + + // Root Element parse is complete. + // Consume the annoying xml "Misc" that can appear at the end of the doc. + parseMisc(status); + + // We should have reached the end of the input + if (fPos != src.length()) { + error("Extra content at the end of the document", status); + goto errorExit; + } + + // Success! + return root; + +errorExit: + delete root; + return nullptr; +} + +// +// createElement +// We've just matched an element start tag. Create and fill in a UXMLElement object +// for it. +// +UXMLElement * +UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) { + // First capture group is the element's name. + UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status); + + // Scan for attributes. + int32_t pos = mEl.end(1, status); // The position after the end of the tag name + + while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element. + UnicodeString attName = mAttrValue.group(1, status); + UnicodeString attValue = mAttrValue.group(2, status); + + // Trim the quotes from the att value. These are left over from the original regex + // that parsed the attribute, which couldn't conveniently strip them. + attValue.remove(0,1); // one char from the beginning + attValue.truncate(attValue.length()-1); // and one from the end. + + // XML Attribute value normalization. + // This is one of the really screwy parts of the XML spec. + // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize + // Note that non-validating parsers must treat all entities as type CDATA + // which simplifies things some. + + // Att normalization step 1: normalize any newlines in the attribute value + mNewLineNormalizer.reset(attValue); + attValue = mNewLineNormalizer.replaceAll(fOneLF, status); + + // Next change all xml white space chars to plain \u0020 spaces. + mAttrNormalizer.reset(attValue); + UnicodeString oneSpace((char16_t)0x0020); + attValue = mAttrNormalizer.replaceAll(oneSpace, status); + + // Replace character entities. + replaceCharRefs(attValue, status); + + // Save the attribute name and value in our document structure. + el->fAttNames.addElement((void *)intern(attName, status), status); + el->fAttValues.addElement(attValue.clone(), status); + pos = mAttrValue.end(2, status); + } + fPos = mEl.end(0, status); + return el; +} + +// +// parseMisc +// Consume XML "Misc" [production #27] +// which is any combination of space, PI and comments +// Need to watch end-of-input because xml MISC stuff is allowed after +// the document element, so we WILL scan off the end in this function +// +void +UXMLParser::parseMisc(UErrorCode &status) { + for (;;) { + if (fPos >= mXMLPI.input().length()) { + break; + } + if (mXMLPI.lookingAt(fPos, status)) { + fPos = mXMLPI.end(status); + continue; + } + if (mXMLSP.lookingAt(fPos, status)) { + fPos = mXMLSP.end(status); + continue; + } + if (mXMLComment.lookingAt(fPos, status)) { + fPos = mXMLComment.end(status); + continue; + } + break; + } +} + +// +// Scan for document content. +// +UnicodeString +UXMLParser::scanContent(UErrorCode &status) { + UnicodeString result; + if (mXMLCharData.lookingAt(fPos, status)) { + result = mXMLCharData.group((int32_t)0, status); + // Normalize the new-lines. (Before char ref substitution) + mNewLineNormalizer.reset(result); + result = mNewLineNormalizer.replaceAll(fOneLF, status); + + // TODO: handle CDATA + fPos = mXMLCharData.end(0, status); + } + + return result; +} + +// +// replaceCharRefs +// +// replace the char entities < & { ካ etc. in a string +// with the corresponding actual character. +// +void +UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { + UnicodeString result; + UnicodeString replacement; + int i; + + mAmps.reset(s); + // See the initialization for the regex matcher mAmps. + // Which entity we've matched is determined by which capture group has content, + // which is flagged by start() of that group not being -1. + while (mAmps.find()) { + if (mAmps.start(1, status) != -1) { + replacement.setTo((char16_t)x_AMP); + } else if (mAmps.start(2, status) != -1) { + replacement.setTo((char16_t)x_LT); + } else if (mAmps.start(3, status) != -1) { + replacement.setTo((char16_t)x_GT); + } else if (mAmps.start(4, status) != -1) { + replacement.setTo((char16_t)x_APOS); + } else if (mAmps.start(5, status) != -1) { + replacement.setTo((char16_t)x_QUOT); + } else if (mAmps.start(6, status) != -1) { + UnicodeString hexString = mAmps.group(6, status); + UChar32 val = 0; + for (i=0; i<hexString.length(); i++) { + val = (val << 4) + u_digit(hexString.charAt(i), 16); + } + // TODO: some verification that the character is valid + replacement.setTo(val); + } else if (mAmps.start(7, status) != -1) { + UnicodeString decimalString = mAmps.group(7, status); + UChar32 val = 0; + for (i=0; i<decimalString.length(); i++) { + val = val*10 + u_digit(decimalString.charAt(i), 10); + } + // TODO: some verification that the character is valid + replacement.setTo(val); + } else { + // An unrecognized &entity; Leave it alone. + // TODO: check that it really looks like an entity, and is not some + // random & in the text. + replacement = mAmps.group((int32_t)0, status); + } + mAmps.appendReplacement(result, replacement, status); + } + mAmps.appendTail(result); + s = result; +} + +void +UXMLParser::error(const char *message, UErrorCode &status) { + // TODO: something better here... + const UnicodeString &src=mXMLDecl.input(); + int line = 0; + int ci = 0; + while (ci < fPos && ci>=0) { + ci = src.indexOf((char16_t)0x0a, ci+1); + line++; + } + fprintf(stderr, "Error: %s at line %d\n", message, line); + if (U_SUCCESS(status)) { + status = U_PARSE_ERROR; + } +} + +// intern strings like in Java + +const UnicodeString * +UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) { + const UHashElement *he=fNames.find(s); + if(he!=nullptr) { + // already a known name, return its hashed key pointer + return (const UnicodeString *)he->key.pointer; + } else { + // add this new name and return its hashed key pointer + fNames.puti(s, 1, errorCode); + he=fNames.find(s); + return (const UnicodeString *)he->key.pointer; + } +} + +const UnicodeString * +UXMLParser::findName(const UnicodeString &s) const { + const UHashElement *he=fNames.find(s); + if(he!=nullptr) { + // a known name, return its hashed key pointer + return (const UnicodeString *)he->key.pointer; + } else { + // unknown name + return nullptr; + } +} + +// UXMLElement ------------------------------------------------------------- *** + +UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) : + fParser(parser), + fName(name), + fAttNames(errorCode), + fAttValues(errorCode), + fChildren(errorCode), + fParent(nullptr) +{ +} + +UXMLElement::~UXMLElement() { + int i; + // attribute names are owned by the UXMLParser, don't delete them here + for (i=fAttValues.size()-1; i>=0; i--) { + delete (UObject *)fAttValues.elementAt(i); + } + for (i=fChildren.size()-1; i>=0; i--) { + delete (UObject *)fChildren.elementAt(i); + } +} + +const UnicodeString & +UXMLElement::getTagName() const { + return *fName; +} + +UnicodeString +UXMLElement::getText(UBool recurse) const { + UnicodeString text; + appendText(text, recurse); + return text; +} + +void +UXMLElement::appendText(UnicodeString &text, UBool recurse) const { + const UObject *node; + int32_t i, count=fChildren.size(); + for(i=0; i<count; ++i) { + node=(const UObject *)fChildren.elementAt(i); + const UnicodeString *s=dynamic_cast<const UnicodeString *>(node); + if(s!=nullptr) { + text.append(*s); + } else if(recurse) /* must be a UXMLElement */ { + ((const UXMLElement *)node)->appendText(text, recurse); + } + } +} + +int32_t +UXMLElement::countAttributes() const { + return fAttNames.size(); +} + +const UnicodeString * +UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const { + if(0<=i && i<fAttNames.size()) { + name.setTo(*(const UnicodeString *)fAttNames.elementAt(i)); + value.setTo(*(const UnicodeString *)fAttValues.elementAt(i)); + return &value; // or return (UnicodeString *)fAttValues.elementAt(i); + } else { + return nullptr; + } +} + +const UnicodeString * +UXMLElement::getAttribute(const UnicodeString &name) const { + // search for the attribute name by comparing the interned pointer, + // not the string contents + const UnicodeString *p=fParser->findName(name); + if(p==nullptr) { + return nullptr; // no such attribute seen by the parser at all + } + + int32_t i, count=fAttNames.size(); + for(i=0; i<count; ++i) { + if(p==(const UnicodeString *)fAttNames.elementAt(i)) { + return (const UnicodeString *)fAttValues.elementAt(i); + } + } + return nullptr; +} + +int32_t +UXMLElement::countChildren() const { + return fChildren.size(); +} + +const UObject * +UXMLElement::getChild(int32_t i, UXMLNodeType &type) const { + if(0<=i && i<fChildren.size()) { + const UObject *node=(const UObject *)fChildren.elementAt(i); + if(dynamic_cast<const UXMLElement *>(node)!=nullptr) { + type=UXML_NODE_TYPE_ELEMENT; + } else { + type=UXML_NODE_TYPE_STRING; + } + return node; + } else { + return nullptr; + } +} + +const UXMLElement * +UXMLElement::nextChildElement(int32_t &i) const { + if(i<0) { + return nullptr; + } + + const UObject *node; + int32_t count=fChildren.size(); + while(i<count) { + node=(const UObject *)fChildren.elementAt(i++); + const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); + if(elem!=nullptr) { + return elem; + } + } + return nullptr; +} + +const UXMLElement * +UXMLElement::getChildElement(const UnicodeString &name) const { + // search for the element name by comparing the interned pointer, + // not the string contents + const UnicodeString *p=fParser->findName(name); + if(p==nullptr) { + return nullptr; // no such element seen by the parser at all + } + + const UObject *node; + int32_t i, count=fChildren.size(); + for(i=0; i<count; ++i) { + node=(const UObject *)fChildren.elementAt(i); + const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); + if(elem!=nullptr) { + if(p==elem->fName) { + return elem; + } + } + } + return nullptr; +} + +U_NAMESPACE_END + +#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ + diff --git a/intl/icu/source/tools/toolutil/xmlparser.h b/intl/icu/source/tools/toolutil/xmlparser.h new file mode 100644 index 0000000000..d0dcd9a48a --- /dev/null +++ b/intl/icu/source/tools/toolutil/xmlparser.h @@ -0,0 +1,247 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2004-2005, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: xmlparser.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2004jul21 +* created by: Andy Heninger +* +* Tiny XML parser using ICU and intended for use in ICU tests and in build tools. +* Not suitable for production use. Not supported. +* Not conformant. Not efficient. +* But very small. +*/ + +#ifndef __XMLPARSER_H__ +#define __XMLPARSER_H__ + +#include "unicode/uobject.h" +#include "unicode/unistr.h" +#include "unicode/regex.h" +#include "uvector.h" +#include "hash.h" + +#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION + +enum UXMLNodeType { + /** Node type string (text contents), stored as a UnicodeString. */ + UXML_NODE_TYPE_STRING, + /** Node type element, stored as a UXMLElement. */ + UXML_NODE_TYPE_ELEMENT, + UXML_NODE_TYPE_COUNT +}; + +U_NAMESPACE_BEGIN + +class UXMLParser; + +/** + * This class represents an element node in a parsed XML tree. + */ +class U_TOOLUTIL_API UXMLElement : public UObject { +public: + /** + * Destructor. + */ + virtual ~UXMLElement(); + + /** + * Get the tag name of this element. + */ + const UnicodeString &getTagName() const; + /** + * Get the text contents of the element. + * Append the contents of all text child nodes. + * @param recurse If true, also recursively appends the contents of all + * text child nodes of element children. + * @return The text contents. + */ + UnicodeString getText(UBool recurse) const; + /** + * Get the number of attributes. + */ + int32_t countAttributes() const; + /** + * Get the i-th attribute. + * @param i Index of the attribute. + * @param name Output parameter, receives the attribute name. + * @param value Output parameter, receives the attribute value. + * @return A pointer to the attribute value (may be &value or a pointer to an + * internal string object), or nullptr if i is out of bounds. + */ + const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const; + /** + * Get the value of the attribute with the given name. + * @param name Attribute name to be looked up. + * @return A pointer to the attribute value, or nullptr if this element + * does not have this attribute. + */ + const UnicodeString *getAttribute(const UnicodeString &name) const; + /** + * Get the number of child nodes. + */ + int32_t countChildren() const; + /** + * Get the i-th child node. + * @param i Index of the child node. + * @param type The child node type. + * @return A pointer to the child node object, or nullptr if i is out of bounds. + */ + const UObject *getChild(int32_t i, UXMLNodeType &type) const; + /** + * Get the next child element node, skipping non-element child nodes. + * @param i Enumeration index; initialize to 0 before getting the first child element. + * @return A pointer to the next child element, or nullptr if there is none. + */ + const UXMLElement *nextChildElement(int32_t &i) const; + /** + * Get the immediate child element with the given name. + * If there are multiple child elements with this name, then return + * the first one. + * @param name Element name to be looked up. + * @return A pointer to the element node, or nullptr if this element + * does not have this immediate child element. + */ + const UXMLElement *getChildElement(const UnicodeString &name) const; + + /** + * ICU "poor man's RTTI", returns a UClassID for the actual class. + */ + virtual UClassID getDynamicClassID() const override; + + /** + * ICU "poor man's RTTI", returns a UClassID for this class. + */ + static UClassID U_EXPORT2 getStaticClassID(); + +private: + // prevent default construction etc. + UXMLElement(); + UXMLElement(const UXMLElement &other); + UXMLElement &operator=(const UXMLElement &other); + + void appendText(UnicodeString &text, UBool recurse) const; + + friend class UXMLParser; + + UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode); + + const UXMLParser *fParser; + const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser) + UnicodeString fContent; // The text content of this node. All element content is + // concatenated even when there are intervening nested elements + // (which doesn't happen with most xml files we care about) + // Sections of content containing only white space are dropped, + // which gets rid the bogus white space content from + // elements which are primarily containers for nested elements. + UVector fAttNames; // A vector containing the names of this element's attributes + // The names are UnicodeString objects, owned by the UXMLParser. + UVector fAttValues; // A vector containing the attribute values for + // this element's attributes. The order is the same + // as that of the attribute name vector. + + UVector fChildren; // The child nodes of this element (a Vector) + + UXMLElement *fParent; // A pointer to the parent element of this element. +}; + +/** + * A simple XML parser; it is neither efficient nor conformant and only useful for + * restricted types of XML documents. + * + * The parse methods parse whole documents and return the parse trees via their + * root elements. + */ +class U_TOOLUTIL_API UXMLParser : public UObject { +public: + /** + * Create an XML parser. + */ + static UXMLParser *createParser(UErrorCode &errorCode); + /** + * Destructor. + */ + virtual ~UXMLParser(); + + /** + * Parse an XML document, create the entire document tree, and + * return a pointer to the root element of the parsed tree. + * The caller must delete the element. + */ + UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode); + /** + * Parse an XML file, create the entire document tree, and + * return a pointer to the root element of the parsed tree. + * The caller must delete the element. + */ + UXMLElement *parseFile(const char *filename, UErrorCode &errorCode); + + /** + * ICU "poor man's RTTI", returns a UClassID for the actual class. + */ + virtual UClassID getDynamicClassID() const override; + + /** + * ICU "poor man's RTTI", returns a UClassID for this class. + */ + static UClassID U_EXPORT2 getStaticClassID(); + +private: + // prevent default construction etc. + UXMLParser(); + UXMLParser(const UXMLParser &other); + UXMLParser &operator=(const UXMLParser &other); + + // constructor + UXMLParser(UErrorCode &status); + + void parseMisc(UErrorCode &status); + UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status); + void error(const char *message, UErrorCode &status); + UnicodeString scanContent(UErrorCode &status); + void replaceCharRefs(UnicodeString &s, UErrorCode &status); + + const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode); +public: + // public for UXMLElement only + const UnicodeString *findName(const UnicodeString &s) const; +private: + + // There is one ICU regex matcher for each of the major XML syntax items + // that are recognized. + RegexMatcher mXMLDecl; + RegexMatcher mXMLComment; + RegexMatcher mXMLSP; + RegexMatcher mXMLDoctype; + RegexMatcher mXMLPI; + RegexMatcher mXMLElemStart; + RegexMatcher mXMLElemEnd; + RegexMatcher mXMLElemEmpty; + RegexMatcher mXMLCharData; + RegexMatcher mAttrValue; + RegexMatcher mAttrNormalizer; + RegexMatcher mNewLineNormalizer; + RegexMatcher mAmps; + + Hashtable fNames; // interned element/attribute name strings + UStack fElementStack; // Stack holds the parent elements when nested + // elements are being parsed. All items on this + // stack are of type UXMLElement. + int32_t fPos; // String index of the current scan position in + // xml source (in fSrc). + UnicodeString fOneLF; +}; + +U_NAMESPACE_END +#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ + +#endif diff --git a/intl/icu/source/tools/tzcode/Makefile.in b/intl/icu/source/tools/tzcode/Makefile.in new file mode 100644 index 0000000000..4ba969f42e --- /dev/null +++ b/intl/icu/source/tools/tzcode/Makefile.in @@ -0,0 +1,182 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Some Portions Copyright (c) 2006-2012 IBM and others. All Rights Reserved. + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +subdir = tools/tzcode + +include $(top_builddir)/icudefs.mk + +ifeq ($(TZDATA),) +TZDATA = $(firstword $(wildcard ./tzdata*.tar.gz) $(wildcard $(srcdir)/tzdata*.tar.gz)) +endif +ifeq ($(TZCODE),) +TZCODE = $(firstword $(wildcard ./tzcode*.tar.gz) $(wildcard $(srcdir)/tzcode*.tar.gz)) +endif + + +PRIMARY_DATA = africa antarctica asia australasia europe northamerica southamerica +SUPPLEMENTAL_DATA = etcetera factory backward +#DEPRECATED_DATA = pacificnew systemv solar87 solar88 solar89 +#TDATA = $(PRIMARY_DATA) $(SUPPLEMENTAL_DATA) $(DEPRECATED_DATA) +TDATA = $(PRIMARY_DATA) $(SUPPLEMENTAL_DATA) + +TZDIR=zoneinfo + +CFLAGS+=-D_POSIX_C_SOURCE +CPPFLAGS+= -DTZDIR=\"$(TZDIR)\" + +# more data +XDATA=zone.tab leapseconds iso3166.tab +ICUDATA=ZoneMetaData.java icu_zone.txt tz2icu zoneinfo64.txt zoneinfo.txt + +VANGUARD_DIR= ./vanguard + +# For TZ DB/ICU comparison +TZORIG=./tzorig +TZORIG_TZDIR=./tzorig/tzdir +TZORIG_ABS := $(shell pwd)/tzorig +TZORIG_TZDIR_ABS := $(TZORIG_ABS)/tzdir +TZORIG_OPTS := CFLAGS="-D_POSIX_C_SOURCE $(TZORIG_EXTRA_CFLAGS)" TZDIR=$(TZORIG_TZDIR_ABS) + + +## Options for building zdumps +ZDUMPOUT=$(shell pwd)/zdumpout +ICUZDUMPOUT=$(shell pwd)/icuzdumpout + +ZDUMP_OPTS= -v -a -d $(ZDUMPOUT) -c 1902,2038 -i +ICUZDUMP_OPTS= -a -d $(ICUZDUMPOUT) + +# Executables & objects +OBJECTS= zic.o localtime.o asctime.o scheck.o ialloc.o +ZICTARG=$(BINDIR)/zic$(EXEEXT) +ZICEXEC=$(TOOLBINDIR)/zic$(TOOLEXEEXT) +TZ2ICUTARG=$(BINDIR)/tz2icu$(EXEEXT) +TZ2ICUEXEC=$(TOOLBINDIR)/tz2icu$(TOOLEXEEXT) +ICUZDUMPTARG=$(BINDIR)/icuzdump$(EXEEXT) +ICUZDUMPEXEC=$(TOOLBINDIR)/icuzdump$(TOOLEXEEXT) + +ifeq ($(TZDATA),) +all: + @echo ERROR "tzdata*.tar.gz" can\'t be found. + @false +else +all: icu_data +endif + +TZCODE_TARGETS= tzorig check-dump + +ifeq ($(TZCODE),) +# we're broken. +$(TZCODE_TARGETS): + @echo ERROR "tzcode*.tar.gz" can\'t be found. + @false + +else +ifeq ($(TZDATA),) +# we're broken. +$(TZCODE_TARGETS): + @echo ERROR "tzdata*.tar.gz" can\'t be found. + @false +else +tzorig: $(TZCODE) $(TZDATA) + -$(RMV) ./tzorig/ + mkdir $@ + mkdir $(TZORIG_TZDIR) + gunzip -d < $(TZDATA) | ( cd $@ ; tar xf - ) + gunzip -d < $(TZCODE) | ( cd $@ ; tar xf - ) + for tzfile in $(TDATA) ; do \ + mv $(TZORIG)/$$tzfile $(TZORIG)/$$tzfile.bak && \ + awk -v DATAFORM=rearguard -f $(TZORIG)/ziguard.awk $(TZORIG)/$$tzfile.bak > $(TZORIG)/$$tzfile; \ + done + -mv $(TZORIG)/zdump.c $(TZORIG)/zdump.c.orig + cp $(srcdir)/zdump.c $(TZORIG)/zdump.c + -mv $(TZORIG)/factory $(TZORIG)/factory.orig + cat $(TZORIG)/factory.orig $(srcdir)/icuzones > $(TZORIG)/factory + -mv $(TZORIG)/zishrink.awk $(TZORIG)/zishrink.awk.orig + sed -e '/if (line ~ \/^R SystemV \/) return/s/^/#/' $(TZORIG)/zishrink.awk.orig > $(TZORIG)/zishrink.awk +# -mv $(TZORIG)/Makefile $(TZORIG)/Makefile.orig +# sed -e "s/^BACKWARD=.*/BACKWARD= backward pacificnew/" $(TZORIG)/Makefile.orig > $(TZORIG)/Makefile + $(MAKE) -C $@ $(TZORIG_OPTS) zdump zones + +$(ZDUMPOUT): tzorig + ( cd $(TZORIG) ; ./zdump$(EXEEXT) $(ZDUMP_OPTS) ) + find $(ZDUMPOUT) -name '*--ICU' -exec sh -c 'mv "$${0}" $${0%--ICU}' {} \; + +dump-out: $(ZDUMPOUT) $(ICUZDUMPOUT) + +check-dump: dump-out + diff -r zdumpout icuzdumpout + +endif +endif + +$(ICUZDUMPOUT): $(ICUZDUMPEXEC) + -$(RMV) $(ICUZDUMPOUT) + -mkdir $(ICUZDUMPOUT) + $(INVOKE) $(ICUZDUMPEXEC) $(ICUZDUMP_OPTS) + + +# +# old 'tz' rules start here +# + + +$(ZICTARG): $(OBJECTS) $(TDATA) $(srcdir)/tz2icu.h + $(CC) $(CFLAGS) $(TZORIG_EXTRA_CFLAGS) $(LFLAGS) -I$(srcdir) $(OBJECTS) $(LDLIBS) -o $@ + +$(TZ2ICUTARG): $(srcdir)/tz2icu.cpp $(srcdir)/tz2icu.h + $(CXX) $(CXXFLAGS) -I$(srcdir) -I$(top_srcdir)/common $(srcdir)/tz2icu.cpp -o $@ + +$(ICUZDUMPTARG): $(srcdir)/icuzdump.cpp + $(LINK.cc) -I$(srcdir) -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(top_srcdir)/tools/toolutil -I$(top_srcdir)/io -pedantic $(srcdir)/icuzdump.cpp $(LIBICUUC) $(LIBICUDT) $(LIBICUI18N) $(LIBICUIO) $(LIBICUTOOLUTIL) -o $@ + + +$(TDATA): tdatamarker + +tdatamarker: $(TZDATA) + mkdir $(VANGUARD_DIR) + gunzip -d < $(TZDATA) | tar xf - --exclude=Makefile + for tzfile in $(TDATA) ; do \ + mv $$tzfile $(VANGUARD_DIR)/$$tzfile && \ + awk -v DATAFORM=rearguard -f ziguard.awk $(VANGUARD_DIR)/$$tzfile > $$tzfile; \ + done + touch $@ + +posix_only: $(ZICEXEC) $(TDATA) $(srcdir)/icuzones + $(ZICEXEC) -d $(TZDIR) -L /dev/null $(TDATA) $(srcdir)/icuzones + + +icu_data: $(TZ2ICUEXEC) posix_only + $(TZ2ICUEXEC) $(TZDIR) zone.tab `echo $(TZDATA) | sed -e "s/.*\/tzdata//;s/\.tar\.gz$$//"` + $(TZ2ICUEXEC) $(TZDIR) zone.tab `echo $(TZDATA) | sed -e "s/.*\/tzdata//;s/\.tar\.gz$$//"` --old + +clean: + -rm -f core *.o *.out zdump${EXEEXT} $(ZICTARG) date $(TZ2ICUTARG) + @echo ICU specific cleanup: + -rm -f $(ICUDATA) + -rm -rf $(TZDIR) + -rm -rf $(VANGUARD_DIR) + -$(RMV) $(ICUZDUMPTARG) tzorig ./zdumpout/ ./icuzdumpout/ +ifneq ($(TZDATA),) + -rm -rf `gunzip -d < $(TZDATA) | tar tf - --exclude=Makefile | grep -o '[^ ]*$$' | tr '\n' ' '` + -rm -f tdatamarker +endif + +checkclean: + +dataclean: clean + -rm -f $(TDATA) $(XDATA) + +distclean: dataclean clean + -rm -f Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + diff --git a/intl/icu/source/tools/tzcode/asctime.c b/intl/icu/source/tools/tzcode/asctime.c new file mode 100644 index 0000000000..152b0db4e5 --- /dev/null +++ b/intl/icu/source/tools/tzcode/asctime.c @@ -0,0 +1,132 @@ +/* +** This file is in the public domain, so clarified as of +** 1996-06-05 by Arthur David Olson. +*/ + +/* +** Avoid the temptation to punt entirely to strftime; +** the output of strftime is supposed to be locale specific +** whereas the output of asctime is supposed to be constant. +*/ + +/*LINTLIBRARY*/ + +#include "private.h" +#include "tzfile.h" + +/* +** Some systems only handle "%.2d"; others only handle "%02d"; +** "%02.2d" makes (most) everybody happy. +** At least some versions of gcc warn about the %02.2d; +** we conditionalize below to avoid the warning. +*/ +/* +** All years associated with 32-bit time_t values are exactly four digits long; +** some years associated with 64-bit time_t values are not. +** Vintage programs are coded for years that are always four digits long +** and may assume that the newline always lands in the same place. +** For years that are less than four digits, we pad the output with +** leading zeroes to get the newline in the traditional place. +** The -4 ensures that we get four characters of output even if +** we call a strftime variant that produces fewer characters for some years. +** The ISO C 1999 and POSIX 1003.1-2004 standards prohibit padding the year, +** but many implementations pad anyway; most likely the standards are buggy. +*/ +#ifdef __GNUC__ +#define ASCTIME_FMT "%.3s %.3s%3d %2.2d:%2.2d:%2.2d %-4s\n" +#else /* !defined __GNUC__ */ +#define ASCTIME_FMT "%.3s %.3s%3d %02.2d:%02.2d:%02.2d %-4s\n" +#endif /* !defined __GNUC__ */ +/* +** For years that are more than four digits we put extra spaces before the year +** so that code trying to overwrite the newline won't end up overwriting +** a digit within a year and truncating the year (operating on the assumption +** that no output is better than wrong output). +*/ +#ifdef __GNUC__ +#define ASCTIME_FMT_B "%.3s %.3s%3d %2.2d:%2.2d:%2.2d %s\n" +#else /* !defined __GNUC__ */ +#define ASCTIME_FMT_B "%.3s %.3s%3d %02.2d:%02.2d:%02.2d %s\n" +#endif /* !defined __GNUC__ */ + +#define STD_ASCTIME_BUF_SIZE 26 +/* +** Big enough for something such as +** ??? ???-2147483648 -2147483648:-2147483648:-2147483648 -2147483648\n +** (two three-character abbreviations, five strings denoting integers, +** seven explicit spaces, two explicit colons, a newline, +** and a trailing ASCII nul). +** The values above are for systems where an int is 32 bits and are provided +** as an example; the define below calculates the maximum for the system at +** hand. +*/ +#define MAX_ASCTIME_BUF_SIZE (2*3+5*INT_STRLEN_MAXIMUM(int)+7+2+1+1) + +static char buf_asctime[MAX_ASCTIME_BUF_SIZE]; + +/* +** A la ISO/IEC 9945-1, ANSI/IEEE Std 1003.1, 2004 Edition. +*/ + +char * +asctime_r(register const struct tm *timeptr, char *buf) +{ + static const char wday_name[][3] = { + "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" + }; + static const char mon_name[][3] = { + "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" + }; + register const char * wn; + register const char * mn; + char year[INT_STRLEN_MAXIMUM(int) + 2]; + char result[MAX_ASCTIME_BUF_SIZE]; + + if (timeptr == NULL) { + errno = EINVAL; + return strcpy(buf, "??? ??? ?? ??:??:?? ????\n"); + } + if (timeptr->tm_wday < 0 || timeptr->tm_wday >= DAYSPERWEEK) + wn = "???"; + else wn = wday_name[timeptr->tm_wday]; + if (timeptr->tm_mon < 0 || timeptr->tm_mon >= MONSPERYEAR) + mn = "???"; + else mn = mon_name[timeptr->tm_mon]; + /* + ** Use strftime's %Y to generate the year, to avoid overflow problems + ** when computing timeptr->tm_year + TM_YEAR_BASE. + ** Assume that strftime is unaffected by other out-of-range members + ** (e.g., timeptr->tm_mday) when processing "%Y". + */ + (void) strftime(year, sizeof year, "%Y", timeptr); + /* + ** We avoid using snprintf since it's not available on all systems. + */ + (void) sprintf(result, + ((strlen(year) <= 4) ? ASCTIME_FMT : ASCTIME_FMT_B), + wn, mn, + timeptr->tm_mday, timeptr->tm_hour, + timeptr->tm_min, timeptr->tm_sec, + year); + if (strlen(result) < STD_ASCTIME_BUF_SIZE || buf == buf_asctime) + return strcpy(buf, result); + else { +#ifdef EOVERFLOW + errno = EOVERFLOW; +#else /* !defined EOVERFLOW */ + errno = EINVAL; +#endif /* !defined EOVERFLOW */ + return NULL; + } +} + +/* +** A la ISO/IEC 9945-1, ANSI/IEEE Std 1003.1, 2004 Edition. +*/ + +char * +asctime(register const struct tm *timeptr) +{ + return asctime_r(timeptr, buf_asctime); +} diff --git a/intl/icu/source/tools/tzcode/ialloc.c b/intl/icu/source/tools/tzcode/ialloc.c new file mode 100644 index 0000000000..b6f018897b --- /dev/null +++ b/intl/icu/source/tools/tzcode/ialloc.c @@ -0,0 +1,32 @@ +/* +** This file is in the public domain, so clarified as of +** 2006-07-17 by Arthur David Olson. +*/ + +/*LINTLIBRARY*/ + +#include "private.h" + +char * +icatalloc(char *const old, const char *const new) +{ + register char * result; + register int oldsize, newsize; + + newsize = (new == NULL) ? 0 : strlen(new); + if (old == NULL) + oldsize = 0; + else if (newsize == 0) + return old; + else oldsize = strlen(old); + if ((result = realloc(old, oldsize + newsize + 1)) != NULL) + if (new != NULL) + (void) strcpy(result + oldsize, new); + return result; +} + +char * +icpyalloc(const char *const string) +{ + return icatalloc(NULL, string); +} diff --git a/intl/icu/source/tools/tzcode/icuregions b/intl/icu/source/tools/tzcode/icuregions new file mode 100644 index 0000000000..2cf5d2f701 --- /dev/null +++ b/intl/icu/source/tools/tzcode/icuregions @@ -0,0 +1,25 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +###################################################################### +# Copyright (C) 2013-2014, International Business Machines +# Corporation and others. All Rights Reserved. +###################################################################### +# This is an ICU-specific file including zone/region mapping. +# +# Each line below indicates zone and its region in the syntax below - +# <zone_id> <region_code> +# +Africa/Asmera ER +Africa/Timbuktu ML +America/Coral_Harbour CA +America/Montreal CA +America/Pangnirtung CA +America/Virgin VI +Antarctica/South_Pole AQ +Atlantic/Jan_Mayen SJ +Europe/Simferopol UA +Pacific/Johnston UM +Pacific/Ponape FM +Pacific/Truk FM +Pacific/Yap FM + diff --git a/intl/icu/source/tools/tzcode/icuzdump.cpp b/intl/icu/source/tools/tzcode/icuzdump.cpp new file mode 100644 index 0000000000..c82fc43373 --- /dev/null +++ b/intl/icu/source/tools/tzcode/icuzdump.cpp @@ -0,0 +1,426 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2007-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: icuzdump.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2007-04-02 +* created by: Yoshito Umaoka +* +* This tool write out timezone transitions for ICU timezone. This tool +* is used as a part of tzdata update process to check if ICU timezone +* code works as well as the corresponding Olson stock localtime/zdump. +*/ + +#include <cstdlib> +#include <cstring> +#include <fstream> +#include <sstream> +#include <iostream> + +#include "unicode/utypes.h" +#include "unicode/ustring.h" +#include "unicode/timezone.h" +#include "unicode/simpletz.h" +#include "unicode/smpdtfmt.h" +#include "unicode/decimfmt.h" +#include "unicode/gregocal.h" +#include "unicode/ustream.h" +#include "unicode/putil.h" + +#include "cmemory.h" +#include "uoptions.h" + +using namespace std; +using namespace icu; + +class DumpFormatter { +public: + DumpFormatter() { + UErrorCode status = U_ZERO_ERROR; + stz = new SimpleTimeZone(0, ""); + sdf = new SimpleDateFormat((UnicodeString)"yyyy-MM-dd EEE HH:mm:ss", Locale::getEnglish(), status); + DecimalFormatSymbols *symbols = new DecimalFormatSymbols(Locale::getEnglish(), status); + decf = new DecimalFormat("00", symbols, status); + } + ~DumpFormatter() { + } + + UnicodeString& format(UDate time, int32_t offset, UBool isDst, UnicodeString& appendTo) { + stz->setRawOffset(offset); + sdf->setTimeZone(*stz); + UnicodeString str = sdf->format(time, appendTo); + if (offset < 0) { + appendTo += "-"; + offset = -offset; + } else { + appendTo += "+"; + } + + int32_t hour, min, sec; + + offset /= 1000; + sec = offset % 60; + offset = (offset - sec) / 60; + min = offset % 60; + hour = offset / 60; + + decf->format(hour, appendTo); + decf->format(min, appendTo); + decf->format(sec, appendTo); + appendTo += "[DST="; + if (isDst) { + appendTo += "1"; + } else { + appendTo += "0"; + } + appendTo += "]"; + return appendTo; + } +private: + SimpleTimeZone* stz; + SimpleDateFormat* sdf; + DecimalFormat* decf; +}; + +class ICUZDump { +public: + ICUZDump() { + formatter = new DumpFormatter(); + loyear = 1902; + hiyear = 2050; + tick = 1000; + linesep = nullptr; + } + + ~ICUZDump() { + } + + void setLowYear(int32_t lo) { + loyear = lo; + } + + void setHighYear(int32_t hi) { + hiyear = hi; + } + + void setTick(int32_t t) { + tick = t; + } + + void setTimeZone(TimeZone* tz) { + timezone = tz; + } + + void setDumpFormatter(DumpFormatter* fmt) { + formatter = fmt; + } + + void setLineSeparator(const char* sep) { + linesep = sep; + } + + void dump(ostream& out) { + UErrorCode status = U_ZERO_ERROR; + UDate SEARCH_INCREMENT = 12 * 60 * 60 * 1000; // half day + UDate t, cutlo, cuthi; + int32_t rawOffset, dstOffset; + UnicodeString str; + + getCutOverTimes(cutlo, cuthi); + t = cutlo; + timezone->getOffset(t, false, rawOffset, dstOffset, status); + while (t < cuthi) { + int32_t newRawOffset, newDstOffset; + UDate newt = t + SEARCH_INCREMENT; + + timezone->getOffset(newt, false, newRawOffset, newDstOffset, status); + + UBool bSameOffset = (rawOffset + dstOffset) == (newRawOffset + newDstOffset); + UBool bSameDst = ((dstOffset != 0) && (newDstOffset != 0)) || ((dstOffset == 0) && (newDstOffset == 0)); + + if (!bSameOffset || !bSameDst) { + // find the boundary + UDate lot = t; + UDate hit = newt; + while (true) { + int32_t diff = (int32_t)(hit - lot); + if (diff <= tick) { + break; + } + UDate medt = lot + ((diff / 2) / tick) * tick; + int32_t medRawOffset, medDstOffset; + timezone->getOffset(medt, false, medRawOffset, medDstOffset, status); + + bSameOffset = (rawOffset + dstOffset) == (medRawOffset + medDstOffset); + bSameDst = ((dstOffset != 0) && (medDstOffset != 0)) || ((dstOffset == 0) && (medDstOffset == 0)); + + if (!bSameOffset || !bSameDst) { + hit = medt; + } else { + lot = medt; + } + } + // write out the boundary + str.remove(); + formatter->format(lot, rawOffset + dstOffset, (dstOffset == 0 ? false : true), str); + out << str << " > "; + str.remove(); + formatter->format(hit, newRawOffset + newDstOffset, (newDstOffset == 0 ? false : true), str); + out << str; + if (linesep != nullptr) { + out << linesep; + } else { + out << endl; + } + + rawOffset = newRawOffset; + dstOffset = newDstOffset; + } + t = newt; + } + } + +private: + void getCutOverTimes(UDate& lo, UDate& hi) { + UErrorCode status = U_ZERO_ERROR; + GregorianCalendar* gcal = new GregorianCalendar(timezone, Locale::getEnglish(), status); + gcal->clear(); + gcal->set(loyear, 0, 1, 0, 0, 0); + lo = gcal->getTime(status); + gcal->set(hiyear, 0, 1, 0, 0, 0); + hi = gcal->getTime(status); + } + + TimeZone* timezone; + int32_t loyear; + int32_t hiyear; + int32_t tick; + + DumpFormatter* formatter; + const char* linesep; +}; + +class ZoneIterator { +public: + ZoneIterator(UBool bAll = false) { + if (bAll) { + UErrorCode status = U_ZERO_ERROR; + zenum = TimeZone::createEnumeration(status); + // TODO: Add error case handling later. + } + else { + zenum = nullptr; + zids = nullptr; + idx = 0; + numids = 1; + } + } + + ZoneIterator(const char** ids, int32_t num) { + zenum = nullptr; + zids = ids; + idx = 0; + numids = num; + } + + ~ZoneIterator() { + if (zenum != nullptr) { + delete zenum; + } + } + + TimeZone* next() { + TimeZone* tz = nullptr; + if (zenum != nullptr) { + UErrorCode status = U_ZERO_ERROR; + const UnicodeString* zid = zenum->snext(status); + if (zid != nullptr) { + tz = TimeZone::createTimeZone(*zid); + } + } + else { + if (idx < numids) { + if (zids != nullptr) { + tz = TimeZone::createTimeZone((const UnicodeString&)zids[idx]); + } + else { + tz = TimeZone::createDefault(); + } + idx++; + } + } + return tz; + } + +private: + const char** zids; + StringEnumeration* zenum; + int32_t idx; + int32_t numids; +}; + +enum { + kOptHelpH = 0, + kOptHelpQuestionMark, + kOptAllZones, + kOptCutover, + kOptDestDir, + kOptLineSep +}; + +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_DEF("allzones", 'a', UOPT_NO_ARG), + UOPTION_DEF("cutover", 'c', UOPT_REQUIRES_ARG), + UOPTION_DEF("destdir", 'd', UOPT_REQUIRES_ARG), + UOPTION_DEF("linesep", 'l', UOPT_REQUIRES_ARG) +}; + +extern int +main(int argc, char *argv[]) { + int32_t low = 1902; + int32_t high = 2038; + UBool bAll = false; + const char *dir = nullptr; + const char *linesep = nullptr; + + U_MAIN_INIT_ARGS(argc, argv); + argc = u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + if (argc < 0) { + cerr << "Illegal command line argument(s)" << endl << endl; + } + + if (argc < 0 || options[kOptHelpH].doesOccur || options[kOptHelpQuestionMark].doesOccur) { + cerr + << "Usage: icuzdump [-options] [zoneid1 zoneid2 ...]" << endl + << endl + << "\tDump all offset transitions for the specified zones." << endl + << endl + << "Options:" << endl + << "\t-a : Dump all available zones." << endl + << "\t-d <dir> : When specified, write transitions in a file under" << endl + << "\t the directory for each zone." << endl + << "\t-l <sep> : New line code type used in file outputs. CR or LF (default)" + << "\t or CRLF." << endl + << "\t-c [<low_year>,]<high_year>" << endl + << "\t : When specified, dump transitions starting <low_year>" << endl + << "\t (inclusive) up to <high_year> (exclusive). The default" << endl + << "\t values are 1902(low) and 2038(high)." << endl; + return argc < 0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + bAll = options[kOptAllZones].doesOccur; + + if (options[kOptDestDir].doesOccur) { + dir = options[kOptDestDir].value; + } + + if (options[kOptLineSep].doesOccur) { + if (strcmp(options[kOptLineSep].value, "CR") == 0) { + linesep = "\r"; + } else if (strcmp(options[kOptLineSep].value, "CRLF") == 0) { + linesep = "\r\n"; + } else if (strcmp(options[kOptLineSep].value, "LF") == 0) { + linesep = "\n"; + } + } + + if (options[kOptCutover].doesOccur) { + char* comma = (char*)strchr(options[kOptCutover].value, ','); + if (comma == nullptr) { + high = atoi(options[kOptCutover].value); + } else { + *comma = 0; + low = atoi(options[kOptCutover].value); + high = atoi(comma + 1); + } + } + + ICUZDump dumper; + dumper.setLowYear(low); + dumper.setHighYear(high); + if (dir != nullptr && linesep != nullptr) { + // use the specified line separator only for file output + dumper.setLineSeparator((const char*)linesep); + } + + ZoneIterator* zit; + if (bAll) { + zit = new ZoneIterator(true); + } else { + if (argc <= 1) { + zit = new ZoneIterator(); + } else { + zit = new ZoneIterator((const char**)&argv[1], argc - 1); + } + } + + UnicodeString id; + if (dir != nullptr) { + // file output + ostringstream path; + ios::openmode mode = ios::out; + if (linesep != nullptr) { + mode |= ios::binary; + } + for (;;) { + TimeZone* tz = zit->next(); + if (tz == nullptr) { + break; + } + dumper.setTimeZone(tz); + tz->getID(id); + + // target file path + path.str(""); + path << dir << U_FILE_SEP_CHAR; + id = id.findAndReplace("/", "-"); + path << id; + + ofstream* fout = new ofstream(path.str().c_str(), mode); + if (fout->fail()) { + cerr << "Cannot open file " << path.str() << endl; + delete fout; + delete tz; + break; + } + + dumper.dump(*fout); + fout->close(); + delete fout; + delete tz; + } + + } else { + // stdout + UBool bFirst = true; + for (;;) { + TimeZone* tz = zit->next(); + if (tz == nullptr) { + break; + } + dumper.setTimeZone(tz); + tz->getID(id); + if (bFirst) { + bFirst = false; + } else { + cout << endl; + } + cout << "ZONE: " << id << endl; + dumper.dump(cout); + delete tz; + } + } + delete zit; +} diff --git a/intl/icu/source/tools/tzcode/icuzdump.vcxproj b/intl/icu/source/tools/tzcode/icuzdump.vcxproj new file mode 100644 index 0000000000..4f7b96fc29 --- /dev/null +++ b/intl/icu/source/tools/tzcode/icuzdump.vcxproj @@ -0,0 +1,110 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup Label="ProjectConfigurations"> + <ProjectConfiguration Include="Debug|Win32"> + <Configuration>Debug</Configuration> + <Platform>Win32</Platform> + </ProjectConfiguration> + <ProjectConfiguration Include="Release|Win32"> + <Configuration>Release</Configuration> + <Platform>Win32</Platform> + </ProjectConfiguration> + </ItemGroup> + <PropertyGroup Label="Globals"> + <ProjectGuid>{655F4481-B461-4DF0-AF10-0D01114A26C1}</ProjectGuid> + <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion> + <RootNamespace>icuzdump</RootNamespace> + <Keyword>Win32Proj</Keyword> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <CharacterSet>Unicode</CharacterSet> + <WholeProgramOptimization>true</WholeProgramOptimization> + </PropertyGroup> + <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <CharacterSet>Unicode</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets"> + <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" /> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.40219.1</_ProjectFileVersion> + <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\x86\Debug\</OutDir> + <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\x86\Debug\</IntDir> + <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental> + <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir> + <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir> + <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental> + </PropertyGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\bin +</Command> + <Outputs>..\..\..\bin\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + <ClCompile> + <Optimization>Disabled</Optimization> + <AdditionalIncludeDirectories>..\..\..\include;..\toolutil;..\..\common;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PreprocessorDefinitions>WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <MinimalRebuild>false</MinimalRebuild> + <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <PrecompiledHeader> + </PrecompiledHeader> + <PrecompiledHeaderOutputFile>.\x86\Debug/icuzdump.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>.\x86\Debug/</AssemblerListingLocation> + <ObjectFileName>.\x86\Debug/</ObjectFileName> + <ProgramDataBaseFileName>.\x86\Debug/</ProgramDataBaseFileName> + <BrowseInformation>true</BrowseInformation> + <WarningLevel>Level3</WarningLevel> + <DebugInformationFormat>EditAndContinue</DebugInformationFormat> + <CompileAs>Default</CompileAs> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icuind.lib;icutud.lib;icuiod.lib;%(AdditionalDependencies)</AdditionalDependencies> + <OutputFile>.\x86\Debug/icuzdump.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + <GenerateDebugInformation>true</GenerateDebugInformation> + <ProgramDatabaseFile>.\x86\Debug/icuzdump.pdb</ProgramDatabaseFile> + <SubSystem>Console</SubSystem> + <DataExecutionPrevention> + </DataExecutionPrevention> + <TargetMachine>NotSet</TargetMachine> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> + <ClCompile> + <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <PrecompiledHeader> + </PrecompiledHeader> + <WarningLevel>Level3</WarningLevel> + <DebugInformationFormat>ProgramDatabase</DebugInformationFormat> + </ClCompile> + <Link> + <GenerateDebugInformation>true</GenerateDebugInformation> + <SubSystem>Console</SubSystem> + <OptimizeReferences>true</OptimizeReferences> + <EnableCOMDATFolding>true</EnableCOMDATFolding> + <DataExecutionPrevention> + </DataExecutionPrevention> + <TargetMachine>MachineX86</TargetMachine> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="icuzdump.cpp" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project> diff --git a/intl/icu/source/tools/tzcode/icuzdump.vcxproj.filters b/intl/icu/source/tools/tzcode/icuzdump.vcxproj.filters new file mode 100644 index 0000000000..8004a63a1c --- /dev/null +++ b/intl/icu/source/tools/tzcode/icuzdump.vcxproj.filters @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier> + <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier> + <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="icuzdump.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/tzcode/icuzones b/intl/icu/source/tools/tzcode/icuzones new file mode 100644 index 0000000000..52f5698cd7 --- /dev/null +++ b/intl/icu/source/tools/tzcode/icuzones @@ -0,0 +1,96 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +###################################################################### +# Copyright (C) 2007-2014, International Business Machines +# Corporation and others. All Rights Reserved. +###################################################################### +# This is an ICU-specific file with the same format as regular +# tzdata time zone files, for consistent parsing by the tools that +# turn "Olson" tzdata into ICU's zoneinfo.txt. +# The purpose of this file is to give ICU a superset of the time zones +# that are in CLDR and also include legacy ICU time zones originally +# in tz.alias for rataining backward compatibility. + +# Add Etc/Unknown, defined by CLDR. Give it Etc/GMT behavior. + +# Zone NAME GMTOFF RULES FORMAT +Zone Etc/Unknown 0 - Unknown + +# SystemV time zones. +# IANA tzdb file 'systemv' file has these SystemV/* zones commented out up to 2020a. +# 'systemv' file was removed in 2020b. We keep them in this supplemental zone data +# file for compatibility purpose. + +# Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S +Rule SystemV min 1973 - Apr lastSun 2:00 1:00 D +Rule SystemV min 1973 - Oct lastSun 2:00 0 S +Rule SystemV 1974 only - Jan 6 2:00 1:00 D +Rule SystemV 1974 only - Nov lastSun 2:00 0 S +Rule SystemV 1975 only - Feb 23 2:00 1:00 D +Rule SystemV 1975 only - Oct lastSun 2:00 0 S +Rule SystemV 1976 max - Apr lastSun 2:00 1:00 D +Rule SystemV 1976 max - Oct lastSun 2:00 0 S + +# Zone NAME GMTOFF RULES/SAVE FORMAT [UNTIL] +Zone SystemV/AST4ADT -4:00 SystemV A%sT +Zone SystemV/EST5EDT -5:00 SystemV E%sT +Zone SystemV/CST6CDT -6:00 SystemV C%sT +Zone SystemV/MST7MDT -7:00 SystemV M%sT +Zone SystemV/PST8PDT -8:00 SystemV P%sT +Zone SystemV/YST9YDT -9:00 SystemV Y%sT +Zone SystemV/AST4 -4:00 - AST +Zone SystemV/EST5 -5:00 - EST +Zone SystemV/CST6 -6:00 - CST +Zone SystemV/MST7 -7:00 - MST +Zone SystemV/PST8 -8:00 - PST +Zone SystemV/YST9 -9:00 - YST +Zone SystemV/HST10 -10:00 - HST + +# pacificnew +# IANA tzdb file 'pacificnew' used to contain a Link for US/Pacific-New. +# 'pacificnew' file was removed in 2020b. We keep the Link here for compatibility. +Link America/Los_Angeles US/Pacific-New + + +# The list below is for supporting legacy ICU zone aliases. +# These definitions were originally defined in tz.alias. + +#### Aliases that conflict with Olson compatibility Zone definition + +Link Australia/Darwin ACT +Link Australia/Sydney AET +Link America/Argentina/Buenos_Aires AGT +Link Africa/Cairo ART +Link America/Anchorage AST +Link America/Sao_Paulo BET +Link Asia/Dhaka BST +Link Africa/Maputo CAT +Link America/St_Johns CNT +Link America/Chicago CST +Link Asia/Shanghai CTT +Link Africa/Addis_Ababa EAT +Link Europe/Paris ECT +#Link Europe/Istanbul EET # EET is a standard UNIX zone +####Link EST America/New_York EST # Defined as -05:00 +####Link Pacific/Honolulu HST # Defined as -10:00 +Link America/Indiana/Indianapolis IET +Link Asia/Kolkata IST +Link Asia/Tokyo JST +#Link Asia/Tehran MET # MET is a standard UNIX zone +Link Pacific/Apia MIT +####Link America/Denver MST # Defined as -07:00 +Link Asia/Yerevan NET +Link Pacific/Auckland NST +Link Asia/Karachi PLT +Link America/Phoenix PNT +Link America/Puerto_Rico PRT +Link America/Los_Angeles PST +Link Pacific/Guadalcanal SST +#Link Etc/UTC UTC # Olson LINK +Link Asia/Ho_Chi_Minh VST + +# +# Aliases already dropped from the TZ database. +# ICU may also remove these aliases. +# +Link America/Regina Canada/East-Saskatchewan # removed from backward in 2017c diff --git a/intl/icu/source/tools/tzcode/localtime.c b/intl/icu/source/tools/tzcode/localtime.c new file mode 100644 index 0000000000..8d84a92ddd --- /dev/null +++ b/intl/icu/source/tools/tzcode/localtime.c @@ -0,0 +1,2058 @@ +/* +** This file is in the public domain, so clarified as of +** 1996-06-05 by Arthur David Olson. +*/ + +/* +** Leap second handling from Bradley White. +** POSIX-style TZ environment variable handling from Guy Harris. +*/ + +/*LINTLIBRARY*/ + +#include <stdbool.h> + +#include "private.h" +#include "tzfile.h" +#include "fcntl.h" + +#ifndef TZ_ABBR_MAX_LEN +#define TZ_ABBR_MAX_LEN 16 +#endif /* !defined TZ_ABBR_MAX_LEN */ + +#ifndef TZ_ABBR_CHAR_SET +#define TZ_ABBR_CHAR_SET \ + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 :+-._" +#endif /* !defined TZ_ABBR_CHAR_SET */ + +#ifndef TZ_ABBR_ERR_CHAR +#define TZ_ABBR_ERR_CHAR '_' +#endif /* !defined TZ_ABBR_ERR_CHAR */ + +/* +** SunOS 4.1.1 headers lack O_BINARY. +*/ + +#ifdef O_BINARY +#define OPEN_MODE (O_RDONLY | O_BINARY) +#endif /* defined O_BINARY */ +#ifndef O_BINARY +#define OPEN_MODE O_RDONLY +#endif /* !defined O_BINARY */ + +#ifndef WILDABBR +/* +** Someone might make incorrect use of a time zone abbreviation: +** 1. They might reference tzname[0] before calling tzset (explicitly +** or implicitly). +** 2. They might reference tzname[1] before calling tzset (explicitly +** or implicitly). +** 3. They might reference tzname[1] after setting to a time zone +** in which Daylight Saving Time is never observed. +** 4. They might reference tzname[0] after setting to a time zone +** in which Standard Time is never observed. +** 5. They might reference tm.TM_ZONE after calling offtime. +** What's best to do in the above cases is open to debate; +** for now, we just set things up so that in any of the five cases +** WILDABBR is used. Another possibility: initialize tzname[0] to the +** string "tzname[0] used before set", and similarly for the other cases. +** And another: initialize tzname[0] to "ERA", with an explanation in the +** manual page of what this "time zone abbreviation" means (doing this so +** that tzname[0] has the "normal" length of three characters). +*/ +#define WILDABBR " " +#endif /* !defined WILDABBR */ + +static const char wildabbr[] = WILDABBR; + +static const char gmt[] = "GMT"; + +/* +** The DST rules to use if TZ has no rules and we can't load TZDEFRULES. +** We default to US rules as of 1999-08-17. +** POSIX 1003.1 section 8.1.1 says that the default DST rules are +** implementation dependent; for historical reasons, US rules are a +** common default. +*/ +#ifndef TZDEFRULESTRING +#define TZDEFRULESTRING ",M4.1.0,M10.5.0" +#endif /* !defined TZDEFDST */ + +struct ttinfo { /* time type information */ + int_fast32_t tt_gmtoff; /* UT offset in seconds */ + int tt_isdst; /* used to set tm_isdst */ + int tt_abbrind; /* abbreviation list index */ + int tt_ttisstd; /* true if transition is std time */ + int tt_ttisgmt; /* true if transition is UT */ +}; + +struct lsinfo { /* leap second information */ + time_t ls_trans; /* transition time */ + int_fast64_t ls_corr; /* correction to apply */ +}; + +#define BIGGEST(a, b) (((a) > (b)) ? (a) : (b)) + +#ifdef TZNAME_MAX +#define MY_TZNAME_MAX TZNAME_MAX +#endif /* defined TZNAME_MAX */ +#ifndef TZNAME_MAX +#define MY_TZNAME_MAX 255 +#endif /* !defined TZNAME_MAX */ + +struct state { + int leapcnt; + int timecnt; + int typecnt; + int charcnt; + int goback; + int goahead; + time_t ats[TZ_MAX_TIMES]; + unsigned char types[TZ_MAX_TIMES]; + struct ttinfo ttis[TZ_MAX_TYPES]; + char chars[BIGGEST(BIGGEST(TZ_MAX_CHARS + 1, sizeof gmt), + (2 * (MY_TZNAME_MAX + 1)))]; + struct lsinfo lsis[TZ_MAX_LEAPS]; + int defaulttype; /* for early times or if no transitions */ +}; + +struct rule { + int r_type; /* type of rule--see below */ + int r_day; /* day number of rule */ + int r_week; /* week number of rule */ + int r_mon; /* month number of rule */ + int_fast32_t r_time; /* transition time of rule */ +}; + +#define JULIAN_DAY 0 /* Jn - Julian day */ +#define DAY_OF_YEAR 1 /* n - day of year */ +#define MONTH_NTH_DAY_OF_WEEK 2 /* Mm.n.d - month, week, day of week */ + +/* +** Prototypes for static functions. +*/ + +static int_fast32_t detzcode(const char * codep); +static int_fast64_t detzcode64(const char * codep); +static int differ_by_repeat(time_t t1, time_t t0); +static const char * getzname(const char * strp) ATTRIBUTE_PURE; +static const char * getqzname(const char * strp, const int delim) + ATTRIBUTE_PURE; +static const char * getnum(const char * strp, int * nump, int min, + int max); +static const char * getsecs(const char * strp, int_fast32_t * secsp); +static const char * getoffset(const char * strp, int_fast32_t * offsetp); +static const char * getrule(const char * strp, struct rule * rulep); +static void gmtload(struct state * sp); +static struct tm * gmtsub(const time_t * timep, int_fast32_t offset, + struct tm * tmp); +static struct tm * localsub(const time_t * timep, int_fast32_t offset, + struct tm * tmp); +static int increment_overflow(int * number, int delta); +static int leaps_thru_end_of(int y) ATTRIBUTE_PURE; +static int increment_overflow32(int_fast32_t * number, int delta); +static int increment_overflow_time(time_t *t, int_fast32_t delta); +static int normalize_overflow32(int_fast32_t * tensptr, + int * unitsptr, int base); +static int normalize_overflow(int * tensptr, int * unitsptr, + int base); +static void settzname(void); +static time_t time1(struct tm * tmp, + struct tm * (*funcp)(const time_t *, + int_fast32_t, struct tm *), + int_fast32_t offset); +static time_t time2(struct tm *tmp, + struct tm * (*funcp)(const time_t *, + int_fast32_t, struct tm*), + int_fast32_t offset, int * okayp); +static time_t time2sub(struct tm *tmp, + struct tm * (*funcp)(const time_t *, + int_fast32_t, struct tm*), + int_fast32_t offset, int * okayp, int do_norm_secs); +static struct tm * timesub(const time_t * timep, int_fast32_t offset, + const struct state * sp, struct tm * tmp); +static int tmcomp(const struct tm * atmp, + const struct tm * btmp); +static int_fast32_t transtime(int year, const struct rule * rulep, + int_fast32_t offset) + ATTRIBUTE_PURE; +static int typesequiv(const struct state * sp, int a, int b); +static int tzload(const char * name, struct state * sp, + int doextend); +static int tzparse(const char * name, struct state * sp, + int lastditch); + +#ifdef ALL_STATE +static struct state * lclptr; +static struct state * gmtptr; +#endif /* defined ALL_STATE */ + +#ifndef ALL_STATE +static struct state lclmem; +static struct state gmtmem; +#define lclptr (&lclmem) +#define gmtptr (&gmtmem) +#endif /* State Farm */ + +#ifndef TZ_STRLEN_MAX +#define TZ_STRLEN_MAX 255 +#endif /* !defined TZ_STRLEN_MAX */ + +static char lcl_TZname[TZ_STRLEN_MAX + 1]; +static int lcl_is_set; +static int gmt_is_set; + +char * tzname[2] = { + (char *) wildabbr, + (char *) wildabbr +}; + +/* +** Section 4.12.3 of X3.159-1989 requires that +** Except for the strftime function, these functions [asctime, +** ctime, gmtime, localtime] return values in one of two static +** objects: a broken-down time structure and an array of char. +** Thanks to Paul Eggert for noting this. +*/ + +static struct tm tm; + +#ifdef USG_COMPAT +long timezone = 0; +int daylight = 0; +#endif /* defined USG_COMPAT */ + +#ifdef ALTZONE +long altzone = 0; +#endif /* defined ALTZONE */ + +static int_fast32_t +detzcode(const char *const codep) +{ + register int_fast32_t result; + register int i; + + result = (codep[0] & 0x80) ? -1 : 0; + for (i = 0; i < 4; ++i) + result = (result << 8) | (codep[i] & 0xff); + return result; +} + +static int_fast64_t +detzcode64(const char *const codep) +{ + register int_fast64_t result; + register int i; + + result = (codep[0] & 0x80) ? -1 : 0; + for (i = 0; i < 8; ++i) + result = (result << 8) | (codep[i] & 0xff); + return result; +} + +static void +settzname(void) +{ + register struct state * const sp = lclptr; + register int i; + + tzname[0] = tzname[1] = (char *) wildabbr; +#ifdef USG_COMPAT + daylight = 0; + timezone = 0; +#endif /* defined USG_COMPAT */ +#ifdef ALTZONE + altzone = 0; +#endif /* defined ALTZONE */ + if (sp == NULL) { + tzname[0] = tzname[1] = (char *) gmt; + return; + } + /* + ** And to get the latest zone names into tzname. . . + */ + for (i = 0; i < sp->typecnt; ++i) { + register const struct ttinfo * const ttisp = &sp->ttis[i]; + + tzname[ttisp->tt_isdst] = &sp->chars[ttisp->tt_abbrind]; + } + for (i = 0; i < sp->timecnt; ++i) { + register const struct ttinfo * const ttisp = + &sp->ttis[ + sp->types[i]]; + + tzname[ttisp->tt_isdst] = + &sp->chars[ttisp->tt_abbrind]; +#ifdef USG_COMPAT + if (ttisp->tt_isdst) + daylight = 1; + if (!ttisp->tt_isdst) + timezone = -(ttisp->tt_gmtoff); +#endif /* defined USG_COMPAT */ +#ifdef ALTZONE + if (ttisp->tt_isdst) + altzone = -(ttisp->tt_gmtoff); +#endif /* defined ALTZONE */ + } + /* + ** Finally, scrub the abbreviations. + ** First, replace bogus characters. + */ + for (i = 0; i < sp->charcnt; ++i) + if (strchr(TZ_ABBR_CHAR_SET, sp->chars[i]) == NULL) + sp->chars[i] = TZ_ABBR_ERR_CHAR; + /* + ** Second, truncate long abbreviations. + */ + for (i = 0; i < sp->typecnt; ++i) { + register const struct ttinfo * const ttisp = &sp->ttis[i]; + register char * cp = &sp->chars[ttisp->tt_abbrind]; + + if (strlen(cp) > TZ_ABBR_MAX_LEN && + strcmp(cp, GRANDPARENTED) != 0) + *(cp + TZ_ABBR_MAX_LEN) = '\0'; + } +} + +static int +differ_by_repeat(const time_t t1, const time_t t0) +{ + if (TYPE_BIT(time_t) - TYPE_SIGNED(time_t) < SECSPERREPEAT_BITS) + return 0; + return t1 - t0 == SECSPERREPEAT; +} + +static int +tzload(register const char *name, register struct state *const sp, + register const int doextend) +{ + register const char * p; + register int i; + register int fid; + register int stored; + register int nread; + typedef union { + struct tzhead tzhead; + char buf[2 * sizeof(struct tzhead) + + 2 * sizeof *sp + + 4 * TZ_MAX_TIMES]; + } u_t; +#ifdef ALL_STATE + register u_t * const up = malloc(sizeof *up); +#else /* !defined ALL_STATE */ + u_t u; + register u_t * const up = &u; +#endif /* !defined ALL_STATE */ + + sp->goback = sp->goahead = false; + + if (up == NULL) + return -1; + + if (name == NULL && (name = TZDEFAULT) == NULL) + goto oops; + { + register int doaccess; + /* + ** Section 4.9.1 of the C standard says that + ** "FILENAME_MAX expands to an integral constant expression + ** that is the size needed for an array of char large enough + ** to hold the longest file name string that the implementation + ** guarantees can be opened." + */ + char fullname[FILENAME_MAX + 1]; + + if (name[0] == ':') + ++name; + doaccess = name[0] == '/'; + if (!doaccess) { + if ((p = TZDIR) == NULL) + goto oops; + if ((strlen(p) + strlen(name) + 1) >= sizeof fullname) + goto oops; + (void) strcpy(fullname, p); + (void) strcat(fullname, "/"); + (void) strcat(fullname, name); + /* + ** Set doaccess if '.' (as in "../") shows up in name. + */ + if (strchr(name, '.') != NULL) + doaccess = true; + name = fullname; + } + if (doaccess && access(name, R_OK) != 0) + goto oops; + if ((fid = open(name, OPEN_MODE)) == -1) + goto oops; + } + nread = read(fid, up->buf, sizeof up->buf); + if (close(fid) < 0 || nread <= 0) + goto oops; + for (stored = 4; stored <= 8; stored *= 2) { + int ttisstdcnt; + int ttisgmtcnt; + int timecnt; + + ttisstdcnt = (int) detzcode(up->tzhead.tzh_ttisstdcnt); + ttisgmtcnt = (int) detzcode(up->tzhead.tzh_ttisgmtcnt); + sp->leapcnt = (int) detzcode(up->tzhead.tzh_leapcnt); + sp->timecnt = (int) detzcode(up->tzhead.tzh_timecnt); + sp->typecnt = (int) detzcode(up->tzhead.tzh_typecnt); + sp->charcnt = (int) detzcode(up->tzhead.tzh_charcnt); + p = up->tzhead.tzh_charcnt + sizeof up->tzhead.tzh_charcnt; + if (sp->leapcnt < 0 || sp->leapcnt > TZ_MAX_LEAPS || + sp->typecnt <= 0 || sp->typecnt > TZ_MAX_TYPES || + sp->timecnt < 0 || sp->timecnt > TZ_MAX_TIMES || + sp->charcnt < 0 || sp->charcnt > TZ_MAX_CHARS || + (ttisstdcnt != sp->typecnt && ttisstdcnt != 0) || + (ttisgmtcnt != sp->typecnt && ttisgmtcnt != 0)) + goto oops; + if (nread - (p - up->buf) < + sp->timecnt * stored + /* ats */ + sp->timecnt + /* types */ + sp->typecnt * 6 + /* ttinfos */ + sp->charcnt + /* chars */ + sp->leapcnt * (stored + 4) + /* lsinfos */ + ttisstdcnt + /* ttisstds */ + ttisgmtcnt) /* ttisgmts */ + goto oops; + timecnt = 0; + for (i = 0; i < sp->timecnt; ++i) { + int_fast64_t at + = stored == 4 ? detzcode(p) : detzcode64(p); + sp->types[i] = ((TYPE_SIGNED(time_t) + ? time_t_min <= at + : 0 <= at) + && at <= time_t_max); + if (sp->types[i]) { + if (i && !timecnt && at != time_t_min) { + /* + ** Keep the earlier record, but tweak + ** it so that it starts with the + ** minimum time_t value. + */ + sp->types[i - 1] = 1; + sp->ats[timecnt++] = time_t_min; + } + sp->ats[timecnt++] = at; + } + p += stored; + } + timecnt = 0; + for (i = 0; i < sp->timecnt; ++i) { + unsigned char typ = *p++; + if (sp->typecnt <= typ) + goto oops; + if (sp->types[i]) + sp->types[timecnt++] = typ; + } + sp->timecnt = timecnt; + for (i = 0; i < sp->typecnt; ++i) { + register struct ttinfo * ttisp; + + ttisp = &sp->ttis[i]; + ttisp->tt_gmtoff = detzcode(p); + p += 4; + ttisp->tt_isdst = (unsigned char) *p++; + if (ttisp->tt_isdst != 0 && ttisp->tt_isdst != 1) + goto oops; + ttisp->tt_abbrind = (unsigned char) *p++; + if (ttisp->tt_abbrind < 0 || + ttisp->tt_abbrind > sp->charcnt) + goto oops; + } + for (i = 0; i < sp->charcnt; ++i) + sp->chars[i] = *p++; + sp->chars[i] = '\0'; /* ensure '\0' at end */ + for (i = 0; i < sp->leapcnt; ++i) { + register struct lsinfo * lsisp; + + lsisp = &sp->lsis[i]; + lsisp->ls_trans = (stored == 4) ? + detzcode(p) : detzcode64(p); + p += stored; + lsisp->ls_corr = detzcode(p); + p += 4; + } + for (i = 0; i < sp->typecnt; ++i) { + register struct ttinfo * ttisp; + + ttisp = &sp->ttis[i]; + if (ttisstdcnt == 0) + ttisp->tt_ttisstd = false; + else { + ttisp->tt_ttisstd = *p++; + if (ttisp->tt_ttisstd != true && + ttisp->tt_ttisstd != false) + goto oops; + } + } + for (i = 0; i < sp->typecnt; ++i) { + register struct ttinfo * ttisp; + + ttisp = &sp->ttis[i]; + if (ttisgmtcnt == 0) + ttisp->tt_ttisgmt = false; + else { + ttisp->tt_ttisgmt = *p++; + if (ttisp->tt_ttisgmt != true && + ttisp->tt_ttisgmt != false) + goto oops; + } + } + /* + ** If this is an old file, we're done. + */ + if (up->tzhead.tzh_version[0] == '\0') + break; + nread -= p - up->buf; + for (i = 0; i < nread; ++i) + up->buf[i] = p[i]; + /* + ** If this is a signed narrow time_t system, we're done. + */ + if (TYPE_SIGNED(time_t) && stored >= (int) sizeof(time_t)) + break; + } + if (doextend && nread > 2 && + up->buf[0] == '\n' && up->buf[nread - 1] == '\n' && + sp->typecnt + 2 <= TZ_MAX_TYPES) { + struct state ts; + register int result; + + up->buf[nread - 1] = '\0'; + result = tzparse(&up->buf[1], &ts, false); + if (result == 0 && ts.typecnt == 2 && + sp->charcnt + ts.charcnt <= TZ_MAX_CHARS) { + for (i = 0; i < 2; ++i) + ts.ttis[i].tt_abbrind += + sp->charcnt; + for (i = 0; i < ts.charcnt; ++i) + sp->chars[sp->charcnt++] = + ts.chars[i]; + i = 0; + while (i < ts.timecnt && + ts.ats[i] <= + sp->ats[sp->timecnt - 1]) + ++i; + while (i < ts.timecnt && + sp->timecnt < TZ_MAX_TIMES) { + sp->ats[sp->timecnt] = + ts.ats[i]; + sp->types[sp->timecnt] = + sp->typecnt + + ts.types[i]; + ++sp->timecnt; + ++i; + } + sp->ttis[sp->typecnt++] = ts.ttis[0]; + sp->ttis[sp->typecnt++] = ts.ttis[1]; + } + } + if (sp->timecnt > 1) { + for (i = 1; i < sp->timecnt; ++i) + if (typesequiv(sp, sp->types[i], sp->types[0]) && + differ_by_repeat(sp->ats[i], sp->ats[0])) { + sp->goback = true; + break; + } + for (i = sp->timecnt - 2; i >= 0; --i) + if (typesequiv(sp, sp->types[sp->timecnt - 1], + sp->types[i]) && + differ_by_repeat(sp->ats[sp->timecnt - 1], + sp->ats[i])) { + sp->goahead = true; + break; + } + } + /* + ** If type 0 is unused in transitions, + ** it's the type to use for early times. + */ + for (i = 0; i < sp->typecnt; ++i) + if (sp->types[i] == 0) + break; + i = (i >= sp->typecnt) ? 0 : -1; + /* + ** Absent the above, + ** if there are transition times + ** and the first transition is to a daylight time + ** find the standard type less than and closest to + ** the type of the first transition. + */ + if (i < 0 && sp->timecnt > 0 && sp->ttis[sp->types[0]].tt_isdst) { + i = sp->types[0]; + while (--i >= 0) + if (!sp->ttis[i].tt_isdst) + break; + } + /* + ** If no result yet, find the first standard type. + ** If there is none, punt to type zero. + */ + if (i < 0) { + i = 0; + while (sp->ttis[i].tt_isdst) + if (++i >= sp->typecnt) { + i = 0; + break; + } + } + sp->defaulttype = i; +#ifdef ALL_STATE + free(up); +#endif /* defined ALL_STATE */ + return 0; +oops: +#ifdef ALL_STATE + free(up); +#endif /* defined ALL_STATE */ + return -1; +} + +static int +typesequiv(const struct state *const sp, const int a, const int b) +{ + register int result; + + if (sp == NULL || + a < 0 || a >= sp->typecnt || + b < 0 || b >= sp->typecnt) + result = false; + else { + register const struct ttinfo * ap = &sp->ttis[a]; + register const struct ttinfo * bp = &sp->ttis[b]; + result = ap->tt_gmtoff == bp->tt_gmtoff && + ap->tt_isdst == bp->tt_isdst && + ap->tt_ttisstd == bp->tt_ttisstd && + ap->tt_ttisgmt == bp->tt_ttisgmt && + strcmp(&sp->chars[ap->tt_abbrind], + &sp->chars[bp->tt_abbrind]) == 0; + } + return result; +} + +static const int mon_lengths[2][MONSPERYEAR] = { + { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }, + { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 } +}; + +static const int year_lengths[2] = { + DAYSPERNYEAR, DAYSPERLYEAR +}; + +/* +** Given a pointer into a time zone string, scan until a character that is not +** a valid character in a zone name is found. Return a pointer to that +** character. +*/ + +static const char * +getzname(register const char *strp) +{ + register char c; + + while ((c = *strp) != '\0' && !is_digit(c) && c != ',' && c != '-' && + c != '+') + ++strp; + return strp; +} + +/* +** Given a pointer into an extended time zone string, scan until the ending +** delimiter of the zone name is located. Return a pointer to the delimiter. +** +** As with getzname above, the legal character set is actually quite +** restricted, with other characters producing undefined results. +** We don't do any checking here; checking is done later in common-case code. +*/ + +static const char * +getqzname(register const char *strp, const int delim) +{ + register int c; + + while ((c = *strp) != '\0' && c != delim) + ++strp; + return strp; +} + +/* +** Given a pointer into a time zone string, extract a number from that string. +** Check that the number is within a specified range; if it is not, return +** NULL. +** Otherwise, return a pointer to the first character not part of the number. +*/ + +static const char * +getnum(register const char *strp, int *const nump, const int min, const int max) +{ + register char c; + register int num; + + if (strp == NULL || !is_digit(c = *strp)) + return NULL; + num = 0; + do { + num = num * 10 + (c - '0'); + if (num > max) + return NULL; /* illegal value */ + c = *++strp; + } while (is_digit(c)); + if (num < min) + return NULL; /* illegal value */ + *nump = num; + return strp; +} + +/* +** Given a pointer into a time zone string, extract a number of seconds, +** in hh[:mm[:ss]] form, from the string. +** If any error occurs, return NULL. +** Otherwise, return a pointer to the first character not part of the number +** of seconds. +*/ + +static const char * +getsecs(register const char *strp, int_fast32_t *const secsp) +{ + int num; + + /* + ** `HOURSPERDAY * DAYSPERWEEK - 1' allows quasi-Posix rules like + ** "M10.4.6/26", which does not conform to Posix, + ** but which specifies the equivalent of + ** ``02:00 on the first Sunday on or after 23 Oct''. + */ + strp = getnum(strp, &num, 0, HOURSPERDAY * DAYSPERWEEK - 1); + if (strp == NULL) + return NULL; + *secsp = num * (int_fast32_t) SECSPERHOUR; + if (*strp == ':') { + ++strp; + strp = getnum(strp, &num, 0, MINSPERHOUR - 1); + if (strp == NULL) + return NULL; + *secsp += num * SECSPERMIN; + if (*strp == ':') { + ++strp; + /* `SECSPERMIN' allows for leap seconds. */ + strp = getnum(strp, &num, 0, SECSPERMIN); + if (strp == NULL) + return NULL; + *secsp += num; + } + } + return strp; +} + +/* +** Given a pointer into a time zone string, extract an offset, in +** [+-]hh[:mm[:ss]] form, from the string. +** If any error occurs, return NULL. +** Otherwise, return a pointer to the first character not part of the time. +*/ + +static const char * +getoffset(register const char *strp, int_fast32_t *const offsetp) +{ + register int neg = 0; + + if (*strp == '-') { + neg = 1; + ++strp; + } else if (*strp == '+') + ++strp; + strp = getsecs(strp, offsetp); + if (strp == NULL) + return NULL; /* illegal time */ + if (neg) + *offsetp = -*offsetp; + return strp; +} + +/* +** Given a pointer into a time zone string, extract a rule in the form +** date[/time]. See POSIX section 8 for the format of "date" and "time". +** If a valid rule is not found, return NULL. +** Otherwise, return a pointer to the first character not part of the rule. +*/ + +static const char * +getrule(const char *strp, register struct rule *const rulep) +{ + if (*strp == 'J') { + /* + ** Julian day. + */ + rulep->r_type = JULIAN_DAY; + ++strp; + strp = getnum(strp, &rulep->r_day, 1, DAYSPERNYEAR); + } else if (*strp == 'M') { + /* + ** Month, week, day. + */ + rulep->r_type = MONTH_NTH_DAY_OF_WEEK; + ++strp; + strp = getnum(strp, &rulep->r_mon, 1, MONSPERYEAR); + if (strp == NULL) + return NULL; + if (*strp++ != '.') + return NULL; + strp = getnum(strp, &rulep->r_week, 1, 5); + if (strp == NULL) + return NULL; + if (*strp++ != '.') + return NULL; + strp = getnum(strp, &rulep->r_day, 0, DAYSPERWEEK - 1); + } else if (is_digit(*strp)) { + /* + ** Day of year. + */ + rulep->r_type = DAY_OF_YEAR; + strp = getnum(strp, &rulep->r_day, 0, DAYSPERLYEAR - 1); + } else return NULL; /* invalid format */ + if (strp == NULL) + return NULL; + if (*strp == '/') { + /* + ** Time specified. + */ + ++strp; + strp = getoffset(strp, &rulep->r_time); + } else rulep->r_time = 2 * SECSPERHOUR; /* default = 2:00:00 */ + return strp; +} + +/* +** Given a year, a rule, and the offset from UT at the time that rule takes +** effect, calculate the year-relative time that rule takes effect. +*/ + +static int_fast32_t +transtime(const int year, register const struct rule *const rulep, + const int_fast32_t offset) +{ + register int leapyear; + register int_fast32_t value; + register int i; + int d, m1, yy0, yy1, yy2, dow; + + INITIALIZE(value); + leapyear = isleap(year); + switch (rulep->r_type) { + + case JULIAN_DAY: + /* + ** Jn - Julian day, 1 == January 1, 60 == March 1 even in leap + ** years. + ** In non-leap years, or if the day number is 59 or less, just + ** add SECSPERDAY times the day number-1 to the time of + ** January 1, midnight, to get the day. + */ + value = (rulep->r_day - 1) * SECSPERDAY; + if (leapyear && rulep->r_day >= 60) + value += SECSPERDAY; + break; + + case DAY_OF_YEAR: + /* + ** n - day of year. + ** Just add SECSPERDAY times the day number to the time of + ** January 1, midnight, to get the day. + */ + value = rulep->r_day * SECSPERDAY; + break; + + case MONTH_NTH_DAY_OF_WEEK: + /* + ** Mm.n.d - nth "dth day" of month m. + */ + + /* + ** Use Zeller's Congruence to get day-of-week of first day of + ** month. + */ + m1 = (rulep->r_mon + 9) % 12 + 1; + yy0 = (rulep->r_mon <= 2) ? (year - 1) : year; + yy1 = yy0 / 100; + yy2 = yy0 % 100; + dow = ((26 * m1 - 2) / 10 + + 1 + yy2 + yy2 / 4 + yy1 / 4 - 2 * yy1) % 7; + if (dow < 0) + dow += DAYSPERWEEK; + + /* + ** "dow" is the day-of-week of the first day of the month. Get + ** the day-of-month (zero-origin) of the first "dow" day of the + ** month. + */ + d = rulep->r_day - dow; + if (d < 0) + d += DAYSPERWEEK; + for (i = 1; i < rulep->r_week; ++i) { + if (d + DAYSPERWEEK >= + mon_lengths[leapyear][rulep->r_mon - 1]) + break; + d += DAYSPERWEEK; + } + + /* + ** "d" is the day-of-month (zero-origin) of the day we want. + */ + value = d * SECSPERDAY; + for (i = 0; i < rulep->r_mon - 1; ++i) + value += mon_lengths[leapyear][i] * SECSPERDAY; + break; + } + + /* + ** "value" is the year-relative time of 00:00:00 UT on the day in + ** question. To get the year-relative time of the specified local + ** time on that day, add the transition time and the current offset + ** from UT. + */ + return value + rulep->r_time + offset; +} + +/* +** Given a POSIX section 8-style TZ string, fill in the rule tables as +** appropriate. +*/ + +static int +tzparse(const char *name, register struct state *const sp, + const int lastditch) +{ + const char * stdname; + const char * dstname; + size_t stdlen; + size_t dstlen; + int_fast32_t stdoffset; + int_fast32_t dstoffset; + register char * cp; + register int load_result; + static struct ttinfo zttinfo; + + INITIALIZE(dstname); + stdname = name; + if (lastditch) { + stdlen = strlen(name); /* length of standard zone name */ + name += stdlen; + if (stdlen >= sizeof sp->chars) + stdlen = (sizeof sp->chars) - 1; + stdoffset = 0; + } else { + if (*name == '<') { + name++; + stdname = name; + name = getqzname(name, '>'); + if (*name != '>') + return (-1); + stdlen = name - stdname; + name++; + } else { + name = getzname(name); + stdlen = name - stdname; + } + if (*name == '\0') + return -1; + name = getoffset(name, &stdoffset); + if (name == NULL) + return -1; + } + load_result = tzload(TZDEFRULES, sp, false); + if (load_result != 0) + sp->leapcnt = 0; /* so, we're off a little */ + if (*name != '\0') { + if (*name == '<') { + dstname = ++name; + name = getqzname(name, '>'); + if (*name != '>') + return -1; + dstlen = name - dstname; + name++; + } else { + dstname = name; + name = getzname(name); + dstlen = name - dstname; /* length of DST zone name */ + } + if (*name != '\0' && *name != ',' && *name != ';') { + name = getoffset(name, &dstoffset); + if (name == NULL) + return -1; + } else dstoffset = stdoffset - SECSPERHOUR; + if (*name == '\0' && load_result != 0) + name = TZDEFRULESTRING; + if (*name == ',' || *name == ';') { + struct rule start; + struct rule end; + register int year; + register int yearlim; + register int timecnt; + time_t janfirst; + + ++name; + if ((name = getrule(name, &start)) == NULL) + return -1; + if (*name++ != ',') + return -1; + if ((name = getrule(name, &end)) == NULL) + return -1; + if (*name != '\0') + return -1; + sp->typecnt = 2; /* standard time and DST */ + /* + ** Two transitions per year, from EPOCH_YEAR forward. + */ + sp->ttis[0] = sp->ttis[1] = zttinfo; + sp->ttis[0].tt_gmtoff = -dstoffset; + sp->ttis[0].tt_isdst = 1; + sp->ttis[0].tt_abbrind = stdlen + 1; + sp->ttis[1].tt_gmtoff = -stdoffset; + sp->ttis[1].tt_isdst = 0; + sp->ttis[1].tt_abbrind = 0; + sp->defaulttype = 0; + timecnt = 0; + janfirst = 0; + yearlim = EPOCH_YEAR + YEARSPERREPEAT; + for (year = EPOCH_YEAR; year < yearlim; year++) { + int_fast32_t + starttime = transtime(year, &start, stdoffset), + endtime = transtime(year, &end, dstoffset); + int_fast32_t + yearsecs = (year_lengths[isleap(year)] + * SECSPERDAY); + int reversed = endtime < starttime; + if (reversed) { + int_fast32_t swap = starttime; + starttime = endtime; + endtime = swap; + } + if (reversed + || (starttime < endtime + && (endtime - starttime + < (yearsecs + + (stdoffset - dstoffset))))) { + if (TZ_MAX_TIMES - 2 < timecnt) + break; + yearlim = year + YEARSPERREPEAT + 1; + sp->ats[timecnt] = janfirst; + if (increment_overflow_time + (&sp->ats[timecnt], starttime)) + break; + sp->types[timecnt++] = reversed; + sp->ats[timecnt] = janfirst; + if (increment_overflow_time + (&sp->ats[timecnt], endtime)) + break; + sp->types[timecnt++] = !reversed; + } + if (increment_overflow_time(&janfirst, yearsecs)) + break; + } + sp->timecnt = timecnt; + if (!timecnt) + sp->typecnt = 1; /* Perpetual DST. */ + } else { + register int_fast32_t theirstdoffset; + register int_fast32_t theirdstoffset; + register int_fast32_t theiroffset; + register int isdst; + register int i; + register int j; + + if (*name != '\0') + return -1; + /* + ** Initial values of theirstdoffset and theirdstoffset. + */ + theirstdoffset = 0; + for (i = 0; i < sp->timecnt; ++i) { + j = sp->types[i]; + if (!sp->ttis[j].tt_isdst) { + theirstdoffset = + -sp->ttis[j].tt_gmtoff; + break; + } + } + theirdstoffset = 0; + for (i = 0; i < sp->timecnt; ++i) { + j = sp->types[i]; + if (sp->ttis[j].tt_isdst) { + theirdstoffset = + -sp->ttis[j].tt_gmtoff; + break; + } + } + /* + ** Initially we're assumed to be in standard time. + */ + isdst = false; + theiroffset = theirstdoffset; + /* + ** Now juggle transition times and types + ** tracking offsets as you do. + */ + for (i = 0; i < sp->timecnt; ++i) { + j = sp->types[i]; + sp->types[i] = sp->ttis[j].tt_isdst; + if (sp->ttis[j].tt_ttisgmt) { + /* No adjustment to transition time */ + } else { + /* + ** If summer time is in effect, and the + ** transition time was not specified as + ** standard time, add the summer time + ** offset to the transition time; + ** otherwise, add the standard time + ** offset to the transition time. + */ + /* + ** Transitions from DST to DDST + ** will effectively disappear since + ** POSIX provides for only one DST + ** offset. + */ + if (isdst && !sp->ttis[j].tt_ttisstd) { + sp->ats[i] += dstoffset - + theirdstoffset; + } else { + sp->ats[i] += stdoffset - + theirstdoffset; + } + } + theiroffset = -sp->ttis[j].tt_gmtoff; + if (sp->ttis[j].tt_isdst) + theirdstoffset = theiroffset; + else theirstdoffset = theiroffset; + } + /* + ** Finally, fill in ttis. + */ + sp->ttis[0] = sp->ttis[1] = zttinfo; + sp->ttis[0].tt_gmtoff = -stdoffset; + sp->ttis[0].tt_isdst = false; + sp->ttis[0].tt_abbrind = 0; + sp->ttis[1].tt_gmtoff = -dstoffset; + sp->ttis[1].tt_isdst = true; + sp->ttis[1].tt_abbrind = stdlen + 1; + sp->typecnt = 2; + sp->defaulttype = 0; + } + } else { + dstlen = 0; + sp->typecnt = 1; /* only standard time */ + sp->timecnt = 0; + sp->ttis[0] = zttinfo; + sp->ttis[0].tt_gmtoff = -stdoffset; + sp->ttis[0].tt_isdst = 0; + sp->ttis[0].tt_abbrind = 0; + sp->defaulttype = 0; + } + sp->charcnt = stdlen + 1; + if (dstlen != 0) + sp->charcnt += dstlen + 1; + if ((size_t) sp->charcnt > sizeof sp->chars) + return -1; + cp = sp->chars; + (void) strncpy(cp, stdname, stdlen); + cp += stdlen; + *cp++ = '\0'; + if (dstlen != 0) { + (void) strncpy(cp, dstname, dstlen); + *(cp + dstlen) = '\0'; + } + return 0; +} + +static void +gmtload(struct state *const sp) +{ + if (tzload(gmt, sp, true) != 0) + (void) tzparse(gmt, sp, true); +} + +#ifndef STD_INSPIRED +/* +** A non-static declaration of tzsetwall in a system header file +** may cause a warning about this upcoming static declaration... +*/ +static +#endif /* !defined STD_INSPIRED */ +void +tzsetwall(void) +{ + if (lcl_is_set < 0) + return; + lcl_is_set = -1; + +#ifdef ALL_STATE + if (lclptr == NULL) { + lclptr = malloc(sizeof *lclptr); + if (lclptr == NULL) { + settzname(); /* all we can do */ + return; + } + } +#endif /* defined ALL_STATE */ + if (tzload(NULL, lclptr, true) != 0) + gmtload(lclptr); + settzname(); +} + +void +tzset(void) +{ + register const char * name; + + name = getenv("TZ"); + if (name == NULL) { + tzsetwall(); + return; + } + + if (lcl_is_set > 0 && strcmp(lcl_TZname, name) == 0) + return; + lcl_is_set = strlen(name) < sizeof lcl_TZname; + if (lcl_is_set) + (void) strcpy(lcl_TZname, name); + +#ifdef ALL_STATE + if (lclptr == NULL) { + lclptr = malloc(sizeof *lclptr); + if (lclptr == NULL) { + settzname(); /* all we can do */ + return; + } + } +#endif /* defined ALL_STATE */ + if (*name == '\0') { + /* + ** User wants it fast rather than right. + */ + lclptr->leapcnt = 0; /* so, we're off a little */ + lclptr->timecnt = 0; + lclptr->typecnt = 0; + lclptr->ttis[0].tt_isdst = 0; + lclptr->ttis[0].tt_gmtoff = 0; + lclptr->ttis[0].tt_abbrind = 0; + (void) strcpy(lclptr->chars, gmt); + } else if (tzload(name, lclptr, true) != 0) + if (name[0] == ':' || tzparse(name, lclptr, false) != 0) + (void) gmtload(lclptr); + settzname(); +} + +/* +** The easy way to behave "as if no library function calls" localtime +** is to not call it--so we drop its guts into "localsub", which can be +** freely called. (And no, the PANS doesn't require the above behavior-- +** but it *is* desirable.) +** +** The unused offset argument is for the benefit of mktime variants. +*/ + +/*ARGSUSED*/ +static struct tm * +localsub(const time_t *const timep, const int_fast32_t offset, + struct tm *const tmp) +{ + register struct state * sp; + register const struct ttinfo * ttisp; + register int i; + register struct tm * result; + const time_t t = *timep; + + sp = lclptr; + if (sp == NULL) + return gmtsub(timep, offset, tmp); + if ((sp->goback && t < sp->ats[0]) || + (sp->goahead && t > sp->ats[sp->timecnt - 1])) { + time_t newt = t; + register time_t seconds; + register time_t years; + + if (t < sp->ats[0]) + seconds = sp->ats[0] - t; + else seconds = t - sp->ats[sp->timecnt - 1]; + --seconds; + years = (seconds / SECSPERREPEAT + 1) * YEARSPERREPEAT; + seconds = years * AVGSECSPERYEAR; + if (t < sp->ats[0]) + newt += seconds; + else newt -= seconds; + if (newt < sp->ats[0] || + newt > sp->ats[sp->timecnt - 1]) + return NULL; /* "cannot happen" */ + result = localsub(&newt, offset, tmp); + if (result == tmp) { + register time_t newy; + + newy = tmp->tm_year; + if (t < sp->ats[0]) + newy -= years; + else newy += years; + tmp->tm_year = newy; + if (tmp->tm_year != newy) + return NULL; + } + return result; + } + if (sp->timecnt == 0 || t < sp->ats[0]) { + i = sp->defaulttype; + } else { + register int lo = 1; + register int hi = sp->timecnt; + + while (lo < hi) { + register int mid = (lo + hi) >> 1; + + if (t < sp->ats[mid]) + hi = mid; + else lo = mid + 1; + } + i = (int) sp->types[lo - 1]; + } + ttisp = &sp->ttis[i]; + /* + ** To get (wrong) behavior that's compatible with System V Release 2.0 + ** you'd replace the statement below with + ** t += ttisp->tt_gmtoff; + ** timesub(&t, 0L, sp, tmp); + */ + result = timesub(&t, ttisp->tt_gmtoff, sp, tmp); + tmp->tm_isdst = ttisp->tt_isdst; + tzname[tmp->tm_isdst] = &sp->chars[ttisp->tt_abbrind]; +#ifdef TM_ZONE + tmp->TM_ZONE = &sp->chars[ttisp->tt_abbrind]; +#endif /* defined TM_ZONE */ + return result; +} + +struct tm * +localtime(const time_t *const timep) +{ + tzset(); + return localsub(timep, 0L, &tm); +} + +/* +** Re-entrant version of localtime. +*/ + +struct tm * +localtime_r(const time_t *const timep, struct tm *tmp) +{ + return localsub(timep, 0L, tmp); +} + +/* +** gmtsub is to gmtime as localsub is to localtime. +*/ + +static struct tm * +gmtsub(const time_t *const timep, const int_fast32_t offset, + struct tm *const tmp) +{ + register struct tm * result; + + if (!gmt_is_set) { + gmt_is_set = true; +#ifdef ALL_STATE + gmtptr = malloc(sizeof *gmtptr); +#endif /* defined ALL_STATE */ + if (gmtptr != NULL) + gmtload(gmtptr); + } + result = timesub(timep, offset, gmtptr, tmp); +#ifdef TM_ZONE + /* + ** Could get fancy here and deliver something such as + ** "UT+xxxx" or "UT-xxxx" if offset is non-zero, + ** but this is no time for a treasure hunt. + */ + tmp->TM_ZONE = offset ? wildabbr : gmtptr ? gmtptr->chars : gmt; +#endif /* defined TM_ZONE */ + return result; +} + +struct tm * +gmtime(const time_t *const timep) +{ + return gmtsub(timep, 0L, &tm); +} + +/* +* Re-entrant version of gmtime. +*/ + +struct tm * +gmtime_r(const time_t *const timep, struct tm *tmp) +{ + return gmtsub(timep, 0L, tmp); +} + +#ifdef STD_INSPIRED + +struct tm * +offtime(const time_t *const timep, const long offset) +{ + return gmtsub(timep, offset, &tm); +} + +#endif /* defined STD_INSPIRED */ + +/* +** Return the number of leap years through the end of the given year +** where, to make the math easy, the answer for year zero is defined as zero. +*/ + +static int +leaps_thru_end_of(register const int y) +{ + return (y >= 0) ? (y / 4 - y / 100 + y / 400) : + -(leaps_thru_end_of(-(y + 1)) + 1); +} + +static struct tm * +timesub(const time_t *const timep, const int_fast32_t offset, + register const struct state *const sp, + register struct tm *const tmp) +{ + register const struct lsinfo * lp; + register time_t tdays; + register int idays; /* unsigned would be so 2003 */ + register int_fast64_t rem; + int y; + register const int * ip; + register int_fast64_t corr; + register int hit; + register int i; + + corr = 0; + hit = 0; + i = (sp == NULL) ? 0 : sp->leapcnt; + while (--i >= 0) { + lp = &sp->lsis[i]; + if (*timep >= lp->ls_trans) { + if (*timep == lp->ls_trans) { + hit = ((i == 0 && lp->ls_corr > 0) || + lp->ls_corr > sp->lsis[i - 1].ls_corr); + if (hit) + while (i > 0 && + sp->lsis[i].ls_trans == + sp->lsis[i - 1].ls_trans + 1 && + sp->lsis[i].ls_corr == + sp->lsis[i - 1].ls_corr + 1) { + ++hit; + --i; + } + } + corr = lp->ls_corr; + break; + } + } + y = EPOCH_YEAR; + tdays = *timep / SECSPERDAY; + rem = *timep - tdays * SECSPERDAY; + while (tdays < 0 || tdays >= year_lengths[isleap(y)]) { + int newy; + register time_t tdelta; + register int idelta; + register int leapdays; + + tdelta = tdays / DAYSPERLYEAR; + if (! ((! TYPE_SIGNED(time_t) || INT_MIN <= tdelta) + && tdelta <= INT_MAX)) + return NULL; + idelta = tdelta; + if (idelta == 0) + idelta = (tdays < 0) ? -1 : 1; + newy = y; + if (increment_overflow(&newy, idelta)) + return NULL; + leapdays = leaps_thru_end_of(newy - 1) - + leaps_thru_end_of(y - 1); + tdays -= ((time_t) newy - y) * DAYSPERNYEAR; + tdays -= leapdays; + y = newy; + } + { + register int_fast32_t seconds; + + seconds = tdays * SECSPERDAY; + tdays = seconds / SECSPERDAY; + rem += seconds - tdays * SECSPERDAY; + } + /* + ** Given the range, we can now fearlessly cast... + */ + idays = tdays; + rem += offset - corr; + while (rem < 0) { + rem += SECSPERDAY; + --idays; + } + while (rem >= SECSPERDAY) { + rem -= SECSPERDAY; + ++idays; + } + while (idays < 0) { + if (increment_overflow(&y, -1)) + return NULL; + idays += year_lengths[isleap(y)]; + } + while (idays >= year_lengths[isleap(y)]) { + idays -= year_lengths[isleap(y)]; + if (increment_overflow(&y, 1)) + return NULL; + } + tmp->tm_year = y; + if (increment_overflow(&tmp->tm_year, -TM_YEAR_BASE)) + return NULL; + tmp->tm_yday = idays; + /* + ** The "extra" mods below avoid overflow problems. + */ + tmp->tm_wday = EPOCH_WDAY + + ((y - EPOCH_YEAR) % DAYSPERWEEK) * + (DAYSPERNYEAR % DAYSPERWEEK) + + leaps_thru_end_of(y - 1) - + leaps_thru_end_of(EPOCH_YEAR - 1) + + idays; + tmp->tm_wday %= DAYSPERWEEK; + if (tmp->tm_wday < 0) + tmp->tm_wday += DAYSPERWEEK; + tmp->tm_hour = (int) (rem / SECSPERHOUR); + rem %= SECSPERHOUR; + tmp->tm_min = (int) (rem / SECSPERMIN); + /* + ** A positive leap second requires a special + ** representation. This uses "... ??:59:60" et seq. + */ + tmp->tm_sec = (int) (rem % SECSPERMIN) + hit; + ip = mon_lengths[isleap(y)]; + for (tmp->tm_mon = 0; idays >= ip[tmp->tm_mon]; ++(tmp->tm_mon)) + idays -= ip[tmp->tm_mon]; + tmp->tm_mday = (int) (idays + 1); + tmp->tm_isdst = 0; +#ifdef TM_GMTOFF + tmp->TM_GMTOFF = offset; +#endif /* defined TM_GMTOFF */ + return tmp; +} + +char * +ctime(const time_t *const timep) +{ +/* +** Section 4.12.3.2 of X3.159-1989 requires that +** The ctime function converts the calendar time pointed to by timer +** to local time in the form of a string. It is equivalent to +** asctime(localtime(timer)) +*/ + return asctime(localtime(timep)); +} + +char * +ctime_r(const time_t *const timep, char *buf) +{ + struct tm mytm; + + return asctime_r(localtime_r(timep, &mytm), buf); +} + +/* +** Adapted from code provided by Robert Elz, who writes: +** The "best" way to do mktime I think is based on an idea of Bob +** Kridle's (so its said...) from a long time ago. +** It does a binary search of the time_t space. Since time_t's are +** just 32 bits, its a max of 32 iterations (even at 64 bits it +** would still be very reasonable). +*/ + +#ifndef WRONG +#define WRONG (-1) +#endif /* !defined WRONG */ + +/* +** Normalize logic courtesy Paul Eggert. +*/ + +static int +increment_overflow(int *const ip, int j) +{ + register int const i = *ip; + + /* + ** If i >= 0 there can only be overflow if i + j > INT_MAX + ** or if j > INT_MAX - i; given i >= 0, INT_MAX - i cannot overflow. + ** If i < 0 there can only be overflow if i + j < INT_MIN + ** or if j < INT_MIN - i; given i < 0, INT_MIN - i cannot overflow. + */ + if ((i >= 0) ? (j > INT_MAX - i) : (j < INT_MIN - i)) + return true; + *ip += j; + return false; +} + +static int +increment_overflow32(int_fast32_t *const lp, int const m) +{ + register int_fast32_t const l = *lp; + + if ((l >= 0) ? (m > INT_FAST32_MAX - l) : (m < INT_FAST32_MIN - l)) + return true; + *lp += m; + return false; +} + +static int +increment_overflow_time(time_t *tp, int_fast32_t j) +{ + /* + ** This is like + ** 'if (! (time_t_min <= *tp + j && *tp + j <= time_t_max)) ...', + ** except that it does the right thing even if *tp + j would overflow. + */ + if (! (j < 0 + ? (TYPE_SIGNED(time_t) ? time_t_min - j <= *tp : -1 - j < *tp) + : *tp <= time_t_max - j)) + return true; + *tp += j; + return false; +} + +static int +normalize_overflow(int *const tensptr, int *const unitsptr, const int base) +{ + register int tensdelta; + + tensdelta = (*unitsptr >= 0) ? + (*unitsptr / base) : + (-1 - (-1 - *unitsptr) / base); + *unitsptr -= tensdelta * base; + return increment_overflow(tensptr, tensdelta); +} + +static int +normalize_overflow32(int_fast32_t *const tensptr, int *const unitsptr, + const int base) +{ + register int tensdelta; + + tensdelta = (*unitsptr >= 0) ? + (*unitsptr / base) : + (-1 - (-1 - *unitsptr) / base); + *unitsptr -= tensdelta * base; + return increment_overflow32(tensptr, tensdelta); +} + +static int +tmcomp(register const struct tm *const atmp, + register const struct tm *const btmp) +{ + register int result; + + if (atmp->tm_year != btmp->tm_year) + return atmp->tm_year < btmp->tm_year ? -1 : 1; + if ((result = (atmp->tm_mon - btmp->tm_mon)) == 0 && + (result = (atmp->tm_mday - btmp->tm_mday)) == 0 && + (result = (atmp->tm_hour - btmp->tm_hour)) == 0 && + (result = (atmp->tm_min - btmp->tm_min)) == 0) + result = atmp->tm_sec - btmp->tm_sec; + return result; +} + +static time_t +time2sub(struct tm *const tmp, + struct tm *(*const funcp)(const time_t *, int_fast32_t, struct tm *), + const int_fast32_t offset, + int *const okayp, + const int do_norm_secs) +{ + register const struct state * sp; + register int dir; + register int i, j; + register int saved_seconds; + register int_fast32_t li; + register time_t lo; + register time_t hi; + int_fast32_t y; + time_t newt; + time_t t; + struct tm yourtm, mytm; + + *okayp = false; + yourtm = *tmp; + if (do_norm_secs) { + if (normalize_overflow(&yourtm.tm_min, &yourtm.tm_sec, + SECSPERMIN)) + return WRONG; + } + if (normalize_overflow(&yourtm.tm_hour, &yourtm.tm_min, MINSPERHOUR)) + return WRONG; + if (normalize_overflow(&yourtm.tm_mday, &yourtm.tm_hour, HOURSPERDAY)) + return WRONG; + y = yourtm.tm_year; + if (normalize_overflow32(&y, &yourtm.tm_mon, MONSPERYEAR)) + return WRONG; + /* + ** Turn y into an actual year number for now. + ** It is converted back to an offset from TM_YEAR_BASE later. + */ + if (increment_overflow32(&y, TM_YEAR_BASE)) + return WRONG; + while (yourtm.tm_mday <= 0) { + if (increment_overflow32(&y, -1)) + return WRONG; + li = y + (1 < yourtm.tm_mon); + yourtm.tm_mday += year_lengths[isleap(li)]; + } + while (yourtm.tm_mday > DAYSPERLYEAR) { + li = y + (1 < yourtm.tm_mon); + yourtm.tm_mday -= year_lengths[isleap(li)]; + if (increment_overflow32(&y, 1)) + return WRONG; + } + for ( ; ; ) { + i = mon_lengths[isleap(y)][yourtm.tm_mon]; + if (yourtm.tm_mday <= i) + break; + yourtm.tm_mday -= i; + if (++yourtm.tm_mon >= MONSPERYEAR) { + yourtm.tm_mon = 0; + if (increment_overflow32(&y, 1)) + return WRONG; + } + } + if (increment_overflow32(&y, -TM_YEAR_BASE)) + return WRONG; + yourtm.tm_year = y; + if (yourtm.tm_year != y) + return WRONG; + if (yourtm.tm_sec >= 0 && yourtm.tm_sec < SECSPERMIN) + saved_seconds = 0; + else if (y + TM_YEAR_BASE < EPOCH_YEAR) { + /* + ** We can't set tm_sec to 0, because that might push the + ** time below the minimum representable time. + ** Set tm_sec to 59 instead. + ** This assumes that the minimum representable time is + ** not in the same minute that a leap second was deleted from, + ** which is a safer assumption than using 58 would be. + */ + if (increment_overflow(&yourtm.tm_sec, 1 - SECSPERMIN)) + return WRONG; + saved_seconds = yourtm.tm_sec; + yourtm.tm_sec = SECSPERMIN - 1; + } else { + saved_seconds = yourtm.tm_sec; + yourtm.tm_sec = 0; + } + /* + ** Do a binary search (this works whatever time_t's type is). + */ + if (!TYPE_SIGNED(time_t)) { + lo = 0; + hi = lo - 1; + } else { + lo = 1; + for (i = 0; i < (int) TYPE_BIT(time_t) - 1; ++i) + lo *= 2; + hi = -(lo + 1); + } + for ( ; ; ) { + t = lo / 2 + hi / 2; + if (t < lo) + t = lo; + else if (t > hi) + t = hi; + if ((*funcp)(&t, offset, &mytm) == NULL) { + /* + ** Assume that t is too extreme to be represented in + ** a struct tm; arrange things so that it is less + ** extreme on the next pass. + */ + dir = (t > 0) ? 1 : -1; + } else dir = tmcomp(&mytm, &yourtm); + if (dir != 0) { + if (t == lo) { + if (t == time_t_max) + return WRONG; + ++t; + ++lo; + } else if (t == hi) { + if (t == time_t_min) + return WRONG; + --t; + --hi; + } + if (lo > hi) + return WRONG; + if (dir > 0) + hi = t; + else lo = t; + continue; + } + if (yourtm.tm_isdst < 0 || mytm.tm_isdst == yourtm.tm_isdst) + break; + /* + ** Right time, wrong type. + ** Hunt for right time, right type. + ** It's okay to guess wrong since the guess + ** gets checked. + */ + sp = (const struct state *) + ((funcp == localsub) ? lclptr : gmtptr); + if (sp == NULL) + return WRONG; + for (i = sp->typecnt - 1; i >= 0; --i) { + if (sp->ttis[i].tt_isdst != yourtm.tm_isdst) + continue; + for (j = sp->typecnt - 1; j >= 0; --j) { + if (sp->ttis[j].tt_isdst == yourtm.tm_isdst) + continue; + newt = t + sp->ttis[j].tt_gmtoff - + sp->ttis[i].tt_gmtoff; + if ((*funcp)(&newt, offset, &mytm) == NULL) + continue; + if (tmcomp(&mytm, &yourtm) != 0) + continue; + if (mytm.tm_isdst != yourtm.tm_isdst) + continue; + /* + ** We have a match. + */ + t = newt; + goto label; + } + } + return WRONG; + } +label: + newt = t + saved_seconds; + if ((newt < t) != (saved_seconds < 0)) + return WRONG; + t = newt; + if ((*funcp)(&t, offset, tmp)) + *okayp = true; + return t; +} + +static time_t +time2(struct tm * const tmp, + struct tm * (*const funcp)(const time_t *, int_fast32_t, struct tm *), + const int_fast32_t offset, + int *const okayp) +{ + time_t t; + + /* + ** First try without normalization of seconds + ** (in case tm_sec contains a value associated with a leap second). + ** If that fails, try with normalization of seconds. + */ + t = time2sub(tmp, funcp, offset, okayp, false); + return *okayp ? t : time2sub(tmp, funcp, offset, okayp, true); +} + +static time_t +time1(struct tm *const tmp, + struct tm *(*const funcp) (const time_t *, int_fast32_t, struct tm *), + const int_fast32_t offset) +{ + register time_t t; + register const struct state * sp; + register int samei, otheri; + register int sameind, otherind; + register int i; + register int nseen; + int seen[TZ_MAX_TYPES]; + int types[TZ_MAX_TYPES]; + int okay; + + if (tmp == NULL) { + errno = EINVAL; + return WRONG; + } + if (tmp->tm_isdst > 1) + tmp->tm_isdst = 1; + t = time2(tmp, funcp, offset, &okay); + if (okay) + return t; + if (tmp->tm_isdst < 0) +#ifdef PCTS + /* + ** POSIX Conformance Test Suite code courtesy Grant Sullivan. + */ + tmp->tm_isdst = 0; /* reset to std and try again */ +#else + return t; +#endif /* !defined PCTS */ + /* + ** We're supposed to assume that somebody took a time of one type + ** and did some math on it that yielded a "struct tm" that's bad. + ** We try to divine the type they started from and adjust to the + ** type they need. + */ + sp = (const struct state *) ((funcp == localsub) ? lclptr : gmtptr); + if (sp == NULL) + return WRONG; + for (i = 0; i < sp->typecnt; ++i) + seen[i] = false; + nseen = 0; + for (i = sp->timecnt - 1; i >= 0; --i) + if (!seen[sp->types[i]]) { + seen[sp->types[i]] = true; + types[nseen++] = sp->types[i]; + } + for (sameind = 0; sameind < nseen; ++sameind) { + samei = types[sameind]; + if (sp->ttis[samei].tt_isdst != tmp->tm_isdst) + continue; + for (otherind = 0; otherind < nseen; ++otherind) { + otheri = types[otherind]; + if (sp->ttis[otheri].tt_isdst == tmp->tm_isdst) + continue; + tmp->tm_sec += sp->ttis[otheri].tt_gmtoff - + sp->ttis[samei].tt_gmtoff; + tmp->tm_isdst = !tmp->tm_isdst; + t = time2(tmp, funcp, offset, &okay); + if (okay) + return t; + tmp->tm_sec -= sp->ttis[otheri].tt_gmtoff - + sp->ttis[samei].tt_gmtoff; + tmp->tm_isdst = !tmp->tm_isdst; + } + } + return WRONG; +} + +time_t +mktime(struct tm *const tmp) +{ + tzset(); + return time1(tmp, localsub, 0L); +} + +#ifdef STD_INSPIRED + +time_t +timelocal(struct tm *const tmp) +{ + if (tmp != NULL) + tmp->tm_isdst = -1; /* in case it wasn't initialized */ + return mktime(tmp); +} + +time_t +timegm(struct tm *const tmp) +{ + if (tmp != NULL) + tmp->tm_isdst = 0; + return time1(tmp, gmtsub, 0L); +} + +time_t +timeoff(struct tm *const tmp, const long offset) +{ + if (tmp != NULL) + tmp->tm_isdst = 0; + return time1(tmp, gmtsub, offset); +} + +#endif /* defined STD_INSPIRED */ + +#ifdef CMUCS + +/* +** The following is supplied for compatibility with +** previous versions of the CMUCS runtime library. +*/ + +long +gtime(struct tm *const tmp) +{ + const time_t t = mktime(tmp); + + if (t == WRONG) + return -1; + return t; +} + +#endif /* defined CMUCS */ + +/* +** XXX--is the below the right way to conditionalize?? +*/ + +#ifdef STD_INSPIRED + +/* +** IEEE Std 1003.1-1988 (POSIX) legislates that 536457599 +** shall correspond to "Wed Dec 31 23:59:59 UTC 1986", which +** is not the case if we are accounting for leap seconds. +** So, we provide the following conversion routines for use +** when exchanging timestamps with POSIX conforming systems. +*/ + +static int_fast64_t +leapcorr(time_t *timep) +{ + register struct state * sp; + register struct lsinfo * lp; + register int i; + + sp = lclptr; + i = sp->leapcnt; + while (--i >= 0) { + lp = &sp->lsis[i]; + if (*timep >= lp->ls_trans) + return lp->ls_corr; + } + return 0; +} + +time_t +time2posix(time_t t) +{ + tzset(); + return t - leapcorr(&t); +} + +time_t +posix2time(time_t t) +{ + time_t x; + time_t y; + + tzset(); + /* + ** For a positive leap second hit, the result + ** is not unique. For a negative leap second + ** hit, the corresponding time doesn't exist, + ** so we return an adjacent second. + */ + x = t + leapcorr(&t); + y = x - leapcorr(&x); + if (y < t) { + do { + x++; + y = x - leapcorr(&x); + } while (y < t); + if (t != y) + return x - 1; + } else if (y > t) { + do { + --x; + y = x - leapcorr(&x); + } while (y > t); + if (t != y) + return x + 1; + } + return x; +} + +#endif /* defined STD_INSPIRED */ diff --git a/intl/icu/source/tools/tzcode/private.h b/intl/icu/source/tools/tzcode/private.h new file mode 100644 index 0000000000..1f35483dc4 --- /dev/null +++ b/intl/icu/source/tools/tzcode/private.h @@ -0,0 +1,415 @@ +#ifndef PRIVATE_H + +#define PRIVATE_H + +/* +** This file is in the public domain, so clarified as of +** 1996-06-05 by Arthur David Olson. +*/ + +/* +** This header is for use ONLY with the time conversion code. +** There is no guarantee that it will remain unchanged, +** or that it will remain at all. +** Do NOT copy it to any system include directory. +** Thank you! +*/ + +#define GRANDPARENTED "Local time zone must be set--see zic manual page" + +/* +** Defaults for preprocessor symbols. +** You can override these in your C compiler options, e.g. `-DHAVE_ADJTIME=0'. +*/ + +#ifndef HAVE_ADJTIME +#define HAVE_ADJTIME 1 +#endif /* !defined HAVE_ADJTIME */ + +#ifndef HAVE_GETTEXT +#define HAVE_GETTEXT 0 +#endif /* !defined HAVE_GETTEXT */ + +#ifndef HAVE_INCOMPATIBLE_CTIME_R +#define HAVE_INCOMPATIBLE_CTIME_R 0 +#endif /* !defined INCOMPATIBLE_CTIME_R */ + +#ifndef HAVE_LINK +#define HAVE_LINK 1 +#endif /* !defined HAVE_LINK */ + +#ifndef HAVE_SETTIMEOFDAY +#define HAVE_SETTIMEOFDAY 3 +#endif /* !defined HAVE_SETTIMEOFDAY */ + +#ifndef HAVE_SYMLINK +#define HAVE_SYMLINK 1 +#endif /* !defined HAVE_SYMLINK */ + +#ifndef HAVE_SYS_STAT_H +#define HAVE_SYS_STAT_H 1 +#endif /* !defined HAVE_SYS_STAT_H */ + +#ifndef HAVE_SYS_WAIT_H +#define HAVE_SYS_WAIT_H 1 +#endif /* !defined HAVE_SYS_WAIT_H */ + +#ifndef HAVE_UNISTD_H +#define HAVE_UNISTD_H 1 +#endif /* !defined HAVE_UNISTD_H */ + +#ifndef HAVE_UTMPX_H +#define HAVE_UTMPX_H 0 +#endif /* !defined HAVE_UTMPX_H */ + +#ifndef LOCALE_HOME +#define LOCALE_HOME "/usr/lib/locale" +#endif /* !defined LOCALE_HOME */ + +#if HAVE_INCOMPATIBLE_CTIME_R +#define asctime_r _incompatible_asctime_r +#define ctime_r _incompatible_ctime_r +#endif /* HAVE_INCOMPATIBLE_CTIME_R */ + +/* +** Nested includes +*/ + +#include "sys/types.h" /* for time_t */ +#include "stdio.h" +#include "errno.h" +#include "string.h" +#include "limits.h" /* for CHAR_BIT et al. */ +#include "time.h" +#include "stdlib.h" + +#if HAVE_GETTEXT +#include "libintl.h" +#endif /* HAVE_GETTEXT */ + +#if HAVE_SYS_WAIT_H +#include <sys/wait.h> /* for WIFEXITED and WEXITSTATUS */ +#endif /* HAVE_SYS_WAIT_H */ + +#ifndef WIFEXITED +#define WIFEXITED(status) (((status) & 0xff) == 0) +#endif /* !defined WIFEXITED */ +#ifndef WEXITSTATUS +#define WEXITSTATUS(status) (((status) >> 8) & 0xff) +#endif /* !defined WEXITSTATUS */ + +#if HAVE_UNISTD_H +#include "unistd.h" /* for F_OK, R_OK, and other POSIX goodness */ +#endif /* HAVE_UNISTD_H */ + +#ifndef F_OK +#define F_OK 0 +#endif /* !defined F_OK */ +#ifndef R_OK +#define R_OK 4 +#endif /* !defined R_OK */ + +/* Unlike <ctype.h>'s isdigit, this also works if c < 0 | c > UCHAR_MAX. */ +#define is_digit(c) ((unsigned)(c) - '0' <= 9) + +/* +** Define HAVE_STDINT_H's default value here, rather than at the +** start, since __GLIBC__'s value depends on previously-included +** files. +** (glibc 2.1 and later have stdint.h, even with pre-C99 compilers.) +*/ +#ifndef HAVE_STDINT_H +#define HAVE_STDINT_H \ + (199901 <= __STDC_VERSION__ || \ + 2 < (__GLIBC__ + (0 < __GLIBC_MINOR__))) +#endif /* !defined HAVE_STDINT_H */ + +#if HAVE_STDINT_H +#include "stdint.h" +#endif /* !HAVE_STDINT_H */ + +#ifndef HAVE_INTTYPES_H +# define HAVE_INTTYPES_H HAVE_STDINT_H +#endif +#if HAVE_INTTYPES_H +# include <inttypes.h> +#endif + +#ifndef INT_FAST64_MAX +/* Pre-C99 GCC compilers define __LONG_LONG_MAX__ instead of LLONG_MAX. */ +#if defined LLONG_MAX || defined __LONG_LONG_MAX__ +typedef long long int_fast64_t; +# ifdef LLONG_MAX +# define INT_FAST64_MIN LLONG_MIN +# define INT_FAST64_MAX LLONG_MAX +# else +# define INT_FAST64_MIN __LONG_LONG_MIN__ +# define INT_FAST64_MAX __LONG_LONG_MAX__ +# endif +# define SCNdFAST64 "lld" +#else /* ! (defined LLONG_MAX || defined __LONG_LONG_MAX__) */ +#if (LONG_MAX >> 31) < 0xffffffff +Please use a compiler that supports a 64-bit integer type (or wider); +you may need to compile with "-DHAVE_STDINT_H". +#endif /* (LONG_MAX >> 31) < 0xffffffff */ +typedef long int_fast64_t; +# define INT_FAST64_MIN LONG_MIN +# define INT_FAST64_MAX LONG_MAX +# define SCNdFAST64 "ld" +#endif /* ! (defined LLONG_MAX || defined __LONG_LONG_MAX__) */ +#endif /* !defined INT_FAST64_MAX */ + +#ifndef INT_FAST32_MAX +# if INT_MAX >> 31 == 0 +typedef long int_fast32_t; +# else +typedef int int_fast32_t; +# endif +#endif + +#ifndef INTMAX_MAX +# if defined LLONG_MAX || defined __LONG_LONG_MAX__ +typedef long long intmax_t; +# define strtoimax strtoll +# define PRIdMAX "lld" +# ifdef LLONG_MAX +# define INTMAX_MAX LLONG_MAX +# define INTMAX_MIN LLONG_MIN +# else +# define INTMAX_MAX __LONG_LONG_MAX__ +# define INTMAX_MIN __LONG_LONG_MIN__ +# endif +# else +typedef long intmax_t; +# define strtoimax strtol +# define PRIdMAX "ld" +# define INTMAX_MAX LONG_MAX +# define INTMAX_MIN LONG_MIN +# endif +#endif + +#ifndef UINTMAX_MAX +# if defined ULLONG_MAX || defined __LONG_LONG_MAX__ +typedef unsigned long long uintmax_t; +# define PRIuMAX "llu" +# else +typedef unsigned long uintmax_t; +# define PRIuMAX "lu" +# endif +#endif + +#ifndef INT32_MAX +#define INT32_MAX 0x7fffffff +#endif /* !defined INT32_MAX */ +#ifndef INT32_MIN +#define INT32_MIN (-1 - INT32_MAX) +#endif /* !defined INT32_MIN */ + +#ifndef SIZE_MAX +#define SIZE_MAX ((size_t) -1) +#endif + +#if 2 < __GNUC__ + (96 <= __GNUC_MINOR__) +# define ATTRIBUTE_CONST __attribute__ ((const)) +# define ATTRIBUTE_PURE __attribute__ ((__pure__)) +# define ATTRIBUTE_FORMAT(spec) __attribute__ ((__format__ spec)) +#else +# define ATTRIBUTE_CONST /* empty */ +# define ATTRIBUTE_PURE /* empty */ +# define ATTRIBUTE_FORMAT(spec) /* empty */ +#endif + +#if !defined _Noreturn && __STDC_VERSION__ < 201112 +# if 2 < __GNUC__ + (8 <= __GNUC_MINOR__) +# define _Noreturn __attribute__ ((__noreturn__)) +# else +# define _Noreturn +# endif +#endif + +#if __STDC_VERSION__ < 199901 && !defined restrict +# define restrict /* empty */ +#endif + +/* +** Workarounds for compilers/systems. +*/ + +/* +** Some time.h implementations don't declare asctime_r. +** Others might define it as a macro. +** Fix the former without affecting the latter. +*/ + +#ifndef asctime_r +extern char * asctime_r(struct tm const *, char *); +#endif + +/* +** Compile with -Dtime_tz=T to build the tz package with a private +** time_t type equivalent to T rather than the system-supplied time_t. +** This debugging feature can test unusual design decisions +** (e.g., time_t wider than 'long', or unsigned time_t) even on +** typical platforms. +*/ +#ifdef time_tz +static time_t sys_time(time_t *x) { return time(x); } + +# undef ctime +# define ctime tz_ctime +# undef ctime_r +# define ctime_r tz_ctime_r +# undef difftime +# define difftime tz_difftime +# undef gmtime +# define gmtime tz_gmtime +# undef gmtime_r +# define gmtime_r tz_gmtime_r +# undef localtime +# define localtime tz_localtime +# undef localtime_r +# define localtime_r tz_localtime_r +# undef mktime +# define mktime tz_mktime +# undef time +# define time tz_time +# undef time_t +# define time_t tz_time_t + +typedef time_tz time_t; + +char *ctime(time_t const *); +char *ctime_r(time_t const *, char *); +double difftime(time_t, time_t); +struct tm *gmtime(time_t const *); +struct tm *gmtime_r(time_t const *restrict, struct tm *restrict); +struct tm *localtime(time_t const *); +struct tm *localtime_r(time_t const *restrict, struct tm *restrict); +time_t mktime(struct tm *); + +static time_t +time(time_t *p) +{ + time_t r = sys_time(0); + if (p) + *p = r; + return r; +} +#endif + +/* +** Private function declarations. +*/ + +char * icatalloc(char * old, const char * new); +char * icpyalloc(const char * string); +const char * scheck(const char * string, const char * format); + +/* +** Finally, some convenience items. +*/ + +#ifndef TYPE_BIT +#define TYPE_BIT(type) (sizeof (type) * CHAR_BIT) +#endif /* !defined TYPE_BIT */ + +#ifndef TYPE_SIGNED +#define TYPE_SIGNED(type) (((type) -1) < 0) +#endif /* !defined TYPE_SIGNED */ + +/* The minimum and maximum finite time values. */ +static time_t const time_t_min = + (TYPE_SIGNED(time_t) + ? (time_t) -1 << (CHAR_BIT * sizeof (time_t) - 1) + : 0); +static time_t const time_t_max = + (TYPE_SIGNED(time_t) + ? - (~ 0 < 0) - ((time_t) -1 << (CHAR_BIT * sizeof (time_t) - 1)) + : -1); + +#ifndef INT_STRLEN_MAXIMUM +/* +** 302 / 1000 is log10(2.0) rounded up. +** Subtract one for the sign bit if the type is signed; +** add one for integer division truncation; +** add one more for a minus sign if the type is signed. +*/ +#define INT_STRLEN_MAXIMUM(type) \ + ((TYPE_BIT(type) - TYPE_SIGNED(type)) * 302 / 1000 + \ + 1 + TYPE_SIGNED(type)) +#endif /* !defined INT_STRLEN_MAXIMUM */ + +/* +** INITIALIZE(x) +*/ + +#ifndef GNUC_or_lint +#ifdef lint +#define GNUC_or_lint +#endif /* defined lint */ +#ifndef lint +#ifdef __GNUC__ +#define GNUC_or_lint +#endif /* defined __GNUC__ */ +#endif /* !defined lint */ +#endif /* !defined GNUC_or_lint */ + +#ifndef INITIALIZE +#ifdef GNUC_or_lint +#define INITIALIZE(x) ((x) = 0) +#endif /* defined GNUC_or_lint */ +#ifndef GNUC_or_lint +#define INITIALIZE(x) +#endif /* !defined GNUC_or_lint */ +#endif /* !defined INITIALIZE */ + +/* +** For the benefit of GNU folk... +** `_(MSGID)' uses the current locale's message library string for MSGID. +** The default is to use gettext if available, and use MSGID otherwise. +*/ + +#ifndef _ +#if HAVE_GETTEXT +#define _(msgid) gettext(msgid) +#else /* !HAVE_GETTEXT */ +#define _(msgid) msgid +#endif /* !HAVE_GETTEXT */ +#endif /* !defined _ */ + +#ifndef TZ_DOMAIN +#define TZ_DOMAIN "tz" +#endif /* !defined TZ_DOMAIN */ + +#if HAVE_INCOMPATIBLE_CTIME_R +#undef asctime_r +#undef ctime_r +char *asctime_r(struct tm const *, char *); +char *ctime_r(time_t const *, char *); +#endif /* HAVE_INCOMPATIBLE_CTIME_R */ + +#ifndef YEARSPERREPEAT +#define YEARSPERREPEAT 400 /* years before a Gregorian repeat */ +#endif /* !defined YEARSPERREPEAT */ + +/* +** The Gregorian year averages 365.2425 days, which is 31556952 seconds. +*/ + +#ifndef AVGSECSPERYEAR +#define AVGSECSPERYEAR 31556952L +#endif /* !defined AVGSECSPERYEAR */ + +#ifndef SECSPERREPEAT +#define SECSPERREPEAT ((int_fast64_t) YEARSPERREPEAT * (int_fast64_t) AVGSECSPERYEAR) +#endif /* !defined SECSPERREPEAT */ + +#ifndef SECSPERREPEAT_BITS +#define SECSPERREPEAT_BITS 34 /* ceil(log2(SECSPERREPEAT)) */ +#endif /* !defined SECSPERREPEAT_BITS */ + +/* +** UNIX was a registered trademark of The Open Group in 2003. +*/ + +#endif /* !defined PRIVATE_H */ diff --git a/intl/icu/source/tools/tzcode/readme.txt b/intl/icu/source/tools/tzcode/readme.txt new file mode 100644 index 0000000000..9a3cf97703 --- /dev/null +++ b/intl/icu/source/tools/tzcode/readme.txt @@ -0,0 +1,95 @@ +* Copyright (C) 2016 and later: Unicode, Inc. and others. +* License & terms of use: http://www.unicode.org/copyright.html +********************************************************************** +* Copyright (c) 2003-2014, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* Author: Alan Liu +* Created: August 18 2003 +* Since: ICU 2.8 +********************************************************************** + +Note: this directory currently contains tzcode as of tzcode2014b.tar.gz + with localtime.c patches from tzcode2014b.tar.gz + + +---------------------------------------------------------------------- +OVERVIEW + +This file describes the tools in icu/source/tools/tzcode + +The purpose of these tools is to process the zoneinfo or "Olson" time +zone database into a form usable by ICU4C (release 2.8 and later). +Unlike earlier releases, ICU4C 2.8 supports historical time zone +behavior, as well as the full set of Olson compatibility IDs. + +References: + +ICU4C: https://icu.unicode.org/ +Olson: ftp://ftp.iana.org/tz/releases/ + +---------------------------------------------------------------------- +ICU4C vs. ICU4J + +For ICU releases >= 2.8, both ICU4C and ICU4J implement full +historical time zones, based on Olson data. The implementations in C +and Java are somewhat different. The C implementation is a +self-contained implementation, whereas ICU4J uses the underlying JDK +1.3 or 1.4 time zone implementation. + +Older versions of ICU (C and Java <= 2.6) implement a "present day +snapshot". This only reflects current time zone behavior, without +historical variation. Furthermore, it lacks the full set of Olson +compatibility IDs. + +---------------------------------------------------------------------- +BACKGROUND + +The zoneinfo or "Olson" time zone package is used by various systems +to describe the behavior of time zones. The package consists of +several parts. E.g.: + + Index of ftp://ftp.iana.org/tz/releases/ + + tzcode2014b.tar.gz 172 KB 3/25/2014 05:11:00 AM + tzdata2014b.tar.gz 216 KB 3/25/2014 05:11:00 AM + +ICU only uses the tzdataYYYYV.tar.gz files, +where YYYY is the year and V is the version letter ('a'...'z'). + +This directory has partial contents of tzcode checked into ICU + +---------------------------------------------------------------------- +HOWTO + +0. Note, these instructions will only work on POSIX type systems. + +1. Obtain the current versions of tzdataYYYYV.tar.gz (aka `tzdata') from + the FTP site given above. Either manually download or use wget: + + $ cd {path_to}/icu/source/tools/tzcode + $ wget "ftp://ftp.iana.org/tz/releases/tzdata*.tar.gz" + +2. Copy only one tzdata*.tar.gz file into the icu/source/tools/tzcode/ + directory (this directory). + + *** Make sure you only have ONE FILE named tzdata*.tar.gz in the + directory. + +3. Build ICU normally. You will see a notice "updating zoneinfo.txt..." + +### Following instructions for ICU maintainers only ### + +4. Obtain the current version of tzcodeYYYY.tar.gz from the FTP site to + this directory. + +5. Run make target "check-dump". This target extract makes the original + tzcode and compile the original tzdata with icu supplemental data + (icuzones). Then it makes zdump / icuzdump and dump all time + transitions for all ICU timezone to files under zdumpout / icuzdumpout + directory. When they produce different results, the target returns + the error. + +6. Don't forget to check in the new zoneinfo64.txt (from its location at + {path_to}/icu/source/data/misc/zoneinfo64.txt) into SVN. + diff --git a/intl/icu/source/tools/tzcode/scheck.c b/intl/icu/source/tools/tzcode/scheck.c new file mode 100644 index 0000000000..8bd01a858f --- /dev/null +++ b/intl/icu/source/tools/tzcode/scheck.c @@ -0,0 +1,64 @@ +/* +** This file is in the public domain, so clarified as of +** 2006-07-17 by Arthur David Olson. +*/ + +/*LINTLIBRARY*/ + +#include "private.h" + +const char * +scheck(const char *const string, const char *const format) +{ + register char * fbuf; + register const char * fp; + register char * tp; + register int c; + register const char * result; + char dummy; + + result = ""; + if (string == NULL || format == NULL) + return result; + fbuf = malloc(2 * strlen(format) + 4); + if (fbuf == NULL) + return result; + fp = format; + tp = fbuf; + + /* + ** Copy directives, suppressing each conversion that is not + ** already suppressed. Scansets containing '%' are not + ** supported; e.g., the conversion specification "%[%]" is not + ** supported. Also, multibyte characters containing a + ** non-leading '%' byte are not supported. + */ + while ((*tp++ = c = *fp++) != '\0') { + if (c != '%') + continue; + if (is_digit(*fp)) { + char const *f = fp; + char *t = tp; + do { + *t++ = c = *f++; + } while (is_digit(c)); + if (c == '$') { + fp = f; + tp = t; + } + } + *tp++ = '*'; + if (*fp == '*') + ++fp; + if ((*tp++ = *fp++) == '\0') + break; + } + + *(tp - 1) = '%'; + *tp++ = 'c'; + *tp = '\0'; + if (sscanf(string, fbuf, &dummy) != 1) + result = format; + free(fbuf); + return result; +} diff --git a/intl/icu/source/tools/tzcode/tz2icu.cpp b/intl/icu/source/tools/tzcode/tz2icu.cpp new file mode 100644 index 0000000000..0adb1bdeb3 --- /dev/null +++ b/intl/icu/source/tools/tzcode/tz2icu.cpp @@ -0,0 +1,1881 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (c) 2003-2014, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* Author: Alan Liu +* Created: July 10 2003 +* Since: ICU 2.8 +********************************************************************** +*/ +#include "tzfile.h" // from Olson tzcode archive, copied to this dir + +#ifdef WIN32 + + #include <windows.h> + #undef min // windows.h/STL conflict + #undef max // windows.h/STL conflict + // "identifier was truncated to 'number' characters" warning + #pragma warning(disable: 4786) + +#else + + #include <unistd.h> + #include <stdio.h> + #include <dirent.h> + #include <string.h> + #include <sys/stat.h> + +#endif + +#include <algorithm> +#include <cassert> +#include <ctime> +#include <fstream> +#include <iomanip> +#include <iostream> +#include <iterator> +#include <limits> +#include <map> +#include <set> +#include <sstream> +#include <sstream> +#include <stdexcept> +#include <string> +#include <vector> + +#include "tz2icu.h" +#include "unicode/uversion.h" + +using namespace std; + +bool ICU44PLUS = true; +string TZ_RESOURCE_NAME = ICU_TZ_RESOURCE; + +//-------------------------------------------------------------------- +// Time utilities +//-------------------------------------------------------------------- + +const int64_t SECS_PER_YEAR = 31536000; // 365 days +const int64_t SECS_PER_LEAP_YEAR = 31622400; // 366 days +const int64_t LOWEST_TIME32 = (int64_t)((int32_t)0x80000000); +const int64_t HIGHEST_TIME32 = (int64_t)((int32_t)0x7fffffff); + +bool isLeap(int32_t y) { + return (y%4 == 0) && ((y%100 != 0) || (y%400 == 0)); // Gregorian +} + +int64_t secsPerYear(int32_t y) { + return isLeap(y) ? SECS_PER_LEAP_YEAR : SECS_PER_YEAR; +} + +/** + * Given a calendar year, return the GMT epoch seconds for midnight + * GMT of January 1 of that year. yearToSeconds(1970) == 0. + */ +int64_t yearToSeconds(int32_t year) { + // inefficient but foolproof + int64_t s = 0; + int32_t y = 1970; + while (y < year) { + s += secsPerYear(y++); + } + while (y > year) { + s -= secsPerYear(--y); + } + return s; +} + +/** + * Given 1970 GMT epoch seconds, return the calendar year containing + * that time. secondsToYear(0) == 1970. + */ +int32_t secondsToYear(int64_t seconds) { + // inefficient but foolproof + int32_t y = 1970; + int64_t s = 0; + if (seconds >= 0) { + for (;;) { + s += secsPerYear(y++); + if (s > seconds) break; + } + --y; + } else { + for (;;) { + s -= secsPerYear(--y); + if (s <= seconds) break; + } + } + return y; +} + +//-------------------------------------------------------------------- +// Types +//-------------------------------------------------------------------- + +struct FinalZone; +struct FinalRule; +struct SimplifiedZoneType; + +// A transition from one ZoneType to another +// Minimal size = 5 bytes (4+1) +struct Transition { + int64_t time; // seconds, 1970 epoch + int32_t type; // index into 'ZoneInfo.types' 0..255 + Transition(int64_t _time, int32_t _type) { + time = _time; + type = _type; + } +}; + +// A behavior mode (what zic calls a 'type') of a time zone. +// Minimal size = 6 bytes (4+1+3bits) +// SEE: SimplifiedZoneType +struct ZoneType { + int64_t rawoffset; // raw seconds offset from GMT + int64_t dstoffset; // dst seconds offset from GMT + + // We don't really need any of the following, but they are + // retained for possible future use. See SimplifiedZoneType. + int32_t abbr; // index into ZoneInfo.abbrs 0..n-1 + bool isdst; + bool isstd; + bool isgmt; + + ZoneType(const SimplifiedZoneType&); // used by optimizeTypeList + + ZoneType() : rawoffset(-1), dstoffset(-1), abbr(-1) {} + + // A restricted equality, of just the raw and dst offset + bool matches(const ZoneType& other) { + return rawoffset == other.rawoffset && + dstoffset == other.dstoffset; + } +}; + +// A collection of transitions from one ZoneType to another, together +// with a list of the ZoneTypes. A ZoneInfo object may have a long +// list of transitions between a smaller list of ZoneTypes. +// +// This object represents the contents of a single zic-created +// zoneinfo file. +struct ZoneInfo { + vector<Transition> transitions; + vector<ZoneType> types; + vector<string> abbrs; + + string finalRuleID; + int32_t finalOffset; + int32_t finalYear; // -1 if none + + // If this is an alias, then all other fields are meaningless, and + // this field will point to the "real" zone 0..n-1. + int32_t aliasTo; // -1 if this is a "real" zone + + // If there are aliases TO this zone, then the following set will + // contain their index numbers (each index >= 0). + set<int32_t> aliases; + + ZoneInfo() : finalYear(-1), aliasTo(-1) {} + + void mergeFinalData(const FinalZone& fz); + + void optimizeTypeList(); + + // Set this zone to be an alias TO another zone. + void setAliasTo(int32_t index); + + // Clear the list of aliases OF this zone. + void clearAliases(); + + // Add an alias to the list of aliases OF this zone. + void addAlias(int32_t index); + + // Is this an alias to another zone? + bool isAlias() const { + return aliasTo >= 0; + } + + // Retrieve alias list + const set<int32_t>& getAliases() const { + return aliases; + } + + void print(ostream& os, const string& id) const; +}; + +void ZoneInfo::clearAliases() { + assert(aliasTo < 0); + aliases.clear(); +} + +void ZoneInfo::addAlias(int32_t index) { + assert(aliasTo < 0 && index >= 0 && aliases.find(index) == aliases.end()); + aliases.insert(index); +} + +void ZoneInfo::setAliasTo(int32_t index) { + assert(index >= 0); + assert(aliases.size() == 0); + aliasTo = index; +} + +typedef map<string, ZoneInfo> ZoneMap; + +typedef ZoneMap::const_iterator ZoneMapIter; + +//-------------------------------------------------------------------- +// ZONEINFO +//-------------------------------------------------------------------- + +// Global map holding all our ZoneInfo objects, indexed by id. +ZoneMap ZONEINFO; + +//-------------------------------------------------------------------- +// zoneinfo file parsing +//-------------------------------------------------------------------- + +// Read zic-coded 32-bit integer from file +int64_t readcoded(ifstream& file, int64_t minv=numeric_limits<int64_t>::min(), + int64_t maxv=numeric_limits<int64_t>::max()) { + unsigned char buf[4]; // must be UNSIGNED + int64_t val=0; + file.read((char*)buf, 4); + for(int32_t i=0,shift=24;i<4;++i,shift-=8) { + val |= buf[i] << shift; + } + if (val < minv || val > maxv) { + ostringstream os; + os << "coded value out-of-range: " << val << ", expected [" + << minv << ", " << maxv << "]"; + throw out_of_range(os.str()); + } + return val; +} + +// Read zic-coded 64-bit integer from file +int64_t readcoded64(ifstream& file, int64_t minv=numeric_limits<int64_t>::min(), + int64_t maxv=numeric_limits<int64_t>::max()) { + unsigned char buf[8]; // must be UNSIGNED + int64_t val=0; + file.read((char*)buf, 8); + for(int32_t i=0,shift=56;i<8;++i,shift-=8) { + val |= (int64_t)buf[i] << shift; + } + if (val < minv || val > maxv) { + ostringstream os; + os << "coded value out-of-range: " << val << ", expected [" + << minv << ", " << maxv << "]"; + throw out_of_range(os.str()); + } + return val; +} + +// Read a boolean value +bool readbool(ifstream& file) { + char c; + file.read(&c, 1); + if (c!=0 && c!=1) { + ostringstream os; + os << "boolean value out-of-range: " << (int32_t)c; + throw out_of_range(os.str()); + } + return (c!=0); +} + +/** + * Read the zoneinfo file structure (see tzfile.h) into a ZoneInfo + * @param file an already-open file stream + */ +void readzoneinfo(ifstream& file, ZoneInfo& info, bool is64bitData) { + int32_t i; + + // Check for TZ_ICU_MAGIC signature at file start. If we get a + // signature mismatch, it means we're trying to read a file which + // isn't a ICU-modified-zic-created zoneinfo file. Typically this + // means the user is passing in a "normal" zoneinfo directory, or + // a zoneinfo directory that is polluted with other files, or that + // the user passed in the wrong directory. + char buf[32]; + file.read(buf, 4); + if (strncmp(buf, TZ_ICU_MAGIC, 4) != 0) { + throw invalid_argument("TZ_ICU_MAGIC signature missing"); + } + // skip additional Olson byte version + file.read(buf, 1); + // if '\0', we have just one copy of data, if '2' or '3', there is additional + // 64 bit version at the end. + if(buf[0]!=0 && buf[0]!='2' && buf[0]!='3') { + throw invalid_argument("Bad Olson version info"); + } + + // Read reserved bytes. The first of these will be a version byte. + file.read(buf, 15); + if (*(ICUZoneinfoVersion*)&buf != TZ_ICU_VERSION) { + throw invalid_argument("File version mismatch"); + } + + // Read array sizes + int64_t isgmtcnt = readcoded(file, 0); + int64_t isdstcnt = readcoded(file, 0); + int64_t leapcnt = readcoded(file, 0); + int64_t timecnt = readcoded(file, 0); + int64_t typecnt = readcoded(file, 0); + int64_t charcnt = readcoded(file, 0); + + // Confirm sizes that we assume to be equal. These assumptions + // are drawn from a reading of the zic source (2003a), so they + // should hold unless the zic source changes. + if (isgmtcnt != typecnt || isdstcnt != typecnt) { + throw invalid_argument("count mismatch between tzh_ttisgmtcnt, tzh_ttisdstcnt, tth_typecnt"); + } + + // Used temporarily to store transition times and types. We need + // to do this because the times and types are stored in two + // separate arrays. + vector<int64_t> transitionTimes(timecnt, -1); // temporary + vector<int32_t> transitionTypes(timecnt, -1); // temporary + + // Read transition times + for (i=0; i<timecnt; ++i) { + if (is64bitData) { + transitionTimes[i] = readcoded64(file); + } else { + transitionTimes[i] = readcoded(file); + } + } + + // Read transition types + for (i=0; i<timecnt; ++i) { + unsigned char c; + file.read((char*) &c, 1); + int32_t t = (int32_t) c; + if (t < 0 || t >= typecnt) { + ostringstream os; + os << "illegal type: " << t << ", expected [0, " << (typecnt-1) << "]"; + throw out_of_range(os.str()); + } + transitionTypes[i] = t; + } + + // Build transitions vector out of corresponding times and types. + bool insertInitial = false; + if (is64bitData && !ICU44PLUS) { + if (timecnt > 0) { + int32_t minidx = -1; + for (i=0; i<timecnt; ++i) { + if (transitionTimes[i] < LOWEST_TIME32) { + if (minidx == -1 || transitionTimes[i] > transitionTimes[minidx]) { + // Preserve the latest transition before the 32bit minimum time + minidx = i; + } + } else if (transitionTimes[i] > HIGHEST_TIME32) { + // Skipping the rest of the transition data. We cannot put such + // transitions into zoneinfo.res, because data is limited to signed + // 32bit int by the ICU resource bundle. + break; + } else { + info.transitions.push_back(Transition(transitionTimes[i], transitionTypes[i])); + } + } + + if (minidx != -1) { + // If there are any transitions before the 32bit minimum time, + // put the type information with the 32bit minimum time + vector<Transition>::iterator itr = info.transitions.begin(); + info.transitions.insert(itr, Transition(LOWEST_TIME32, transitionTypes[minidx])); + } else { + // Otherwise, we need insert the initial type later + insertInitial = true; + } + } + } else { + for (i=0; i<timecnt; ++i) { + info.transitions.push_back(Transition(transitionTimes[i], transitionTypes[i])); + } + } + + // Read types (except for the isdst and isgmt flags, which come later (why??)) + for (i=0; i<typecnt; ++i) { + ZoneType type; + + type.rawoffset = readcoded(file); + type.dstoffset = readcoded(file); + type.isdst = readbool(file); + + unsigned char c; + file.read((char*) &c, 1); + type.abbr = (int32_t) c; + + if (type.isdst != (type.dstoffset != 0)) { + throw invalid_argument("isdst does not reflect dstoffset"); + } + + info.types.push_back(type); + } + + assert(info.types.size() == (unsigned) typecnt); + + if (insertInitial) { + assert(timecnt > 0); + assert(typecnt > 0); + + int32_t initialTypeIdx = -1; + + // Check if the first type is not dst + if (info.types.at(0).dstoffset != 0) { + // Initial type's rawoffset is same with the rawoffset after the + // first transition, but no DST is observed. + int64_t rawoffset0 = (info.types.at(info.transitions.at(0).type)).rawoffset; + // Look for matching type + for (i=0; i<(int32_t)info.types.size(); ++i) { + if (info.types.at(i).rawoffset == rawoffset0 + && info.types.at(i).dstoffset == 0) { + initialTypeIdx = i; + break; + } + } + } else { + initialTypeIdx = 0; + } + assert(initialTypeIdx >= 0); + // Add the initial type associated with the lowest int32 time + vector<Transition>::iterator itr = info.transitions.begin(); + info.transitions.insert(itr, Transition(LOWEST_TIME32, initialTypeIdx)); + } + + + // Read the abbreviation string + if (charcnt) { + // All abbreviations are concatenated together, with a 0 at + // the end of each abbr. + char* str = new char[charcnt + 8]; + file.read(str, charcnt); + + // Split abbreviations apart into individual strings. Record + // offset of each abbr in a vector. + vector<int32_t> abbroffset; + char *limit=str+charcnt; + for (char* p=str; p<limit; ++p) { + char* start = p; + while (*p != 0) ++p; + info.abbrs.push_back(string(start, p-start)); + abbroffset.push_back(start-str); + } + + // Remap all the abbrs. Old value is offset into concatenated + // raw abbr strings. New value is index into vector of + // strings. E.g., 0,5,10,14 => 0,1,2,3. + + // Keep track of which abbreviations get used. + vector<bool> abbrseen(abbroffset.size(), false); + + for (vector<ZoneType>::iterator it=info.types.begin(); + it!=info.types.end(); + ++it) { + vector<int32_t>::const_iterator x= + find(abbroffset.begin(), abbroffset.end(), it->abbr); + if (x==abbroffset.end()) { + // TODO: Modify code to add a new string to the end of + // the abbr list when a middle offset is given, e.g., + // "abc*def*" where * == '\0', take offset of 1 and + // make the array "abc", "def", "bc", and translate 1 + // => 2. NOT CRITICAL since we don't even use the + // abbr at this time. +#if 0 + // TODO: Re-enable this warning if we start using + // the Olson abbr data, or if the above TODO is completed. + ostringstream os; + os << "Warning: unusual abbr offset " << it->abbr + << ", expected one of"; + for (vector<int32_t>::const_iterator y=abbroffset.begin(); + y!=abbroffset.end(); ++y) { + os << ' ' << *y; + } + cerr << os.str() << "; using 0" << endl; +#endif + it->abbr = 0; + } else { + int32_t index = x - abbroffset.begin(); + it->abbr = index; + abbrseen[index] = true; + } + } + + for (int32_t ii=0;ii<(int32_t) abbrseen.size();++ii) { + if (!abbrseen[ii]) { + cerr << "Warning: unused abbreviation: " << ii << endl; + } + } + } + + // Read leap second info, if any. + // *** We discard leap second data. *** + for (i=0; i<leapcnt; ++i) { + readcoded(file); // transition time + readcoded(file); // total correction after above + } + + // Read isstd flags + for (i=0; i<typecnt; ++i) info.types[i].isstd = readbool(file); + + // Read isgmt flags + for (i=0; i<typecnt; ++i) info.types[i].isgmt = readbool(file); +} + +//-------------------------------------------------------------------- +// Directory and file reading +//-------------------------------------------------------------------- + +/** + * Process a single zoneinfo file, adding the data to ZONEINFO + * @param path the full path to the file, e.g., ".\zoneinfo\America\Los_Angeles" + * @param id the zone ID, e.g., "America/Los_Angeles" + */ +void handleFile(string path, string id) { + // Check for duplicate id + if (ZONEINFO.find(id) != ZONEINFO.end()) { + ostringstream os; + os << "duplicate zone ID: " << id; + throw invalid_argument(os.str()); + } + + ifstream file(path.c_str(), ios::in | ios::binary); + if (!file) { + throw invalid_argument("can't open file"); + } + + // eat 32bit data part + ZoneInfo info; + readzoneinfo(file, info, false); + + // Check for errors + if (!file) { + throw invalid_argument("read error"); + } + + // we only use 64bit part + ZoneInfo info64; + readzoneinfo(file, info64, true); + + bool alldone = false; + int64_t eofPos = (int64_t) file.tellg(); + + // '\n' + <envvar string> + '\n' after the 64bit version data + char ch = file.get(); + if (ch == 0x0a) { + bool invalidchar = false; + while (file.get(ch)) { + if (ch == 0x0a) { + break; + } + if (ch < 0x20) { + // must be printable ascii + invalidchar = true; + break; + } + } + if (!invalidchar) { + eofPos = (int64_t) file.tellg(); + file.seekg(0, ios::end); + eofPos = eofPos - (int64_t) file.tellg(); + if (eofPos == 0) { + alldone = true; + } + } + } + if (!alldone) { + ostringstream os; + os << (-eofPos) << " unprocessed bytes at end"; + throw invalid_argument(os.str()); + } + + ZONEINFO[id] = info64; +} + +/** + * Recursively scan the given directory, calling handleFile() for each + * file in the tree. The user should call with the root directory and + * a prefix of "". The function will call itself with non-empty + * prefix values. + */ +#ifdef WIN32 + +void scandir(string dirname, string prefix="") { + HANDLE hList; + WIN32_FIND_DATA FileData; + + // Get the first file + hList = FindFirstFile((dirname + "\\*").c_str(), &FileData); + if (hList == INVALID_HANDLE_VALUE) { + cerr << "Error: Invalid directory: " << dirname << endl; + exit(1); + } + for (;;) { + string name(FileData.cFileName); + string path(dirname + "\\" + name); + if (FileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) { + if (name != "." && name != "..") { + scandir(path, prefix + name + "/"); + } + } else { + try { + string id = prefix + name; + handleFile(path, id); + } catch (const exception& e) { + cerr << "Error: While processing \"" << path << "\", " + << e.what() << endl; + exit(1); + } + } + + if (!FindNextFile(hList, &FileData)) { + if (GetLastError() == ERROR_NO_MORE_FILES) { + break; + } // else...? + } + } + FindClose(hList); +} + +#else + +void scandir(string dir, string prefix="") { + DIR *dp; + struct dirent *dir_entry; + struct stat stat_info; + char pwd[512]; + vector<string> subdirs; + vector<string> subfiles; + + if ((dp = opendir(dir.c_str())) == nullptr) { + cerr << "Error: Invalid directory: " << dir << endl; + exit(1); + } + if (!getcwd(pwd, sizeof(pwd))) { + cerr << "Error: Directory name too long" << endl; + exit(1); + } + chdir(dir.c_str()); + while ((dir_entry = readdir(dp)) != nullptr) { + string name = dir_entry->d_name; + string path = dir + "/" + name; + lstat(dir_entry->d_name,&stat_info); + if (S_ISDIR(stat_info.st_mode)) { + if (name != "." && name != "..") { + subdirs.push_back(path); + subdirs.push_back(prefix + name + "/"); + // scandir(path, prefix + name + "/"); + } + } else { + try { + string id = prefix + name; + subfiles.push_back(path); + subfiles.push_back(id); + // handleFile(path, id); + } catch (const exception& e) { + cerr << "Error: While processing \"" << path << "\", " + << e.what() << endl; + exit(1); + } + } + } + closedir(dp); + chdir(pwd); + + for(int32_t i=0;i<(int32_t)subfiles.size();i+=2) { + try { + handleFile(subfiles[i], subfiles[i+1]); + } catch (const exception& e) { + cerr << "Error: While processing \"" << subfiles[i] << "\", " + << e.what() << endl; + exit(1); + } + } + for(int32_t i=0;i<(int32_t)subdirs.size();i+=2) { + scandir(subdirs[i], subdirs[i+1]); + } +} + +#endif + +//-------------------------------------------------------------------- +// Final zone and rule info +//-------------------------------------------------------------------- + +/** + * Read and discard the current line. + */ +void consumeLine(istream& in) { + int32_t c; + do { + c = in.get(); + } while (c != EOF && c != '\n'); +} + +enum { + DOM = 0, + DOWGEQ = 1, + DOWLEQ = 2 +}; + +const char* TIME_MODE[] = {"w", "s", "u"}; + +// Allow 29 days in February because zic outputs February 29 +// for rules like "last Sunday in February". +const int32_t MONTH_LEN[] = {31,29,31,30,31,30,31,31,30,31,30,31}; + +const int32_t HOUR = 3600; + +struct FinalZone { + int32_t offset; // raw offset + int32_t year; // takes effect for y >= year + string ruleid; + set<string> aliases; + FinalZone(int32_t _offset, int32_t _year, const string& _ruleid) : + offset(_offset), year(_year), ruleid(_ruleid) { + if (offset <= -16*HOUR || offset >= 16*HOUR) { + ostringstream os; + os << "Invalid input offset " << offset + << " for year " << year + << " and rule ID " << ruleid; + throw invalid_argument(os.str()); + } + if (year < 1900) { + ostringstream os; + os << "Invalid input year " << year + << " with offset " << offset + << " and rule ID " << ruleid; + throw invalid_argument(os.str()); + } + } + FinalZone() : offset(-1), year(-1) {} + void addLink(const string& alias) { + if (aliases.find(alias) != aliases.end()) { + ostringstream os; + os << "Duplicate alias " << alias; + throw invalid_argument(os.str()); + } + aliases.insert(alias); + } +}; + +struct FinalRulePart { + int32_t mode; + int32_t month; + int32_t dom; + int32_t dow; + int32_t time; + int32_t offset; // dst offset, usually either 0 or 1:00 + + // Isstd and isgmt only have 3 valid states, corresponding to local + // wall time, local standard time, and GMT standard time. + // Here is how the isstd & isgmt flags are set by zic: + //| case 's': /* Standard */ + //| rp->r_todisstd = true; + //| rp->r_todisgmt = false; + //| case 'w': /* Wall */ + //| rp->r_todisstd = false; + //| rp->r_todisgmt = false; + //| case 'g': /* Greenwich */ + //| case 'u': /* Universal */ + //| case 'z': /* Zulu */ + //| rp->r_todisstd = true; + //| rp->r_todisgmt = true; + bool isstd; + bool isgmt; + + bool isset; // used during building; later ignored + + FinalRulePart() : isset(false) {} + void set(const string& id, + const string& _mode, + int32_t _month, + int32_t _dom, + int32_t _dow, + int32_t _time, + bool _isstd, + bool _isgmt, + int32_t _offset) { + if (isset) { + throw invalid_argument("FinalRulePart set twice"); + } + isset = true; + if (_mode == "DOWLEQ") { + mode = DOWLEQ; + } else if (_mode == "DOWGEQ") { + mode = DOWGEQ; + } else if (_mode == "DOM") { + mode = DOM; + } else { + throw invalid_argument("Unrecognized FinalRulePart mode"); + } + month = _month; + dom = _dom; + dow = _dow; + time = _time; + isstd = _isstd; + isgmt = _isgmt; + offset = _offset; + + ostringstream os; + if (month < 0 || month >= 12) { + os << "Invalid input month " << month; + } + if (dom < 1 || dom > MONTH_LEN[month]) { + os << "Invalid input day of month " << dom; + } + if (mode != DOM && (dow < 0 || dow >= 7)) { + os << "Invalid input day of week " << dow; + } + if (offset < (-1 * HOUR) || offset > (2 * HOUR)) { + os << "Invalid input offset " << offset; + } + if (isgmt && !isstd) { + os << "Invalid input isgmt && !isstd"; + } + if (!os.str().empty()) { + os << " for rule " + << id + << _mode + << month << dom << dow << time + << isstd << isgmt + << offset; + throw invalid_argument(os.str()); + } + } + + /** + * Return the time mode as an ICU SimpleTimeZone int from 0..2; + * see simpletz.h. + */ + int32_t timemode() const { + if (isgmt) { + assert(isstd); + return 2; // gmt standard + } + if (isstd) { + return 1; // local standard + } + return 0; // local wall + } + + // The SimpleTimeZone encoding method for rules is as follows: + // stz_dowim stz_dow + // DOM: dom 0 + // DOWGEQ: dom -(dow+1) + // DOWLEQ: -dom -(dow+1) + // E.g., to encode Mon>=7, use stz_dowim=7, stz_dow=-2 + // to encode Mon<=7, use stz_dowim=-7, stz_dow=-2 + // to encode 7, use stz_dowim=7, stz_dow=0 + // Note that for this program and for SimpleTimeZone, 0==Jan, + // but for this program 0==Sun while for SimpleTimeZone 1==Sun. + + /** + * Return a "dowim" param suitable for SimpleTimeZone. + */ + int32_t stz_dowim() const { + return (mode == DOWLEQ) ? -dom : dom; + } + + /** + * Return a "dow" param suitable for SimpleTimeZone. + */ + int32_t stz_dow() const { + return (mode == DOM) ? 0 : -(dow+1); + } +}; + +struct FinalRule { + FinalRulePart part[2]; + + bool isset() const { + return part[0].isset && part[1].isset; + } + + void print(ostream& os) const; +}; + +map<string,FinalZone> finalZones; +map<string,FinalRule> finalRules; + +map<string, set<string> > links; +map<string, string> reverseLinks; + +/** + * Predicate used to find FinalRule objects that do not have both + * sub-parts set (indicating an error in the input file). + */ +bool isNotSet(const pair<const string,FinalRule>& p) { + return !p.second.isset(); +} + +/** + * Predicate used to find FinalZone objects that do not map to a known + * rule (indicating an error in the input file). + */ +bool mapsToUnknownRule(const pair<const string,FinalZone>& p) { + return finalRules.find(p.second.ruleid) == finalRules.end(); +} + +/** + * This set is used to make sure each rule in finalRules is used at + * least once. First we populate it with all the rules from + * finalRules; then we remove all the rules referred to in + * finaleZones. + */ +set<string> ruleIDset; + +void insertRuleID(const pair<string,FinalRule>& p) { + ruleIDset.insert(p.first); +} + +void eraseRuleID(const pair<string,FinalZone>& p) { + ruleIDset.erase(p.second.ruleid); +} + +/** + * Populate finalZones and finalRules from the given istream. + */ +void readFinalZonesAndRules(istream& in) { + + for (;;) { + string token; + in >> token; + if (in.eof() || !in) { + break; + } else if (token == "zone") { + // zone Africa/Cairo 7200 1995 Egypt # zone Africa/Cairo, offset 7200, year >= 1995, rule Egypt (0) + string id, ruleid; + int32_t offset, year; + in >> id >> offset >> year >> ruleid; + consumeLine(in); + finalZones[id] = FinalZone(offset, year, ruleid); + } else if (token == "rule") { + // rule US DOWGEQ 3 1 0 7200 0 0 3600 # 52: US, file data/northamerica, line 119, mode DOWGEQ, April, dom 1, Sunday, time 7200, isstd 0, isgmt 0, offset 3600 + // rule US DOWLEQ 9 31 0 7200 0 0 0 # 53: US, file data/northamerica, line 114, mode DOWLEQ, October, dom 31, Sunday, time 7200, isstd 0, isgmt 0, offset 0 + string id, mode; + int32_t month, dom, dow, time, offset; + bool isstd, isgmt; + in >> id >> mode >> month >> dom >> dow >> time >> isstd >> isgmt >> offset; + consumeLine(in); + FinalRule& fr = finalRules[id]; + int32_t p = fr.part[0].isset ? 1 : 0; + fr.part[p].set(id, mode, month, dom, dow, time, isstd, isgmt, offset); + } else if (token == "link") { + string fromid, toid; // fromid == "real" zone, toid == alias + in >> fromid >> toid; + // DO NOT consumeLine(in); + if (finalZones.find(toid) != finalZones.end()) { + throw invalid_argument("Bad link: `to' id is a \"real\" zone"); + } + + links[fromid].insert(toid); + reverseLinks[toid] = fromid; + } else if (token.length() > 0 && token[0] == '#') { + consumeLine(in); + } else { + throw invalid_argument("Unrecognized keyword"); + } + } + + if (!in.eof() && !in) { + throw invalid_argument("Parse failure"); + } + + // Perform validity check: Each rule should have data for 2 parts. + if (count_if(finalRules.begin(), finalRules.end(), isNotSet) != 0) { + throw invalid_argument("One or more incomplete rule pairs"); + } + + // Perform validity check: Each zone should map to a known rule. + if (count_if(finalZones.begin(), finalZones.end(), mapsToUnknownRule) != 0) { + throw invalid_argument("One or more zones refers to an unknown rule"); + } + + // Perform validity check: Each rule should be referred to by a zone. + ruleIDset.clear(); + for_each(finalRules.begin(), finalRules.end(), insertRuleID); + for_each(finalZones.begin(), finalZones.end(), eraseRuleID); + if (ruleIDset.size() != 0) { + throw invalid_argument("Unused rules"); + } +} + +//-------------------------------------------------------------------- +// Resource bundle output +//-------------------------------------------------------------------- + +// SEE olsontz.h FOR RESOURCE BUNDLE DATA LAYOUT + +void ZoneInfo::print(ostream& os, const string& id) const { + // Implement compressed format #2: + os << " /* " << id << " */ "; + + if (aliasTo >= 0) { + assert(aliases.size() == 0); + os << ":int { " << aliasTo << " } "; // No endl - save room for comment. + return; + } + + if (ICU44PLUS) { + os << ":table {" << endl; + } else { + os << ":array {" << endl; + } + + vector<Transition>::const_iterator trn; + vector<ZoneType>::const_iterator typ; + + bool first; + + if (ICU44PLUS) { + trn = transitions.begin(); + + // pre 32bit transitions + if (trn != transitions.end() && trn->time < LOWEST_TIME32) { + os << " transPre32:intvector { "; + for (first = true; trn != transitions.end() && trn->time < LOWEST_TIME32; ++trn) { + if (!first) { + os<< ", "; + } + first = false; + os << (int32_t)(trn->time >> 32) << ", " << (int32_t)(trn->time & 0x00000000ffffffff); + } + os << " }" << endl; + } + + // 32bit transitions + if (trn != transitions.end() && trn->time < HIGHEST_TIME32) { + os << " trans:intvector { "; + for (first = true; trn != transitions.end() && trn->time < HIGHEST_TIME32; ++trn) { + if (!first) { + os << ", "; + } + first = false; + os << trn->time; + } + os << " }" << endl; + } + + // post 32bit transitions + if (trn != transitions.end()) { + os << " transPost32:intvector { "; + for (first = true; trn != transitions.end(); ++trn) { + if (!first) { + os<< ", "; + } + first = false; + os << (int32_t)(trn->time >> 32) << ", " << (int32_t)(trn->time & 0x00000000ffffffff); + } + os << " }" << endl; + } + } else { + os << " :intvector { "; + for (trn = transitions.begin(), first = true; trn != transitions.end(); ++trn) { + if (!first) os << ", "; + first = false; + os << trn->time; + } + os << " }" << endl; + } + + + first=true; + if (ICU44PLUS) { + os << " typeOffsets:intvector { "; + } else { + os << " :intvector { "; + } + for (typ = types.begin(); typ != types.end(); ++typ) { + if (!first) os << ", "; + first = false; + os << typ->rawoffset << ", " << typ->dstoffset; + } + os << " }" << endl; + + if (ICU44PLUS) { + if (transitions.size() != 0) { + os << " typeMap:bin { \"" << hex << setfill('0'); + for (trn = transitions.begin(); trn != transitions.end(); ++trn) { + os << setw(2) << trn->type; + } + os << dec << "\" }" << endl; + } + } else { + os << " :bin { \"" << hex << setfill('0'); + for (trn = transitions.begin(); trn != transitions.end(); ++trn) { + os << setw(2) << trn->type; + } + os << dec << "\" }" << endl; + } + + // Final zone info, if any + if (finalYear != -1) { + if (ICU44PLUS) { + os << " finalRule { \"" << finalRuleID << "\" }" << endl; + os << " finalRaw:int { " << finalOffset << " }" << endl; + os << " finalYear:int { " << finalYear << " }" << endl; + } else { + os << " \"" << finalRuleID << "\"" << endl; + os << " :intvector { " << finalOffset << ", " + << finalYear << " }" << endl; + } + } + + // Alias list, if any + if (aliases.size() != 0) { + first = true; + if (ICU44PLUS) { + os << " links:intvector { "; + } else { + os << " :intvector { "; + } + for (set<int32_t>::const_iterator i=aliases.begin(); i!=aliases.end(); ++i) { + if (!first) os << ", "; + first = false; + os << *i; + } + os << " }" << endl; + } + + os << " } "; // no trailing 'endl', so comments can be placed. +} + +inline ostream& +operator<<(ostream& os, const ZoneMap& zoneinfo) { + int32_t c = 0; + for (ZoneMapIter it = zoneinfo.begin(); + it != zoneinfo.end(); + ++it) { + if(c && !ICU44PLUS) os << ","; + it->second.print(os, it->first); + os << "//Z#" << c++ << endl; + } + return os; +} + +// print the string list +ostream& printStringList( ostream& os, const ZoneMap& zoneinfo) { + int32_t n = 0; // count + int32_t col = 0; // column + os << " Names {" << endl + << " "; + for (ZoneMapIter it = zoneinfo.begin(); + it != zoneinfo.end(); + ++it) { + if(n) { + os << ","; + col ++; + } + const string& id = it->first; + os << "\"" << id << "\""; + col += id.length() + 2; + if(col >= 50) { + os << " // " << n << endl + << " "; + col = 0; + } + n++; + } + os << " // " << (n-1) << endl + << " }" << endl; + + return os; +} + +//-------------------------------------------------------------------- +// main +//-------------------------------------------------------------------- + +// Unary predicate for finding transitions after a given time +bool isAfter(const Transition t, int64_t thresh) { + return t.time >= thresh; +} + +/** + * A zone type that contains only the raw and dst offset. Used by the + * optimizeTypeList() method. + */ +struct SimplifiedZoneType { + int64_t rawoffset; + int64_t dstoffset; + SimplifiedZoneType() : rawoffset(-1), dstoffset(-1) {} + SimplifiedZoneType(const ZoneType& t) : rawoffset(t.rawoffset), + dstoffset(t.dstoffset) {} + bool operator<(const SimplifiedZoneType& t) const { + return rawoffset < t.rawoffset || + (rawoffset == t.rawoffset && + dstoffset < t.dstoffset); + } +}; + +/** + * Construct a ZoneType from a SimplifiedZoneType. Note that this + * discards information; the new ZoneType will have meaningless + * (empty) abbr, isdst, isstd, and isgmt flags; this is appropriate, + * since ignoring these is how we do optimization (we have no use for + * these in historical transitions). + */ +ZoneType::ZoneType(const SimplifiedZoneType& t) : + rawoffset(t.rawoffset), dstoffset(t.dstoffset), + abbr(-1), isdst(false), isstd(false), isgmt(false) {} + +/** + * Optimize the type list to remove excess entries. The type list may + * contain entries that are distinct only in terms of their dst, std, + * or gmt flags. Since we don't care about those flags, we can reduce + * the type list to a set of unique raw/dst offset pairs, and remap + * the type indices in the transition list, which stores, for each + * transition, a transition time and a type index. + */ +void ZoneInfo::optimizeTypeList() { + // Assemble set of unique types; only those in the `transitions' + // list, since there may be unused types in the `types' list + // corresponding to transitions that have been trimmed (during + // merging of final data). + + if (aliasTo >= 0) return; // Nothing to do for aliases + + if (!ICU44PLUS) { + // This is the old logic which has a bug, which occasionally removes + // the type before the first transition. The problem was fixed + // by inserting the dummy transition indirectly. + + // If there are zero transitions and one type, then leave that as-is. + if (transitions.size() == 0) { + if (types.size() != 1) { + cerr << "Error: transition count = 0, type count = " << types.size() << endl; + } + return; + } + + set<SimplifiedZoneType> simpleset; + for (vector<Transition>::const_iterator i=transitions.begin(); + i!=transitions.end(); ++i) { + assert(i->type < (int32_t)types.size()); + simpleset.insert(types[i->type]); + } + + // Map types to integer indices + map<SimplifiedZoneType,int32_t> simplemap; + int32_t n=0; + for (set<SimplifiedZoneType>::const_iterator i=simpleset.begin(); + i!=simpleset.end(); ++i) { + simplemap[*i] = n++; + } + + // Remap transitions + for (vector<Transition>::iterator i=transitions.begin(); + i!=transitions.end(); ++i) { + assert(i->type < (int32_t)types.size()); + ZoneType oldtype = types[i->type]; + SimplifiedZoneType newtype(oldtype); + assert(simplemap.find(newtype) != simplemap.end()); + i->type = simplemap[newtype]; + } + + // Replace type list + types.clear(); + copy(simpleset.begin(), simpleset.end(), back_inserter(types)); + + } else { + if (types.size() > 1) { + // Note: localtime uses the very first non-dst type as initial offsets. + // If all types are DSTs, the very first type is treated as the initial offsets. + + // Decide a type used as the initial offsets. ICU put the type at index 0. + ZoneType initialType = types[0]; + for (vector<ZoneType>::const_iterator i=types.begin(); i!=types.end(); ++i) { + if (i->dstoffset == 0) { + initialType = *i; + break; + } + } + + SimplifiedZoneType initialSimplifiedType(initialType); + + // create a set of unique types, but ignoring fields which we're not interested in + set<SimplifiedZoneType> simpleset; + simpleset.insert(initialSimplifiedType); + for (vector<Transition>::const_iterator i=transitions.begin(); i!=transitions.end(); ++i) { + assert(i->type < (int32_t)types.size()); + simpleset.insert(types[i->type]); + } + + // Map types to integer indices, however, keeping the first type at offset 0 + map<SimplifiedZoneType,int32_t> simplemap; + simplemap[initialSimplifiedType] = 0; + int32_t n = 1; + for (set<SimplifiedZoneType>::const_iterator i=simpleset.begin(); i!=simpleset.end(); ++i) { + if (*i < initialSimplifiedType || initialSimplifiedType < *i) { + simplemap[*i] = n++; + } + } + + // Remap transitions + for (vector<Transition>::iterator i=transitions.begin(); + i!=transitions.end(); ++i) { + assert(i->type < (int32_t)types.size()); + ZoneType oldtype = types[i->type]; + SimplifiedZoneType newtype(oldtype); + assert(simplemap.find(newtype) != simplemap.end()); + i->type = simplemap[newtype]; + } + + // Replace type list + types.clear(); + types.push_back(initialSimplifiedType); + for (set<SimplifiedZoneType>::const_iterator i=simpleset.begin(); i!=simpleset.end(); ++i) { + if (*i < initialSimplifiedType || initialSimplifiedType < *i) { + types.push_back(*i); + } + } + + // Reiterating transitions to remove any transitions which + // do not actually change the raw/dst offsets + int32_t prevTypeIdx = 0; + for (vector<Transition>::iterator i=transitions.begin(); i!=transitions.end();) { + if (i->type == prevTypeIdx) { + // this is not a time transition, probably just name change + // e.g. America/Resolute after 2006 in 2010b + transitions.erase(i); + } else { + prevTypeIdx = i->type; + i++; + } + } + } + } + +} + +/** + * Merge final zone data into this zone. + */ +void ZoneInfo::mergeFinalData(const FinalZone& fz) { + int32_t year = fz.year; + int64_t seconds = yearToSeconds(year); + + if (!ICU44PLUS) { + if (seconds > HIGHEST_TIME32) { + // Avoid transitions beyond signed 32bit max second. + // This may result incorrect offset computation around + // HIGHEST_TIME32. This is a limitation of ICU + // before 4.4. + seconds = HIGHEST_TIME32; + } + } + + vector<Transition>::iterator it = + find_if(transitions.begin(), transitions.end(), + bind2nd(ptr_fun(isAfter), seconds)); + transitions.erase(it, transitions.end()); + + if (finalYear != -1) { + throw invalid_argument("Final zone already merged in"); + } + finalYear = fz.year; + finalOffset = fz.offset; + finalRuleID = fz.ruleid; +} + +/** + * Merge the data from the given final zone into the core zone data by + * calling the ZoneInfo member function mergeFinalData. + */ +void mergeOne(const string& zoneid, const FinalZone& fz) { + if (ZONEINFO.find(zoneid) == ZONEINFO.end()) { + throw invalid_argument("Unrecognized final zone ID"); + } + ZONEINFO[zoneid].mergeFinalData(fz); +} + +/** + * Visitor function that merges the final zone data into the main zone + * data structures. It calls mergeOne for each final zone and its + * list of aliases. + */ +void mergeFinalZone(const pair<string,FinalZone>& p) { + const string& id = p.first; + const FinalZone& fz = p.second; + + mergeOne(id, fz); +} + +/** + * Print this rule in resource bundle format to os. ID and enclosing + * braces handled elsewhere. + */ +void FinalRule::print(ostream& os) const { + // First print the rule part that enters DST; then the rule part + // that exits it. + int32_t whichpart = (part[0].offset != 0) ? 0 : 1; + assert(part[whichpart].offset != 0); + assert(part[1-whichpart].offset == 0); + + os << " "; + for (int32_t i=0; i<2; ++i) { + const FinalRulePart& p = part[whichpart]; + whichpart = 1-whichpart; + os << p.month << ", " << p.stz_dowim() << ", " << p.stz_dow() << ", " + << p.time << ", " << p.timemode() << ", "; + } + os << part[whichpart].offset << endl; +} + +#define ICU_ZONE_OVERRIDE_SUFFIX "--ICU" +#define ICU_ZONE_OVERRIDE_SUFFIX_LEN 5 + +int main(int argc, char *argv[]) { + string rootpath, zonetab, version; + bool validArgs = false; + + if (argc == 4 || argc == 5) { + validArgs = true; + rootpath = argv[1]; + zonetab = argv[2]; + version = argv[3]; + if (argc == 5) { + if (strcmp(argv[4], "--old") == 0) { + ICU44PLUS = false; + TZ_RESOURCE_NAME = ICU_TZ_RESOURCE_OLD; + } else { + validArgs = false; + } + } + } + if (!validArgs) { + cout << "Usage: tz2icu <dir> <cmap> <tzver> [--old]" << endl + << " <dir> path to zoneinfo file tree generated by" << endl + << " ICU-patched version of zic" << endl + << " <cmap> country map, from tzdata archive," << endl + << " typically named \"zone.tab\"" << endl + << " <tzver> version string, such as \"2003e\"" << endl + << " --old generating resource format before ICU4.4" << endl; + exit(1); + } + + cout << "Olson data version: " << version << endl; + cout << "ICU 4.4+ format: " << (ICU44PLUS ? "Yes" : "No") << endl; + + try { + ifstream finals(ICU_ZONE_FILE); + if (finals) { + readFinalZonesAndRules(finals); + + cout << "Finished reading " << finalZones.size() + << " final zones and " << finalRules.size() + << " final rules from " ICU_ZONE_FILE << endl; + } else { + cerr << "Error: Unable to open " ICU_ZONE_FILE << endl; + return 1; + } + } catch (const exception& error) { + cerr << "Error: While reading " ICU_ZONE_FILE ": " << error.what() << endl; + return 1; + } + + try { + // Recursively scan all files below the given path, accumulating + // their data into ZONEINFO. All files must be TZif files. Any + // failure along the way will result in a call to exit(1). + scandir(rootpath); + } catch (const exception& error) { + cerr << "Error: While scanning " << rootpath << ": " << error.what() << endl; + return 1; + } + + cout << "Finished reading " << ZONEINFO.size() << " zoneinfo files [" + << (ZONEINFO.begin())->first << ".." + << (--ZONEINFO.end())->first << "]" << endl; + + // Overrides TZ database zones with ICU custom zone definition. + // These ICU zone overrides are defined in icuzones, with suffix --ICU. + // If there is a matching TZ database zone, the zoneinfo is replaced + // with the ICU definition. Then, the zone ID with --ICU suffix + // will be deleted from the final list. + // For example, zoneinfo for Europe/Dublin imported from the TZ database + // will be replaced with the zone definition for Europe/Dublin--ICU + // in icuzones. + + // Collect zone IDs to be modified with ICU definition. + vector<string> customZones; + for (ZoneMapIter i = ZONEINFO.begin(); i != ZONEINFO.end(); ++i) { + const string& id = i->first; + size_t idx = id.rfind(ICU_ZONE_OVERRIDE_SUFFIX); + if (idx != string::npos && idx == id.length() - ICU_ZONE_OVERRIDE_SUFFIX_LEN) { + cout << "ICU zone override: " << id << endl; + customZones.push_back(id.substr(0, idx)); + } + } + + // + // BEGIN ICU Custom ZoneInfo Override Handling + // + + // Replace zoneinfo with ICU definition, then remove ICU zone ID with + // the special suffix. + for (vector<string>::iterator i = customZones.begin(); i != customZones.end(); i++) { + string& origId = *i; + string custId = origId + ICU_ZONE_OVERRIDE_SUFFIX; + + map<string,ZoneInfo>::iterator origZi = ZONEINFO.find(origId); + map<string,ZoneInfo>::iterator custZi = ZONEINFO.find(custId); + if (origZi != ZONEINFO.end() && custZi != ZONEINFO.end()) { + // replace original zone info with custom override, + // then delete one custom ID + cout << "Replacing ZoneInfo " << origId << " with " << custId << endl; + origZi->second = custZi->second; + ZONEINFO.erase(custZi); + } + + // Also replace final rule + map<string,FinalZone>::iterator origFz = finalZones.find(origId); + map<string,FinalZone>::iterator custFz = finalZones.find(custId); + if (origFz != finalZones.end() && custFz != finalZones.end()) { + // replace original final zone with custom override, + // then delete one for custom ID + cout << "Replacing FinalZone for " << origId << " with " << custId << endl; + origFz->second = custFz->second; + finalZones.erase(custFz); + } + } + + // Also remove aliases for ICU custom zoneinfo overrides. + for (map<string,set<string>>::const_iterator i = links.begin(); i != links.end(); ) { + const string& id = i->first; + size_t idx = id.rfind(ICU_ZONE_OVERRIDE_SUFFIX); + if (idx != string::npos && idx == id.length() - ICU_ZONE_OVERRIDE_SUFFIX_LEN) { + const set<string>& aliases = i->second; + // Also remove all revserse links + for (set<string>::const_iterator j = aliases.begin(); j != aliases.end(); j++) { + const string& alias = *j; + cout << "Removing alias " << alias << endl; + reverseLinks.erase(alias); + } + + links.erase(i++); + } else { + i++; + } + } + + + // + // END ICU Custom ZoneInfo Override Handling + // + + try { + for_each(finalZones.begin(), finalZones.end(), mergeFinalZone); + } catch (const exception& error) { + cerr << "Error: While merging final zone data: " << error.what() << endl; + return 1; + } + + // Process links (including ICU aliases). For each link set we have + // a canonical ID (e.g., America/Los_Angeles) and a set of one or more + // aliases (e.g., PST, PST8PDT, ...). + + // 1. Add all aliases as zone objects in ZONEINFO + for (map<string,set<string> >::const_iterator i = links.begin(); + i!=links.end(); ++i) { + const string& olson = i->first; + const set<string>& aliases = i->second; + if (ZONEINFO.find(olson) == ZONEINFO.end()) { + cerr << "Error: Invalid 'Link' to non-existent \"" + << olson << "\"" << endl; + return 1; + } + for (set<string>::const_iterator j=aliases.begin(); + j!=aliases.end(); ++j) { + ZONEINFO[*j] = ZoneInfo(); + } + } + + // 2. Create a mapping from zones to index numbers 0..n-1. + map<string,int32_t> zoneIDs; + vector<string> zoneIDlist; + int32_t z=0; + for (ZoneMap::iterator i=ZONEINFO.begin(); i!=ZONEINFO.end(); ++i) { + zoneIDs[i->first] = z++; + zoneIDlist.push_back(i->first); + } + assert(z == (int32_t) ZONEINFO.size()); + + // 3. Merge aliases. Sometimes aliases link to other aliases; we + // resolve these into simplest possible sets. + map<string,set<string> > links2; + map<string,string> reverse2; + for (map<string,set<string> >::const_iterator i = links.begin(); + i!=links.end(); ++i) { + string olson = i->first; + while (reverseLinks.find(olson) != reverseLinks.end()) { + olson = reverseLinks[olson]; + } + for (set<string>::const_iterator j=i->second.begin(); j!=i->second.end(); ++j) { + links2[olson].insert(*j); + reverse2[*j] = olson; + } + } + links = links2; + reverseLinks = reverse2; + + if (false) { // Debugging: Emit link map + for (map<string,set<string> >::const_iterator i = links.begin(); + i!=links.end(); ++i) { + cout << i->first << ": "; + for (set<string>::const_iterator j=i->second.begin(); j!=i->second.end(); ++j) { + cout << *j << ", "; + } + cout << endl; + } + } + + // 4. Update aliases + for (map<string,set<string> >::const_iterator i = links.begin(); + i!=links.end(); ++i) { + const string& olson = i->first; + const set<string>& aliases = i->second; + ZONEINFO[olson].clearAliases(); + ZONEINFO[olson].addAlias(zoneIDs[olson]); + for (set<string>::const_iterator j=aliases.begin(); + j!=aliases.end(); ++j) { + assert(zoneIDs.find(olson) != zoneIDs.end()); + assert(zoneIDs.find(*j) != zoneIDs.end()); + assert(ZONEINFO.find(*j) != ZONEINFO.end()); + ZONEINFO[*j].setAliasTo(zoneIDs[olson]); + ZONEINFO[olson].addAlias(zoneIDs[*j]); + } + } + + // Once merging of final data is complete, we can optimize the type list + for (ZoneMap::iterator i=ZONEINFO.begin(); i!=ZONEINFO.end(); ++i) { + i->second.optimizeTypeList(); + } + + // Create the country map + map<string, string> icuRegions; // ICU's custom zone -> country override + map<string, set<string> > countryMap; // country -> set of zones + map<string, string> reverseCountryMap; // zone -> country + + try { + // Read icuregions file to collect ICU's own zone-region mapping data. + ifstream frg(ICU_REGIONS); + if (frg) { + string line; + while (getline(frg, line)) { + if (line[0] == '#') continue; + + string zone, country; + istringstream is(line); + is >> zone >> country; + if (zone.size() == 0) continue; + if (country.size() < 2) { + cerr << "Error: Can't parse " << line << " in " << ICU_REGIONS << endl; + return 1; + } + icuRegions[zone] = country; + } + } else { + cout << "No custom region map [icuregions]" << endl; + } + } catch (const exception& error) { + cerr << "Error: While reading " << ICU_REGIONS << ": " << error.what() << endl; + return 1; + } + + try { + ifstream f(zonetab.c_str()); + if (!f) { + cerr << "Error: Unable to open " << zonetab << endl; + return 1; + } + int32_t n = 0; + string line; + while (getline(f, line)) { + string::size_type lb = line.find('#'); + if (lb != string::npos) { + line.resize(lb); // trim comments + } + string country, coord, zone; + istringstream is(line); + is >> country >> coord >> zone; + if (country.size() == 0) continue; + if (country.size() != 2 || zone.size() < 1) { + cerr << "Error: Can't parse " << line << " in " << zonetab << endl; + return 1; + } + if (ZONEINFO.find(zone) == ZONEINFO.end()) { + cerr << "Error: Country maps to invalid zone " << zone + << " in " << zonetab << endl; + return 1; + } + if (icuRegions.find(zone) != icuRegions.end()) { + // Custom override + string customCountry = icuRegions[zone]; + cout << "Region Mapping: custom override for " << zone + << " " << country << " -> " << customCountry << endl; + country = customCountry; + } + countryMap[country].insert(zone); + reverseCountryMap[zone] = country; + //cerr << (n+1) << ": " << country << " <=> " << zone << endl; + ++n; + } + cout << "Finished reading " << n + << " country entries from " << zonetab << endl; + } catch (const exception& error) { + cerr << "Error: While reading " << zonetab << ": " << error.what() << endl; + return 1; + } + + // Merge ICU's own zone-region mapping data + for (map<string,string>::const_iterator i = icuRegions.begin(); + i != icuRegions.end(); ++i) { + const string& zid(i->first); + if (reverseCountryMap.find(zid) != reverseCountryMap.end()) { + continue; + } + cout << "Region Mapping: custom data zone=" << zid + << ", region=" << i->second << endl; + + reverseCountryMap[zid] = i->second; + countryMap[i->second].insert(zid); + } + + // Merge ICU aliases into country map. Don't merge any alias + // that already has a country map, since that doesn't make sense. + // E.g. "Link Europe/Oslo Arctic/Longyearbyen" doesn't mean we + // should cross-map the countries between these two zones. + for (map<string,set<string> >::const_iterator i = links.begin(); + i!=links.end(); ++i) { + const string& olson(i->first); + if (reverseCountryMap.find(olson) == reverseCountryMap.end()) { + continue; + } + string c = reverseCountryMap[olson]; + const set<string>& aliases(i->second); + for (set<string>::const_iterator j=aliases.begin(); + j != aliases.end(); ++j) { + if (reverseCountryMap.find(*j) == reverseCountryMap.end()) { + countryMap[c].insert(*j); + reverseCountryMap[*j] = c; + //cerr << "Aliased country: " << c << " <=> " << *j << endl; + } + } + } + + // Create a pseudo-country containing all zones belonging to no country + set<string> nocountry; + for (ZoneMap::iterator i=ZONEINFO.begin(); i!=ZONEINFO.end(); ++i) { + if (reverseCountryMap.find(i->first) == reverseCountryMap.end()) { + nocountry.insert(i->first); + } + } + countryMap[""] = nocountry; + + // Get local time & year for below + time_t sec; + time(&sec); + struct tm* now = localtime(&sec); + + string filename = TZ_RESOURCE_NAME + ".txt"; + // Write out a resource-bundle source file containing data for + // all zones. + ofstream file(filename.c_str()); + if (file) { + file << "//---------------------------------------------------------" << endl + << "// Copyright (C) 2016 and later: Unicode, Inc. and others." << endl + << "// License & terms of use: http://www.unicode.org/copyright.html" << endl + << "//---------------------------------------------------------" << endl + << "// Build tool: tz2icu" << endl + << "// Build date: " << asctime(now) /* << endl -- asctime emits CR */ + << "// tz database: ftp://ftp.iana.org/tz/" << endl + << "// tz version: " << version << endl + << "// ICU version: " << U_ICU_VERSION << endl + << "//---------------------------------------------------------" << endl + << "// >> !!! >> THIS IS A MACHINE-GENERATED FILE << !!! <<" << endl + << "// >> !!! >>> DO NOT EDIT <<< !!! <<" << endl + << "//---------------------------------------------------------" << endl + << endl + << TZ_RESOURCE_NAME << ":table(nofallback) {" << endl + << " TZVersion { \"" << version << "\" }" << endl + << " Zones:array { " << endl + << ZONEINFO // Zones (the actual data) + << " }" << endl; + + // Names correspond to the Zones list, used for binary searching. + printStringList ( file, ZONEINFO ); // print the Names list + + // Final Rules are used if requested by the zone + file << " Rules { " << endl; + // Emit final rules + int32_t frc = 0; + for(map<string,FinalRule>::iterator i=finalRules.begin(); + i!=finalRules.end(); ++i) { + const string& id = i->first; + const FinalRule& r = i->second; + file << " " << id << ":intvector {" << endl; + r.print(file); + file << " } //_#" << frc++ << endl; + } + file << " }" << endl; + + // Emit country (region) map. + if (ICU44PLUS) { + file << " Regions:array {" << endl; + int32_t zn = 0; + for (ZoneMap::iterator i=ZONEINFO.begin(); i!=ZONEINFO.end(); ++i) { + map<string, string>::iterator cit = reverseCountryMap.find(i->first); + if (cit == reverseCountryMap.end()) { + file << " \"001\","; + } else { + file << " \"" << cit->second << "\", "; + } + file << "//Z#" << zn++ << " " << i->first << endl; + } + file << " }" << endl; + } else { + file << " Regions { " << endl; + int32_t rc = 0; + for (map<string, set<string> >::const_iterator i=countryMap.begin(); + i != countryMap.end(); ++i) { + string country = i->first; + const set<string>& zones(i->second); + file << " "; + if(country[0]==0) { + file << "Default"; + } + file << country << ":intvector { "; + bool first = true; + for (set<string>::const_iterator j=zones.begin(); + j != zones.end(); ++j) { + if (!first) file << ", "; + first = false; + if (zoneIDs.find(*j) == zoneIDs.end()) { + cerr << "Error: Nonexistent zone in country map: " << *j << endl; + return 1; + } + file << zoneIDs[*j]; // emit the zone's index number + } + file << " } //R#" << rc++ << endl; + } + file << " }" << endl; + } + + file << "}" << endl; + } + + file.close(); + + if (file) { // recheck error bit + cout << "Finished writing " << TZ_RESOURCE_NAME << ".txt" << endl; + } else { + cerr << "Error: Unable to open/write to " << TZ_RESOURCE_NAME << ".txt" << endl; + return 1; + } +} +//eof diff --git a/intl/icu/source/tools/tzcode/tz2icu.h b/intl/icu/source/tools/tzcode/tz2icu.h new file mode 100644 index 0000000000..c077c21697 --- /dev/null +++ b/intl/icu/source/tools/tzcode/tz2icu.h @@ -0,0 +1,46 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (c) 2003-2013, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* Author: Alan Liu +* Created: July 10 2003 +* Since: ICU 2.8 +********************************************************************** +*/ + +#ifndef _TZ2ICU_H_ +#define _TZ2ICU_H_ + +/* We have modified the zoneinfo binary format (we write raw offset + * and DST offset separately instead of their sum) so we notate the + * file with a distinct signature. This prevents someone from trying + * to use our output files as normal zoneinfo files, and also prevents + * someone from trying to use normal zoneinfo files for ICU. We also + * use the first byte of the reserved section as a version integer, to + * be incremented each time the data format changes. + */ + +#define TZ_ICU_MAGIC "TZic" /* cf. TZ_MAGIC = "TZif" */ + +typedef unsigned char ICUZoneinfoVersion; + +#define TZ_ICU_VERSION ((ICUZoneinfoVersion) 1) + +/* File into which we will write supplemental ICU data. This allows + * zic to communicate final zone data to tz2icu. */ +#define ICU_ZONE_FILE "icu_zone.txt" + +/* Output resource name. This determines both the file name and the + * resource name within the file. That is, the output will be to the + * file ICU_TZ_RESOURCE ".txt" and the resource within it will be + * ICU_TZ_RESOURCE. */ +#define ICU_TZ_RESOURCE_OLD "zoneinfo" +#define ICU_TZ_RESOURCE "zoneinfo64" + +/* File containing custom zone-region mapping. */ +#define ICU_REGIONS "icuregions" + +#endif diff --git a/intl/icu/source/tools/tzcode/tzfile.h b/intl/icu/source/tools/tzcode/tzfile.h new file mode 100644 index 0000000000..8fa197529e --- /dev/null +++ b/intl/icu/source/tools/tzcode/tzfile.h @@ -0,0 +1,169 @@ +#ifndef TZFILE_H + +#define TZFILE_H + +/* +** This file is in the public domain, so clarified as of +** 1996-06-05 by Arthur David Olson. +*/ + +/* +** This header is for use ONLY with the time conversion code. +** There is no guarantee that it will remain unchanged, +** or that it will remain at all. +** Do NOT copy it to any system include directory. +** Thank you! +*/ + +/* +** Information about time zone files. +*/ + +#ifndef TZDIR +#define TZDIR "/usr/local/etc/zoneinfo" /* Time zone object file directory */ +#endif /* !defined TZDIR */ + +#ifndef TZDEFAULT +#define TZDEFAULT "localtime" +#endif /* !defined TZDEFAULT */ + +#ifndef TZDEFRULES +#define TZDEFRULES "posixrules" +#endif /* !defined TZDEFRULES */ + +/* +** Each file begins with. . . +*/ + +#define TZ_MAGIC "TZif" + +struct tzhead { + char tzh_magic[4]; /* TZ_MAGIC */ + char tzh_version[1]; /* '\0' or '2' or '3' as of 2013 */ + char tzh_reserved[15]; /* reserved--must be zero */ + char tzh_ttisgmtcnt[4]; /* coded number of trans. time flags */ + char tzh_ttisstdcnt[4]; /* coded number of trans. time flags */ + char tzh_leapcnt[4]; /* coded number of leap seconds */ + char tzh_timecnt[4]; /* coded number of transition times */ + char tzh_typecnt[4]; /* coded number of local time types */ + char tzh_charcnt[4]; /* coded number of abbr. chars */ +}; + +/* +** . . .followed by. . . +** +** tzh_timecnt (char [4])s coded transition times a la time(2) +** tzh_timecnt (unsigned char)s types of local time starting at above +** tzh_typecnt repetitions of +** one (char [4]) coded UT offset in seconds +** one (unsigned char) used to set tm_isdst +** one (unsigned char) that's an abbreviation list index +** tzh_charcnt (char)s '\0'-terminated zone abbreviations +** tzh_leapcnt repetitions of +** one (char [4]) coded leap second transition times +** one (char [4]) total correction after above +** tzh_ttisstdcnt (char)s indexed by type; if true, transition +** time is standard time, if false, +** transition time is wall clock time +** if absent, transition times are +** assumed to be wall clock time +** tzh_ttisgmtcnt (char)s indexed by type; if true, transition +** time is UT, if false, +** transition time is local time +** if absent, transition times are +** assumed to be local time +*/ + +/* +** If tzh_version is '2' or greater, the above is followed by a second instance +** of tzhead and a second instance of the data in which each coded transition +** time uses 8 rather than 4 chars, +** then a POSIX-TZ-environment-variable-style string for use in handling +** instants after the last transition time stored in the file +** (with nothing between the newlines if there is no POSIX representation for +** such instants). +** +** If tz_version is '3' or greater, the above is extended as follows. +** First, the POSIX TZ string's hour offset may range from -167 +** through 167 as compared to the POSIX-required 0 through 24. +** Second, its DST start time may be January 1 at 00:00 and its stop +** time December 31 at 24:00 plus the difference between DST and +** standard time, indicating DST all year. +*/ + +/* +** In the current implementation, "tzset()" refuses to deal with files that +** exceed any of the limits below. +*/ + +#ifndef TZ_MAX_TIMES +#define TZ_MAX_TIMES 2000 +#endif /* !defined TZ_MAX_TIMES */ + +#ifndef TZ_MAX_TYPES +/* This must be at least 17 for Europe/Samara and Europe/Vilnius. */ +#define TZ_MAX_TYPES 256 /* Limited by what (unsigned char)'s can hold */ +#endif /* !defined TZ_MAX_TYPES */ + +#ifndef TZ_MAX_CHARS +#define TZ_MAX_CHARS 50 /* Maximum number of abbreviation characters */ + /* (limited by what unsigned chars can hold) */ +#endif /* !defined TZ_MAX_CHARS */ + +#ifndef TZ_MAX_LEAPS +#define TZ_MAX_LEAPS 50 /* Maximum number of leap second corrections */ +#endif /* !defined TZ_MAX_LEAPS */ + +#define SECSPERMIN 60 +#define MINSPERHOUR 60 +#define HOURSPERDAY 24 +#define DAYSPERWEEK 7 +#define DAYSPERNYEAR 365 +#define DAYSPERLYEAR 366 +#define SECSPERHOUR (SECSPERMIN * MINSPERHOUR) +#define SECSPERDAY ((int_fast32_t) SECSPERHOUR * HOURSPERDAY) +#define MONSPERYEAR 12 + +#define TM_SUNDAY 0 +#define TM_MONDAY 1 +#define TM_TUESDAY 2 +#define TM_WEDNESDAY 3 +#define TM_THURSDAY 4 +#define TM_FRIDAY 5 +#define TM_SATURDAY 6 + +#define TM_JANUARY 0 +#define TM_FEBRUARY 1 +#define TM_MARCH 2 +#define TM_APRIL 3 +#define TM_MAY 4 +#define TM_JUNE 5 +#define TM_JULY 6 +#define TM_AUGUST 7 +#define TM_SEPTEMBER 8 +#define TM_OCTOBER 9 +#define TM_NOVEMBER 10 +#define TM_DECEMBER 11 + +#define TM_YEAR_BASE 1900 + +#define EPOCH_YEAR 1970 +#define EPOCH_WDAY TM_THURSDAY + +#define isleap(y) (((y) % 4) == 0 && (((y) % 100) != 0 || ((y) % 400) == 0)) + +/* +** Since everything in isleap is modulo 400 (or a factor of 400), we know that +** isleap(y) == isleap(y % 400) +** and so +** isleap(a + b) == isleap((a + b) % 400) +** or +** isleap(a + b) == isleap(a % 400 + b % 400) +** This is true even if % means modulo rather than Fortran remainder +** (which is allowed by C89 but not C99). +** We use this to avoid addition overflow problems. +*/ + +#define isleap_sum(a, b) isleap((a) % 400 + (b) % 400) + +#endif /* !defined TZFILE_H */ diff --git a/intl/icu/source/tools/tzcode/tzselect.ksh b/intl/icu/source/tools/tzcode/tzselect.ksh new file mode 100644 index 0000000000..26dfa98476 --- /dev/null +++ b/intl/icu/source/tools/tzcode/tzselect.ksh @@ -0,0 +1,308 @@ +#! /bin/ksh + +# '@(#)tzselect.ksh 8.1' + +# Ask the user about the time zone, and output the resulting TZ value to stdout. +# Interact with the user via stderr and stdin. + +# Contributed by Paul Eggert. + +# Porting notes: +# +# This script requires several features of the Korn shell. +# If your host lacks the Korn shell, +# you can use either of the following free programs instead: +# +# <a href=ftp://ftp.gnu.org/pub/gnu/> +# Bourne-Again shell (bash) +# </a> +# +# <a href=ftp://ftp.cs.mun.ca/pub/pdksh/pdksh.tar.gz> +# Public domain ksh +# </a> +# +# This script also uses several features of modern awk programs. +# If your host lacks awk, or has an old awk that does not conform to Posix.2, +# you can use either of the following free programs instead: +# +# <a href=ftp://ftp.gnu.org/pub/gnu/> +# GNU awk (gawk) +# </a> +# +# <a href=ftp://ftp.whidbey.net/pub/brennan/> +# mawk +# </a> + + +# Specify default values for environment variables if they are unset. +: ${AWK=awk} +: ${TZDIR=$(pwd)} + +# Check for awk Posix compliance. +($AWK -v x=y 'BEGIN { exit 123 }') </dev/null >/dev/null 2>&1 +[ $? = 123 ] || { + echo >&2 "$0: Sorry, your \`$AWK' program is not Posix compatible." + exit 1 +} + +# Make sure the tables are readable. +TZ_COUNTRY_TABLE=$TZDIR/iso3166.tab +TZ_ZONE_TABLE=$TZDIR/zone.tab +for f in $TZ_COUNTRY_TABLE $TZ_ZONE_TABLE +do + <$f || { + echo >&2 "$0: time zone files are not set up correctly" + exit 1 + } +done + +newline=' +' +IFS=$newline + + +# Work around a bug in bash 1.14.7 and earlier, where $PS3 is sent to stdout. +case $(echo 1 | (select x in x; do break; done) 2>/dev/null) in +?*) PS3= +esac + + +# Begin the main loop. We come back here if the user wants to retry. +while + + echo >&2 'Please identify a location' \ + 'so that time zone rules can be set correctly.' + + continent= + country= + region= + + + # Ask the user for continent or ocean. + + echo >&2 'Please select a continent or ocean.' + + select continent in \ + Africa \ + Americas \ + Antarctica \ + 'Arctic Ocean' \ + Asia \ + 'Atlantic Ocean' \ + Australia \ + Europe \ + 'Indian Ocean' \ + 'Pacific Ocean' \ + 'none - I want to specify the time zone using the Posix TZ format.' + do + case $continent in + '') + echo >&2 'Please enter a number in range.';; + ?*) + case $continent in + Americas) continent=America;; + *' '*) continent=$(expr "$continent" : '\([^ ]*\)') + esac + break + esac + done + case $continent in + '') + exit 1;; + none) + # Ask the user for a Posix TZ string. Check that it conforms. + while + echo >&2 'Please enter the desired value' \ + 'of the TZ environment variable.' + echo >&2 'For example, GST-10 is a zone named GST' \ + 'that is 10 hours ahead (east) of UTC.' + read TZ + $AWK -v TZ="$TZ" 'BEGIN { + tzname = "[^-+,0-9][^-+,0-9][^-+,0-9]+" + time = "[0-2]?[0-9](:[0-5][0-9](:[0-5][0-9])?)?" + offset = "[-+]?" time + date = "(J?[0-9]+|M[0-9]+\.[0-9]+\.[0-9]+)" + datetime = "," date "(/" time ")?" + tzpattern = "^(:.*|" tzname offset "(" tzname \ + "(" offset ")?(" datetime datetime ")?)?)$" + if (TZ ~ tzpattern) exit 1 + exit 0 + }' + do + echo >&2 "\`$TZ' is not a conforming" \ + 'Posix time zone string.' + done + TZ_for_date=$TZ;; + *) + # Get list of names of countries in the continent or ocean. + countries=$($AWK -F'\t' \ + -v continent="$continent" \ + -v TZ_COUNTRY_TABLE="$TZ_COUNTRY_TABLE" \ + ' + /^#/ { next } + $3 ~ ("^" continent "/") { + if (!cc_seen[$1]++) cc_list[++ccs] = $1 + } + END { + while (getline <TZ_COUNTRY_TABLE) { + if ($0 !~ /^#/) cc_name[$1] = $2 + } + for (i = 1; i <= ccs; i++) { + country = cc_list[i] + if (cc_name[country]) { + country = cc_name[country] + } + print country + } + } + ' <$TZ_ZONE_TABLE | sort -f) + + + # If there's more than one country, ask the user which one. + case $countries in + *"$newline"*) + echo >&2 'Please select a country.' + select country in $countries + do + case $country in + '') echo >&2 'Please enter a number in range.';; + ?*) break + esac + done + + case $country in + '') exit 1 + esac;; + *) + country=$countries + esac + + + # Get list of names of time zone rule regions in the country. + regions=$($AWK -F'\t' \ + -v country="$country" \ + -v TZ_COUNTRY_TABLE="$TZ_COUNTRY_TABLE" \ + ' + BEGIN { + cc = country + while (getline <TZ_COUNTRY_TABLE) { + if ($0 !~ /^#/ && country == $2) { + cc = $1 + break + } + } + } + $1 == cc { print $4 } + ' <$TZ_ZONE_TABLE) + + + # If there's more than one region, ask the user which one. + case $regions in + *"$newline"*) + echo >&2 'Please select one of the following' \ + 'time zone regions.' + select region in $regions + do + case $region in + '') echo >&2 'Please enter a number in range.';; + ?*) break + esac + done + case $region in + '') exit 1 + esac;; + *) + region=$regions + esac + + # Determine TZ from country and region. + TZ=$($AWK -F'\t' \ + -v country="$country" \ + -v region="$region" \ + -v TZ_COUNTRY_TABLE="$TZ_COUNTRY_TABLE" \ + ' + BEGIN { + cc = country + while (getline <TZ_COUNTRY_TABLE) { + if ($0 !~ /^#/ && country == $2) { + cc = $1 + break + } + } + } + $1 == cc && $4 == region { print $3 } + ' <$TZ_ZONE_TABLE) + + # Make sure the corresponding zoneinfo file exists. + TZ_for_date=$TZDIR/$TZ + <$TZ_for_date || { + echo >&2 "$0: time zone files are not set up correctly" + exit 1 + } + esac + + + # Use the proposed TZ to output the current date relative to UTC. + # Loop until they agree in seconds. + # Give up after 8 unsuccessful tries. + + extra_info= + for i in 1 2 3 4 5 6 7 8 + do + TZdate=$(LANG=C TZ="$TZ_for_date" date) + UTdate=$(LANG=C TZ=UTC0 date) + TZsec=$(expr "$TZdate" : '.*:\([0-5][0-9]\)') + UTsec=$(expr "$UTdate" : '.*:\([0-5][0-9]\)') + case $TZsec in + $UTsec) + extra_info=" +Local time is now: $TZdate. +Universal Time is now: $UTdate." + break + esac + done + + + # Output TZ info and ask the user to confirm. + + echo >&2 "" + echo >&2 "The following information has been given:" + echo >&2 "" + case $country+$region in + ?*+?*) echo >&2 " $country$newline $region";; + ?*+) echo >&2 " $country";; + +) echo >&2 " TZ='$TZ'" + esac + echo >&2 "" + echo >&2 "Therefore TZ='$TZ' will be used.$extra_info" + echo >&2 "Is the above information OK?" + + ok= + select ok in Yes No + do + case $ok in + '') echo >&2 'Please enter 1 for Yes, or 2 for No.';; + ?*) break + esac + done + case $ok in + '') exit 1;; + Yes) break + esac +do : +done + +case $SHELL in +*csh) file=.login line="setenv TZ '$TZ'";; +*) file=.profile line="TZ='$TZ'; export TZ" +esac + +echo >&2 " +You can make this change permanent for yourself by appending the line + $line +to the file '$file' in your home directory; then log out and log in again. + +Here is that TZ value again, this time on standard output so that you +can use the $0 command in shell scripts:" + +echo "$TZ" diff --git a/intl/icu/source/tools/tzcode/zdump.c b/intl/icu/source/tools/tzcode/zdump.c new file mode 100644 index 0000000000..ebd7a5ce32 --- /dev/null +++ b/intl/icu/source/tools/tzcode/zdump.c @@ -0,0 +1,1089 @@ +/* +** This file is in the public domain, so clarified as of +** 2009-05-17 by Arthur David Olson. +*/ + +#include "version.h" + +/* +** This code has been made independent of the rest of the time +** conversion package to increase confidence in the verification it provides. +** You can use this code to help in verifying other implementations. +** +** However, include private.h when debugging, so that it overrides +** time_t consistently with the rest of the package. +*/ + +#ifdef time_tz +# include "private.h" +#endif + +#include <stdbool.h> + +#include "stdio.h" /* for stdout, stderr, perror */ +#include "string.h" /* for strcpy */ +#include "sys/types.h" /* for time_t */ +#include "time.h" /* for struct tm */ +#include "stdlib.h" /* for exit, malloc, atoi */ +#include "limits.h" /* for CHAR_BIT, LLONG_MAX */ +#include "ctype.h" /* for isalpha et al. */ + +/* Enable extensions and modifications for ICU. */ +#define ICU + +#ifdef ICU +#include "dirent.h" +#include "sys/stat.h" +#endif + +#ifndef isascii +#define isascii(x) 1 +#endif /* !defined isascii */ + +/* +** Substitutes for pre-C99 compilers. +** Much of this section of code is stolen from private.h. +*/ + +#ifndef HAVE_STDINT_H +# define HAVE_STDINT_H \ + (199901 <= __STDC_VERSION__ || 2 < (__GLIBC__ + (0 < __GLIBC_MINOR__))) +#endif +#if HAVE_STDINT_H +# include "stdint.h" +#endif +#ifndef HAVE_INTTYPES_H +# define HAVE_INTTYPES_H HAVE_STDINT_H +#endif +#if HAVE_INTTYPES_H +# include <inttypes.h> +#endif + +#ifndef INT_FAST32_MAX +# if INT_MAX >> 31 == 0 +typedef long int_fast32_t; +# else +typedef int int_fast32_t; +# endif +#endif + +#ifndef INTMAX_MAX +# if defined LLONG_MAX || defined __LONG_LONG_MAX__ +typedef long long intmax_t; +# define strtoimax strtoll +# define PRIdMAX "lld" +# ifdef LLONG_MAX +# define INTMAX_MAX LLONG_MAX +# else +# define INTMAX_MAX __LONG_LONG_MAX__ +# endif +# else +typedef long intmax_t; +# define strtoimax strtol +# define PRIdMAX "ld" +# define INTMAX_MAX LONG_MAX +# endif +#endif + + +#ifndef ZDUMP_LO_YEAR +#define ZDUMP_LO_YEAR (-500) +#endif /* !defined ZDUMP_LO_YEAR */ + +#ifndef ZDUMP_HI_YEAR +#define ZDUMP_HI_YEAR 2500 +#endif /* !defined ZDUMP_HI_YEAR */ + +#ifndef MAX_STRING_LENGTH +#define MAX_STRING_LENGTH 1024 +#endif /* !defined MAX_STRING_LENGTH */ + +#ifndef EXIT_SUCCESS +#define EXIT_SUCCESS 0 +#endif /* !defined EXIT_SUCCESS */ + +#ifndef EXIT_FAILURE +#define EXIT_FAILURE 1 +#endif /* !defined EXIT_FAILURE */ + +#ifndef SECSPERMIN +#define SECSPERMIN 60 +#endif /* !defined SECSPERMIN */ + +#ifndef MINSPERHOUR +#define MINSPERHOUR 60 +#endif /* !defined MINSPERHOUR */ + +#ifndef SECSPERHOUR +#define SECSPERHOUR (SECSPERMIN * MINSPERHOUR) +#endif /* !defined SECSPERHOUR */ + +#ifndef HOURSPERDAY +#define HOURSPERDAY 24 +#endif /* !defined HOURSPERDAY */ + +#ifndef EPOCH_YEAR +#define EPOCH_YEAR 1970 +#endif /* !defined EPOCH_YEAR */ + +#ifndef TM_YEAR_BASE +#define TM_YEAR_BASE 1900 +#endif /* !defined TM_YEAR_BASE */ + +#ifndef DAYSPERNYEAR +#define DAYSPERNYEAR 365 +#endif /* !defined DAYSPERNYEAR */ + +#ifndef isleap +#define isleap(y) (((y) % 4) == 0 && (((y) % 100) != 0 || ((y) % 400) == 0)) +#endif /* !defined isleap */ + +#ifndef isleap_sum +/* +** See tzfile.h for details on isleap_sum. +*/ +#define isleap_sum(a, b) isleap((a) % 400 + (b) % 400) +#endif /* !defined isleap_sum */ + +#define SECSPERDAY ((int_fast32_t) SECSPERHOUR * HOURSPERDAY) +#define SECSPERNYEAR (SECSPERDAY * DAYSPERNYEAR) +#define SECSPERLYEAR (SECSPERNYEAR + SECSPERDAY) +#define SECSPER400YEARS (SECSPERNYEAR * (intmax_t) (300 + 3) \ + + SECSPERLYEAR * (intmax_t) (100 - 3)) + +/* +** True if SECSPER400YEARS is known to be representable as an +** intmax_t. It's OK that SECSPER400YEARS_FITS can in theory be false +** even if SECSPER400YEARS is representable, because when that happens +** the code merely runs a bit more slowly, and this slowness doesn't +** occur on any practical platform. +*/ +enum { SECSPER400YEARS_FITS = SECSPERLYEAR <= INTMAX_MAX / 400 }; + +#ifndef HAVE_GETTEXT +#define HAVE_GETTEXT 0 +#endif +#if HAVE_GETTEXT +#include "locale.h" /* for setlocale */ +#include "libintl.h" +#endif /* HAVE_GETTEXT */ + +#ifndef GNUC_or_lint +#ifdef lint +#define GNUC_or_lint +#else /* !defined lint */ +#ifdef __GNUC__ +#define GNUC_or_lint +#endif /* defined __GNUC__ */ +#endif /* !defined lint */ +#endif /* !defined GNUC_or_lint */ + +#if 2 < __GNUC__ || (__GNUC__ == 2 && 96 <= __GNUC_MINOR__) +# define ATTRIBUTE_PURE __attribute__ ((__pure__)) +#else +# define ATTRIBUTE_PURE /* empty */ +#endif + +/* +** For the benefit of GNU folk... +** `_(MSGID)' uses the current locale's message library string for MSGID. +** The default is to use gettext if available, and use MSGID otherwise. +*/ + +#ifndef _ +#if HAVE_GETTEXT +#define _(msgid) gettext(msgid) +#else /* !HAVE_GETTEXT */ +#define _(msgid) msgid +#endif /* !HAVE_GETTEXT */ +#endif /* !defined _ */ + +#ifndef TZ_DOMAIN +#define TZ_DOMAIN "tz" +#endif /* !defined TZ_DOMAIN */ + +extern char ** environ; +extern int getopt(int argc, char * const argv[], + const char * options); +extern char * optarg; +extern int optind; +extern char * tzname[2]; + +/* The minimum and maximum finite time values. */ +static time_t const absolute_min_time = + ((time_t) -1 < 0 + ? (time_t) -1 << (CHAR_BIT * sizeof (time_t) - 1) + : 0); +static time_t const absolute_max_time = + ((time_t) -1 < 0 + ? - (~ 0 < 0) - ((time_t) -1 << (CHAR_BIT * sizeof (time_t) - 1)) + : -1); +static size_t longest; +static char * progname; +static int warned; + +static char * abbr(struct tm * tmp); +static void abbrok(const char * abbrp, const char * zone); +static intmax_t delta(struct tm * newp, struct tm * oldp) ATTRIBUTE_PURE; +static void dumptime(const struct tm * tmp); +static time_t hunt(char * name, time_t lot, time_t hit); +static void show(char * zone, time_t t, int v); +static const char * tformat(void); +static time_t yeartot(intmax_t y) ATTRIBUTE_PURE; +#ifdef ICU +typedef struct listentry { + char * name; + struct listentry * next; +} listentry; + +static time_t huntICU(char * name, time_t lot, time_t hit, FILE *fp); +static void dumptimeICU(FILE * fp, time_t t); +static void showICU(FILE * fp, char * zone, time_t t1, time_t t2); +static int getall(struct listentry ** namelist); +static void getzones(char * basedir, char * subdir, struct listentry ** last, int * count); +#endif + +#ifndef TYPECHECK +#define my_localtime localtime +#else /* !defined TYPECHECK */ +static struct tm * +my_localtime(time_t *tp) +{ + register struct tm * tmp; + + tmp = localtime(tp); + if (tp != NULL && tmp != NULL) { + struct tm tm; + register time_t t; + + tm = *tmp; + t = mktime(&tm); + if (t != *tp) { + (void) fflush(stdout); + (void) fprintf(stderr, "\n%s: ", progname); + (void) fprintf(stderr, tformat(), *tp); + (void) fprintf(stderr, " ->"); + (void) fprintf(stderr, " year=%d", tmp->tm_year); + (void) fprintf(stderr, " mon=%d", tmp->tm_mon); + (void) fprintf(stderr, " mday=%d", tmp->tm_mday); + (void) fprintf(stderr, " hour=%d", tmp->tm_hour); + (void) fprintf(stderr, " min=%d", tmp->tm_min); + (void) fprintf(stderr, " sec=%d", tmp->tm_sec); + (void) fprintf(stderr, " isdst=%d", tmp->tm_isdst); + (void) fprintf(stderr, " -> "); + (void) fprintf(stderr, tformat(), t); + (void) fprintf(stderr, "\n"); + } + } + return tmp; +} +#endif /* !defined TYPECHECK */ + +static void +abbrok(const char *const abbrp, const char *const zone) +{ + register const char * cp; + register const char * wp; + + if (warned) + return; + cp = abbrp; + wp = NULL; + while (isascii((unsigned char) *cp) && isalpha((unsigned char) *cp)) + ++cp; + if (cp - abbrp == 0) + wp = _("lacks alphabetic at start"); + else if (cp - abbrp < 3) + wp = _("has fewer than 3 alphabetics"); + else if (cp - abbrp > 6) + wp = _("has more than 6 alphabetics"); + if (wp == NULL && (*cp == '+' || *cp == '-')) { + ++cp; + if (isascii((unsigned char) *cp) && + isdigit((unsigned char) *cp)) + if (*cp++ == '1' && *cp >= '0' && *cp <= '4') + ++cp; + if (*cp != '\0') + wp = _("differs from POSIX standard"); + } + if (wp == NULL) + return; + (void) fflush(stdout); + (void) fprintf(stderr, + _("%s: warning: zone \"%s\" abbreviation \"%s\" %s\n"), + progname, zone, abbrp, wp); + warned = true; +} + +static void +usage(FILE * const stream, const int status) +{ + (void) fprintf(stream, +_("%s: usage: %s [--version] [--help] [-{vV}] [-{ct} [lo,]hi] zonename ...\n" + "\n" + "Report bugs to %s.\n"), + progname, progname, REPORT_BUGS_TO); + exit(status); +} + +int +main(int argc, char *argv[]) +{ + register int i; + register int vflag; + register int Vflag; + register char * cutarg; + register char * cuttimes; + register time_t cutlotime; + register time_t cuthitime; + register char ** fakeenv; + time_t now; + time_t t; + time_t newt; + struct tm tm; + struct tm newtm; + register struct tm * tmp; + register struct tm * newtmp; +#ifdef ICU + int nextopt; + char * dirarg; + int aflag; + int iflag; + listentry * namelist = NULL; + FILE * fp = stdout; +#endif + + cutlotime = absolute_min_time; + cuthitime = absolute_max_time; +#if HAVE_GETTEXT + (void) setlocale(LC_ALL, ""); +#ifdef TZ_DOMAINDIR + (void) bindtextdomain(TZ_DOMAIN, TZ_DOMAINDIR); +#endif /* defined TEXTDOMAINDIR */ + (void) textdomain(TZ_DOMAIN); +#endif /* HAVE_GETTEXT */ + progname = argv[0]; + for (i = 1; i < argc; ++i) + if (strcmp(argv[i], "--version") == 0) { + (void) printf("zdump %s%s\n", PKGVERSION, TZVERSION); + exit(EXIT_SUCCESS); + } else if (strcmp(argv[i], "--help") == 0) { + usage(stdout, EXIT_SUCCESS); + } + vflag = Vflag = 0; + cutarg = cuttimes = NULL; +#ifdef ICU + aflag = 0; + iflag = 0; + dirarg = NULL; + for (;;) + switch(getopt(argc, argv, "ac:d:it:vV")) { + case 'a': aflag = 1; break; + case 'c': cutarg = optarg; break; + case 'd': dirarg = optarg; break; + case 'i': iflag = 1; break; + case 't': cuttimes = optarg; break; + case 'v': vflag = 1; break; + case 'V': Vflag = 1; break; + case -1: + if (! (optind == argc - 1 && strcmp(argv[optind], "=") == 0)) + goto arg_processing_done; + /* Fall through. */ + default: + (void) fprintf(stderr, + _("%s: usage is %s [ --version ] [ -a ] [ -v ] [ -V ] [ -i ] [ -c [loyear,]hiyear ] [ -t [lotime,]hitime] ][ -d dir ] [ zonename ... ]\n"), + progname, progname); + exit(EXIT_FAILURE); + } +#else + for (;;) + switch (getopt(argc, argv, "c:t:vV")) { + case 'c': cutarg = optarg; break; + case 't': cuttimes = optarg; break; + case 'v': vflag = 1; break; + case 'V': Vflag = 1; break; + case -1: + if (! (optind == argc - 1 && strcmp(argv[optind], "=") == 0)) + goto arg_processing_done; + /* Fall through. */ + default: + usage(stderr, EXIT_FAILURE); + } +#endif + arg_processing_done:; + +#ifdef ICU + if (dirarg != NULL) { + DIR * dp; + /* create the output directory */ + mkdir(dirarg, 0777); + if ((dp = opendir(dirarg)) == NULL) { + fprintf(stderr, "cannot create the target directory"); + exit(EXIT_FAILURE); + } + closedir(dp); + } +#endif + + if (vflag | Vflag) { + intmax_t lo; + intmax_t hi; + char *loend, *hiend; + register intmax_t cutloyear = ZDUMP_LO_YEAR; + register intmax_t cuthiyear = ZDUMP_HI_YEAR; + if (cutarg != NULL) { + lo = strtoimax(cutarg, &loend, 10); + if (cutarg != loend && !*loend) { + hi = lo; + cuthiyear = hi; + } else if (cutarg != loend && *loend == ',' + && (hi = strtoimax(loend + 1, &hiend, 10), + loend + 1 != hiend && !*hiend)) { + cutloyear = lo; + cuthiyear = hi; + } else { +(void) fprintf(stderr, _("%s: wild -c argument %s\n"), + progname, cutarg); + exit(EXIT_FAILURE); + } + } + if (cutarg != NULL || cuttimes == NULL) { + cutlotime = yeartot(cutloyear); + cuthitime = yeartot(cuthiyear); + } + if (cuttimes != NULL) { + lo = strtoimax(cuttimes, &loend, 10); + if (cuttimes != loend && !*loend) { + hi = lo; + if (hi < cuthitime) { + if (hi < absolute_min_time) + hi = absolute_min_time; + cuthitime = hi; + } + } else if (cuttimes != loend && *loend == ',' + && (hi = strtoimax(loend + 1, &hiend, 10), + loend + 1 != hiend && !*hiend)) { + if (cutlotime < lo) { + if (absolute_max_time < lo) + lo = absolute_max_time; + cutlotime = lo; + } + if (hi < cuthitime) { + if (hi < absolute_min_time) + hi = absolute_min_time; + cuthitime = hi; + } + } else { + (void) fprintf(stderr, + _("%s: wild -t argument %s\n"), + progname, cuttimes); + exit(EXIT_FAILURE); + } + } + } + +#ifdef ICU + if (aflag) { + /* get all available zones */ + char ** fakeargv; + int i; + int count; + + count = getall(&namelist); + fakeargv = (char **) malloc((size_t) (argc + count) * sizeof *argv); + /* + if ((fakeargv = (char **) malloc((size_t) (argc + count) * sizeof *argv)) == NULL) { + exit(EXIT_FAILURE); + } + */ + for (i = 0; i < argc; i++) { + fakeargv[i] = argv[i]; + } + for (i = 0; i < count; i++) { + fakeargv[i + argc] = namelist->name; + namelist = namelist->next; + } + argv = fakeargv; + argc += count; + } +#endif + (void) time(&now); + longest = 0; + for (i = optind; i < argc; ++i) + if (strlen(argv[i]) > longest) + longest = strlen(argv[i]); + { + register int from; + register int to; + + for (i = 0; environ[i] != NULL; ++i) + continue; + fakeenv = malloc((i + 2) * sizeof *fakeenv); + if (fakeenv == NULL + || (fakeenv[0] = malloc(longest + 4)) == NULL) { + (void) perror(progname); + exit(EXIT_FAILURE); + } + to = 0; + (void) strcpy(fakeenv[to++], "TZ="); + for (from = 0; environ[from] != NULL; ++from) + if (strncmp(environ[from], "TZ=", 3) != 0) + fakeenv[to++] = environ[from]; + fakeenv[to] = NULL; + environ = fakeenv; + } + for (i = optind; i < argc; ++i) { + static char buf[MAX_STRING_LENGTH]; + + (void) strcpy(&fakeenv[0][3], argv[i]); + if (! (vflag | Vflag)) { + show(argv[i], now, false); + continue; + } +#ifdef ICU + fp = NULL; + if (iflag) { + if (dirarg == NULL) { + /* we want to display a zone name here */ + if (i != optind) { + printf("\n"); + } + printf("ZONE: %s\n", argv[i]); + } else { + int zstart; + char path[FILENAME_MAX + 1]; + strcpy(path, dirarg); + strcat(path, "/"); + zstart = strlen(path); + strcat(path, argv[i]); + /* replace '/' with '-' */ + while(path[++zstart] != 0) { + if (path[zstart] == '/') { + path[zstart] = '-'; + } + } + if ((fp = fopen(path, "w")) == NULL) { + fprintf(stderr, "cannot create output file %s\n", path); + exit(EXIT_FAILURE); + } + } + } +#endif + warned = false; + t = absolute_min_time; +#ifdef ICU + /* skip displaying info for the lowest time, which is actually not + * a transition when -i option is set */ + if (!iflag) { +#endif + if (!Vflag) { + show(argv[i], t, true); + t += SECSPERDAY; + show(argv[i], t, true); + } +#ifdef ICU + } +#endif + if (t < cutlotime) + t = cutlotime; + tmp = my_localtime(&t); + if (tmp != NULL) { + tm = *tmp; + (void) strncpy(buf, abbr(&tm), (sizeof buf) - 1); + } + for ( ; ; ) { + newt = (t < absolute_max_time - SECSPERDAY / 2 + ? t + SECSPERDAY / 2 + : absolute_max_time); + if (cuthitime <= newt) + break; + newtmp = localtime(&newt); + if (newtmp != NULL) + newtm = *newtmp; +#ifdef ICU + if (iflag) { + /* We do not want to capture transitions just for + * abbreviated zone name changes */ + if ((tmp == NULL || newtmp == NULL) ? (tmp != newtmp) : + (delta(&newtm, &tm) != (newt - t) || + newtm.tm_isdst != tm.tm_isdst)) { + newt = huntICU(argv[i], t, newt, fp); + newtmp = localtime(&newt); + if (newtmp != NULL) { + newtm = *newtmp; + (void) strncpy(buf, + abbr(&newtm), + (sizeof buf) - 1); + } + } + } else { +#endif + if ((tmp == NULL || newtmp == NULL) ? (tmp != newtmp) : + (delta(&newtm, &tm) != (newt - t) || + newtm.tm_isdst != tm.tm_isdst || + strcmp(abbr(&newtm), buf) != 0)) { + newt = hunt(argv[i], t, newt); + newtmp = localtime(&newt); + if (newtmp != NULL) { + newtm = *newtmp; + (void) strncpy(buf, + abbr(&newtm), + (sizeof buf) - 1); + } + } +#ifdef ICU + } +#endif + t = newt; + tm = newtm; + tmp = newtmp; + } +#ifdef ICU + if (!iflag) { + /* skip displaying info for the highest time, which is actually not + * a transition when -i option is used*/ +#endif + if (!Vflag) { + t = absolute_max_time; + t -= SECSPERDAY; + show(argv[i], t, true); + t += SECSPERDAY; + show(argv[i], t, true); + } +#ifdef ICU + } + /* close file */ + if (fp != NULL) { + fclose(fp); + } +#endif + } + if (fflush(stdout) || ferror(stdout)) { + (void) fprintf(stderr, "%s: ", progname); + (void) perror(_("Error writing to standard output")); + exit(EXIT_FAILURE); + } +#ifdef ICU + if (aflag) { + struct listentry * entry = namelist; + struct listentry * next; + while (entry != NULL) { + free(entry->name); + next = entry->next; + free(entry); + entry = next; + } + } +#endif + exit(EXIT_SUCCESS); + /* If exit fails to exit... */ + return EXIT_FAILURE; +} + +static time_t +yeartot(const intmax_t y) +{ + register intmax_t myy, seconds, years; + register time_t t; + + myy = EPOCH_YEAR; + t = 0; + while (myy < y) { + if (SECSPER400YEARS_FITS && 400 <= y - myy) { + intmax_t diff400 = (y - myy) / 400; + if (INTMAX_MAX / SECSPER400YEARS < diff400) + return absolute_max_time; + seconds = diff400 * SECSPER400YEARS; + years = diff400 * 400; + } else { + seconds = isleap(myy) ? SECSPERLYEAR : SECSPERNYEAR; + years = 1; + } + myy += years; + if (t > absolute_max_time - seconds) + return absolute_max_time; + t += seconds; + } + while (y < myy) { + if (SECSPER400YEARS_FITS && y + 400 <= myy && myy < 0) { + intmax_t diff400 = (myy - y) / 400; + if (INTMAX_MAX / SECSPER400YEARS < diff400) + return absolute_min_time; + seconds = diff400 * SECSPER400YEARS; + years = diff400 * 400; + } else { + seconds = isleap(myy - 1) ? SECSPERLYEAR : SECSPERNYEAR; + years = 1; + } + myy -= years; + if (t < absolute_min_time + seconds) + return absolute_min_time; + t -= seconds; + } + return t; +} + +static time_t +hunt(char *name, time_t lot, time_t hit) +{ + time_t t; + struct tm lotm; + register struct tm * lotmp; + struct tm tm; + register struct tm * tmp; + char loab[MAX_STRING_LENGTH]; + + lotmp = my_localtime(&lot); + if (lotmp != NULL) { + lotm = *lotmp; + (void) strncpy(loab, abbr(&lotm), (sizeof loab) - 1); + } + for ( ; ; ) { + time_t diff = hit - lot; + if (diff < 2) + break; + t = lot; + t += diff / 2; + if (t <= lot) + ++t; + else if (t >= hit) + --t; + tmp = my_localtime(&t); + if (tmp != NULL) + tm = *tmp; + if ((lotmp == NULL || tmp == NULL) ? (lotmp == tmp) : + (delta(&tm, &lotm) == (t - lot) && + tm.tm_isdst == lotm.tm_isdst && + strcmp(abbr(&tm), loab) == 0)) { + lot = t; + lotm = tm; + lotmp = tmp; + } else hit = t; + } + show(name, lot, true); + show(name, hit, true); + return hit; +} + +/* +** Thanks to Paul Eggert for logic used in delta. +*/ + +static intmax_t +delta(struct tm * newp, struct tm *oldp) +{ + register intmax_t result; + register int tmy; + + if (newp->tm_year < oldp->tm_year) + return -delta(oldp, newp); + result = 0; + for (tmy = oldp->tm_year; tmy < newp->tm_year; ++tmy) + result += DAYSPERNYEAR + isleap_sum(tmy, TM_YEAR_BASE); + result += newp->tm_yday - oldp->tm_yday; + result *= HOURSPERDAY; + result += newp->tm_hour - oldp->tm_hour; + result *= MINSPERHOUR; + result += newp->tm_min - oldp->tm_min; + result *= SECSPERMIN; + result += newp->tm_sec - oldp->tm_sec; + return result; +} + +static void +show(char *zone, time_t t, int v) +{ + register struct tm * tmp; + + (void) printf("%-*s ", (int) longest, zone); + if (v) { + tmp = gmtime(&t); + if (tmp == NULL) { + (void) printf(tformat(), t); + } else { + dumptime(tmp); + (void) printf(" UT"); + } + (void) printf(" = "); + } + tmp = my_localtime(&t); + dumptime(tmp); + if (tmp != NULL) { + if (*abbr(tmp) != '\0') + (void) printf(" %s", abbr(tmp)); + if (v) { + (void) printf(" isdst=%d", tmp->tm_isdst); +#ifdef TM_GMTOFF + (void) printf(" gmtoff=%ld", tmp->TM_GMTOFF); +#endif /* defined TM_GMTOFF */ + } + } + (void) printf("\n"); + if (tmp != NULL && *abbr(tmp) != '\0') + abbrok(abbr(tmp), zone); +} + +static char * +abbr(struct tm *tmp) +{ + register char * result; + static char nada; + + if (tmp->tm_isdst != 0 && tmp->tm_isdst != 1) + return &nada; + result = tzname[tmp->tm_isdst]; + return (result == NULL) ? &nada : result; +} + +/* +** The code below can fail on certain theoretical systems; +** it works on all known real-world systems as of 2004-12-30. +*/ + +static const char * +tformat(void) +{ + if (0 > (time_t) -1) { /* signed */ + if (sizeof (time_t) == sizeof (intmax_t)) + return "%"PRIdMAX; + if (sizeof (time_t) > sizeof (long)) + return "%lld"; + if (sizeof (time_t) > sizeof (int)) + return "%ld"; + return "%d"; + } +#ifdef PRIuMAX + if (sizeof (time_t) == sizeof (uintmax_t)) + return "%"PRIuMAX; +#endif + if (sizeof (time_t) > sizeof (unsigned long)) + return "%llu"; + if (sizeof (time_t) > sizeof (unsigned int)) + return "%lu"; + return "%u"; +} + +static void +dumptime(register const struct tm *timeptr) +{ + static const char wday_name[][3] = { + "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" + }; + static const char mon_name[][3] = { + "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" + }; + register const char * wn; + register const char * mn; + register int lead; + register int trail; + + if (timeptr == NULL) { + (void) printf("NULL"); + return; + } + /* + ** The packaged versions of localtime and gmtime never put out-of-range + ** values in tm_wday or tm_mon, but since this code might be compiled + ** with other (perhaps experimental) versions, paranoia is in order. + */ + if (timeptr->tm_wday < 0 || timeptr->tm_wday >= + (int) (sizeof wday_name / sizeof wday_name[0])) + wn = "???"; + else wn = wday_name[timeptr->tm_wday]; + if (timeptr->tm_mon < 0 || timeptr->tm_mon >= + (int) (sizeof mon_name / sizeof mon_name[0])) + mn = "???"; + else mn = mon_name[timeptr->tm_mon]; + (void) printf("%.3s %.3s%3d %.2d:%.2d:%.2d ", + wn, mn, + timeptr->tm_mday, timeptr->tm_hour, + timeptr->tm_min, timeptr->tm_sec); +#define DIVISOR 10 + trail = timeptr->tm_year % DIVISOR + TM_YEAR_BASE % DIVISOR; + lead = timeptr->tm_year / DIVISOR + TM_YEAR_BASE / DIVISOR + + trail / DIVISOR; + trail %= DIVISOR; + if (trail < 0 && lead > 0) { + trail += DIVISOR; + --lead; + } else if (lead < 0 && trail > 0) { + trail -= DIVISOR; + ++lead; + } + if (lead == 0) + (void) printf("%d", trail); + else (void) printf("%d%d", lead, ((trail < 0) ? -trail : trail)); +} + +#ifdef ICU +static time_t +huntICU(char *name, time_t lot, time_t hit, FILE * fp) +{ + time_t t; + long diff; + struct tm lotm; + register struct tm * lotmp; + struct tm tm; + register struct tm * tmp; + char loab[MAX_STRING_LENGTH]; + + lotmp = my_localtime(&lot); + if (lotmp != NULL) { + lotm = *lotmp; + (void) strncpy(loab, abbr(&lotm), (sizeof loab) - 1); + } + for ( ; ; ) { + diff = (long) (hit - lot); + if (diff < 2) + break; + t = lot; + t += diff / 2; + if (t <= lot) + ++t; + else if (t >= hit) + --t; + tmp = my_localtime(&t); + if (tmp != NULL) + tm = *tmp; + /* We do not want to capture transitions just for + * abbreviated zone name changes */ + if ((lotmp == NULL || tmp == NULL) ? (lotmp == tmp) : + (delta(&tm, &lotm) == (t - lot) && + tm.tm_isdst == lotm.tm_isdst)) { + lot = t; + lotm = tm; + lotmp = tmp; + } else hit = t; + } + showICU(fp, name, lot, hit); + return hit; +} + +static void showICU(FILE * fp, char *zone, time_t t1, time_t t2) +{ + if (fp == NULL) { + fp = stdout; + } + dumptimeICU(fp, t1); + fprintf(fp, " > "); + dumptimeICU(fp, t2); + fprintf(fp, "\n"); +} + +static void dumptimeICU(FILE * fp, time_t t) +{ + static const char wday_name[][3] = { + "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" + }; + struct tm gmt; + struct tm loc; + register int lead; + register int trail; + long offset; + long hour, min, sec; + + loc = *my_localtime(&t); + + trail = loc.tm_year % DIVISOR + TM_YEAR_BASE % DIVISOR; + lead = loc.tm_year / DIVISOR + TM_YEAR_BASE / DIVISOR + trail / DIVISOR; + trail %= DIVISOR; + if (trail < 0 && lead > 0) { + trail += DIVISOR; + --lead; + } else if (lead < 0 && trail > 0) { + trail -= DIVISOR; + ++lead; + } + + fprintf(fp, "%04d-%02d-%02d", lead * DIVISOR + trail, loc.tm_mon + 1, loc.tm_mday); + fprintf(fp, " %.3s ", wday_name[loc.tm_wday]); + fprintf(fp, "%02d:%02d:%02d", loc.tm_hour, loc.tm_min, loc.tm_sec); + + gmt = *gmtime(&t); + offset = delta(&loc, &gmt); + if (offset < 0) { + offset = -offset; + fprintf(fp, "-"); + } else { + fprintf(fp, "+"); + } + + sec = offset % 60; + offset = (offset - sec) / 60; + min = offset % 60; + hour = offset / 60; + + fprintf(fp, "%02ld", hour); + fprintf(fp, "%02ld", min); + fprintf(fp, "%02ld", sec); + fprintf(fp, "[DST=%d]", loc.tm_isdst); +} + +static int getall(struct listentry ** namelist) { + int count = 0; + struct listentry dummyentry; + struct listentry * last = &dummyentry; + + getzones(TZDIR, NULL, &last, &count); + if (count > 0) { + *namelist = dummyentry.next; + } + + return count; +} + +static void getzones(char * basedir, char * relpath, struct listentry ** last, int * count) { + char path[FILENAME_MAX + 1]; + struct dirent * dir; + DIR * dp; + + strcpy(path, basedir); + if (relpath != NULL) { + strcat(path, "/"); + strcat(path, relpath); + } + + if ((dp = opendir(path)) == NULL) { + /* file */ + if (strstr(relpath, ".tab") == NULL && strcmp(relpath, "Etc/Unknown") != 0) { + char * pzonename; + listentry * pentry; + + if ((pzonename = malloc(strlen(relpath) + 1)) == NULL) { + exit(EXIT_FAILURE); + } + strcpy(pzonename, relpath); + + if ((pentry = malloc(sizeof(listentry))) == NULL) { + exit(EXIT_FAILURE); + } + + pentry->name = pzonename; + pentry->next = NULL; + (*last)->next = pentry; + *last = pentry; + (*count)++; + } + } else { + /* directory */ + while ((dir = readdir(dp)) != NULL) { + char subpath[FILENAME_MAX + 1]; + + if (strcmp(dir->d_name, ".") == 0 + || strcmp(dir->d_name, "..") == 0) { + continue; + } + if (relpath != NULL) { + strcpy(subpath, relpath); + strcat(subpath, "/"); + strcat(subpath, dir->d_name); + } else { + strcpy(subpath, dir->d_name); + } + getzones(basedir, subpath, last, count); + } + closedir(dp); + } +} +#endif diff --git a/intl/icu/source/tools/tzcode/zic.c b/intl/icu/source/tools/tzcode/zic.c new file mode 100644 index 0000000000..54576780d5 --- /dev/null +++ b/intl/icu/source/tools/tzcode/zic.c @@ -0,0 +1,3156 @@ +/* +** This file is in the public domain, so clarified as of +** 2006-07-17 by Arthur David Olson. +*/ + +/* Enable extensions and modifications for ICU. */ +#define ICU + +/* Continue executing after link failure. Even if ICU is undefined + * (for vanilla zic behavior), ICU_LINKS should be defined, since zic + * appears to fail on the 2003 data the first time through during the + * linking phase. Running zic twice, with ICU_LINKS defined, causes + * links to be handled correctly. */ +#define ICU_LINKS + +#define LEAVE_SOME_PRE_2011_SYSTEMS_IN_THE_LURCH + +#ifdef ICU +/* These constants are embedded in dynamically generated header + * version.h in the standard tzcode distribution. */ +static char const PKGVERSION[]="N/A"; +static char const TZVERSION[]="N/A"; +static char const REPORT_BUGS_TO[]="N/A"; +#else +#include "version.h" +#endif +#include "private.h" +#include "locale.h" +#include "tzfile.h" + +#include <stdarg.h> +#include <stdbool.h> + +#define ZIC_VERSION_PRE_2013 '2' +#define ZIC_VERSION '3' + +typedef int_fast64_t zic_t; +#define ZIC_MIN INT_FAST64_MIN +#define ZIC_MAX INT_FAST64_MAX +#define SCNdZIC SCNdFAST64 + +#ifndef ZIC_MAX_ABBR_LEN_WO_WARN +#define ZIC_MAX_ABBR_LEN_WO_WARN 6 +#endif /* !defined ZIC_MAX_ABBR_LEN_WO_WARN */ + +#if HAVE_SYS_STAT_H +#include "sys/stat.h" +#endif +#ifdef S_IRUSR +#define MKDIR_UMASK (S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH) +#else +#define MKDIR_UMASK 0755 +#endif + +#ifdef ICU +#include "tz2icu.h" +#endif + +/* +** On some ancient hosts, predicates like `isspace(C)' are defined +** only if isascii(C) || C == EOF. Modern hosts obey the C Standard, +** which says they are defined only if C == ((unsigned char) C) || C == EOF. +** Neither the C Standard nor Posix require that `isascii' exist. +** For portability, we check both ancient and modern requirements. +** If isascii is not defined, the isascii check succeeds trivially. +*/ +#include "ctype.h" +#ifndef isascii +#define isascii(x) 1 +#endif + +#define end(cp) (strchr((cp), '\0')) + +struct rule { + const char * r_filename; + int r_linenum; + const char * r_name; + + zic_t r_loyear; /* for example, 1986 */ + zic_t r_hiyear; /* for example, 1986 */ + const char * r_yrtype; + int r_lowasnum; + int r_hiwasnum; + + int r_month; /* 0..11 */ + + int r_dycode; /* see below */ + int r_dayofmonth; + int r_wday; + + zic_t r_tod; /* time from midnight */ + int r_todisstd; /* above is standard time if true */ + /* or wall clock time if false */ + int r_todisgmt; /* above is GMT if true */ + /* or local time if false */ + zic_t r_stdoff; /* offset from standard time */ + const char * r_abbrvar; /* variable part of abbreviation */ + + int r_todo; /* a rule to do (used in outzone) */ + zic_t r_temp; /* used in outzone */ +}; + +/* +** r_dycode r_dayofmonth r_wday +*/ + +#define DC_DOM 0 /* 1..31 */ /* unused */ +#define DC_DOWGEQ 1 /* 1..31 */ /* 0..6 (Sun..Sat) */ +#define DC_DOWLEQ 2 /* 1..31 */ /* 0..6 (Sun..Sat) */ + +struct zone { + const char * z_filename; + int z_linenum; + + const char * z_name; + zic_t z_gmtoff; + const char * z_rule; + const char * z_format; + + zic_t z_stdoff; + + struct rule * z_rules; + int z_nrules; + + struct rule z_untilrule; + zic_t z_untiltime; +}; + +extern int getopt(int argc, char * const argv[], + const char * options); +extern int link(const char * fromname, const char * toname); +extern char * optarg; +extern int optind; + +#if ! HAVE_LINK +# define link(from, to) (-1) +#endif +#if ! HAVE_SYMLINK +# define symlink(from, to) (-1) +#endif + +static void addtt(zic_t starttime, int type); +#ifdef ICU +static int addtype(const zic_t gmtoff, const zic_t rawoff, const zic_t dstoff, + char *const abbr, int isdst, + int ttisstd, int ttisgmt); +#else +static int addtype(zic_t gmtoff, const char * abbr, int isdst, + int ttisstd, int ttisgmt); +#endif +static void leapadd(zic_t t, int positive, int rolling, int count); +static void adjleap(void); +static void associate(void); +static void dolink(const char * fromfield, const char * tofield); +static char ** getfields(char * buf); +static zic_t gethms(const char * string, const char * errstrng, + int signable); +static void infile(const char * filename); +static void inleap(char ** fields, int nfields); +static void inlink(char ** fields, int nfields); +static void inrule(char ** fields, int nfields); +static int inzcont(char ** fields, int nfields); +static int inzone(char ** fields, int nfields); +static int inzsub(char ** fields, int nfields, int iscont); +static int itsdir(const char * name); +static int lowerit(int c); +static int mkdirs(char * filename); +static void newabbr(const char * abbr); +static zic_t oadd(zic_t t1, zic_t t2); +static void outzone(const struct zone * zp, int ntzones); +static zic_t rpytime(const struct rule * rp, zic_t wantedy); +static void rulesub(struct rule * rp, + const char * loyearp, const char * hiyearp, + const char * typep, const char * monthp, + const char * dayp, const char * timep); +static zic_t tadd(zic_t t1, zic_t t2); +static int yearistype(int year, const char * type); +#ifdef ICU +static void emit_icu_zone(FILE* f, const char* zoneName, int zoneOffset, + const struct rule* rule, + int ruleIndex, int startYear); +static void emit_icu_link(FILE* f, const char* from, const char* to); +static void emit_icu_rule(FILE* f, const struct rule* r, int ruleIndex); +static int add_icu_final_rules(const struct rule* r1, const struct rule* r2); +#endif + +static int charcnt; +static int errors; +static const char * filename; +static int leapcnt; +static int leapseen; +static zic_t leapminyear; +static zic_t leapmaxyear; +static int linenum; +static int max_abbrvar_len; +static int max_format_len; +static zic_t max_year; +static zic_t min_year; +static int noise; +static const char * rfilename; +static int rlinenum; +static const char * progname; +static int timecnt; +static int timecnt_alloc; +static int typecnt; + +/* +** Line codes. +*/ + +#define LC_RULE 0 +#define LC_ZONE 1 +#define LC_LINK 2 +#define LC_LEAP 3 + +/* +** Which fields are which on a Zone line. +*/ + +#define ZF_NAME 1 +#define ZF_GMTOFF 2 +#define ZF_RULE 3 +#define ZF_FORMAT 4 +#define ZF_TILYEAR 5 +#define ZF_TILMONTH 6 +#define ZF_TILDAY 7 +#define ZF_TILTIME 8 +#define ZONE_MINFIELDS 5 +#define ZONE_MAXFIELDS 9 + +/* +** Which fields are which on a Zone continuation line. +*/ + +#define ZFC_GMTOFF 0 +#define ZFC_RULE 1 +#define ZFC_FORMAT 2 +#define ZFC_TILYEAR 3 +#define ZFC_TILMONTH 4 +#define ZFC_TILDAY 5 +#define ZFC_TILTIME 6 +#define ZONEC_MINFIELDS 3 +#define ZONEC_MAXFIELDS 7 + +/* +** Which files are which on a Rule line. +*/ + +#define RF_NAME 1 +#define RF_LOYEAR 2 +#define RF_HIYEAR 3 +#define RF_COMMAND 4 +#define RF_MONTH 5 +#define RF_DAY 6 +#define RF_TOD 7 +#define RF_STDOFF 8 +#define RF_ABBRVAR 9 +#define RULE_FIELDS 10 + +/* +** Which fields are which on a Link line. +*/ + +#define LF_FROM 1 +#define LF_TO 2 +#define LINK_FIELDS 3 + +/* +** Which fields are which on a Leap line. +*/ + +#define LP_YEAR 1 +#define LP_MONTH 2 +#define LP_DAY 3 +#define LP_TIME 4 +#define LP_CORR 5 +#define LP_ROLL 6 +#define LEAP_FIELDS 7 + +/* +** Year synonyms. +*/ + +#define YR_MINIMUM 0 +#define YR_MAXIMUM 1 +#define YR_ONLY 2 + +static struct rule * rules; +static int nrules; /* number of rules */ +static int nrules_alloc; + +static struct zone * zones; +static int nzones; /* number of zones */ +static int nzones_alloc; + +struct link { + const char * l_filename; + int l_linenum; + const char * l_from; + const char * l_to; +}; + +static struct link * links; +static int nlinks; +static int nlinks_alloc; + +struct lookup { + const char * l_word; + const int l_value; +}; + +#ifdef ICU +/* Indices into rules[] for final rules. They will occur in pairs, + * with finalRules[i] occurring before finalRules[i+1] in the year. + * Each zone need only store a start year, a standard offset, and an + * index into finalRules[]. FinalRules[] are aliases into rules[]. */ +static const struct rule ** finalRules = NULL; +static int finalRulesCount = 0; +#endif + +static struct lookup const * byword(const char * string, + const struct lookup * lp); + +static struct lookup const line_codes[] = { + { "Rule", LC_RULE }, + { "Zone", LC_ZONE }, + { "Link", LC_LINK }, + { "Leap", LC_LEAP }, + { NULL, 0} +}; + +static struct lookup const mon_names[] = { + { "January", TM_JANUARY }, + { "February", TM_FEBRUARY }, + { "March", TM_MARCH }, + { "April", TM_APRIL }, + { "May", TM_MAY }, + { "June", TM_JUNE }, + { "July", TM_JULY }, + { "August", TM_AUGUST }, + { "September", TM_SEPTEMBER }, + { "October", TM_OCTOBER }, + { "November", TM_NOVEMBER }, + { "December", TM_DECEMBER }, + { NULL, 0 } +}; + +static struct lookup const wday_names[] = { + { "Sunday", TM_SUNDAY }, + { "Monday", TM_MONDAY }, + { "Tuesday", TM_TUESDAY }, + { "Wednesday", TM_WEDNESDAY }, + { "Thursday", TM_THURSDAY }, + { "Friday", TM_FRIDAY }, + { "Saturday", TM_SATURDAY }, + { NULL, 0 } +}; + +static struct lookup const lasts[] = { + { "last-Sunday", TM_SUNDAY }, + { "last-Monday", TM_MONDAY }, + { "last-Tuesday", TM_TUESDAY }, + { "last-Wednesday", TM_WEDNESDAY }, + { "last-Thursday", TM_THURSDAY }, + { "last-Friday", TM_FRIDAY }, + { "last-Saturday", TM_SATURDAY }, + { NULL, 0 } +}; + +static struct lookup const begin_years[] = { + { "minimum", YR_MINIMUM }, + { "maximum", YR_MAXIMUM }, + { NULL, 0 } +}; + +static struct lookup const end_years[] = { + { "minimum", YR_MINIMUM }, + { "maximum", YR_MAXIMUM }, + { "only", YR_ONLY }, + { NULL, 0 } +}; + +static struct lookup const leap_types[] = { + { "Rolling", true }, + { "Stationary", false }, + { NULL, 0 } +}; + +static const int len_months[2][MONSPERYEAR] = { + { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }, + { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 } +}; + +static const int len_years[2] = { + DAYSPERNYEAR, DAYSPERLYEAR +}; + +static struct attype { + zic_t at; + unsigned char type; +} * attypes; +static zic_t gmtoffs[TZ_MAX_TYPES]; +#ifdef ICU +/* gmtoffs[i] = rawoffs[i] + dstoffs[i] */ +static zic_t rawoffs[TZ_MAX_TYPES]; +static zic_t dstoffs[TZ_MAX_TYPES]; +#endif +static char isdsts[TZ_MAX_TYPES]; +static unsigned char abbrinds[TZ_MAX_TYPES]; +static char ttisstds[TZ_MAX_TYPES]; +static char ttisgmts[TZ_MAX_TYPES]; +static char chars[TZ_MAX_CHARS]; +static zic_t trans[TZ_MAX_LEAPS]; +static zic_t corr[TZ_MAX_LEAPS]; +static char roll[TZ_MAX_LEAPS]; + +/* +** Memory allocation. +*/ + +static _Noreturn void +memory_exhausted(const char *msg) +{ + fprintf(stderr, _("%s: Memory exhausted: %s\n"), progname, msg); + exit(EXIT_FAILURE); +} + +static ATTRIBUTE_PURE size_t +size_product(size_t nitems, size_t itemsize) +{ + if (SIZE_MAX / itemsize < nitems) + memory_exhausted("size overflow"); + return nitems * itemsize; +} + +static ATTRIBUTE_PURE void * +memcheck(void *const ptr) +{ + if (ptr == NULL) + memory_exhausted(strerror(errno)); + return ptr; +} + +#define emalloc(size) memcheck(malloc(size)) +#define erealloc(ptr, size) memcheck(realloc(ptr, size)) +#define ecpyalloc(ptr) memcheck(icpyalloc(ptr)) +#define ecatalloc(oldp, newp) memcheck(icatalloc((oldp), (newp))) + +static void * +growalloc(void *ptr, size_t itemsize, int nitems, int *nitems_alloc) +{ + if (nitems < *nitems_alloc) + return ptr; + else { + int amax = INT_MAX < SIZE_MAX ? INT_MAX : SIZE_MAX; + if ((amax - 1) / 3 * 2 < *nitems_alloc) + memory_exhausted("int overflow"); + *nitems_alloc = *nitems_alloc + (*nitems_alloc >> 1) + 1; + return erealloc(ptr, size_product(*nitems_alloc, itemsize)); + } +} + +/* +** Error handling. +*/ + +static void +eats(const char *const name, const int num, const char *const rname, + const int rnum) +{ + filename = name; + linenum = num; + rfilename = rname; + rlinenum = rnum; +} + +static void +eat(const char *const name, const int num) +{ + eats(name, num, NULL, -1); +} + +static void ATTRIBUTE_FORMAT((printf, 1, 0)) +verror(const char *const string, va_list args) +{ + /* + ** Match the format of "cc" to allow sh users to + ** zic ... 2>&1 | error -t "*" -v + ** on BSD systems. + */ + fprintf(stderr, _("\"%s\", line %d: "), filename, linenum); + vfprintf(stderr, string, args); + if (rfilename != NULL) + (void) fprintf(stderr, _(" (rule from \"%s\", line %d)"), + rfilename, rlinenum); + (void) fprintf(stderr, "\n"); + ++errors; +} + +static void ATTRIBUTE_FORMAT((printf, 1, 2)) +error(const char *const string, ...) +{ + va_list args; + va_start(args, string); + verror(string, args); + va_end(args); +} + +static void ATTRIBUTE_FORMAT((printf, 1, 2)) +warning(const char *const string, ...) +{ + va_list args; + fprintf(stderr, _("warning: ")); + va_start(args, string); + verror(string, args); + va_end(args); + --errors; +} + +static _Noreturn void +usage(FILE *stream, int status) +{ + (void) fprintf(stream, _("%s: usage is %s \ +[ --version ] [ --help ] [ -v ] [ -l localtime ] [ -p posixrules ] \\\n\ +\t[ -d directory ] [ -L leapseconds ] [ -y yearistype ] [ filename ... ]\n\ +\n\ +Report bugs to %s.\n"), + progname, progname, REPORT_BUGS_TO); + exit(status); +} + +#ifdef ICU +/* File into which we will write supplemental ICU data. */ +static FILE * icuFile; + +static void +emit_icu_zone(FILE* f, const char* zoneName, int zoneOffset, + const struct rule* rule, + int ruleIndex, int startYear) { + /* machine-readable section */ + fprintf(f, "zone %s %d %d %s", zoneName, zoneOffset, startYear, rule->r_name); + + /* human-readable section */ + fprintf(f, " # zone %s, offset %d, year >= %d, rule %s (%d)\n", + zoneName, zoneOffset, startYear, + rule->r_name, ruleIndex); +} + +static void +emit_icu_link(FILE* f, const char* from, const char* to) { + /* machine-readable section */ + fprintf(f, "link %s %s\n", from, to); +} + +static const char* DYCODE[] = {"DOM", "DOWGEQ", "DOWLEQ"}; + +static void +emit_icu_rule(FILE* f, const struct rule* r, int ruleIndex) { + if (r->r_yrtype != NULL) { + warning("year types not supported by ICU"); + fprintf(stderr, "rule %s, file %s, line %d\n", + r->r_name, r->r_filename, r->r_linenum); + } + + /* machine-readable section */ + fprintf(f, "rule %s %s %d %d %d %lld %d %d %lld", + r->r_name, DYCODE[r->r_dycode], + r->r_month, r->r_dayofmonth, + (r->r_dycode == DC_DOM ? -1 : r->r_wday), + r->r_tod, r->r_todisstd, r->r_todisgmt, r->r_stdoff + ); + + /* human-readable section */ + fprintf(f, " # %d: %s, file %s, line %d", + ruleIndex, r->r_name, r->r_filename, r->r_linenum); + fprintf(f, ", mode %s", DYCODE[r->r_dycode]); + fprintf(f, ", %s, dom %d", mon_names[r->r_month].l_word, r->r_dayofmonth); + if (r->r_dycode != DC_DOM) { + fprintf(f, ", %s", wday_names[r->r_wday].l_word); + } + fprintf(f, ", time %lld", r->r_tod); + fprintf(f, ", isstd %d", r->r_todisstd); + fprintf(f, ", isgmt %d", r->r_todisgmt); + fprintf(f, ", offset %lld", r->r_stdoff); + fprintf(f, "\n"); +} + +static int +add_icu_final_rules(const struct rule* r1, const struct rule* r2) { + int i; + + for (i=0; i<finalRulesCount; ++i) { /* i+=2 should work too */ + if (r1==finalRules[i]) return i; /* [sic] pointer comparison */ + } + + finalRules = (const struct rule**) (void*) erealloc((char *) finalRules, + (finalRulesCount + 2) * sizeof(*finalRules)); + finalRules[finalRulesCount++] = r1; + finalRules[finalRulesCount++] = r2; + return finalRulesCount - 2; +} +#endif + +static const char * psxrules; +static const char * lcltime; +static const char * directory; +static const char * leapsec; +static const char * yitcommand; + +int +main(int argc, char **argv) +{ + register int i; + register int j; + register int c; + +#ifdef S_IWGRP + (void) umask(umask(S_IWGRP | S_IWOTH) | (S_IWGRP | S_IWOTH)); +#endif +#if HAVE_GETTEXT + (void) setlocale(LC_ALL, ""); +#ifdef TZ_DOMAINDIR + (void) bindtextdomain(TZ_DOMAIN, TZ_DOMAINDIR); +#endif /* defined TEXTDOMAINDIR */ + (void) textdomain(TZ_DOMAIN); +#endif /* HAVE_GETTEXT */ + progname = argv[0]; + if (TYPE_BIT(zic_t) < 64) { + (void) fprintf(stderr, "%s: %s\n", progname, + _("wild compilation-time specification of zic_t")); + exit(EXIT_FAILURE); + } + for (i = 1; i < argc; ++i) + if (strcmp(argv[i], "--version") == 0) { + (void) printf("zic %s%s\n", PKGVERSION, TZVERSION); + exit(EXIT_SUCCESS); + } else if (strcmp(argv[i], "--help") == 0) { + usage(stdout, EXIT_SUCCESS); + } + while ((c = getopt(argc, argv, "d:l:p:L:vsy:")) != EOF && c != -1) + switch (c) { + default: + usage(stderr, EXIT_FAILURE); + case 'd': + if (directory == NULL) + directory = optarg; + else { + (void) fprintf(stderr, +_("%s: More than one -d option specified\n"), + progname); + exit(EXIT_FAILURE); + } + break; + case 'l': + if (lcltime == NULL) + lcltime = optarg; + else { + (void) fprintf(stderr, +_("%s: More than one -l option specified\n"), + progname); + exit(EXIT_FAILURE); + } + break; + case 'p': + if (psxrules == NULL) + psxrules = optarg; + else { + (void) fprintf(stderr, +_("%s: More than one -p option specified\n"), + progname); + exit(EXIT_FAILURE); + } + break; + case 'y': + if (yitcommand == NULL) + yitcommand = optarg; + else { + (void) fprintf(stderr, +_("%s: More than one -y option specified\n"), + progname); + exit(EXIT_FAILURE); + } + break; + case 'L': + if (leapsec == NULL) + leapsec = optarg; + else { + (void) fprintf(stderr, +_("%s: More than one -L option specified\n"), + progname); + exit(EXIT_FAILURE); + } + break; + case 'v': + noise = true; + break; + case 's': + (void) printf("%s: -s ignored\n", progname); + break; + } + if (optind == argc - 1 && strcmp(argv[optind], "=") == 0) + usage(stderr, EXIT_FAILURE); /* usage message by request */ + if (directory == NULL) + directory = TZDIR; + if (yitcommand == NULL) + yitcommand = "yearistype"; + + if (optind < argc && leapsec != NULL) { + infile(leapsec); + adjleap(); + } + +#ifdef ICU + if ((icuFile = fopen(ICU_ZONE_FILE, "w")) == NULL) { + const char *e = strerror(errno); + (void) fprintf(stderr, _("%s: Can't open %s: %s\n"), + progname, ICU_ZONE_FILE, e); + (void) exit(EXIT_FAILURE); + } +#endif + for (i = optind; i < argc; ++i) + infile(argv[i]); + if (errors) + exit(EXIT_FAILURE); + associate(); + for (i = 0; i < nzones; i = j) { + /* + ** Find the next non-continuation zone entry. + */ + for (j = i + 1; j < nzones && zones[j].z_name == NULL; ++j) + continue; + outzone(&zones[i], j - i); + } + /* + ** Make links. + */ + for (i = 0; i < nlinks; ++i) { + eat(links[i].l_filename, links[i].l_linenum); + dolink(links[i].l_from, links[i].l_to); +#ifdef ICU + emit_icu_link(icuFile, links[i].l_from, links[i].l_to); +#endif + if (noise) + for (j = 0; j < nlinks; ++j) + if (strcmp(links[i].l_to, + links[j].l_from) == 0) + warning(_("link to link")); + } + if (lcltime != NULL) { + eat("command line", 1); + dolink(lcltime, TZDEFAULT); + } + if (psxrules != NULL) { + eat("command line", 1); + dolink(psxrules, TZDEFRULES); + } +#ifdef ICU + for (i=0; i<finalRulesCount; ++i) { + emit_icu_rule(icuFile, finalRules[i], i); + } +#endif /*ICU*/ + return (errors == 0) ? EXIT_SUCCESS : EXIT_FAILURE; +} + +static void +dolink(const char *const fromfield, const char *const tofield) +{ + register char * fromname; + register char * toname; + + if (fromfield[0] == '/') + fromname = ecpyalloc(fromfield); + else { + fromname = ecpyalloc(directory); + fromname = ecatalloc(fromname, "/"); + fromname = ecatalloc(fromname, fromfield); + } + if (tofield[0] == '/') + toname = ecpyalloc(tofield); + else { + toname = ecpyalloc(directory); + toname = ecatalloc(toname, "/"); + toname = ecatalloc(toname, tofield); + } + /* + ** We get to be careful here since + ** there's a fair chance of root running us. + */ + if (!itsdir(toname)) + (void) remove(toname); + if (link(fromname, toname) != 0 + && access(fromname, F_OK) == 0 && !itsdir(fromname)) { + int result; + + if (mkdirs(toname) != 0) + exit(EXIT_FAILURE); + + result = link(fromname, toname); + if (result != 0) { + const char *s = fromfield; + const char *t; + register char * symlinkcontents = NULL; + + do + t = s; + while ((s = strchr(s, '/')) + && ! strncmp (fromfield, tofield, + ++s - fromfield)); + + for (s = tofield + (t - fromfield); + (s = strchr(s, '/')); + s++) + symlinkcontents = + ecatalloc(symlinkcontents, + "../"); + symlinkcontents = ecatalloc(symlinkcontents, t); + result = symlink(symlinkcontents, toname); + if (result == 0) +warning(_("hard link failed, symbolic link used")); + free(symlinkcontents); + } + if (result != 0) { + FILE *fp, *tp; + int c; + fp = fopen(fromname, "rb"); + if (!fp) { + const char *e = strerror(errno); + (void) fprintf(stderr, + _("%s: Can't read %s: %s\n"), + progname, fromname, e); + exit(EXIT_FAILURE); + } + tp = fopen(toname, "wb"); + if (!tp) { + const char *e = strerror(errno); + (void) fprintf(stderr, + _("%s: Can't create %s: %s\n"), + progname, toname, e); + exit(EXIT_FAILURE); + } + while ((c = getc(fp)) != EOF) + putc(c, tp); + if (ferror(fp) || fclose(fp)) { + (void) fprintf(stderr, + _("%s: Error reading %s\n"), + progname, fromname); + exit(EXIT_FAILURE); + } + if (ferror(tp) || fclose(tp)) { + (void) fprintf(stderr, + _("%s: Error writing %s\n"), + progname, toname); + exit(EXIT_FAILURE); + } + warning(_("link failed, copy used")); +#ifndef ICU_LINKS + exit(EXIT_FAILURE); +#endif + } + } + free(fromname); + free(toname); +} + +#define TIME_T_BITS_IN_FILE 64 + +static const zic_t min_time = (zic_t) -1 << (TIME_T_BITS_IN_FILE - 1); +static const zic_t max_time = -1 - ((zic_t) -1 << (TIME_T_BITS_IN_FILE - 1)); + +static int +itsdir(const char *const name) +{ + register char * myname; + register int accres; + + myname = ecpyalloc(name); + myname = ecatalloc(myname, "/."); + accres = access(myname, F_OK); + free(myname); + return accres == 0; +} + +/* +** Associate sets of rules with zones. +*/ + +/* +** Sort by rule name. +*/ + +static int +rcomp(const void *cp1, const void *cp2) +{ + return strcmp(((const struct rule *) cp1)->r_name, + ((const struct rule *) cp2)->r_name); +} + +static void +associate(void) +{ + register struct zone * zp; + register struct rule * rp; + register int base, out; + register int i, j; + + if (nrules != 0) { + (void) qsort(rules, nrules, sizeof *rules, rcomp); + for (i = 0; i < nrules - 1; ++i) { + if (strcmp(rules[i].r_name, + rules[i + 1].r_name) != 0) + continue; + if (strcmp(rules[i].r_filename, + rules[i + 1].r_filename) == 0) + continue; + eat(rules[i].r_filename, rules[i].r_linenum); + warning(_("same rule name in multiple files")); + eat(rules[i + 1].r_filename, rules[i + 1].r_linenum); + warning(_("same rule name in multiple files")); + for (j = i + 2; j < nrules; ++j) { + if (strcmp(rules[i].r_name, + rules[j].r_name) != 0) + break; + if (strcmp(rules[i].r_filename, + rules[j].r_filename) == 0) + continue; + if (strcmp(rules[i + 1].r_filename, + rules[j].r_filename) == 0) + continue; + break; + } + i = j - 1; + } + } + for (i = 0; i < nzones; ++i) { + zp = &zones[i]; + zp->z_rules = NULL; + zp->z_nrules = 0; + } + for (base = 0; base < nrules; base = out) { + rp = &rules[base]; + for (out = base + 1; out < nrules; ++out) + if (strcmp(rp->r_name, rules[out].r_name) != 0) + break; + for (i = 0; i < nzones; ++i) { + zp = &zones[i]; + if (strcmp(zp->z_rule, rp->r_name) != 0) + continue; + zp->z_rules = rp; + zp->z_nrules = out - base; + } + } + for (i = 0; i < nzones; ++i) { + zp = &zones[i]; + if (zp->z_nrules == 0) { + /* + ** Maybe we have a local standard time offset. + */ + eat(zp->z_filename, zp->z_linenum); + zp->z_stdoff = gethms(zp->z_rule, _("unruly zone"), + true); + /* + ** Note, though, that if there's no rule, + ** a '%s' in the format is a bad thing. + */ + if (strchr(zp->z_format, '%') != 0) + error("%s", _("%s in ruleless zone")); + } + } + if (errors) + exit(EXIT_FAILURE); +} + +static void +infile(const char *name) +{ + register FILE * fp; + register char ** fields; + register char * cp; + register const struct lookup * lp; + register int nfields; + register int wantcont; + register int num; + char buf[BUFSIZ]; + + if (strcmp(name, "-") == 0) { + name = _("standard input"); + fp = stdin; + } else if ((fp = fopen(name, "r")) == NULL) { + const char *e = strerror(errno); + + (void) fprintf(stderr, _("%s: Can't open %s: %s\n"), + progname, name, e); + exit(EXIT_FAILURE); + } + wantcont = false; + for (num = 1; ; ++num) { + eat(name, num); + if (fgets(buf, sizeof buf, fp) != buf) + break; + cp = strchr(buf, '\n'); + if (cp == NULL) { + error(_("line too long")); + exit(EXIT_FAILURE); + } + *cp = '\0'; + fields = getfields(buf); + nfields = 0; + while (fields[nfields] != NULL) { + static char nada; + + if (strcmp(fields[nfields], "-") == 0) + fields[nfields] = &nada; + ++nfields; + } + if (nfields == 0) { + /* nothing to do */ + } else if (wantcont) { + wantcont = inzcont(fields, nfields); + } else { + lp = byword(fields[0], line_codes); + if (lp == NULL) + error(_("input line of unknown type")); + else switch ((int) (lp->l_value)) { + case LC_RULE: + inrule(fields, nfields); + wantcont = false; + break; + case LC_ZONE: + wantcont = inzone(fields, nfields); + break; + case LC_LINK: + inlink(fields, nfields); + wantcont = false; + break; + case LC_LEAP: + if (name != leapsec) + (void) fprintf(stderr, +_("%s: Leap line in non leap seconds file %s\n"), + progname, name); + else inleap(fields, nfields); + wantcont = false; + break; + default: /* "cannot happen" */ + (void) fprintf(stderr, +_("%s: panic: Invalid l_value %d\n"), + progname, lp->l_value); + exit(EXIT_FAILURE); + } + } + free(fields); + } + if (ferror(fp)) { + (void) fprintf(stderr, _("%s: Error reading %s\n"), + progname, filename); + exit(EXIT_FAILURE); + } + if (fp != stdin && fclose(fp)) { + const char *e = strerror(errno); + + (void) fprintf(stderr, _("%s: Error closing %s: %s\n"), + progname, filename, e); + exit(EXIT_FAILURE); + } + if (wantcont) + error(_("expected continuation line not found")); +} + +/* +** Convert a string of one of the forms +** h -h hh:mm -hh:mm hh:mm:ss -hh:mm:ss +** into a number of seconds. +** A null string maps to zero. +** Call error with errstring and return zero on errors. +*/ + +static zic_t +gethms(const char *string, const char *const errstring, const int signable) +{ + zic_t hh; + int mm, ss, sign; + + if (string == NULL || *string == '\0') + return 0; + if (!signable) + sign = 1; + else if (*string == '-') { + sign = -1; + ++string; + } else sign = 1; + if (sscanf(string, scheck(string, "%"SCNdZIC), &hh) == 1) + mm = ss = 0; + else if (sscanf(string, scheck(string, "%"SCNdZIC":%d"), &hh, &mm) == 2) + ss = 0; + else if (sscanf(string, scheck(string, "%"SCNdZIC":%d:%d"), + &hh, &mm, &ss) != 3) { + error("%s", errstring); + return 0; + } + if (hh < 0 || + mm < 0 || mm >= MINSPERHOUR || + ss < 0 || ss > SECSPERMIN) { + error("%s", errstring); + return 0; + } + if (ZIC_MAX / SECSPERHOUR < hh) { + error(_("time overflow")); + return 0; + } + if (noise && hh == HOURSPERDAY && mm == 0 && ss == 0) + warning(_("24:00 not handled by pre-1998 versions of zic")); + if (noise && (hh > HOURSPERDAY || + (hh == HOURSPERDAY && (mm != 0 || ss != 0)))) +warning(_("values over 24 hours not handled by pre-2007 versions of zic")); + return oadd(sign * hh * SECSPERHOUR, + sign * (mm * SECSPERMIN + ss)); +} + +static void +inrule(register char **const fields, const int nfields) +{ + static struct rule r; + + if (nfields != RULE_FIELDS) { + error(_("wrong number of fields on Rule line")); + return; + } + if (*fields[RF_NAME] == '\0') { + error(_("nameless rule")); + return; + } + r.r_filename = filename; + r.r_linenum = linenum; + r.r_stdoff = gethms(fields[RF_STDOFF], _("invalid saved time"), true); + rulesub(&r, fields[RF_LOYEAR], fields[RF_HIYEAR], fields[RF_COMMAND], + fields[RF_MONTH], fields[RF_DAY], fields[RF_TOD]); + r.r_name = ecpyalloc(fields[RF_NAME]); + r.r_abbrvar = ecpyalloc(fields[RF_ABBRVAR]); + if (max_abbrvar_len < strlen(r.r_abbrvar)) + max_abbrvar_len = strlen(r.r_abbrvar); + rules = growalloc(rules, sizeof *rules, nrules, &nrules_alloc); + rules[nrules++] = r; +} + +static int +inzone(register char **const fields, const int nfields) +{ + register int i; + + if (nfields < ZONE_MINFIELDS || nfields > ZONE_MAXFIELDS) { + error(_("wrong number of fields on Zone line")); + return false; + } + if (strcmp(fields[ZF_NAME], TZDEFAULT) == 0 && lcltime != NULL) { + error( +_("\"Zone %s\" line and -l option are mutually exclusive"), + TZDEFAULT); + return false; + } + if (strcmp(fields[ZF_NAME], TZDEFRULES) == 0 && psxrules != NULL) { + error( +_("\"Zone %s\" line and -p option are mutually exclusive"), + TZDEFRULES); + return false; + } + for (i = 0; i < nzones; ++i) + if (zones[i].z_name != NULL && + strcmp(zones[i].z_name, fields[ZF_NAME]) == 0) { + error( +_("duplicate zone name %s (file \"%s\", line %d)"), + fields[ZF_NAME], + zones[i].z_filename, + zones[i].z_linenum); + return false; + } + return inzsub(fields, nfields, false); +} + +static int +inzcont(register char **const fields, const int nfields) +{ + if (nfields < ZONEC_MINFIELDS || nfields > ZONEC_MAXFIELDS) { + error(_("wrong number of fields on Zone continuation line")); + return false; + } + return inzsub(fields, nfields, true); +} + +static int +inzsub(register char **const fields, const int nfields, const int iscont) +{ + register char * cp; + static struct zone z; + register int i_gmtoff, i_rule, i_format; + register int i_untilyear, i_untilmonth; + register int i_untilday, i_untiltime; + register int hasuntil; + + if (iscont) { + i_gmtoff = ZFC_GMTOFF; + i_rule = ZFC_RULE; + i_format = ZFC_FORMAT; + i_untilyear = ZFC_TILYEAR; + i_untilmonth = ZFC_TILMONTH; + i_untilday = ZFC_TILDAY; + i_untiltime = ZFC_TILTIME; + z.z_name = NULL; + } else { + i_gmtoff = ZF_GMTOFF; + i_rule = ZF_RULE; + i_format = ZF_FORMAT; + i_untilyear = ZF_TILYEAR; + i_untilmonth = ZF_TILMONTH; + i_untilday = ZF_TILDAY; + i_untiltime = ZF_TILTIME; + z.z_name = ecpyalloc(fields[ZF_NAME]); + } + z.z_filename = filename; + z.z_linenum = linenum; + z.z_gmtoff = gethms(fields[i_gmtoff], _("invalid UT offset"), true); + if ((cp = strchr(fields[i_format], '%')) != 0) { + if (*++cp != 's' || strchr(cp, '%') != 0) { + error(_("invalid abbreviation format")); + return false; + } + } + z.z_rule = ecpyalloc(fields[i_rule]); + z.z_format = ecpyalloc(fields[i_format]); + if (max_format_len < strlen(z.z_format)) + max_format_len = strlen(z.z_format); + hasuntil = nfields > i_untilyear; + if (hasuntil) { + z.z_untilrule.r_filename = filename; + z.z_untilrule.r_linenum = linenum; + rulesub(&z.z_untilrule, + fields[i_untilyear], + "only", + "", + (nfields > i_untilmonth) ? + fields[i_untilmonth] : "Jan", + (nfields > i_untilday) ? fields[i_untilday] : "1", + (nfields > i_untiltime) ? fields[i_untiltime] : "0"); + z.z_untiltime = rpytime(&z.z_untilrule, + z.z_untilrule.r_loyear); + if (iscont && nzones > 0 && + z.z_untiltime > min_time && + z.z_untiltime < max_time && + zones[nzones - 1].z_untiltime > min_time && + zones[nzones - 1].z_untiltime < max_time && + zones[nzones - 1].z_untiltime >= z.z_untiltime) { + error(_( +"Zone continuation line end time is not after end time of previous line" + )); + return false; + } + } + zones = growalloc(zones, sizeof *zones, nzones, &nzones_alloc); + zones[nzones++] = z; + /* + ** If there was an UNTIL field on this line, + ** there's more information about the zone on the next line. + */ + return hasuntil; +} + +static void +inleap(register char ** const fields, const int nfields) +{ + register const char * cp; + register const struct lookup * lp; + register int i, j; + zic_t year; + int month, day; + zic_t dayoff, tod; + zic_t t; + + if (nfields != LEAP_FIELDS) { + error(_("wrong number of fields on Leap line")); + return; + } + dayoff = 0; + cp = fields[LP_YEAR]; + if (sscanf(cp, scheck(cp, "%"SCNdZIC), &year) != 1) { + /* + ** Leapin' Lizards! + */ + error(_("invalid leaping year")); + return; + } + if (!leapseen || leapmaxyear < year) + leapmaxyear = year; + if (!leapseen || leapminyear > year) + leapminyear = year; + leapseen = true; + j = EPOCH_YEAR; + while (j != year) { + if (year > j) { + i = len_years[isleap(j)]; + ++j; + } else { + --j; + i = -len_years[isleap(j)]; + } + dayoff = oadd(dayoff, i); + } + if ((lp = byword(fields[LP_MONTH], mon_names)) == NULL) { + error(_("invalid month name")); + return; + } + month = lp->l_value; + j = TM_JANUARY; + while (j != month) { + i = len_months[isleap(year)][j]; + dayoff = oadd(dayoff, i); + ++j; + } + cp = fields[LP_DAY]; + if (sscanf(cp, scheck(cp, "%d"), &day) != 1 || + day <= 0 || day > len_months[isleap(year)][month]) { + error(_("invalid day of month")); + return; + } + dayoff = oadd(dayoff, day - 1); + if (dayoff < 0 && !TYPE_SIGNED(zic_t)) { + error(_("time before zero")); + return; + } + if (dayoff < min_time / SECSPERDAY) { + error(_("time too small")); + return; + } + if (dayoff > max_time / SECSPERDAY) { + error(_("time too large")); + return; + } + t = (zic_t) dayoff * SECSPERDAY; + tod = gethms(fields[LP_TIME], _("invalid time of day"), false); + cp = fields[LP_CORR]; + { + register int positive; + int count; + + if (strcmp(cp, "") == 0) { /* infile() turns "-" into "" */ + positive = false; + count = 1; + } else if (strcmp(cp, "--") == 0) { + positive = false; + count = 2; + } else if (strcmp(cp, "+") == 0) { + positive = true; + count = 1; + } else if (strcmp(cp, "++") == 0) { + positive = true; + count = 2; + } else { + error(_("illegal CORRECTION field on Leap line")); + return; + } + if ((lp = byword(fields[LP_ROLL], leap_types)) == NULL) { + error(_( + "illegal Rolling/Stationary field on Leap line" + )); + return; + } + leapadd(tadd(t, tod), positive, lp->l_value, count); + } +} + +static void +inlink(register char **const fields, const int nfields) +{ + struct link l; + + if (nfields != LINK_FIELDS) { + error(_("wrong number of fields on Link line")); + return; + } + if (*fields[LF_FROM] == '\0') { + error(_("blank FROM field on Link line")); + return; + } + if (*fields[LF_TO] == '\0') { + error(_("blank TO field on Link line")); + return; + } + l.l_filename = filename; + l.l_linenum = linenum; + l.l_from = ecpyalloc(fields[LF_FROM]); + l.l_to = ecpyalloc(fields[LF_TO]); + links = growalloc(links, sizeof *links, nlinks, &nlinks_alloc); + links[nlinks++] = l; +} + +static void +rulesub(register struct rule *const rp, + const char *const loyearp, + const char *const hiyearp, + const char *const typep, + const char *const monthp, + const char *const dayp, + const char *const timep) +{ + register const struct lookup * lp; + register const char * cp; + register char * dp; + register char * ep; + + if ((lp = byword(monthp, mon_names)) == NULL) { + error(_("invalid month name")); + return; + } + rp->r_month = lp->l_value; + rp->r_todisstd = false; + rp->r_todisgmt = false; + dp = ecpyalloc(timep); + if (*dp != '\0') { + ep = dp + strlen(dp) - 1; + switch (lowerit(*ep)) { + case 's': /* Standard */ + rp->r_todisstd = true; + rp->r_todisgmt = false; + *ep = '\0'; + break; + case 'w': /* Wall */ + rp->r_todisstd = false; + rp->r_todisgmt = false; + *ep = '\0'; + break; + case 'g': /* Greenwich */ + case 'u': /* Universal */ + case 'z': /* Zulu */ + rp->r_todisstd = true; + rp->r_todisgmt = true; + *ep = '\0'; + break; + } + } + rp->r_tod = gethms(dp, _("invalid time of day"), false); + free(dp); + /* + ** Year work. + */ + cp = loyearp; + lp = byword(cp, begin_years); + rp->r_lowasnum = lp == NULL; + if (!rp->r_lowasnum) switch ((int) lp->l_value) { + case YR_MINIMUM: + rp->r_loyear = ZIC_MIN; + break; + case YR_MAXIMUM: + rp->r_loyear = ZIC_MAX; + break; + default: /* "cannot happen" */ + (void) fprintf(stderr, + _("%s: panic: Invalid l_value %d\n"), + progname, lp->l_value); + exit(EXIT_FAILURE); + } else if (sscanf(cp, scheck(cp, "%"SCNdZIC), &rp->r_loyear) != 1) { + error(_("invalid starting year")); + return; + } + cp = hiyearp; + lp = byword(cp, end_years); + rp->r_hiwasnum = lp == NULL; + if (!rp->r_hiwasnum) switch ((int) lp->l_value) { + case YR_MINIMUM: + rp->r_hiyear = ZIC_MIN; + break; + case YR_MAXIMUM: + rp->r_hiyear = ZIC_MAX; + break; + case YR_ONLY: + rp->r_hiyear = rp->r_loyear; + break; + default: /* "cannot happen" */ + (void) fprintf(stderr, + _("%s: panic: Invalid l_value %d\n"), + progname, lp->l_value); + exit(EXIT_FAILURE); + } else if (sscanf(cp, scheck(cp, "%"SCNdZIC), &rp->r_hiyear) != 1) { + error(_("invalid ending year")); + return; + } + if (rp->r_loyear > rp->r_hiyear) { + error(_("starting year greater than ending year")); + return; + } + if (*typep == '\0') + rp->r_yrtype = NULL; + else { + if (rp->r_loyear == rp->r_hiyear) { + error(_("typed single year")); + return; + } + rp->r_yrtype = ecpyalloc(typep); + } + /* + ** Day work. + ** Accept things such as: + ** 1 + ** last-Sunday + ** Sun<=20 + ** Sun>=7 + */ + dp = ecpyalloc(dayp); + if ((lp = byword(dp, lasts)) != NULL) { + rp->r_dycode = DC_DOWLEQ; + rp->r_wday = lp->l_value; + rp->r_dayofmonth = len_months[1][rp->r_month]; + } else { + if ((ep = strchr(dp, '<')) != 0) + rp->r_dycode = DC_DOWLEQ; + else if ((ep = strchr(dp, '>')) != 0) + rp->r_dycode = DC_DOWGEQ; + else { + ep = dp; + rp->r_dycode = DC_DOM; + } + if (rp->r_dycode != DC_DOM) { + *ep++ = 0; + if (*ep++ != '=') { + error(_("invalid day of month")); + free(dp); + return; + } + if ((lp = byword(dp, wday_names)) == NULL) { + error(_("invalid weekday name")); + free(dp); + return; + } + rp->r_wday = lp->l_value; + } + if (sscanf(ep, scheck(ep, "%d"), &rp->r_dayofmonth) != 1 || + rp->r_dayofmonth <= 0 || + (rp->r_dayofmonth > len_months[1][rp->r_month])) { + error(_("invalid day of month")); + free(dp); + return; + } + } + free(dp); +} + +static void +convert(const int_fast32_t val, char *const buf) +{ + register int i; + register int shift; + unsigned char *const b = (unsigned char *) buf; + + for (i = 0, shift = 24; i < 4; ++i, shift -= 8) + b[i] = val >> shift; +} + +static void +convert64(const zic_t val, char *const buf) +{ + register int i; + register int shift; + unsigned char *const b = (unsigned char *) buf; + + for (i = 0, shift = 56; i < 8; ++i, shift -= 8) + b[i] = val >> shift; +} + +static void +puttzcode(const int_fast32_t val, FILE *const fp) +{ + char buf[4]; + + convert(val, buf); + (void) fwrite(buf, sizeof buf, 1, fp); +} + +static void +puttzcode64(const zic_t val, FILE *const fp) +{ + char buf[8]; + + convert64(val, buf); + (void) fwrite(buf, sizeof buf, 1, fp); +} + +static int +atcomp(const void *avp, const void *bvp) +{ + const zic_t a = ((const struct attype *) avp)->at; + const zic_t b = ((const struct attype *) bvp)->at; + + return (a < b) ? -1 : (a > b); +} + +static int +is32(const zic_t x) +{ + return INT32_MIN <= x && x <= INT32_MAX; +} + +static void +writezone(const char *const name, const char *const string, char version) +{ + register FILE * fp; + register int i, j; + register int leapcnt32, leapi32; + register int timecnt32, timei32; + register int pass; + static char * fullname; + static const struct tzhead tzh0; + static struct tzhead tzh; + zic_t *ats = emalloc(size_product(timecnt, sizeof *ats + 1)); + void *typesptr = ats + timecnt; + unsigned char *types = typesptr; + + /* + ** Sort. + */ + if (timecnt > 1) + (void) qsort(attypes, timecnt, sizeof *attypes, atcomp); + /* + ** Optimize. + */ + { + int fromi; + int toi; + + toi = 0; + fromi = 0; + while (fromi < timecnt && attypes[fromi].at < min_time) + ++fromi; + /* + ** Remember that type 0 is reserved. + */ + if (isdsts[1] == 0) + while (fromi < timecnt && attypes[fromi].type == 1) + ++fromi; /* handled by default rule */ + for ( ; fromi < timecnt; ++fromi) { + if (toi != 0 && ((attypes[fromi].at + + gmtoffs[attypes[toi - 1].type]) <= + (attypes[toi - 1].at + gmtoffs[toi == 1 ? 0 + : attypes[toi - 2].type]))) { + attypes[toi - 1].type = + attypes[fromi].type; + continue; + } + if (toi == 0 || + attypes[toi - 1].type != attypes[fromi].type) + attypes[toi++] = attypes[fromi]; + } + timecnt = toi; + } + /* + ** Transfer. + */ + for (i = 0; i < timecnt; ++i) { + ats[i] = attypes[i].at; + types[i] = attypes[i].type; + } + /* + ** Correct for leap seconds. + */ + for (i = 0; i < timecnt; ++i) { + j = leapcnt; + while (--j >= 0) + if (ats[i] > trans[j] - corr[j]) { + ats[i] = tadd(ats[i], corr[j]); + break; + } + } + /* + ** Figure out 32-bit-limited starts and counts. + */ + timecnt32 = timecnt; + timei32 = 0; + leapcnt32 = leapcnt; + leapi32 = 0; + while (timecnt32 > 0 && !is32(ats[timecnt32 - 1])) + --timecnt32; + while (timecnt32 > 0 && !is32(ats[timei32])) { + --timecnt32; + ++timei32; + } + while (leapcnt32 > 0 && !is32(trans[leapcnt32 - 1])) + --leapcnt32; + while (leapcnt32 > 0 && !is32(trans[leapi32])) { + --leapcnt32; + ++leapi32; + } + fullname = erealloc(fullname, + strlen(directory) + 1 + strlen(name) + 1); + (void) sprintf(fullname, "%s/%s", directory, name); + /* + ** Remove old file, if any, to snap links. + */ + if (!itsdir(fullname) && remove(fullname) != 0 && errno != ENOENT) { + const char *e = strerror(errno); + + (void) fprintf(stderr, _("%s: Can't remove %s: %s\n"), + progname, fullname, e); + exit(EXIT_FAILURE); + } + if ((fp = fopen(fullname, "wb")) == NULL) { + if (mkdirs(fullname) != 0) + exit(EXIT_FAILURE); + if ((fp = fopen(fullname, "wb")) == NULL) { + const char *e = strerror(errno); + + (void) fprintf(stderr, _("%s: Can't create %s: %s\n"), + progname, fullname, e); + exit(EXIT_FAILURE); + } + } + for (pass = 1; pass <= 2; ++pass) { + register int thistimei, thistimecnt; + register int thisleapi, thisleapcnt; + register int thistimelim, thisleaplim; + int writetype[TZ_MAX_TYPES]; + int typemap[TZ_MAX_TYPES]; + register int thistypecnt; + char thischars[TZ_MAX_CHARS]; + char thischarcnt; + int indmap[TZ_MAX_CHARS]; + + if (pass == 1) { + thistimei = timei32; + thistimecnt = timecnt32; + thisleapi = leapi32; + thisleapcnt = leapcnt32; + } else { + thistimei = 0; + thistimecnt = timecnt; + thisleapi = 0; + thisleapcnt = leapcnt; + } + thistimelim = thistimei + thistimecnt; + thisleaplim = thisleapi + thisleapcnt; + /* + ** Remember that type 0 is reserved. + */ + writetype[0] = false; + for (i = 1; i < typecnt; ++i) + writetype[i] = thistimecnt == timecnt; + if (thistimecnt == 0) { + /* + ** No transition times fall in the current + ** (32- or 64-bit) window. + */ + if (typecnt != 0) + writetype[typecnt - 1] = true; + } else { + for (i = thistimei - 1; i < thistimelim; ++i) + if (i >= 0) + writetype[types[i]] = true; + /* + ** For America/Godthab and Antarctica/Palmer + */ + /* + ** Remember that type 0 is reserved. + */ + if (thistimei == 0) + writetype[1] = true; + } +#ifndef LEAVE_SOME_PRE_2011_SYSTEMS_IN_THE_LURCH + /* + ** For some pre-2011 systems: if the last-to-be-written + ** standard (or daylight) type has an offset different from the + ** most recently used offset, + ** append an (unused) copy of the most recently used type + ** (to help get global "altzone" and "timezone" variables + ** set correctly). + */ + { + register int mrudst, mrustd, hidst, histd, type; + + hidst = histd = mrudst = mrustd = -1; + for (i = thistimei; i < thistimelim; ++i) + if (isdsts[types[i]]) + mrudst = types[i]; + else mrustd = types[i]; + for (i = 0; i < typecnt; ++i) + if (writetype[i]) { + if (isdsts[i]) + hidst = i; + else histd = i; + } + if (hidst >= 0 && mrudst >= 0 && hidst != mrudst && + gmtoffs[hidst] != gmtoffs[mrudst]) { + isdsts[mrudst] = -1; + type = addtype(gmtoffs[mrudst], +#ifdef ICU + rawoffs[mrudst], dstoffs[mrudst], +#endif + &chars[abbrinds[mrudst]], + true, + ttisstds[mrudst], + ttisgmts[mrudst]); + isdsts[mrudst] = true; + writetype[type] = true; + } + if (histd >= 0 && mrustd >= 0 && histd != mrustd && + gmtoffs[histd] != gmtoffs[mrustd]) { + isdsts[mrustd] = -1; + type = addtype(gmtoffs[mrustd], +#ifdef ICU + rawoffs[mrudst], dstoffs[mrudst], +#endif + &chars[abbrinds[mrustd]], + false, + ttisstds[mrustd], + ttisgmts[mrustd]); + isdsts[mrustd] = false; + writetype[type] = true; + } + } +#endif /* !defined LEAVE_SOME_PRE_2011_SYSTEMS_IN_THE_LURCH */ + thistypecnt = 0; + /* + ** Potentially, set type 0 to that of lowest-valued time. + */ + if (thistimei > 0) { + for (i = 1; i < typecnt; ++i) + if (writetype[i] && !isdsts[i]) + break; + if (i != types[thistimei - 1]) { + i = types[thistimei - 1]; + gmtoffs[0] = gmtoffs[i]; + isdsts[0] = isdsts[i]; + ttisstds[0] = ttisstds[i]; + ttisgmts[0] = ttisgmts[i]; + abbrinds[0] = abbrinds[i]; + writetype[0] = true; + writetype[i] = false; + } + } + for (i = 0; i < typecnt; ++i) + typemap[i] = writetype[i] ? thistypecnt++ : 0; + for (i = 0; i < sizeof indmap / sizeof indmap[0]; ++i) + indmap[i] = -1; + thischarcnt = 0; + for (i = 0; i < typecnt; ++i) { + register char * thisabbr; + + if (!writetype[i]) + continue; + if (indmap[abbrinds[i]] >= 0) + continue; + thisabbr = &chars[abbrinds[i]]; + for (j = 0; j < thischarcnt; ++j) + if (strcmp(&thischars[j], thisabbr) == 0) + break; + if (j == thischarcnt) { + (void) strcpy(&thischars[(int) thischarcnt], + thisabbr); + thischarcnt += strlen(thisabbr) + 1; + } + indmap[abbrinds[i]] = j; + } +#define DO(field) ((void) fwrite(tzh.field, sizeof tzh.field, 1, fp)) + tzh = tzh0; +#ifdef ICU + * (ICUZoneinfoVersion*) &tzh.tzh_reserved = TZ_ICU_VERSION; + (void) strncpy(tzh.tzh_magic, TZ_ICU_MAGIC, sizeof tzh.tzh_magic); +#else + (void) strncpy(tzh.tzh_magic, TZ_MAGIC, sizeof tzh.tzh_magic); +#endif + tzh.tzh_version[0] = version; + convert(thistypecnt, tzh.tzh_ttisgmtcnt); + convert(thistypecnt, tzh.tzh_ttisstdcnt); + convert(thisleapcnt, tzh.tzh_leapcnt); + convert(thistimecnt, tzh.tzh_timecnt); + convert(thistypecnt, tzh.tzh_typecnt); + convert(thischarcnt, tzh.tzh_charcnt); + DO(tzh_magic); + DO(tzh_version); + DO(tzh_reserved); + DO(tzh_ttisgmtcnt); + DO(tzh_ttisstdcnt); + DO(tzh_leapcnt); + DO(tzh_timecnt); + DO(tzh_typecnt); + DO(tzh_charcnt); +#undef DO + for (i = thistimei; i < thistimelim; ++i) + if (pass == 1) + puttzcode(ats[i], fp); + else puttzcode64(ats[i], fp); + for (i = thistimei; i < thistimelim; ++i) { + unsigned char uc; + + uc = typemap[types[i]]; + (void) fwrite(&uc, sizeof uc, 1, fp); + } + for (i = 0; i < typecnt; ++i) + if (writetype[i]) { +#ifdef ICU + puttzcode(rawoffs[i], fp); + puttzcode(dstoffs[i], fp); +#else + puttzcode(gmtoffs[i], fp); +#endif + (void) putc(isdsts[i], fp); + (void) putc((unsigned char) indmap[abbrinds[i]], fp); + } + if (thischarcnt != 0) + (void) fwrite(thischars, sizeof thischars[0], + thischarcnt, fp); + for (i = thisleapi; i < thisleaplim; ++i) { + register zic_t todo; + + if (roll[i]) { + if (timecnt == 0 || trans[i] < ats[0]) { + j = 0; + while (isdsts[j]) + if (++j >= typecnt) { + j = 0; + break; + } + } else { + j = 1; + while (j < timecnt && + trans[i] >= ats[j]) + ++j; + j = types[j - 1]; + } + todo = tadd(trans[i], -gmtoffs[j]); + } else todo = trans[i]; + if (pass == 1) + puttzcode(todo, fp); + else puttzcode64(todo, fp); + puttzcode(corr[i], fp); + } + for (i = 0; i < typecnt; ++i) + if (writetype[i]) + (void) putc(ttisstds[i], fp); + for (i = 0; i < typecnt; ++i) + if (writetype[i]) + (void) putc(ttisgmts[i], fp); + } + (void) fprintf(fp, "\n%s\n", string); + if (ferror(fp) || fclose(fp)) { + (void) fprintf(stderr, _("%s: Error writing %s\n"), + progname, fullname); + exit(EXIT_FAILURE); + } + free(ats); +} + +static void +doabbr(char *const abbr, const char *const format, const char *const letters, + const int isdst, const int doquotes) +{ + register char * cp; + register char * slashp; + register int len; + + slashp = strchr(format, '/'); + if (slashp == NULL) { + if (letters == NULL) + (void) strcpy(abbr, format); + else (void) sprintf(abbr, format, letters); + } else if (isdst) { + (void) strcpy(abbr, slashp + 1); + } else { + if (slashp > format) + (void) strncpy(abbr, format, slashp - format); + abbr[slashp - format] = '\0'; + } + if (!doquotes) + return; + for (cp = abbr; *cp != '\0'; ++cp) + if (strchr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", *cp) == NULL && + strchr("abcdefghijklmnopqrstuvwxyz", *cp) == NULL) + break; + len = strlen(abbr); + if (len > 0 && *cp == '\0') + return; + abbr[len + 2] = '\0'; + abbr[len + 1] = '>'; + for ( ; len > 0; --len) + abbr[len] = abbr[len - 1]; + abbr[0] = '<'; +} + +static void +updateminmax(const zic_t x) +{ + if (min_year > x) + min_year = x; + if (max_year < x) + max_year = x; +} + +static int +stringoffset(char *result, zic_t offset) +{ + register int hours; + register int minutes; + register int seconds; + + result[0] = '\0'; + if (offset < 0) { + (void) strcpy(result, "-"); + offset = -offset; + } + seconds = offset % SECSPERMIN; + offset /= SECSPERMIN; + minutes = offset % MINSPERHOUR; + offset /= MINSPERHOUR; + hours = offset; + if (hours >= HOURSPERDAY * DAYSPERWEEK) { + result[0] = '\0'; + return -1; + } + (void) sprintf(end(result), "%d", hours); + if (minutes != 0 || seconds != 0) { + (void) sprintf(end(result), ":%02d", minutes); + if (seconds != 0) + (void) sprintf(end(result), ":%02d", seconds); + } + return 0; +} + +static int +stringrule(char *result, const struct rule *const rp, const zic_t dstoff, + const zic_t gmtoff) +{ + register zic_t tod = rp->r_tod; + register int compat = 0; + + result = end(result); + if (rp->r_dycode == DC_DOM) { + register int month, total; + + if (rp->r_dayofmonth == 29 && rp->r_month == TM_FEBRUARY) + return -1; + total = 0; + for (month = 0; month < rp->r_month; ++month) + total += len_months[0][month]; + /* Omit the "J" in Jan and Feb, as that's shorter. */ + if (rp->r_month <= 1) + (void) sprintf(result, "%d", total + rp->r_dayofmonth - 1); + else + (void) sprintf(result, "J%d", total + rp->r_dayofmonth); + } else { + register int week; + register int wday = rp->r_wday; + register int wdayoff; + + if (rp->r_dycode == DC_DOWGEQ) { + wdayoff = (rp->r_dayofmonth - 1) % DAYSPERWEEK; + if (wdayoff) + compat = 2013; + wday -= wdayoff; + tod += wdayoff * SECSPERDAY; + week = 1 + (rp->r_dayofmonth - 1) / DAYSPERWEEK; + } else if (rp->r_dycode == DC_DOWLEQ) { + if (rp->r_dayofmonth == len_months[1][rp->r_month]) + week = 5; + else { + wdayoff = rp->r_dayofmonth % DAYSPERWEEK; + if (wdayoff) + compat = 2013; + wday -= wdayoff; + tod += wdayoff * SECSPERDAY; + week = rp->r_dayofmonth / DAYSPERWEEK; + } + } else return -1; /* "cannot happen" */ + if (wday < 0) + wday += DAYSPERWEEK; + (void) sprintf(result, "M%d.%d.%d", + rp->r_month + 1, week, wday); + } + if (rp->r_todisgmt) + tod += gmtoff; + if (rp->r_todisstd && rp->r_stdoff == 0) + tod += dstoff; + if (tod != 2 * SECSPERMIN * MINSPERHOUR) { + (void) strcat(result, "/"); + if (stringoffset(end(result), tod) != 0) + return -1; + if (tod < 0) { + if (compat < 2013) + compat = 2013; + } else if (SECSPERDAY <= tod) { + if (compat < 1994) + compat = 1994; + } + } + return compat; +} + +static int +rule_cmp(struct rule const *a, struct rule const *b) +{ + if (!a) + return -!!b; + if (!b) + return 1; + if (a->r_hiyear != b->r_hiyear) + return a->r_hiyear < b->r_hiyear ? -1 : 1; + if (a->r_month - b->r_month != 0) + return a->r_month - b->r_month; + return a->r_dayofmonth - b->r_dayofmonth; +} + +enum { YEAR_BY_YEAR_ZONE = 1 }; + +static int +stringzone(char *result, const struct zone *const zpfirst, const int zonecount) +{ + register const struct zone * zp; + register struct rule * rp; + register struct rule * stdrp; + register struct rule * dstrp; + register int i; + register const char * abbrvar; + register int compat = 0; + register int c; + struct rule stdr, dstr; + + result[0] = '\0'; + zp = zpfirst + zonecount - 1; + stdrp = dstrp = NULL; + for (i = 0; i < zp->z_nrules; ++i) { + rp = &zp->z_rules[i]; + if (rp->r_hiwasnum || rp->r_hiyear != ZIC_MAX) + continue; + if (rp->r_yrtype != NULL) + continue; + if (rp->r_stdoff == 0) { + if (stdrp == NULL) + stdrp = rp; + else return -1; + } else { + if (dstrp == NULL) + dstrp = rp; + else return -1; + } + } + if (stdrp == NULL && dstrp == NULL) { + /* + ** There are no rules running through "max". + ** Find the latest std rule in stdabbrrp + ** and latest rule of any type in stdrp. + */ + register struct rule *stdabbrrp = NULL; + for (i = 0; i < zp->z_nrules; ++i) { + rp = &zp->z_rules[i]; + if (rp->r_stdoff == 0 && rule_cmp(stdabbrrp, rp) < 0) + stdabbrrp = rp; + if (rule_cmp(stdrp, rp) < 0) + stdrp = rp; + } + /* + ** Horrid special case: if year is 2037, + ** presume this is a zone handled on a year-by-year basis; + ** do not try to apply a rule to the zone. + */ + if (stdrp != NULL && stdrp->r_hiyear == 2037) + return YEAR_BY_YEAR_ZONE; + + if (stdrp != NULL && stdrp->r_stdoff != 0) { + /* Perpetual DST. */ + dstr.r_month = TM_JANUARY; + dstr.r_dycode = DC_DOM; + dstr.r_dayofmonth = 1; + dstr.r_tod = 0; + dstr.r_todisstd = dstr.r_todisgmt = false; + dstr.r_stdoff = stdrp->r_stdoff; + dstr.r_abbrvar = stdrp->r_abbrvar; + stdr.r_month = TM_DECEMBER; + stdr.r_dycode = DC_DOM; + stdr.r_dayofmonth = 31; + stdr.r_tod = SECSPERDAY + stdrp->r_stdoff; + stdr.r_todisstd = stdr.r_todisgmt = false; + stdr.r_stdoff = 0; + stdr.r_abbrvar + = (stdabbrrp ? stdabbrrp->r_abbrvar : ""); + dstrp = &dstr; + stdrp = &stdr; + } + } + if (stdrp == NULL && (zp->z_nrules != 0 || zp->z_stdoff != 0)) + return -1; + abbrvar = (stdrp == NULL) ? "" : stdrp->r_abbrvar; + doabbr(result, zp->z_format, abbrvar, false, true); + if (stringoffset(end(result), -zp->z_gmtoff) != 0) { + result[0] = '\0'; + return -1; + } + if (dstrp == NULL) + return compat; + doabbr(end(result), zp->z_format, dstrp->r_abbrvar, true, true); + if (dstrp->r_stdoff != SECSPERMIN * MINSPERHOUR) + if (stringoffset(end(result), + -(zp->z_gmtoff + dstrp->r_stdoff)) != 0) { + result[0] = '\0'; + return -1; + } + (void) strcat(result, ","); + c = stringrule(result, dstrp, dstrp->r_stdoff, zp->z_gmtoff); + if (c < 0) { + result[0] = '\0'; + return -1; + } + if (compat < c) + compat = c; + (void) strcat(result, ","); + c = stringrule(result, stdrp, dstrp->r_stdoff, zp->z_gmtoff); + if (c < 0) { + result[0] = '\0'; + return -1; + } + if (compat < c) + compat = c; + return compat; +} + +static void +outzone(const struct zone * const zpfirst, const int zonecount) +{ + register const struct zone * zp; + register struct rule * rp; + register int i, j; + register int usestart, useuntil; + register zic_t starttime, untiltime; + register zic_t gmtoff; + register zic_t stdoff; + register zic_t year; + register zic_t startoff; + register int startttisstd; + register int startttisgmt; + register int type; + register char * startbuf; + register char * ab; + register char * envvar; + register int max_abbr_len; + register int max_envvar_len; + register int prodstic; /* all rules are min to max */ + register int compat; + register int do_extend; + register char version; +#ifdef ICU + int finalRuleYear, finalRuleIndex; + const struct rule* finalRule1; + const struct rule* finalRule2; +#endif + + max_abbr_len = 2 + max_format_len + max_abbrvar_len; + max_envvar_len = 2 * max_abbr_len + 5 * 9; + startbuf = emalloc(max_abbr_len + 1); + ab = emalloc(max_abbr_len + 1); + envvar = emalloc(max_envvar_len + 1); + INITIALIZE(untiltime); + INITIALIZE(starttime); + /* + ** Now. . .finally. . .generate some useful data! + */ + timecnt = 0; + typecnt = 0; + charcnt = 0; + prodstic = zonecount == 1; + /* + ** Thanks to Earl Chew + ** for noting the need to unconditionally initialize startttisstd. + */ + startttisstd = false; + startttisgmt = false; + min_year = max_year = EPOCH_YEAR; + if (leapseen) { + updateminmax(leapminyear); + updateminmax(leapmaxyear + (leapmaxyear < ZIC_MAX)); + } + /* + ** Reserve type 0. + */ + gmtoffs[0] = isdsts[0] = ttisstds[0] = ttisgmts[0] = abbrinds[0] = -1; + typecnt = 1; + for (i = 0; i < zonecount; ++i) { + zp = &zpfirst[i]; + if (i < zonecount - 1) + updateminmax(zp->z_untilrule.r_loyear); + for (j = 0; j < zp->z_nrules; ++j) { + rp = &zp->z_rules[j]; + if (rp->r_lowasnum) + updateminmax(rp->r_loyear); + if (rp->r_hiwasnum) + updateminmax(rp->r_hiyear); + if (rp->r_lowasnum || rp->r_hiwasnum) + prodstic = false; + } + } + /* + ** Generate lots of data if a rule can't cover all future times. + */ + compat = stringzone(envvar, zpfirst, zonecount); + version = compat < 2013 ? ZIC_VERSION_PRE_2013 : ZIC_VERSION; + do_extend = compat < 0 || compat == YEAR_BY_YEAR_ZONE; +#ifdef ICU + do_extend = 0; +#endif + if (noise) { + if (!*envvar) + warning("%s %s", + _("no POSIX environment variable for zone"), + zpfirst->z_name); + else if (compat != 0 && compat != YEAR_BY_YEAR_ZONE) { + /* Circa-COMPAT clients, and earlier clients, might + not work for this zone when given dates before + 1970 or after 2038. */ + warning(_("%s: pre-%d clients may mishandle" + " distant timestamps"), + zpfirst->z_name, compat); + } + } + if (do_extend) { + /* + ** Search through a couple of extra years past the obvious + ** 400, to avoid edge cases. For example, suppose a non-POSIX + ** rule applies from 2012 onwards and has transitions in March + ** and September, plus some one-off transitions in November + ** 2013. If zic looked only at the last 400 years, it would + ** set max_year=2413, with the intent that the 400 years 2014 + ** through 2413 will be repeated. The last transition listed + ** in the tzfile would be in 2413-09, less than 400 years + ** after the last one-off transition in 2013-11. Two years + ** might be overkill, but with the kind of edge cases + ** available we're not sure that one year would suffice. + */ + enum { years_of_observations = YEARSPERREPEAT + 2 }; + + if (min_year >= ZIC_MIN + years_of_observations) + min_year -= years_of_observations; + else min_year = ZIC_MIN; + if (max_year <= ZIC_MAX - years_of_observations) + max_year += years_of_observations; + else max_year = ZIC_MAX; + /* + ** Regardless of any of the above, + ** for a "proDSTic" zone which specifies that its rules + ** always have and always will be in effect, + ** we only need one cycle to define the zone. + */ + if (prodstic) { + min_year = 1900; + max_year = min_year + years_of_observations; + } + } + /* + ** For the benefit of older systems, + ** generate data from 1900 through 2037. + */ + if (min_year > 1900) + min_year = 1900; + if (max_year < 2037) + max_year = 2037; + for (i = 0; i < zonecount; ++i) { + /* + ** A guess that may well be corrected later. + */ + stdoff = 0; + zp = &zpfirst[i]; + usestart = i > 0 && (zp - 1)->z_untiltime > min_time; + useuntil = i < (zonecount - 1); + if (useuntil && zp->z_untiltime <= min_time) + continue; + gmtoff = zp->z_gmtoff; + eat(zp->z_filename, zp->z_linenum); + *startbuf = '\0'; + startoff = zp->z_gmtoff; +#ifdef ICU + finalRuleYear = finalRuleIndex = -1; + finalRule1 = finalRule2 = NULL; + if (i == (zonecount - 1)) { /* !useuntil */ + /* Look for exactly 2 rules that end at 'max' and + * note them. Determine max(r_loyear) for the 2 of + * them. */ + for (j=0; j<zp->z_nrules; ++j) { + rp = &zp->z_rules[j]; + if (rp->r_hiyear == ZIC_MAX) { + if (rp->r_loyear > finalRuleYear) { + finalRuleYear = rp->r_loyear; + } + if (finalRule1 == NULL) { + finalRule1 = rp; + } else if (finalRule2 == NULL) { + finalRule2 = rp; + } else { + error("more than two max rules found (ICU)"); + exit(EXIT_FAILURE); + } + } else if (rp->r_hiyear >= finalRuleYear) { + /* There might be an overriding non-max rule + * to be applied to a specific year after one of + * max rule's start year. For example, + * + * Rule Foo 2010 max ... + * Rule Foo 2015 only ... + * + * In this case, we need to change the start year of + * the final (max) rules to the next year. */ + finalRuleYear = rp->r_hiyear + 1; + + /* When above adjustment is done, max_year might need + * to be adjusted, so the final rule will be properly + * evaluated and emitted by the later code block. + * + * Note: This may push the start year of the final + * rules ahead by 1 year unnecessarily. For example, + * If there are two rules, non-max rule and max rule + * starting in the same year, such as + * + * Rule Foo 2010 only .... + * Rule Foo 2010 max .... + * + * In this case, the final (max) rule actually starts + * in 2010, instead of 2010. We could make this tool + * more intelligent to detect such situation. But pushing + * final rule start year to 1 year ahead (in the worst case) + * will just populate a few extra transitions, and it still + * works fine. So for now, we're not trying to put additional + * logic to optimize the case. + */ + if (max_year < finalRuleYear) { + max_year = finalRuleYear; + } + } + } + if (finalRule1 != NULL) { + if (finalRule2 == NULL) { + warning("only one max rule found (ICU)"); + finalRuleYear = finalRuleIndex = -1; + finalRule1 = NULL; + } else { + if (finalRule1->r_stdoff == finalRule2->r_stdoff) { + /* America/Resolute in 2009a uses a pair of rules + * which does not change the offset. ICU ignores + * such rules without actual time transitions. */ + finalRuleYear = finalRuleIndex = -1; + finalRule1 = finalRule2 = NULL; + } else { + /* Swap if necessary so finalRule1 occurs before + * finalRule2 */ + if (finalRule1->r_month > finalRule2->r_month) { + const struct rule* t = finalRule1; + finalRule1 = finalRule2; + finalRule2 = t; + } + /* Add final rule to our list */ + finalRuleIndex = add_icu_final_rules(finalRule1, finalRule2); + } + } + } + } +#endif + + if (zp->z_nrules == 0) { + stdoff = zp->z_stdoff; + doabbr(startbuf, zp->z_format, + NULL, stdoff != 0, false); + type = addtype(oadd(zp->z_gmtoff, stdoff), +#ifdef ICU + zp->z_gmtoff, stdoff, +#endif + startbuf, stdoff != 0, startttisstd, + startttisgmt); + if (usestart) { + addtt(starttime, type); + usestart = false; + } else if (stdoff != 0) + addtt(min_time, type); + } else for (year = min_year; year <= max_year; ++year) { + if (useuntil && year > zp->z_untilrule.r_hiyear) + break; + /* + ** Mark which rules to do in the current year. + ** For those to do, calculate rpytime(rp, year); + */ + for (j = 0; j < zp->z_nrules; ++j) { + rp = &zp->z_rules[j]; + eats(zp->z_filename, zp->z_linenum, + rp->r_filename, rp->r_linenum); + rp->r_todo = year >= rp->r_loyear && + year <= rp->r_hiyear && + yearistype(year, rp->r_yrtype); + if (rp->r_todo) + rp->r_temp = rpytime(rp, year); + } + for ( ; ; ) { + register int k; + register zic_t jtime, ktime; + register zic_t offset; + + INITIALIZE(ktime); + if (useuntil) { + /* + ** Turn untiltime into UT + ** assuming the current gmtoff and + ** stdoff values. + */ + untiltime = zp->z_untiltime; + if (!zp->z_untilrule.r_todisgmt) + untiltime = tadd(untiltime, + -gmtoff); + if (!zp->z_untilrule.r_todisstd) + untiltime = tadd(untiltime, + -stdoff); + } + /* + ** Find the rule (of those to do, if any) + ** that takes effect earliest in the year. + */ + k = -1; + for (j = 0; j < zp->z_nrules; ++j) { + rp = &zp->z_rules[j]; + if (!rp->r_todo) + continue; + eats(zp->z_filename, zp->z_linenum, + rp->r_filename, rp->r_linenum); + offset = rp->r_todisgmt ? 0 : gmtoff; + if (!rp->r_todisstd) + offset = oadd(offset, stdoff); + jtime = rp->r_temp; + if (jtime == min_time || + jtime == max_time) + continue; + jtime = tadd(jtime, -offset); + if (k < 0 || jtime < ktime) { + k = j; + ktime = jtime; + } + } + if (k < 0) + break; /* go on to next year */ + rp = &zp->z_rules[k]; + rp->r_todo = false; + if (useuntil && ktime >= untiltime) + break; + stdoff = rp->r_stdoff; + if (usestart && ktime == starttime) + usestart = false; + if (usestart) { + if (ktime < starttime) { + startoff = oadd(zp->z_gmtoff, + stdoff); + doabbr(startbuf, zp->z_format, + rp->r_abbrvar, + rp->r_stdoff != 0, + false); + continue; + } + if (*startbuf == '\0' && + startoff == oadd(zp->z_gmtoff, + stdoff)) { + doabbr(startbuf, + zp->z_format, + rp->r_abbrvar, + rp->r_stdoff != + 0, + false); + } + } +#ifdef ICU + if (year >= finalRuleYear && rp == finalRule1) { + /* We want to shift final year 1 year after + * the actual final rule takes effect (year + 1), + * because the previous type is valid until the first + * transition defined by the final rule. Otherwise + * we may see unexpected offset shift at the + * beginning of the year when the final rule takes + * effect. + * + * Note: This may results some 64bit second transitions + * at the very end (year 2038). ICU 4.2 or older releases + * cannot handle 64bit second transitions and they are + * dropped from zoneinfo.txt. */ + emit_icu_zone(icuFile, + zpfirst->z_name, zp->z_gmtoff, + rp, finalRuleIndex, year + 1); + /* only emit this for the first year */ + finalRule1 = NULL; + } +#endif + eats(zp->z_filename, zp->z_linenum, + rp->r_filename, rp->r_linenum); + doabbr(ab, zp->z_format, rp->r_abbrvar, + rp->r_stdoff != 0, false); + offset = oadd(zp->z_gmtoff, rp->r_stdoff); +#ifdef ICU + type = addtype(offset, zp->z_gmtoff, rp->r_stdoff, + ab, rp->r_stdoff != 0, + rp->r_todisstd, rp->r_todisgmt); +#else + type = addtype(offset, ab, rp->r_stdoff != 0, + rp->r_todisstd, rp->r_todisgmt); +#endif + addtt(ktime, type); + } + } + if (usestart) { + if (*startbuf == '\0' && + zp->z_format != NULL && + strchr(zp->z_format, '%') == NULL && + strchr(zp->z_format, '/') == NULL) + (void) strcpy(startbuf, zp->z_format); + eat(zp->z_filename, zp->z_linenum); + if (*startbuf == '\0') +error(_("can't determine time zone abbreviation to use just after until time")); + else addtt(starttime, +#ifdef ICU + addtype(startoff, + zp->z_gmtoff, startoff - zp->z_gmtoff, + startbuf, + startoff != zp->z_gmtoff, + startttisstd, + startttisgmt)); +#else + addtype(startoff, startbuf, + startoff != zp->z_gmtoff, + startttisstd, + startttisgmt)); +#endif + } + /* + ** Now we may get to set starttime for the next zone line. + */ + if (useuntil) { + startttisstd = zp->z_untilrule.r_todisstd; + startttisgmt = zp->z_untilrule.r_todisgmt; + starttime = zp->z_untiltime; + if (!startttisstd) + starttime = tadd(starttime, -stdoff); + if (!startttisgmt) + starttime = tadd(starttime, -gmtoff); + } + } + if (do_extend) { + /* + ** If we're extending the explicitly listed observations + ** for 400 years because we can't fill the POSIX-TZ field, + ** check whether we actually ended up explicitly listing + ** observations through that period. If there aren't any + ** near the end of the 400-year period, add a redundant + ** one at the end of the final year, to make it clear + ** that we are claiming to have definite knowledge of + ** the lack of transitions up to that point. + */ + struct rule xr; + struct attype *lastat; + xr.r_month = TM_JANUARY; + xr.r_dycode = DC_DOM; + xr.r_dayofmonth = 1; + xr.r_tod = 0; + for (lastat = &attypes[0], i = 1; i < timecnt; i++) + if (attypes[i].at > lastat->at) + lastat = &attypes[i]; + if (lastat->at < rpytime(&xr, max_year - 1)) { + /* + ** Create new type code for the redundant entry, + ** to prevent it being optimised away. + */ + if (typecnt >= TZ_MAX_TYPES) { + error(_("too many local time types")); + exit(EXIT_FAILURE); + } + gmtoffs[typecnt] = gmtoffs[lastat->type]; + isdsts[typecnt] = isdsts[lastat->type]; + ttisstds[typecnt] = ttisstds[lastat->type]; + ttisgmts[typecnt] = ttisgmts[lastat->type]; + abbrinds[typecnt] = abbrinds[lastat->type]; + ++typecnt; + addtt(rpytime(&xr, max_year + 1), typecnt-1); + } + } + writezone(zpfirst->z_name, envvar, version); + free(startbuf); + free(ab); + free(envvar); +} + +static void +addtt(const zic_t starttime, int type) +{ + if (starttime <= min_time || + (timecnt == 1 && attypes[0].at < min_time)) { + gmtoffs[0] = gmtoffs[type]; +#ifdef ICU + rawoffs[0] = rawoffs[type]; + dstoffs[0] = dstoffs[type]; +#endif + isdsts[0] = isdsts[type]; + ttisstds[0] = ttisstds[type]; + ttisgmts[0] = ttisgmts[type]; + if (abbrinds[type] != 0) + (void) strcpy(chars, &chars[abbrinds[type]]); + abbrinds[0] = 0; + charcnt = strlen(chars) + 1; + typecnt = 1; + timecnt = 0; + type = 0; + } + attypes = growalloc(attypes, sizeof *attypes, timecnt, &timecnt_alloc); + attypes[timecnt].at = starttime; + attypes[timecnt].type = type; + ++timecnt; +} + +static int +#ifdef ICU +addtype(const zic_t gmtoff, const zic_t rawoff, const zic_t dstoff, char *const abbr, const int isdst, + const int ttisstd, const int ttisgmt) +#else +addtype(const zic_t gmtoff, const char *const abbr, const int isdst, + const int ttisstd, const int ttisgmt) +#endif +{ + register int i, j; + + if (isdst != true && isdst != false) { + error(_("internal error - addtype called with bad isdst")); + exit(EXIT_FAILURE); + } + if (ttisstd != true && ttisstd != false) { + error(_("internal error - addtype called with bad ttisstd")); + exit(EXIT_FAILURE); + } + if (ttisgmt != true && ttisgmt != false) { + error(_("internal error - addtype called with bad ttisgmt")); + exit(EXIT_FAILURE); + } +#ifdef ICU + if (isdst != (dstoff != 0)) { + error(_("internal error - addtype called with bad isdst/dstoff")); + exit(EXIT_FAILURE); + } + if (gmtoff != (rawoff + dstoff)) { + error(_("internal error - addtype called with bad gmt/raw/dstoff")); + exit(EXIT_FAILURE); + } +#endif + /* + ** See if there's already an entry for this zone type. + ** If so, just return its index. + */ + for (i = 0; i < typecnt; ++i) { + if (gmtoff == gmtoffs[i] && isdst == isdsts[i] && +#ifdef ICU + rawoff == rawoffs[i] && dstoff == dstoffs[i] && +#endif + strcmp(abbr, &chars[abbrinds[i]]) == 0 && + ttisstd == ttisstds[i] && + ttisgmt == ttisgmts[i]) + return i; + } + /* + ** There isn't one; add a new one, unless there are already too + ** many. + */ + if (typecnt >= TZ_MAX_TYPES) { + error(_("too many local time types")); + exit(EXIT_FAILURE); + } + if (! (-1L - 2147483647L <= gmtoff && gmtoff <= 2147483647L)) { + error(_("UT offset out of range")); + exit(EXIT_FAILURE); + } + gmtoffs[i] = gmtoff; +#ifdef ICU + rawoffs[i] = rawoff; + dstoffs[i] = dstoff; +#endif + isdsts[i] = isdst; + ttisstds[i] = ttisstd; + ttisgmts[i] = ttisgmt; + + for (j = 0; j < charcnt; ++j) + if (strcmp(&chars[j], abbr) == 0) + break; + if (j == charcnt) + newabbr(abbr); + abbrinds[i] = j; + ++typecnt; + return i; +} + +static void +leapadd(const zic_t t, const int positive, const int rolling, int count) +{ + register int i, j; + + if (leapcnt + (positive ? count : 1) > TZ_MAX_LEAPS) { + error(_("too many leap seconds")); + exit(EXIT_FAILURE); + } + for (i = 0; i < leapcnt; ++i) + if (t <= trans[i]) { + if (t == trans[i]) { + error(_("repeated leap second moment")); + exit(EXIT_FAILURE); + } + break; + } + do { + for (j = leapcnt; j > i; --j) { + trans[j] = trans[j - 1]; + corr[j] = corr[j - 1]; + roll[j] = roll[j - 1]; + } + trans[i] = t; + corr[i] = positive ? 1 : -count; + roll[i] = rolling; + ++leapcnt; + } while (positive && --count != 0); +} + +static void +adjleap(void) +{ + register int i; + register zic_t last = 0; + + /* + ** propagate leap seconds forward + */ + for (i = 0; i < leapcnt; ++i) { + trans[i] = tadd(trans[i], last); + last = corr[i] += last; + } +} + +static int +yearistype(const int year, const char *const type) +{ + static char * buf; + int result; + + if (type == NULL || *type == '\0') + return true; + buf = erealloc(buf, 132 + strlen(yitcommand) + strlen(type)); + (void) sprintf(buf, "%s %d %s", yitcommand, year, type); + result = system(buf); + if (WIFEXITED(result)) switch (WEXITSTATUS(result)) { + case 0: + return true; + case 1: + return false; + } + error(_("Wild result from command execution")); + (void) fprintf(stderr, _("%s: command was '%s', result was %d\n"), + progname, buf, result); + for ( ; ; ) + exit(EXIT_FAILURE); +} + +static int +lowerit(int a) +{ + a = (unsigned char) a; + return (isascii(a) && isupper(a)) ? tolower(a) : a; +} + +/* case-insensitive equality */ +static ATTRIBUTE_PURE int +ciequal(register const char *ap, register const char *bp) +{ + while (lowerit(*ap) == lowerit(*bp++)) + if (*ap++ == '\0') + return true; + return false; +} + +static ATTRIBUTE_PURE int +itsabbr(register const char *abbr, register const char *word) +{ + if (lowerit(*abbr) != lowerit(*word)) + return false; + ++word; + while (*++abbr != '\0') + do { + if (*word == '\0') + return false; + } while (lowerit(*word++) != lowerit(*abbr)); + return true; +} + +static ATTRIBUTE_PURE const struct lookup * +byword(register const char *const word, + register const struct lookup *const table) +{ + register const struct lookup * foundlp; + register const struct lookup * lp; + + if (word == NULL || table == NULL) + return NULL; + /* + ** Look for exact match. + */ + for (lp = table; lp->l_word != NULL; ++lp) + if (ciequal(word, lp->l_word)) + return lp; + /* + ** Look for inexact match. + */ + foundlp = NULL; + for (lp = table; lp->l_word != NULL; ++lp) + if (itsabbr(word, lp->l_word)) { + if (foundlp == NULL) + foundlp = lp; + else return NULL; /* multiple inexact matches */ + } + return foundlp; +} + +static char ** +getfields(register char *cp) +{ + register char * dp; + register char ** array; + register int nsubs; + + if (cp == NULL) + return NULL; + array = emalloc(size_product(strlen(cp) + 1, sizeof *array)); + nsubs = 0; + for ( ; ; ) { + while (isascii((unsigned char) *cp) && + isspace((unsigned char) *cp)) + ++cp; + if (*cp == '\0' || *cp == '#') + break; + array[nsubs++] = dp = cp; + do { + if ((*dp = *cp++) != '"') + ++dp; + else while ((*dp = *cp++) != '"') + if (*dp != '\0') + ++dp; + else { + error(_( + "Odd number of quotation marks" + )); + exit(1); + } + } while (*cp != '\0' && *cp != '#' && + (!isascii(*cp) || !isspace((unsigned char) *cp))); + if (isascii(*cp) && isspace((unsigned char) *cp)) + ++cp; + *dp = '\0'; + } + array[nsubs] = NULL; + return array; +} + +static ATTRIBUTE_PURE zic_t +oadd(const zic_t t1, const zic_t t2) +{ + if (t1 < 0 ? t2 < ZIC_MIN - t1 : ZIC_MAX - t1 < t2) { + error(_("time overflow")); + exit(EXIT_FAILURE); + } + return t1 + t2; +} + +static ATTRIBUTE_PURE zic_t +tadd(const zic_t t1, const zic_t t2) +{ + if (t1 == max_time && t2 > 0) + return max_time; + if (t1 == min_time && t2 < 0) + return min_time; + if (t1 < 0 ? t2 < min_time - t1 : max_time - t1 < t2) { + error(_("time overflow")); + exit(EXIT_FAILURE); + } + return t1 + t2; +} + +/* +** Given a rule, and a year, compute the date - in seconds since January 1, +** 1970, 00:00 LOCAL time - in that year that the rule refers to. +*/ + +static zic_t +rpytime(register const struct rule *const rp, register const zic_t wantedy) +{ + register int m, i; + register zic_t dayoff; /* with a nod to Margaret O. */ + register zic_t t, y; + + if (wantedy == ZIC_MIN) + return min_time; + if (wantedy == ZIC_MAX) + return max_time; + dayoff = 0; + m = TM_JANUARY; + y = EPOCH_YEAR; + while (wantedy != y) { + if (wantedy > y) { + i = len_years[isleap(y)]; + ++y; + } else { + --y; + i = -len_years[isleap(y)]; + } + dayoff = oadd(dayoff, i); + } + while (m != rp->r_month) { + i = len_months[isleap(y)][m]; + dayoff = oadd(dayoff, i); + ++m; + } + i = rp->r_dayofmonth; + if (m == TM_FEBRUARY && i == 29 && !isleap(y)) { + if (rp->r_dycode == DC_DOWLEQ) + --i; + else { + error(_("use of 2/29 in non leap-year")); + exit(EXIT_FAILURE); + } + } + --i; + dayoff = oadd(dayoff, i); + if (rp->r_dycode == DC_DOWGEQ || rp->r_dycode == DC_DOWLEQ) { + register zic_t wday; + +#define LDAYSPERWEEK ((zic_t) DAYSPERWEEK) + wday = EPOCH_WDAY; + /* + ** Don't trust mod of negative numbers. + */ + if (dayoff >= 0) + wday = (wday + dayoff) % LDAYSPERWEEK; + else { + wday -= ((-dayoff) % LDAYSPERWEEK); + if (wday < 0) + wday += LDAYSPERWEEK; + } + while (wday != rp->r_wday) + if (rp->r_dycode == DC_DOWGEQ) { + dayoff = oadd(dayoff, 1); + if (++wday >= LDAYSPERWEEK) + wday = 0; + ++i; + } else { + dayoff = oadd(dayoff, -1); + if (--wday < 0) + wday = LDAYSPERWEEK - 1; + --i; + } + if (i < 0 || i >= len_months[isleap(y)][m]) { + if (noise) + warning(_("rule goes past start/end of month--\ +will not work with pre-2004 versions of zic")); + } + } + if (dayoff < min_time / SECSPERDAY) + return min_time; + if (dayoff > max_time / SECSPERDAY) + return max_time; + t = (zic_t) dayoff * SECSPERDAY; + return tadd(t, rp->r_tod); +} + +static void +newabbr(const char *const string) +{ + register int i; + + if (strcmp(string, GRANDPARENTED) != 0) { + register const char * cp; + const char * mp; + + /* + ** Want one to ZIC_MAX_ABBR_LEN_WO_WARN alphabetics + ** optionally followed by a + or - and a number from 1 to 14. + */ + cp = string; + mp = NULL; + while (isascii((unsigned char) *cp) && + isalpha((unsigned char) *cp)) + ++cp; + if (cp - string == 0) +mp = _("time zone abbreviation lacks alphabetic at start"); + if (noise && cp - string < 3) +mp = _("time zone abbreviation has fewer than 3 alphabetics"); + if (cp - string > ZIC_MAX_ABBR_LEN_WO_WARN) +mp = _("time zone abbreviation has too many alphabetics"); + if (mp == NULL && (*cp == '+' || *cp == '-')) { + ++cp; + if (isascii((unsigned char) *cp) && + isdigit((unsigned char) *cp)) + if (*cp++ == '1' && + *cp >= '0' && *cp <= '4') + ++cp; + } + if (*cp != '\0') +mp = _("time zone abbreviation differs from POSIX standard"); + if (mp != NULL) + warning("%s (%s)", mp, string); + } + i = strlen(string) + 1; + if (charcnt + i > TZ_MAX_CHARS) { + error(_("too many, or too long, time zone abbreviations")); + exit(EXIT_FAILURE); + } + (void) strcpy(&chars[charcnt], string); + charcnt += i; +} + +static int +mkdirs(char *argname) +{ + register char * name; + register char * cp; + + if (argname == NULL || *argname == '\0') + return 0; + cp = name = ecpyalloc(argname); + while ((cp = strchr(cp + 1, '/')) != 0) { + *cp = '\0'; +#ifdef HAVE_DOS_FILE_NAMES + /* + ** DOS drive specifier? + */ + if (isalpha((unsigned char) name[0]) && + name[1] == ':' && name[2] == '\0') { + *cp = '/'; + continue; + } +#endif + if (!itsdir(name)) { + /* + ** It doesn't seem to exist, so we try to create it. + ** Creation may fail because of the directory being + ** created by some other multiprocessor, so we get + ** to do extra checking. + */ + if (mkdir(name, MKDIR_UMASK) != 0) { + const char *e = strerror(errno); + + if (errno != EEXIST || !itsdir(name)) { + (void) fprintf(stderr, +_("%s: Can't create directory %s: %s\n"), + progname, name, e); + free(name); + return -1; + } + } + } + *cp = '/'; + } + free(name); + return 0; +} + +/* +** UNIX was a registered trademark of The Open Group in 2003. +*/ |