diff options
Diffstat (limited to 'intl/icu/source/tools/toolutil')
50 files changed, 16359 insertions, 0 deletions
diff --git a/intl/icu/source/tools/toolutil/BUILD.bazel b/intl/icu/source/tools/toolutil/BUILD.bazel new file mode 100644 index 0000000000..276c857f12 --- /dev/null +++ b/intl/icu/source/tools/toolutil/BUILD.bazel @@ -0,0 +1,126 @@ +# © 2021 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html + +# This Bazel build file defines targets that are dependencies for building +# the gennorm2 and genprops binaries. + +load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") + +package( + default_visibility = ["//visibility:public"], +) + +cc_library( + name = "toolutil", + includes = ["."], + hdrs = ["toolutil.h"], + srcs = ["toolutil.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = ["//icu4c/source/common:platform"], +) + +cc_library( + name = "unewdata", + includes = ["."], + hdrs = ["unewdata.h"], + srcs = ["unewdata.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = [ + ":filestrm", + "//icu4c/source/common:platform", + ], +) + +cc_library( + name = "uoptions", + includes = ["."], + hdrs = ["uoptions.h"], + srcs = ["uoptions.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = ["//icu4c/source/common:platform"], +) + +cc_library( + name = "writesrc", + includes = ["."], + hdrs = ["writesrc.h"], + srcs = ["writesrc.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = [ + "//icu4c/source/common:bytestream", + "//icu4c/source/common:platform", + "//icu4c/source/common:uniset_core", + ], +) + +cc_library( + name = "uparse", + includes = ["."], + hdrs = ["uparse.h"], + srcs = ["uparse.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = [ + ":filestrm", + "//icu4c/source/common:platform", + ], +) + +cc_library( + name = "filestrm", + includes = ["."], + hdrs = ["filestrm.h"], + srcs = ["filestrm.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = ["//icu4c/source/common:platform"], +) + +cc_library( + name = "ppucd", + includes = ["."], + hdrs = ["ppucd.h"], + srcs = ["ppucd.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = [ + ":uparse", + "//icu4c/source/common:platform", + ], +) + +cc_library( + name = "denseranges", + includes = ["."], + hdrs = ["denseranges.h"], + srcs = ["denseranges.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = ["//icu4c/source/common:platform"], +) + +cc_library( + name = "collationinfo", + includes = ["."], + hdrs = ["collationinfo.h"], + srcs = ["collationinfo.cpp"], + local_defines = [ + "U_TOOLUTIL_IMPLEMENTATION", + ], + deps = [ + "//icu4c/source/common:platform", + "//icu4c/source/i18n:headers", + ], +) diff --git a/intl/icu/source/tools/toolutil/Makefile.in b/intl/icu/source/tools/toolutil/Makefile.in new file mode 100644 index 0000000000..c9fd89b0f0 --- /dev/null +++ b/intl/icu/source/tools/toolutil/Makefile.in @@ -0,0 +1,155 @@ +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +#****************************************************************************** +# +# Copyright (C) 1999-2014, International Business Machines +# Corporation and others. All Rights Reserved. +# +#****************************************************************************** +## Makefile.in for ICU - tools/toolutil +## Steven R. Loomis + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +## All the flags and other definitions are included here. +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/toolutil + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(IMPORT_LIB) $(MIDDLE_IMPORT_LIB) $(FINAL_IMPORT_LIB) + +## Target information + +TARGET_STUBNAME=$(TOOLUTIL_STUBNAME) + +ifneq ($(ENABLE_STATIC),) +TARGET = $(LIBDIR)/$(LIBSICU)$(TARGET_STUBNAME)$(ICULIBSUFFIX).$(A) +endif + +ifneq ($(ENABLE_SHARED),) +SO_TARGET = $(LIBDIR)/$(LIBICU)$(TARGET_STUBNAME)$(ICULIBSUFFIX).$(SO) +ALL_SO_TARGETS = $(SO_TARGET) $(MIDDLE_SO_TARGET) $(FINAL_SO_TARGET) $(SHARED_OBJECT) +endif + +ALL_TARGETS = $(TARGET) $(ALL_SO_TARGETS) + +DYNAMICCPPFLAGS = $(SHAREDLIBCPPFLAGS) +DYNAMICCFLAGS = $(SHAREDLIBCFLAGS) +DYNAMICCXXFLAGS = $(SHAREDLIBCXXFLAGS) +CFLAGS += $(LIBCFLAGS) +CXXFLAGS += $(LIBCXXFLAGS) + +CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(top_srcdir)/i18n $(LIBCPPFLAGS) + +# from icuinfo +CPPFLAGS+= "-DU_BUILD=\"@build@\"" "-DU_HOST=\"@host@\"" "-DU_CC=\"@CC@\"" "-DU_CXX=\"@CXX@\"" +CPPFLAGS += -DUNISTR_FROM_CHAR_EXPLICIT=explicit -DUNISTR_FROM_STRING_EXPLICIT=explicit + +DEFS += -DU_TOOLUTIL_IMPLEMENTATION +LDFLAGS += $(LDFLAGSICUTOOLUTIL) +LIBS = $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(SOURCES:.cpp=.o) + +STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O)) + +DEPS = $(OBJECTS:.o=.d) + +-include Makefile.local + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local install-library dist \ +dist-local check check-local + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(ALL_TARGETS) + +install-local: install-library + +install-library: all-local + $(MKINSTALLDIRS) $(DESTDIR)$(libdir) +ifneq ($(ENABLE_STATIC),) + $(INSTALL-L) $(TARGET) $(DESTDIR)$(libdir) +endif +ifneq ($(ENABLE_SHARED),) +# For MinGW, do we want the DLL to go in the bin location? +ifeq ($(MINGW_MOVEDLLSTOBINDIR),YES) + $(MKINSTALLDIRS) $(DESTDIR)$(bindir) + $(INSTALL-L) $(FINAL_SO_TARGET) $(DESTDIR)$(bindir) +else + $(INSTALL-L) $(FINAL_SO_TARGET) $(DESTDIR)$(libdir) +ifneq ($(FINAL_SO_TARGET),$(SO_TARGET)) + cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(SO_TARGET)) && ln -s $(notdir $(FINAL_SO_TARGET)) $(notdir $(SO_TARGET)) +ifneq ($(FINAL_SO_TARGET),$(MIDDLE_SO_TARGET)) + cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(MIDDLE_SO_TARGET)) && ln -s $(notdir $(FINAL_SO_TARGET)) $(notdir $(MIDDLE_SO_TARGET)) +endif +endif +endif +ifneq ($(IMPORT_LIB_EXT),) + $(INSTALL-L) $(FINAL_IMPORT_LIB) $(DESTDIR)$(libdir) +ifneq ($(IMPORT_LIB),$(FINAL_IMPORT_LIB)) + cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(IMPORT_LIB)) && ln -s $(notdir $(FINAL_IMPORT_LIB)) $(notdir $(IMPORT_LIB)) +endif +ifneq ($(MIDDLE_IMPORT_LIB),$(FINAL_IMPORT_LIB)) + cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(MIDDLE_IMPORT_LIB)) && ln -s $(notdir $(FINAL_IMPORT_LIB)) $(notdir $(MIDDLE_IMPORT_LIB)) +endif +endif +endif + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(OBJECTS) $(STATIC_OBJECTS) $(ALL_TARGETS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +ifneq ($(ENABLE_STATIC),) +$(TARGET): $(STATIC_OBJECTS) + $(AR) $(ARFLAGS) $(AR_OUTOPT)$@ $^ + $(RANLIB) $@ +endif + +ifneq ($(ENABLE_SHARED),) +$(SHARED_OBJECT): $(OBJECTS) + $(SHLIB.cc) $(LD_SONAME) $(OUTOPT)$@ $^ $(LIBS) +ifeq ($(ENABLE_RPATH),YES) +ifneq ($(wildcard $(libdir)/$(MIDDLE_SO_TARGET)),) + $(warning RPATH warning: --enable-rpath means test programs may use existing $(libdir)/$(MIDDLE_SO_TARGET)) +endif +endif +endif + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/toolutil/collationinfo.cpp b/intl/icu/source/tools/toolutil/collationinfo.cpp new file mode 100644 index 0000000000..6bad90e133 --- /dev/null +++ b/intl/icu/source/tools/toolutil/collationinfo.cpp @@ -0,0 +1,152 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2013-2015, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* collationinfo.cpp +* +* created on: 2013aug05 +* created by: Markus W. Scherer +*/ + +#include <stdio.h> +#include <string.h> + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + +#include "collationdata.h" +#include "collationdatareader.h" +#include "collationinfo.h" +#include "uassert.h" +#include "uvectr32.h" + +U_NAMESPACE_BEGIN + +void +CollationInfo::printSizes(int32_t sizeWithHeader, const int32_t indexes[]) { + int32_t totalSize = indexes[CollationDataReader::IX_TOTAL_SIZE]; + if(sizeWithHeader > totalSize) { + printf(" header size: %6ld\n", (long)(sizeWithHeader - totalSize)); + } + + int32_t length = indexes[CollationDataReader::IX_INDEXES_LENGTH]; + printf(" indexes: %6ld *4 = %6ld\n", (long)length, (long)length * 4); + + length = getDataLength(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET); + if(length != 0) { + printf(" reorder codes: %6ld *4 = %6ld\n", (long)length / 4, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET); + if(length != 0) { + U_ASSERT(length >= 256); + printf(" reorder table: %6ld\n", (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_TRIE_OFFSET); + if(length != 0) { + printf(" trie size: %6ld\n", (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_RESERVED8_OFFSET); + if(length != 0) { + printf(" reserved (offset 8): %6ld\n", (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_CES_OFFSET); + if(length != 0) { + printf(" CEs: %6ld *8 = %6ld\n", (long)length / 8, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_RESERVED10_OFFSET); + if(length != 0) { + printf(" reserved (offset 10): %6ld\n", (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_CE32S_OFFSET); + if(length != 0) { + printf(" CE32s: %6ld *4 = %6ld\n", (long)length / 4, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET); + if(length != 0) { + printf(" rootElements: %6ld *4 = %6ld\n", (long)length / 4, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_CONTEXTS_OFFSET); + if(length != 0) { + printf(" contexts: %6ld *2 = %6ld\n", (long)length / 2, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_UNSAFE_BWD_OFFSET); + if(length != 0) { + printf(" unsafeBwdSet: %6ld *2 = %6ld\n", (long)length / 2, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET); + if(length != 0) { + printf(" fastLatin table: %6ld *2 = %6ld\n", (long)length / 2, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_SCRIPTS_OFFSET); + if(length != 0) { + printf(" scripts data: %6ld *2 = %6ld\n", (long)length / 2, (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET); + if(length != 0) { + U_ASSERT(length >= 256); + printf(" compressibleBytes: %6ld\n", (long)length); + } + + length = getDataLength(indexes, CollationDataReader::IX_RESERVED18_OFFSET); + if(length != 0) { + printf(" reserved (offset 18): %6ld\n", (long)length); + } + + printf(" collator binary total size: %6ld\n", (long)sizeWithHeader); +} + +int32_t +CollationInfo::getDataLength(const int32_t indexes[], int32_t startIndex) { + return indexes[startIndex + 1] - indexes[startIndex]; +} + +void +CollationInfo::printReorderRanges(const CollationData &data, const int32_t *codes, int32_t length) { + UErrorCode errorCode = U_ZERO_ERROR; + UVector32 ranges(errorCode); + data.makeReorderRanges(codes, length, ranges, errorCode); + if(U_FAILURE(errorCode)) { + printf(" error building reorder ranges: %s\n", u_errorName(errorCode)); + return; + } + + int32_t start = 0; + for(int32_t i = 0; i < ranges.size(); ++i) { + int32_t pair = ranges.elementAti(i); + int32_t limit = (pair >> 16) & 0xffff; + int16_t offset = (int16_t)pair; + if(offset == 0) { + // [inclusive-start, exclusive-limit[ + printf(" [%04x, %04x[\n", start, limit); + } else if(offset > 0) { + printf(" reorder [%04x, %04x[ by offset %02x to [%04x, %04x[\n", + start, limit, offset, + start + (offset << 8), limit + (offset << 8)); + } else /* offset < 0 */ { + printf(" reorder [%04x, %04x[ by offset -%02x to [%04x, %04x[\n", + start, limit, -offset, + start + (offset << 8), limit + (offset << 8)); + } + start = limit; + } +} + +U_NAMESPACE_END + +#endif // !UCONFIG_NO_COLLATION diff --git a/intl/icu/source/tools/toolutil/collationinfo.h b/intl/icu/source/tools/toolutil/collationinfo.h new file mode 100644 index 0000000000..815b89d40d --- /dev/null +++ b/intl/icu/source/tools/toolutil/collationinfo.h @@ -0,0 +1,42 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2013-2015, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* collationinfo.h +* +* created on: 2013aug05 +* created by: Markus W. Scherer +*/ + +#ifndef __COLLATIONINFO_H__ +#define __COLLATIONINFO_H__ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_COLLATION + +U_NAMESPACE_BEGIN + +struct CollationData; + +/** + * Collation-related code for tools & demos. + */ +class U_TOOLUTIL_API CollationInfo /* all static */ { +public: + static void printSizes(int32_t sizeWithHeader, const int32_t indexes[]); + static void printReorderRanges(const CollationData &data, const int32_t *codes, int32_t length); + +private: + CollationInfo(); // no constructor + + static int32_t getDataLength(const int32_t indexes[], int32_t startIndex); +}; + +U_NAMESPACE_END + +#endif // !UCONFIG_NO_COLLATION +#endif // __COLLATIONINFO_H__ diff --git a/intl/icu/source/tools/toolutil/dbgutil.cpp b/intl/icu/source/tools/toolutil/dbgutil.cpp new file mode 100644 index 0000000000..d42b267f73 --- /dev/null +++ b/intl/icu/source/tools/toolutil/dbgutil.cpp @@ -0,0 +1,160 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2007-2012, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + +#include "udbgutil.h" +#include "dbgutil.h" + +#if !UCONFIG_NO_FORMATTING + +#include "unicode/unistr.h" +#include "unicode/ustring.h" +#include "util.h" +#include "ucln.h" + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +U_NAMESPACE_USE + +static UnicodeString **strs = nullptr; + +static const UnicodeString& _fieldString(UDebugEnumType type, int32_t field, UnicodeString& fillin) { + const char *str = udbg_enumName(type, field); + if(str == nullptr) { + return fillin.remove(); + } else { + return fillin = UnicodeString(str, -1, US_INV); + } +} + +U_CDECL_BEGIN +static void udbg_cleanup() { + if(strs != nullptr) { + for(int t=0;t<=UDBG_ENUM_COUNT;t++) { + delete [] strs[t]; + } + delete[] strs; + strs = nullptr; + } +} + +static UBool tu_cleanup() +{ + udbg_cleanup(); + return true; +} + +static void udbg_register_cleanup() { + ucln_registerCleanup(UCLN_TOOLUTIL, tu_cleanup); +} +U_CDECL_END + +static void udbg_setup() { + if(strs == nullptr) { + udbg_register_cleanup(); + //fprintf(stderr,"Initializing string cache..\n"); + //fflush(stderr); + UnicodeString **newStrs = new UnicodeString*[UDBG_ENUM_COUNT+1]; + for(int t=0;t<UDBG_ENUM_COUNT;t++) { + int32_t c = udbg_enumCount((UDebugEnumType)t); + newStrs[t] = new UnicodeString[c+1]; + for(int f=0;f<=c;f++) { + _fieldString((UDebugEnumType)t, f, newStrs[t][f]); + } + } + newStrs[UDBG_ENUM_COUNT] = new UnicodeString[1]; // empty string + + strs = newStrs; + } +} + + + +U_TOOLUTIL_API const UnicodeString& U_EXPORT2 udbg_enumString(UDebugEnumType type, int32_t field) { + if(strs == nullptr ) { + udbg_setup(); + } + if(type<0||type>=UDBG_ENUM_COUNT) { + // use UDBG_ENUM_COUNT,0 to mean an empty string + //fprintf(stderr, "** returning out of range on %d\n",type); + //fflush(stderr); + return strs[UDBG_ENUM_COUNT][0]; + } + int32_t count = udbg_enumCount(type); + //fprintf(stderr, "enumString [%d,%d]: typecount %d, fieldcount %d\n", type,field,UDBG_ENUM_COUNT,count); + //fflush(stderr); + if(field<0 || field > count) { + return strs[type][count]; + } else { return strs[type][field]; + } +} + +U_CAPI int32_t U_EXPORT2 udbg_enumByString(UDebugEnumType type, const UnicodeString& string) { + if(type<0||type>=UDBG_ENUM_COUNT) { + return -1; + } + // initialize array + udbg_enumString(type,0); + // search + /// printf("type=%d\n", type); fflush(stdout); + for(int i=0;i<udbg_enumCount(type);i++) { +// printf("i=%d/%d\n", i, udbg_enumCount(type)); fflush(stdout); + if(string == (strs[type][i])) { + return i; + } + } + return -1; +} + +// from DataMap::utoi +U_CAPI int32_t +udbg_stoi(const UnicodeString &s) +{ + char ch[256]; + const char16_t *u = toUCharPtr(s.getBuffer()); + int32_t len = s.length(); + u_UCharsToChars(u, ch, len); + ch[len] = 0; /* include terminating \0 */ + return atoi(ch); +} + + +U_CAPI double +udbg_stod(const UnicodeString &s) +{ + char ch[256]; + const char16_t *u = toUCharPtr(s.getBuffer()); + int32_t len = s.length(); + u_UCharsToChars(u, ch, len); + ch[len] = 0; /* include terminating \0 */ + return atof(ch); +} + +U_CAPI UnicodeString * +udbg_escape(const UnicodeString &src, UnicodeString *dst) +{ + dst->remove(); + for (int32_t i = 0; i < src.length(); ++i) { + char16_t c = src[i]; + if(ICU_Utility::isUnprintable(c)) { + *dst += UnicodeString("["); + ICU_Utility::escapeUnprintable(*dst, c); + *dst += UnicodeString("]"); + } + else { + *dst += c; + } + } + + return dst; +} + + + +#endif diff --git a/intl/icu/source/tools/toolutil/dbgutil.h b/intl/icu/source/tools/toolutil/dbgutil.h new file mode 100644 index 0000000000..43fe2171b4 --- /dev/null +++ b/intl/icu/source/tools/toolutil/dbgutil.h @@ -0,0 +1,45 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +/* +************************************************************************ +* Copyright (c) 2007-2012, International Business Machines +* Corporation and others. All Rights Reserved. +************************************************************************ +*/ + +/** C++ Utilities to aid in debugging **/ + +#ifndef _DBGUTIL_H +#define _DBGUTIL_H + +#include "unicode/utypes.h" +#include "udbgutil.h" +#include "unicode/unistr.h" + +#if !UCONFIG_NO_FORMATTING + +U_TOOLUTIL_API const icu::UnicodeString& U_EXPORT2 +udbg_enumString(UDebugEnumType type, int32_t field); + +/** + * @return enum offset, or UDBG_INVALID_ENUM on error + */ +U_CAPI int32_t U_EXPORT2 +udbg_enumByString(UDebugEnumType type, const icu::UnicodeString& string); + +/** + * Convert a UnicodeString (with ascii digits) into a number. + * @param s string + * @return numerical value, or 0 on error + */ +U_CAPI int32_t U_EXPORT2 udbg_stoi(const icu::UnicodeString &s); + +U_CAPI double U_EXPORT2 udbg_stod(const icu::UnicodeString &s); + +U_CAPI icu::UnicodeString * U_EXPORT2 +udbg_escape(const icu::UnicodeString &s, icu::UnicodeString *dst); + +#endif + +#endif diff --git a/intl/icu/source/tools/toolutil/denseranges.cpp b/intl/icu/source/tools/toolutil/denseranges.cpp new file mode 100644 index 0000000000..f5e52b1bbb --- /dev/null +++ b/intl/icu/source/tools/toolutil/denseranges.cpp @@ -0,0 +1,160 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2010, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: denseranges.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2010sep25 +* created by: Markus W. Scherer +* +* Helper code for finding a small number of dense ranges. +*/ + +#include "unicode/utypes.h" +#include "denseranges.h" + +// Definitions in the anonymous namespace are invisible outside this file. +namespace { + +/** + * Collect up to 15 range gaps and sort them by ascending gap size. + */ +class LargestGaps { +public: + LargestGaps(int32_t max) : maxLength(max<=kCapacity ? max : kCapacity), length(0) {} + + void add(int32_t gapStart, int64_t gapLength) { + int32_t i=length; + while(i>0 && gapLength>gapLengths[i-1]) { + --i; + } + if(i<maxLength) { + // The new gap is now one of the maxLength largest. + // Insert the new gap, moving up smaller ones of the previous + // length largest. + int32_t j= length<maxLength ? length++ : maxLength-1; + while(j>i) { + gapStarts[j]=gapStarts[j-1]; + gapLengths[j]=gapLengths[j-1]; + --j; + } + gapStarts[i]=gapStart; + gapLengths[i]=gapLength; + } + } + + void truncate(int32_t newLength) { + if(newLength<length) { + length=newLength; + } + } + + int32_t count() const { return length; } + int32_t gapStart(int32_t i) const { return gapStarts[i]; } + int64_t gapLength(int32_t i) const { return gapLengths[i]; } + + int32_t firstAfter(int32_t value) const { + if(length==0) { + return -1; + } + int32_t minValue=0; + int32_t minIndex=-1; + for(int32_t i=0; i<length; ++i) { + if(value<gapStarts[i] && (minIndex<0 || gapStarts[i]<minValue)) { + minValue=gapStarts[i]; + minIndex=i; + } + } + return minIndex; + } + +private: + static const int32_t kCapacity=15; + + int32_t maxLength; + int32_t length; + int32_t gapStarts[kCapacity]; + int64_t gapLengths[kCapacity]; +}; + +} // namespace + +/** + * Does it make sense to write 1..capacity ranges? + * Returns 0 if not, otherwise the number of ranges. + * @param values Sorted array of signed-integer values. + * @param length Number of values. + * @param density Minimum average range density, in 256th. (0x100=100%=perfectly dense.) + * Should be 0x80..0x100, must be 1..0x100. + * @param ranges Output ranges array. + * @param capacity Maximum number of ranges. + * @return Minimum number of ranges (at most capacity) that have the desired density, + * or 0 if that density cannot be achieved. + */ +U_CAPI int32_t U_EXPORT2 +uprv_makeDenseRanges(const int32_t values[], int32_t length, + int32_t density, + int32_t ranges[][2], int32_t capacity) { + if(length<=2) { + return 0; + } + int32_t minValue=values[0]; + int32_t maxValue=values[length-1]; // Assume minValue<=maxValue. + // Use int64_t variables for intermediate-value precision and to avoid + // signed-int32_t overflow of maxValue-minValue. + int64_t maxLength=(int64_t)maxValue-(int64_t)minValue+1; + if(length>=(density*maxLength)/0x100) { + // Use one range. + ranges[0][0]=minValue; + ranges[0][1]=maxValue; + return 1; + } + if(length<=4) { + return 0; + } + // See if we can split [minValue, maxValue] into 2..capacity ranges, + // divided by the 1..(capacity-1) largest gaps. + LargestGaps gaps(capacity-1); + int32_t i; + int32_t expectedValue=minValue; + for(i=1; i<length; ++i) { + ++expectedValue; + int32_t actualValue=values[i]; + if(expectedValue!=actualValue) { + gaps.add(expectedValue, (int64_t)actualValue-(int64_t)expectedValue); + expectedValue=actualValue; + } + } + // We know gaps.count()>=1 because we have fewer values (length) than + // the length of the [minValue..maxValue] range (maxLength). + // (Otherwise we would have returned with the one range above.) + int32_t num; + for(i=0, num=2;; ++i, ++num) { + if(i>=gaps.count()) { + // The values are too sparse for capacity or fewer ranges + // of the requested density. + return 0; + } + maxLength-=gaps.gapLength(i); + if(length>num*2 && length>=(density*maxLength)/0x100) { + break; + } + } + // Use the num ranges with the num-1 largest gaps. + gaps.truncate(num-1); + ranges[0][0]=minValue; + for(i=0; i<=num-2; ++i) { + int32_t gapIndex=gaps.firstAfter(minValue); + int32_t gapStart=gaps.gapStart(gapIndex); + ranges[i][1]=gapStart-1; + ranges[i+1][0]=minValue=(int32_t)(gapStart+gaps.gapLength(gapIndex)); + } + ranges[num-1][1]=maxValue; + return num; +} diff --git a/intl/icu/source/tools/toolutil/denseranges.h b/intl/icu/source/tools/toolutil/denseranges.h new file mode 100644 index 0000000000..c489ca47d8 --- /dev/null +++ b/intl/icu/source/tools/toolutil/denseranges.h @@ -0,0 +1,41 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2010, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: denseranges.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2010sep25 +* created by: Markus W. Scherer +* +* Helper code for finding a small number of dense ranges. +*/ + +#ifndef __DENSERANGES_H__ +#define __DENSERANGES_H__ + +#include "unicode/utypes.h" + +/** + * Does it make sense to write 1..capacity ranges? + * Returns 0 if not, otherwise the number of ranges. + * @param values Sorted array of signed-integer values. + * @param length Number of values. + * @param density Minimum average range density, in 256th. (0x100=100%=perfectly dense.) + * Should be 0x80..0x100, must be 1..0x100. + * @param ranges Output ranges array. + * @param capacity Maximum number of ranges. + * @return Minimum number of ranges (at most capacity) that have the desired density, + * or 0 if that density cannot be achieved. + */ +U_CAPI int32_t U_EXPORT2 +uprv_makeDenseRanges(const int32_t values[], int32_t length, + int32_t density, + int32_t ranges[][2], int32_t capacity); + +#endif // __DENSERANGES_H__ diff --git a/intl/icu/source/tools/toolutil/filestrm.cpp b/intl/icu/source/tools/toolutil/filestrm.cpp new file mode 100644 index 0000000000..9a2695197a --- /dev/null +++ b/intl/icu/source/tools/toolutil/filestrm.cpp @@ -0,0 +1,227 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +****************************************************************************** +* +* Copyright (C) 1997-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* File FILESTRM.C +* +* @author Glenn Marcy +* +* Modification History: +* +* Date Name Description +* 5/8/98 gm Created +* 03/02/99 stephen Reordered params in ungetc to match stdio +* Added wopen +* 3/29/99 helena Merged Stephen and Bertrand's changes. +* +****************************************************************************** +*/ + +#include "filestrm.h" + +#include "cmemory.h" + +#include <stdio.h> + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_open(const char* filename, const char* mode) +{ + if(filename != nullptr && *filename != 0 && mode != nullptr && *mode != 0) { + FILE *file = fopen(filename, mode); + return (FileStream*)file; + } else { + return nullptr; + } +} + +/* +U_CAPI FileStream* U_EXPORT2 +T_FileStream_wopen(const wchar_t* filename, const wchar_t* mode) +{ + // TBD: _wfopen is believed to be MS-specific? +#if U_PLATFORM_USES_ONLY_WIN32_API + FILE* result = _wfopen(filename, mode); + return (FileStream*)result; +#else + size_t fnMbsSize, mdMbsSize; + char *fn, *md; + FILE *result; + + // convert from wchar_t to char + fnMbsSize = wcstombs(nullptr, filename, ((size_t)-1) >> 1); + fn = (char*)uprv_malloc(fnMbsSize+2); + wcstombs(fn, filename, fnMbsSize); + fn[fnMbsSize] = 0; + + mdMbsSize = wcstombs(nullptr, mode, ((size_t)-1) >> 1); + md = (char*)uprv_malloc(mdMbsSize+2); + wcstombs(md, mode, mdMbsSize); + md[mdMbsSize] = 0; + + result = fopen(fn, md); + uprv_free(fn); + uprv_free(md); + return (FileStream*)result; +#endif +} +*/ +U_CAPI void U_EXPORT2 +T_FileStream_close(FileStream* fileStream) +{ + if (fileStream != 0) + fclose((FILE*)fileStream); +} + +U_CAPI UBool U_EXPORT2 +T_FileStream_file_exists(const char* filename) +{ + FILE* temp = fopen(filename, "r"); + if (temp) { + fclose(temp); + return true; + } else + return false; +} + +/*static const int32_t kEOF; +const int32_t FileStream::kEOF = EOF;*/ + +/* +U_CAPI FileStream* +T_FileStream_tmpfile() +{ + FILE* file = tmpfile(); + return (FileStream*)file; +} +*/ + +U_CAPI int32_t U_EXPORT2 +T_FileStream_read(FileStream* fileStream, void* addr, int32_t len) +{ + return static_cast<int32_t>(fread(addr, 1, len, (FILE*)fileStream)); +} + +U_CAPI int32_t U_EXPORT2 +T_FileStream_write(FileStream* fileStream, const void* addr, int32_t len) +{ + + return static_cast<int32_t>(fwrite(addr, 1, len, (FILE*)fileStream)); +} + +U_CAPI void U_EXPORT2 +T_FileStream_rewind(FileStream* fileStream) +{ + rewind((FILE*)fileStream); +} + +U_CAPI int32_t U_EXPORT2 +T_FileStream_putc(FileStream* fileStream, int32_t ch) +{ + int32_t c = fputc(ch, (FILE*)fileStream); + return c; +} + +U_CAPI int U_EXPORT2 +T_FileStream_getc(FileStream* fileStream) +{ + int c = fgetc((FILE*)fileStream); + return c; +} + +U_CAPI int32_t U_EXPORT2 +T_FileStream_ungetc(int32_t ch, FileStream* fileStream) +{ + + int32_t c = ungetc(ch, (FILE*)fileStream); + return c; +} + +U_CAPI int32_t U_EXPORT2 +T_FileStream_peek(FileStream* fileStream) +{ + int32_t c = fgetc((FILE*)fileStream); + return ungetc(c, (FILE*)fileStream); +} + +U_CAPI char* U_EXPORT2 +T_FileStream_readLine(FileStream* fileStream, char* buffer, int32_t length) +{ + return fgets(buffer, length, (FILE*)fileStream); +} + +U_CAPI int32_t U_EXPORT2 +T_FileStream_writeLine(FileStream* fileStream, const char* buffer) +{ + return fputs(buffer, (FILE*)fileStream); +} + +U_CAPI int32_t U_EXPORT2 +T_FileStream_size(FileStream* fileStream) +{ + int32_t savedPos = ftell((FILE*)fileStream); + int32_t size = 0; + + /*Changes by Bertrand A. D. doesn't affect the current position + goes to the end of the file before ftell*/ + fseek((FILE*)fileStream, 0, SEEK_END); + size = (int32_t)ftell((FILE*)fileStream); + fseek((FILE*)fileStream, savedPos, SEEK_SET); + return size; +} + +U_CAPI int U_EXPORT2 +T_FileStream_eof(FileStream* fileStream) +{ + return feof((FILE*)fileStream); +} + +/* + Warning + This function may not work consistently on all platforms + (e.g. HP-UX, FreeBSD and MacOSX don't return an error when + putc is used on a file opened as readonly) +*/ +U_CAPI int U_EXPORT2 +T_FileStream_error(FileStream* fileStream) +{ + return (fileStream == 0 || ferror((FILE*)fileStream)); +} + +/* This function doesn't work. */ +/* force the stream to set its error flag*/ +/*U_CAPI void U_EXPORT2 +T_FileStream_setError(FileStream* fileStream) +{ + fseek((FILE*)fileStream, 99999, SEEK_SET); +} +*/ + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_stdin() +{ + return (FileStream*)stdin; +} + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_stdout() +{ + return (FileStream*)stdout; +} + + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_stderr() +{ + return (FileStream*)stderr; +} + +U_CAPI UBool U_EXPORT2 +T_FileStream_remove(const char* fileName){ + return (remove(fileName) == 0); +} diff --git a/intl/icu/source/tools/toolutil/filestrm.h b/intl/icu/source/tools/toolutil/filestrm.h new file mode 100644 index 0000000000..86fac3063f --- /dev/null +++ b/intl/icu/source/tools/toolutil/filestrm.h @@ -0,0 +1,106 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +****************************************************************************** +* +* Copyright (C) 1997-2005, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* +* File FILESTRM.H +* +* Contains FileStream interface +* +* @author Glenn Marcy +* +* Modification History: +* +* Date Name Description +* 5/8/98 gm Created. +* 03/02/99 stephen Reordered params in ungetc to match stdio +* Added wopen +* +****************************************************************************** +*/ + +#ifndef FILESTRM_H +#define FILESTRM_H + +#include "unicode/utypes.h" + +typedef struct _FileStream FileStream; + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_open(const char* filename, const char* mode); + +/* +U_CAPI FileStream* U_EXPORT2 +T_FileStream_wopen(const wchar_t* filename, const wchar_t* mode); +*/ +U_CAPI void U_EXPORT2 +T_FileStream_close(FileStream* fileStream); + +U_CAPI UBool U_EXPORT2 +T_FileStream_file_exists(const char* filename); + +/* +U_CAPI FileStream* U_EXPORT2 +T_FileStream_tmpfile(void); +*/ + +U_CAPI int32_t U_EXPORT2 +T_FileStream_read(FileStream* fileStream, void* addr, int32_t len); + +U_CAPI int32_t U_EXPORT2 +T_FileStream_write(FileStream* fileStream, const void* addr, int32_t len); + +U_CAPI void U_EXPORT2 +T_FileStream_rewind(FileStream* fileStream); + +/*Added by Bertrand A. D. */ +U_CAPI char * U_EXPORT2 +T_FileStream_readLine(FileStream* fileStream, char* buffer, int32_t length); + +U_CAPI int32_t U_EXPORT2 +T_FileStream_writeLine(FileStream* fileStream, const char* buffer); + +U_CAPI int32_t U_EXPORT2 +T_FileStream_putc(FileStream* fileStream, int32_t ch); + +U_CAPI int U_EXPORT2 +T_FileStream_getc(FileStream* fileStream); + +U_CAPI int32_t U_EXPORT2 +T_FileStream_ungetc(int32_t ch, FileStream *fileStream); + +U_CAPI int32_t U_EXPORT2 +T_FileStream_peek(FileStream* fileStream); + +U_CAPI int32_t U_EXPORT2 +T_FileStream_size(FileStream* fileStream); + +U_CAPI int U_EXPORT2 +T_FileStream_eof(FileStream* fileStream); + +U_CAPI int U_EXPORT2 +T_FileStream_error(FileStream* fileStream); + +/* +U_CAPI void U_EXPORT2 +T_FileStream_setError(FileStream* fileStream); +*/ + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_stdin(void); + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_stdout(void); + +U_CAPI FileStream* U_EXPORT2 +T_FileStream_stderr(void); + +U_CAPI UBool U_EXPORT2 +T_FileStream_remove(const char* fileName); + +#endif /* _FILESTRM*/ diff --git a/intl/icu/source/tools/toolutil/filetools.cpp b/intl/icu/source/tools/toolutil/filetools.cpp new file mode 100644 index 0000000000..994d8e31f0 --- /dev/null +++ b/intl/icu/source/tools/toolutil/filetools.cpp @@ -0,0 +1,140 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2009-2013, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ + +#include "unicode/platform.h" +#if U_PLATFORM == U_PF_MINGW +// *cough* - for struct stat +#ifdef __STRICT_ANSI__ +#undef __STRICT_ANSI__ +#endif +#endif + +#include "filetools.h" +#include "filestrm.h" +#include "charstr.h" +#include "cstring.h" +#include "unicode/putil.h" +#include "putilimp.h" + +#include <stdio.h> +#include <stdlib.h> +#include <sys/stat.h> +#include <time.h> +#include <string.h> + +#if U_HAVE_DIRENT_H +#include <dirent.h> +typedef struct dirent DIRENT; + +#define SKIP1 "." +#define SKIP2 ".." +#endif + +static int32_t whichFileModTimeIsLater(const char *file1, const char *file2); + +/* + * Goes through the given directory recursive to compare each file's modification time with that of the file given. + * Also can be given just one file to check against. Default value for isDir is false. + */ +U_CAPI UBool U_EXPORT2 +isFileModTimeLater(const char *filePath, const char *checkAgainst, UBool isDir) { + UBool isLatest = true; + + if (filePath == nullptr || checkAgainst == nullptr) { + return false; + } + + if (isDir == true) { +#if U_HAVE_DIRENT_H + DIR *pDir = nullptr; + if ((pDir= opendir(checkAgainst)) != nullptr) { + DIR *subDirp = nullptr; + DIRENT *dirEntry = nullptr; + + while ((dirEntry = readdir(pDir)) != nullptr) { + if (uprv_strcmp(dirEntry->d_name, SKIP1) != 0 && uprv_strcmp(dirEntry->d_name, SKIP2) != 0) { + UErrorCode status = U_ZERO_ERROR; + icu::CharString newpath(checkAgainst, -1, status); + newpath.append(U_FILE_SEP_STRING, -1, status); + newpath.append(dirEntry->d_name, -1, status); + if (U_FAILURE(status)) { + fprintf(stderr, "%s:%d: %s\n", __FILE__, __LINE__, u_errorName(status)); + return false; + } + + if ((subDirp = opendir(newpath.data())) != nullptr) { + /* If this new path is a directory, make a recursive call with the newpath. */ + closedir(subDirp); + isLatest = isFileModTimeLater(filePath, newpath.data(), isDir); + if (!isLatest) { + break; + } + } else { + int32_t latest = whichFileModTimeIsLater(filePath, newpath.data()); + if (latest < 0 || latest == 2) { + isLatest = false; + break; + } + } + + } + } + closedir(pDir); + } else { + fprintf(stderr, "Unable to open directory: %s\n", checkAgainst); + return false; + } +#endif + } else { + if (T_FileStream_file_exists(checkAgainst)) { + int32_t latest = whichFileModTimeIsLater(filePath, checkAgainst); + if (latest < 0 || latest == 2) { + isLatest = false; + } + } else { + isLatest = false; + } + } + + return isLatest; +} + +/* Compares the mod time of both files returning a number indicating which one is later. -1 if error ocurs. */ +static int32_t whichFileModTimeIsLater(const char *file1, const char *file2) { + int32_t result = 0; + struct stat stbuf1, stbuf2; + + if (stat(file1, &stbuf1) == 0 && stat(file2, &stbuf2) == 0) { + time_t modtime1, modtime2; + double diff; + + modtime1 = stbuf1.st_mtime; + modtime2 = stbuf2.st_mtime; + + diff = difftime(modtime1, modtime2); + if (diff < 0.0) { + result = 2; + } else if (diff > 0.0) { + result = 1; + } + + } else { + fprintf(stderr, "Unable to get stats from file: %s or %s\n", file1, file2); + result = -1; + } + + return result; +} + +/* Swap the file separater character given with the new one in the file path. */ +U_CAPI void U_EXPORT2 +swapFileSepChar(char *filePath, const char oldFileSepChar, const char newFileSepChar) { + for (int32_t i = 0, length = static_cast<int32_t>(uprv_strlen(filePath)); i < length; i++) { + filePath[i] = (filePath[i] == oldFileSepChar ) ? newFileSepChar : filePath[i]; + } +} diff --git a/intl/icu/source/tools/toolutil/filetools.h b/intl/icu/source/tools/toolutil/filetools.h new file mode 100644 index 0000000000..40a606a7d4 --- /dev/null +++ b/intl/icu/source/tools/toolutil/filetools.h @@ -0,0 +1,34 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2009, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: filetools.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2009jan09 +* created by: Michael Ow +* +* Contains various functions to handle files. +* Not suitable for production use. Not supported. +* Not conformant. Not efficient. +*/ + +#ifndef __FILETOOLS_H__ +#define __FILETOOLS_H__ + +#include "unicode/utypes.h" + +U_CAPI UBool U_EXPORT2 +isFileModTimeLater(const char *filePath, const char *checkAgainst, UBool isDir=false); + +U_CAPI void U_EXPORT2 +swapFileSepChar(char *filePath, const char oldFileSepChar, const char newFileSepChar); + +#endif diff --git a/intl/icu/source/tools/toolutil/flagparser.cpp b/intl/icu/source/tools/toolutil/flagparser.cpp new file mode 100644 index 0000000000..8bbceb4f73 --- /dev/null +++ b/intl/icu/source/tools/toolutil/flagparser.cpp @@ -0,0 +1,180 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2009-2015, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ + +#include "flagparser.h" +#include "filestrm.h" +#include "cstring.h" +#include "cmemory.h" + +#define DEFAULT_BUFFER_SIZE 512 + +static int32_t currentBufferSize = DEFAULT_BUFFER_SIZE; + +static int32_t extractFlag(char* buffer, int32_t bufferSize, char* flag, int32_t flagSize, const char ** flagNames, int32_t numOfFlags, UErrorCode *status); +static int32_t getFlagOffset(const char *buffer, int32_t bufferSize); + +/* + * Opens the given fileName and reads in the information storing the data in flagBuffer. + */ +U_CAPI int32_t U_EXPORT2 +parseFlagsFile(const char *fileName, char **flagBuffer, int32_t flagBufferSize, const char ** flagNames, int32_t numOfFlags, UErrorCode *status) { + char* buffer = nullptr; + char* tmpFlagBuffer = nullptr; + UBool allocateMoreSpace = false; + int32_t idx, i; + int32_t result = 0; + + FileStream *f = T_FileStream_open(fileName, "r"); + if (f == nullptr) { + *status = U_FILE_ACCESS_ERROR; + goto parseFlagsFile_cleanup; + } + + buffer = (char *)uprv_malloc(sizeof(char) * currentBufferSize); + tmpFlagBuffer = (char *)uprv_malloc(sizeof(char) * flagBufferSize); + + if (buffer == nullptr || tmpFlagBuffer == nullptr) { + *status = U_MEMORY_ALLOCATION_ERROR; + goto parseFlagsFile_cleanup; + } + + do { + if (allocateMoreSpace) { + allocateMoreSpace = false; + currentBufferSize *= 2; + uprv_free(buffer); + buffer = (char *)uprv_malloc(sizeof(char) * currentBufferSize); + if (buffer == nullptr) { + *status = U_MEMORY_ALLOCATION_ERROR; + goto parseFlagsFile_cleanup; + } + } + for (i = 0; i < numOfFlags;) { + if (T_FileStream_readLine(f, buffer, currentBufferSize) == nullptr) { + /* End of file reached. */ + break; + } + if (buffer[0] == '#') { + continue; + } + + if ((int32_t)uprv_strlen(buffer) == (currentBufferSize - 1) && buffer[currentBufferSize-2] != '\n') { + /* Allocate more space for buffer if it did not read the entire line */ + allocateMoreSpace = true; + T_FileStream_rewind(f); + break; + } else { + idx = extractFlag(buffer, currentBufferSize, tmpFlagBuffer, flagBufferSize, flagNames, numOfFlags, status); + if (U_FAILURE(*status)) { + if (*status == U_BUFFER_OVERFLOW_ERROR) { + result = currentBufferSize; + } else { + result = -1; + } + break; + } else { + if (flagNames != nullptr) { + if (idx >= 0) { + uprv_strcpy(flagBuffer[idx], tmpFlagBuffer); + } else { + /* No match found. Skip it. */ + continue; + } + } else { + uprv_strcpy(flagBuffer[i++], tmpFlagBuffer); + } + } + } + } + } while (allocateMoreSpace && U_SUCCESS(*status)); + +parseFlagsFile_cleanup: + uprv_free(tmpFlagBuffer); + uprv_free(buffer); + + T_FileStream_close(f); + + if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) { + return -1; + } + + if (U_SUCCESS(*status) && result == 0) { + currentBufferSize = DEFAULT_BUFFER_SIZE; + } + + return result; +} + + +/* + * Extract the setting after the '=' and store it in flag excluding the newline character. + */ +static int32_t extractFlag(char* buffer, int32_t bufferSize, char* flag, int32_t flagSize, const char **flagNames, int32_t numOfFlags, UErrorCode *status) { + int32_t i, idx = -1; + char *pBuffer; + int32_t offset=0; + UBool bufferWritten = false; + + if (buffer[0] != 0) { + /* Get the offset (i.e. position after the '=') */ + offset = getFlagOffset(buffer, bufferSize); + pBuffer = buffer+offset; + for(i = 0;;i++) { + if (i >= flagSize) { + *status = U_BUFFER_OVERFLOW_ERROR; + return -1; + } + if (pBuffer[i+1] == 0) { + /* Indicates a new line character. End here. */ + flag[i] = 0; + break; + } + + flag[i] = pBuffer[i]; + if (i == 0) { + bufferWritten = true; + } + } + } + + if (!bufferWritten) { + flag[0] = 0; + } + + if (flagNames != nullptr && offset>0) { + offset--; /* Move offset back 1 because of '='*/ + for (i = 0; i < numOfFlags; i++) { + if (uprv_strncmp(buffer, flagNames[i], offset) == 0) { + idx = i; + break; + } + } + } + + return idx; +} + +/* + * Get the position after the '=' character. + */ +static int32_t getFlagOffset(const char *buffer, int32_t bufferSize) { + int32_t offset = 0; + + for (offset = 0; offset < bufferSize;offset++) { + if (buffer[offset] == '=') { + offset++; + break; + } + } + + if (offset == bufferSize || (offset - 1) == bufferSize) { + offset = 0; + } + + return offset; +} diff --git a/intl/icu/source/tools/toolutil/flagparser.h b/intl/icu/source/tools/toolutil/flagparser.h new file mode 100644 index 0000000000..aa42547164 --- /dev/null +++ b/intl/icu/source/tools/toolutil/flagparser.h @@ -0,0 +1,32 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2009-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: flagparser.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2009jan08 +* created by: Michael Ow +* +* Tiny flag file parser using ICU and intended for use in ICU tests and in build tools. +* Not suitable for production use. Not supported. +* Not conformant. Not efficient. +* But very small. +*/ + +#ifndef __FLAGPARSER_H__ +#define __FLAGPARSER_H__ + +#include "unicode/utypes.h" + +U_CAPI int32_t U_EXPORT2 +parseFlagsFile(const char *fileName, char **flagBuffer, int32_t flagBufferSize, const char ** flagNames, int32_t numOfFlags, UErrorCode *status); + +#endif diff --git a/intl/icu/source/tools/toolutil/package.cpp b/intl/icu/source/tools/toolutil/package.cpp new file mode 100644 index 0000000000..3098f5d57d --- /dev/null +++ b/intl/icu/source/tools/toolutil/package.cpp @@ -0,0 +1,1311 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: package.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005aug25 +* created by: Markus W. Scherer +* +* Read, modify, and write ICU .dat data package files. +* This is an integral part of the icupkg tool, moved to the toolutil library +* because parts of tool implementations tend to be later shared by +* other tools. +* Subsumes functionality and implementation code from +* gencmn, decmn, and icuswap tools. +*/ + +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/udata.h" +#include "cstring.h" +#include "uarrsort.h" +#include "ucmndata.h" +#include "udataswp.h" +#include "swapimpl.h" +#include "toolutil.h" +#include "package.h" +#include "cmemory.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + + +static const int32_t kItemsChunk = 256; /* How much to increase the filesarray by each time */ + +// general definitions ----------------------------------------------------- *** + +/* UDataInfo cf. udata.h */ +static const UDataInfo dataInfo={ + (uint16_t)sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + (uint8_t)sizeof(char16_t), + 0, + + {0x43, 0x6d, 0x6e, 0x44}, /* dataFormat="CmnD" */ + {1, 0, 0, 0}, /* formatVersion */ + {3, 0, 0, 0} /* dataVersion */ +}; + +U_CDECL_BEGIN +static void U_CALLCONV +printPackageError(void *context, const char *fmt, va_list args) { + vfprintf((FILE *)context, fmt, args); +} +U_CDECL_END + +static uint16_t +readSwapUInt16(uint16_t x) { + return (uint16_t)((x<<8)|(x>>8)); +} + +// platform types ---------------------------------------------------------- *** + +static const char *types="lb?e"; + +enum { TYPE_L, TYPE_B, TYPE_LE, TYPE_E, TYPE_COUNT }; + +static inline int32_t +makeTypeEnum(uint8_t charset, UBool isBigEndian) { + return 2*(int32_t)charset+isBigEndian; +} + +static inline int32_t +makeTypeEnum(char type) { + return + type == 'l' ? TYPE_L : + type == 'b' ? TYPE_B : + type == 'e' ? TYPE_E : + -1; +} + +static inline char +makeTypeLetter(uint8_t charset, UBool isBigEndian) { + return types[makeTypeEnum(charset, isBigEndian)]; +} + +static inline char +makeTypeLetter(int32_t typeEnum) { + return types[typeEnum]; +} + +static void +makeTypeProps(char type, uint8_t &charset, UBool &isBigEndian) { + int32_t typeEnum=makeTypeEnum(type); + charset=(uint8_t)(typeEnum>>1); + isBigEndian=(UBool)(typeEnum&1); +} + +U_CFUNC const UDataInfo * +getDataInfo(const uint8_t *data, int32_t length, + int32_t &infoLength, int32_t &headerLength, + UErrorCode *pErrorCode) { + const DataHeader *pHeader; + const UDataInfo *pInfo; + + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return nullptr; + } + if( data==nullptr || + (length>=0 && length<(int32_t)sizeof(DataHeader)) + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + + pHeader=(const DataHeader *)data; + pInfo=&pHeader->info; + if( (length>=0 && length<(int32_t)sizeof(DataHeader)) || + pHeader->dataHeader.magic1!=0xda || + pHeader->dataHeader.magic2!=0x27 || + pInfo->sizeofUChar!=2 + ) { + *pErrorCode=U_UNSUPPORTED_ERROR; + return nullptr; + } + + if(pInfo->isBigEndian==U_IS_BIG_ENDIAN) { + headerLength=pHeader->dataHeader.headerSize; + infoLength=pInfo->size; + } else { + headerLength=readSwapUInt16(pHeader->dataHeader.headerSize); + infoLength=readSwapUInt16(pInfo->size); + } + + if( headerLength<(int32_t)sizeof(DataHeader) || + infoLength<(int32_t)sizeof(UDataInfo) || + headerLength<(int32_t)(sizeof(pHeader->dataHeader)+infoLength) || + (length>=0 && length<headerLength) + ) { + *pErrorCode=U_UNSUPPORTED_ERROR; + return nullptr; + } + + return pInfo; +} + +static int32_t +getTypeEnumForInputData(const uint8_t *data, int32_t length, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t infoLength, headerLength; + + /* getDataInfo() checks for illegal arguments */ + pInfo=getDataInfo(data, length, infoLength, headerLength, pErrorCode); + if(pInfo==nullptr) { + return -1; + } + + return makeTypeEnum(pInfo->charsetFamily, (UBool)pInfo->isBigEndian); +} + +// file handling ----------------------------------------------------------- *** + +static void +extractPackageName(const char *filename, + char pkg[], int32_t capacity) { + const char *basename; + int32_t len; + + basename=findBasename(filename); + len=(int32_t)strlen(basename)-4; /* -4: subtract the length of ".dat" */ + + if(len<=0 || 0!=strcmp(basename+len, ".dat")) { + fprintf(stderr, "icupkg: \"%s\" is not recognized as a package filename (must end with .dat)\n", + basename); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + + if(len>=capacity) { + fprintf(stderr, "icupkg: the package name \"%s\" is too long (>=%ld)\n", + basename, (long)capacity); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + + memcpy(pkg, basename, len); + pkg[len]=0; +} + +static int32_t +getFileLength(FILE *f) { + int32_t length; + + fseek(f, 0, SEEK_END); + length=(int32_t)ftell(f); + fseek(f, 0, SEEK_SET); + return length; +} + +/* + * Turn tree separators and alternate file separators into normal file separators. + */ +#if U_TREE_ENTRY_SEP_CHAR==U_FILE_SEP_CHAR && U_FILE_ALT_SEP_CHAR==U_FILE_SEP_CHAR +#define treeToPath(s) +#else +static void +treeToPath(char *s) { + char *t; + + for(t=s; *t!=0; ++t) { + if(*t==U_TREE_ENTRY_SEP_CHAR || *t==U_FILE_ALT_SEP_CHAR) { + *t=U_FILE_SEP_CHAR; + } + } +} +#endif + +/* + * Turn file separators into tree separators. + */ +#if U_TREE_ENTRY_SEP_CHAR==U_FILE_SEP_CHAR && U_FILE_ALT_SEP_CHAR==U_FILE_SEP_CHAR +#define pathToTree(s) +#else +static void +pathToTree(char *s) { + char *t; + + for(t=s; *t!=0; ++t) { + if(*t==U_FILE_SEP_CHAR || *t==U_FILE_ALT_SEP_CHAR) { + *t=U_TREE_ENTRY_SEP_CHAR; + } + } +} +#endif + +/* + * Prepend the path (if any) to the name and run the name through treeToName(). + */ +static void +makeFullFilename(const char *path, const char *name, + char *filename, int32_t capacity) { + char *s; + + // prepend the path unless nullptr or empty + if(path!=nullptr && path[0]!=0) { + if((int32_t)(strlen(path)+1)>=capacity) { + fprintf(stderr, "pathname too long: \"%s\"\n", path); + exit(U_BUFFER_OVERFLOW_ERROR); + } + strcpy(filename, path); + + // make sure the path ends with a file separator + s=strchr(filename, 0); + if(*(s-1)!=U_FILE_SEP_CHAR && *(s-1)!=U_FILE_ALT_SEP_CHAR) { + *s++=U_FILE_SEP_CHAR; + } + } else { + s=filename; + } + + // turn the name into a filename, turn tree separators into file separators + if((int32_t)((s-filename)+strlen(name))>=capacity) { + fprintf(stderr, "path/filename too long: \"%s%s\"\n", filename, name); + exit(U_BUFFER_OVERFLOW_ERROR); + } + strcpy(s, name); + treeToPath(s); +} + +static void +makeFullFilenameAndDirs(const char *path, const char *name, + char *filename, int32_t capacity) { + char *sep; + UErrorCode errorCode; + + makeFullFilename(path, name, filename, capacity); + + // make tree directories + errorCode=U_ZERO_ERROR; + sep=strchr(filename, 0)-strlen(name); + while((sep=strchr(sep, U_FILE_SEP_CHAR))!=nullptr) { + if(sep!=filename) { + *sep=0; // truncate temporarily + uprv_mkdir(filename, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: unable to create tree directory \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + } + *sep++=U_FILE_SEP_CHAR; // restore file separator character + } +} + +static uint8_t * +readFile(const char *path, const char *name, int32_t &length, char &type) { + char filename[1024]; + FILE *file; + UErrorCode errorCode; + int32_t fileLength, typeEnum; + + makeFullFilename(path, name, filename, (int32_t)sizeof(filename)); + + /* open the input file, get its length, allocate memory for it, read the file */ + file=fopen(filename, "rb"); + if(file==nullptr) { + fprintf(stderr, "icupkg: unable to open input file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + /* get the file length */ + fileLength=getFileLength(file); + if(ferror(file) || fileLength<=0) { + fprintf(stderr, "icupkg: empty input file \"%s\"\n", filename); + fclose(file); + exit(U_FILE_ACCESS_ERROR); + } + + /* allocate the buffer, pad to multiple of 16 */ + length=(fileLength+0xf)&~0xf; + icu::LocalMemory<uint8_t> data((uint8_t *)uprv_malloc(length)); + if(data.isNull()) { + fclose(file); + fprintf(stderr, "icupkg: malloc error allocating %d bytes.\n", (int)length); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + /* read the file */ + if(fileLength!=(int32_t)fread(data.getAlias(), 1, fileLength, file)) { + fprintf(stderr, "icupkg: error reading \"%s\"\n", filename); + fclose(file); + exit(U_FILE_ACCESS_ERROR); + } + + /* pad the file to a multiple of 16 using the usual padding byte */ + if(fileLength<length) { + memset(data.getAlias()+fileLength, 0xaa, length-fileLength); + } + + fclose(file); + + // minimum check for ICU-format data + errorCode=U_ZERO_ERROR; + typeEnum=getTypeEnumForInputData(data.getAlias(), length, &errorCode); + if(typeEnum<0 || U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: not an ICU data file: \"%s\"\n", filename); +#if !UCONFIG_NO_LEGACY_CONVERSION + exit(U_INVALID_FORMAT_ERROR); +#else + fprintf(stderr, "U_INVALID_FORMAT_ERROR occurred but UCONFIG_NO_LEGACY_CONVERSION is on so this is expected.\n"); + exit(0); +#endif + } + type=makeTypeLetter(typeEnum); + + return data.orphan(); +} + +// .dat package file representation ---------------------------------------- *** + +U_CDECL_BEGIN + +static int32_t U_CALLCONV +compareItems(const void * /*context*/, const void *left, const void *right) { + U_NAMESPACE_USE + + return (int32_t)strcmp(((Item *)left)->name, ((Item *)right)->name); +} + +U_CDECL_END + +U_NAMESPACE_BEGIN + +Package::Package() + : doAutoPrefix(false), prefixEndsWithType(false) { + inPkgName[0]=0; + pkgPrefix[0]=0; + inData=nullptr; + inLength=0; + inCharset=U_CHARSET_FAMILY; + inIsBigEndian=U_IS_BIG_ENDIAN; + + itemCount=0; + itemMax=0; + items=nullptr; + + inStringTop=outStringTop=0; + + matchMode=0; + findPrefix=findSuffix=nullptr; + findPrefixLength=findSuffixLength=0; + findNextIndex=-1; + + // create a header for an empty package + DataHeader *pHeader; + pHeader=(DataHeader *)header; + pHeader->dataHeader.magic1=0xda; + pHeader->dataHeader.magic2=0x27; + memcpy(&pHeader->info, &dataInfo, sizeof(dataInfo)); + headerLength=(int32_t)(4+sizeof(dataInfo)); + if(headerLength&0xf) { + /* NUL-pad the header to a multiple of 16 */ + int32_t length=(headerLength+0xf)&~0xf; + memset(header+headerLength, 0, length-headerLength); + headerLength=length; + } + pHeader->dataHeader.headerSize=(uint16_t)headerLength; +} + +Package::~Package() { + int32_t idx; + + uprv_free(inData); + + for(idx=0; idx<itemCount; ++idx) { + if(items[idx].isDataOwned) { + uprv_free(items[idx].data); + } + } + + uprv_free((void*)items); +} + +void +Package::setPrefix(const char *p) { + if(strlen(p)>=sizeof(pkgPrefix)) { + fprintf(stderr, "icupkg: --toc_prefix %s too long\n", p); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + strcpy(pkgPrefix, p); +} + +void +Package::readPackage(const char *filename) { + UDataSwapper *ds; + const UDataInfo *pInfo; + UErrorCode errorCode; + + const uint8_t *inBytes; + + int32_t length, offset, i; + int32_t itemLength, typeEnum; + char type; + + const UDataOffsetTOCEntry *inEntries; + + extractPackageName(filename, inPkgName, (int32_t)sizeof(inPkgName)); + + /* read the file */ + inData=readFile(nullptr, filename, inLength, type); + length=inLength; + + /* + * swap the header - even if the swapping itself is a no-op + * because it tells us the header length + */ + errorCode=U_ZERO_ERROR; + makeTypeProps(type, inCharset, inIsBigEndian); + ds=udata_openSwapper(inIsBigEndian, inCharset, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_openSwapper(\"%s\") failed - %s\n", + filename, u_errorName(errorCode)); + exit(errorCode); + } + + ds->printError=printPackageError; + ds->printErrorContext=stderr; + + headerLength=sizeof(header); + if(length<headerLength) { + headerLength=length; + } + headerLength=udata_swapDataHeader(ds, inData, headerLength, header, &errorCode); + if(U_FAILURE(errorCode)) { + exit(errorCode); + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==0x43 && /* dataFormat="CmnD" */ + pInfo->dataFormat[1]==0x6d && + pInfo->dataFormat[2]==0x6e && + pInfo->dataFormat[3]==0x44 && + pInfo->formatVersion[0]==1 + )) { + fprintf(stderr, "icupkg: data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as an ICU .dat package\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + exit(U_UNSUPPORTED_ERROR); + } + inIsBigEndian=(UBool)pInfo->isBigEndian; + inCharset=pInfo->charsetFamily; + + inBytes=(const uint8_t *)inData+headerLength; + inEntries=(const UDataOffsetTOCEntry *)(inBytes+4); + + /* check that the itemCount fits, then the ToC table, then at least the header of the last item */ + length-=headerLength; + if(length<4) { + /* itemCount does not fit */ + offset=0x7fffffff; + } else { + itemCount=udata_readInt32(ds, *(const int32_t *)inBytes); + setItemCapacity(itemCount); /* resize so there's space */ + if(itemCount==0) { + offset=4; + } else if(length<(4+8*itemCount)) { + /* ToC table does not fit */ + offset=0x7fffffff; + } else { + /* offset of the last item plus at least 20 bytes for its header */ + offset=20+(int32_t)ds->readUInt32(inEntries[itemCount-1].dataOffset); + } + } + if(length<offset) { + fprintf(stderr, "icupkg: too few bytes (%ld after header) for a .dat package\n", + (long)length); + exit(U_INDEX_OUTOFBOUNDS_ERROR); + } + /* do not modify the package length variable until the last item's length is set */ + + if(itemCount<=0) { + if(doAutoPrefix) { + fprintf(stderr, "icupkg: --auto_toc_prefix[_with_type] but the input package is empty\n"); + exit(U_INVALID_FORMAT_ERROR); + } + } else { + char prefix[MAX_PKG_NAME_LENGTH+4]; + char *s, *inItemStrings; + + if(itemCount>itemMax) { + fprintf(stderr, "icupkg: too many items, maximum is %d\n", itemMax); + exit(U_BUFFER_OVERFLOW_ERROR); + } + + /* swap the item name strings */ + int32_t stringsOffset=4+8*itemCount; + itemLength=(int32_t)(ds->readUInt32(inEntries[0].dataOffset))-stringsOffset; + + // don't include padding bytes at the end of the item names + while(itemLength>0 && inBytes[stringsOffset+itemLength-1]!=0) { + --itemLength; + } + + if((inStringTop+itemLength)>STRING_STORE_SIZE) { + fprintf(stderr, "icupkg: total length of item name strings too long\n"); + exit(U_BUFFER_OVERFLOW_ERROR); + } + + inItemStrings=inStrings+inStringTop; + ds->swapInvChars(ds, inBytes+stringsOffset, itemLength, inItemStrings, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg failed to swap the input .dat package item name strings\n"); + exit(U_INVALID_FORMAT_ERROR); + } + inStringTop+=itemLength; + + // reset the Item entries + memset(items, 0, itemCount*sizeof(Item)); + + /* + * Get the common prefix of the items. + * New-style ICU .dat packages use tree separators ('/') between package names, + * tree names, and item names, + * while old-style ICU .dat packages (before multi-tree support) + * use an underscore ('_') between package and item names. + */ + offset=(int32_t)ds->readUInt32(inEntries[0].nameOffset)-stringsOffset; + s=inItemStrings+offset; // name of the first entry + int32_t prefixLength; + if(doAutoPrefix) { + // Use the first entry's prefix. Must be a new-style package. + const char *prefixLimit=strchr(s, U_TREE_ENTRY_SEP_CHAR); + if(prefixLimit==nullptr) { + fprintf(stderr, + "icupkg: --auto_toc_prefix[_with_type] but " + "the first entry \"%s\" does not contain a '%c'\n", + s, U_TREE_ENTRY_SEP_CHAR); + exit(U_INVALID_FORMAT_ERROR); + } + prefixLength=(int32_t)(prefixLimit-s); + if(prefixLength==0 || prefixLength>=UPRV_LENGTHOF(pkgPrefix)) { + fprintf(stderr, + "icupkg: --auto_toc_prefix[_with_type] but " + "the prefix of the first entry \"%s\" is empty or too long\n", + s); + exit(U_INVALID_FORMAT_ERROR); + } + if(prefixEndsWithType && s[prefixLength-1]!=type) { + fprintf(stderr, + "icupkg: --auto_toc_prefix_with_type but " + "the prefix of the first entry \"%s\" does not end with '%c'\n", + s, type); + exit(U_INVALID_FORMAT_ERROR); + } + memcpy(pkgPrefix, s, prefixLength); + pkgPrefix[prefixLength]=0; + memcpy(prefix, s, ++prefixLength); // include the / + } else { + // Use the package basename as prefix. + int32_t inPkgNameLength= static_cast<int32_t>(strlen(inPkgName)); + memcpy(prefix, inPkgName, inPkgNameLength); + prefixLength=inPkgNameLength; + + if( (int32_t)strlen(s)>=(inPkgNameLength+2) && + 0==memcmp(s, inPkgName, inPkgNameLength) && + s[inPkgNameLength]=='_' + ) { + // old-style .dat package + prefix[prefixLength++]='_'; + } else { + // new-style .dat package + prefix[prefixLength++]=U_TREE_ENTRY_SEP_CHAR; + // if it turns out to not contain U_TREE_ENTRY_SEP_CHAR + // then the test in the loop below will fail + } + } + prefix[prefixLength]=0; + + /* read the ToC table */ + for(i=0; i<itemCount; ++i) { + // skip the package part of the item name, error if it does not match the actual package name + // or if nothing follows the package name + offset=(int32_t)ds->readUInt32(inEntries[i].nameOffset)-stringsOffset; + s=inItemStrings+offset; + if(0!=strncmp(s, prefix, prefixLength) || s[prefixLength]==0) { + fprintf(stderr, "icupkg: input .dat item name \"%s\" does not start with \"%s\"\n", + s, prefix); + exit(U_INVALID_FORMAT_ERROR); + } + items[i].name=s+prefixLength; + + // set the item's data + items[i].data=(uint8_t *)inBytes+ds->readUInt32(inEntries[i].dataOffset); + if(i>0) { + items[i-1].length=(int32_t)(items[i].data-items[i-1].data); + + // set the previous item's platform type + typeEnum=getTypeEnumForInputData(items[i-1].data, items[i-1].length, &errorCode); + if(typeEnum<0 || U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: not an ICU data file: item \"%s\" in \"%s\"\n", items[i-1].name, filename); + exit(U_INVALID_FORMAT_ERROR); + } + items[i-1].type=makeTypeLetter(typeEnum); + } + items[i].isDataOwned=false; + } + // set the last item's length + items[itemCount-1].length=length-ds->readUInt32(inEntries[itemCount-1].dataOffset); + + // set the last item's platform type + typeEnum=getTypeEnumForInputData(items[itemCount-1].data, items[itemCount-1].length, &errorCode); + if(typeEnum<0 || U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: not an ICU data file: item \"%s\" in \"%s\"\n", items[itemCount-1].name, filename); + exit(U_INVALID_FORMAT_ERROR); + } + items[itemCount-1].type=makeTypeLetter(typeEnum); + + if(type!=U_ICUDATA_TYPE_LETTER[0]) { + // sort the item names for the local charset + sortItems(); + } + } + + udata_closeSwapper(ds); +} + +char +Package::getInType() { + return makeTypeLetter(inCharset, inIsBigEndian); +} + +void +Package::writePackage(const char *filename, char outType, const char *comment) { + char prefix[MAX_PKG_NAME_LENGTH+4]; + UDataOffsetTOCEntry entry; + UDataSwapper *dsLocalToOut, *ds[TYPE_COUNT]; + FILE *file; + Item *pItem; + char *name; + UErrorCode errorCode; + int32_t i, length, prefixLength, maxItemLength, basenameOffset, offset, outInt32; + uint8_t outCharset; + UBool outIsBigEndian; + + extractPackageName(filename, prefix, MAX_PKG_NAME_LENGTH); + + // if there is an explicit comment, then use it, else use what's in the current header + if(comment!=nullptr) { + /* get the header size minus the current comment */ + DataHeader *pHeader; + int32_t length; + + pHeader=(DataHeader *)header; + headerLength=4+pHeader->info.size; + length=(int32_t)strlen(comment); + if((int32_t)(headerLength+length)>=(int32_t)sizeof(header)) { + fprintf(stderr, "icupkg: comment too long\n"); + exit(U_BUFFER_OVERFLOW_ERROR); + } + memcpy(header+headerLength, comment, length+1); + headerLength+=length; + if(headerLength&0xf) { + /* NUL-pad the header to a multiple of 16 */ + length=(headerLength+0xf)&~0xf; + memset(header+headerLength, 0, length-headerLength); + headerLength=length; + } + pHeader->dataHeader.headerSize=(uint16_t)headerLength; + } + + makeTypeProps(outType, outCharset, outIsBigEndian); + + // open (TYPE_COUNT-2) swappers + // one is a no-op for local type==outType + // one type (TYPE_LE) is bogus + errorCode=U_ZERO_ERROR; + i=makeTypeEnum(outType); + ds[TYPE_B]= i==TYPE_B ? nullptr : udata_openSwapper(true, U_ASCII_FAMILY, outIsBigEndian, outCharset, &errorCode); + ds[TYPE_L]= i==TYPE_L ? nullptr : udata_openSwapper(false, U_ASCII_FAMILY, outIsBigEndian, outCharset, &errorCode); + ds[TYPE_LE]=nullptr; + ds[TYPE_E]= i==TYPE_E ? nullptr : udata_openSwapper(true, U_EBCDIC_FAMILY, outIsBigEndian, outCharset, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_openSwapper() failed - %s\n", u_errorName(errorCode)); + exit(errorCode); + } + for(i=0; i<TYPE_COUNT; ++i) { + if(ds[i]!=nullptr) { + ds[i]->printError=printPackageError; + ds[i]->printErrorContext=stderr; + } + } + + dsLocalToOut=ds[makeTypeEnum(U_CHARSET_FAMILY, U_IS_BIG_ENDIAN)]; + + // create the file and write its contents + file=fopen(filename, "wb"); + if(file==nullptr) { + fprintf(stderr, "icupkg: unable to create file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + // swap and write the header + if(dsLocalToOut!=nullptr) { + udata_swapDataHeader(dsLocalToOut, header, headerLength, header, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_swapDataHeader(local to out) failed - %s\n", u_errorName(errorCode)); + exit(errorCode); + } + } + length=(int32_t)fwrite(header, 1, headerLength, file); + if(length!=headerLength) { + fprintf(stderr, "icupkg: unable to write complete header to file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + // prepare and swap the package name with a tree separator + // for prepending to item names + if(pkgPrefix[0]==0) { + prefixLength=(int32_t)strlen(prefix); + } else { + prefixLength=(int32_t)strlen(pkgPrefix); + memcpy(prefix, pkgPrefix, prefixLength); + if(prefixEndsWithType) { + prefix[prefixLength-1]=outType; + } + } + prefix[prefixLength++]=U_TREE_ENTRY_SEP_CHAR; + prefix[prefixLength]=0; + if(dsLocalToOut!=nullptr) { + dsLocalToOut->swapInvChars(dsLocalToOut, prefix, prefixLength, prefix, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: swapInvChars(output package name) failed - %s\n", u_errorName(errorCode)); + exit(errorCode); + } + + // swap and sort the item names (sorting needs to be done in the output charset) + dsLocalToOut->swapInvChars(dsLocalToOut, inStrings, inStringTop, inStrings, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: swapInvChars(item names) failed - %s\n", u_errorName(errorCode)); + exit(errorCode); + } + sortItems(); + } + + // create the output item names in sorted order, with the package name prepended to each + for(i=0; i<itemCount; ++i) { + length=(int32_t)strlen(items[i].name); + name=allocString(false, length+prefixLength); + memcpy(name, prefix, prefixLength); + memcpy(name+prefixLength, items[i].name, length+1); + items[i].name=name; + } + + // calculate offsets for item names and items, pad to 16-align items + // align only the first item; each item's length is a multiple of 16 + basenameOffset=4+8*itemCount; + offset=basenameOffset+outStringTop; + if((length=(offset&15))!=0) { + length=16-length; + memset(allocString(false, length-1), 0xaa, length); + offset+=length; + } + + // write the table of contents + // first the itemCount + outInt32=itemCount; + if(dsLocalToOut!=nullptr) { + dsLocalToOut->swapArray32(dsLocalToOut, &outInt32, 4, &outInt32, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: swapArray32(item count) failed - %s\n", u_errorName(errorCode)); + exit(errorCode); + } + } + length=(int32_t)fwrite(&outInt32, 1, 4, file); + if(length!=4) { + fprintf(stderr, "icupkg: unable to write complete item count to file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + // then write the item entries (and collect the maxItemLength) + maxItemLength=0; + for(i=0; i<itemCount; ++i) { + entry.nameOffset=(uint32_t)(basenameOffset+(items[i].name-outStrings)); + entry.dataOffset=(uint32_t)offset; + if(dsLocalToOut!=nullptr) { + dsLocalToOut->swapArray32(dsLocalToOut, &entry, 8, &entry, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: swapArray32(item entry %ld) failed - %s\n", (long)i, u_errorName(errorCode)); + exit(errorCode); + } + } + length=(int32_t)fwrite(&entry, 1, 8, file); + if(length!=8) { + fprintf(stderr, "icupkg: unable to write complete item entry %ld to file \"%s\"\n", (long)i, filename); + exit(U_FILE_ACCESS_ERROR); + } + + length=items[i].length; + if(length>maxItemLength) { + maxItemLength=length; + } + offset+=length; + } + + // write the item names + length=(int32_t)fwrite(outStrings, 1, outStringTop, file); + if(length!=outStringTop) { + fprintf(stderr, "icupkg: unable to write complete item names to file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + // write the items + for(pItem=items, i=0; i<itemCount; ++pItem, ++i) { + int32_t type=makeTypeEnum(pItem->type); + if(ds[type]!=nullptr) { + // swap each item from its platform properties to the desired ones + udata_swap( + ds[type], + pItem->data, pItem->length, pItem->data, + &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_swap(item %ld) failed - %s\n", (long)i, u_errorName(errorCode)); + exit(errorCode); + } + } + length=(int32_t)fwrite(pItem->data, 1, pItem->length, file); + if(length!=pItem->length) { + fprintf(stderr, "icupkg: unable to write complete item %ld to file \"%s\"\n", (long)i, filename); + exit(U_FILE_ACCESS_ERROR); + } + } + + if(ferror(file)) { + fprintf(stderr, "icupkg: unable to write complete file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + fclose(file); + for(i=0; i<TYPE_COUNT; ++i) { + udata_closeSwapper(ds[i]); + } +} + +int32_t +Package::findItem(const char *name, int32_t length) const { + int32_t i, start, limit; + int result; + + /* do a binary search for the string */ + start=0; + limit=itemCount; + while(start<limit) { + i=(start+limit)/2; + if(length>=0) { + result=strncmp(name, items[i].name, length); + } else { + result=strcmp(name, items[i].name); + } + + if(result==0) { + /* found */ + if(length>=0) { + /* + * if we compared just prefixes, then we may need to back up + * to the first item with this prefix + */ + while(i>0 && 0==strncmp(name, items[i-1].name, length)) { + --i; + } + } + return i; + } else if(result<0) { + limit=i; + } else /* result>0 */ { + start=i+1; + } + } + + return ~start; /* not found, return binary-not of the insertion point */ +} + +void +Package::findItems(const char *pattern) { + const char *wild; + + if(pattern==nullptr || *pattern==0) { + findNextIndex=-1; + return; + } + + findPrefix=pattern; + findSuffix=nullptr; + findSuffixLength=0; + + wild=strchr(pattern, '*'); + if(wild==nullptr) { + // no wildcard + findPrefixLength=(int32_t)strlen(pattern); + } else { + // one wildcard + findPrefixLength=(int32_t)(wild-pattern); + findSuffix=wild+1; + findSuffixLength=(int32_t)strlen(findSuffix); + if(nullptr!=strchr(findSuffix, '*')) { + // two or more wildcards + fprintf(stderr, "icupkg: syntax error (more than one '*') in item pattern \"%s\"\n", pattern); + exit(U_PARSE_ERROR); + } + } + + if(findPrefixLength==0) { + findNextIndex=0; + } else { + findNextIndex=findItem(findPrefix, findPrefixLength); + } +} + +int32_t +Package::findNextItem() { + const char *name, *middle, *treeSep; + int32_t idx, nameLength, middleLength; + + if(findNextIndex<0) { + return -1; + } + + while(findNextIndex<itemCount) { + idx=findNextIndex++; + name=items[idx].name; + nameLength=(int32_t)strlen(name); + if(nameLength<(findPrefixLength+findSuffixLength)) { + // item name too short for prefix & suffix + continue; + } + if(findPrefixLength>0 && 0!=memcmp(findPrefix, name, findPrefixLength)) { + // left the range of names with this prefix + break; + } + middle=name+findPrefixLength; + middleLength=nameLength-findPrefixLength-findSuffixLength; + if(findSuffixLength>0 && 0!=memcmp(findSuffix, name+(nameLength-findSuffixLength), findSuffixLength)) { + // suffix does not match + continue; + } + // prefix & suffix match + + if(matchMode&MATCH_NOSLASH) { + treeSep=strchr(middle, U_TREE_ENTRY_SEP_CHAR); + if(treeSep!=nullptr && (treeSep-middle)<middleLength) { + // the middle (matching the * wildcard) contains a tree separator / + continue; + } + } + + // found a matching item + return idx; + } + + // no more items + findNextIndex=-1; + return -1; +} + +void +Package::setMatchMode(uint32_t mode) { + matchMode=mode; +} + +void +Package::addItem(const char *name) { + addItem(name, nullptr, 0, false, U_ICUDATA_TYPE_LETTER[0]); +} + +void +Package::addItem(const char *name, uint8_t *data, int32_t length, UBool isDataOwned, char type) { + int32_t idx; + + idx=findItem(name); + if(idx<0) { + // new item, make space at the insertion point + ensureItemCapacity(); + // move the following items down + idx=~idx; + if(idx<itemCount) { + memmove(items+idx+1, items+idx, (itemCount-idx)*sizeof(Item)); + } + ++itemCount; + + // reset this Item entry + memset(items+idx, 0, sizeof(Item)); + + // copy the item's name + items[idx].name=allocString(true, static_cast<int32_t>(strlen(name))); + strcpy(items[idx].name, name); + pathToTree(items[idx].name); + } else { + // same-name item found, replace it + if(items[idx].isDataOwned) { + uprv_free(items[idx].data); + } + + // keep the item's name since it is the same + } + + // set the item's data + items[idx].data=data; + items[idx].length=length; + items[idx].isDataOwned=isDataOwned; + items[idx].type=type; +} + +void +Package::addFile(const char *filesPath, const char *name) { + uint8_t *data; + int32_t length; + char type; + + data=readFile(filesPath, name, length, type); + // readFile() exits the tool if it fails + addItem(name, data, length, true, type); +} + +void +Package::addItems(const Package &listPkg) { + const Item *pItem; + int32_t i; + + for(pItem=listPkg.items, i=0; i<listPkg.itemCount; ++pItem, ++i) { + addItem(pItem->name, pItem->data, pItem->length, false, pItem->type); + } +} + +void +Package::removeItem(int32_t idx) { + if(idx>=0) { + // remove the item + if(items[idx].isDataOwned) { + uprv_free(items[idx].data); + } + + // move the following items up + if((idx+1)<itemCount) { + memmove(items+idx, items+idx+1, (itemCount-(idx+1))*sizeof(Item)); + } + --itemCount; + + if(idx<=findNextIndex) { + --findNextIndex; + } + } +} + +void +Package::removeItems(const char *pattern) { + int32_t idx; + + findItems(pattern); + while((idx=findNextItem())>=0) { + removeItem(idx); + } +} + +void +Package::removeItems(const Package &listPkg) { + const Item *pItem; + int32_t i; + + for(pItem=listPkg.items, i=0; i<listPkg.itemCount; ++pItem, ++i) { + removeItems(pItem->name); + } +} + +void +Package::extractItem(const char *filesPath, const char *outName, int32_t idx, char outType) { + char filename[1024]; + UDataSwapper *ds; + FILE *file; + Item *pItem; + int32_t fileLength; + uint8_t itemCharset, outCharset; + UBool itemIsBigEndian, outIsBigEndian; + + if(idx<0 || itemCount<=idx) { + return; + } + pItem=items+idx; + + // swap the data to the outType + // outType==0: don't swap + if(outType!=0 && pItem->type!=outType) { + // open the swapper + UErrorCode errorCode=U_ZERO_ERROR; + makeTypeProps(pItem->type, itemCharset, itemIsBigEndian); + makeTypeProps(outType, outCharset, outIsBigEndian); + ds=udata_openSwapper(itemIsBigEndian, itemCharset, outIsBigEndian, outCharset, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_openSwapper(item %ld) failed - %s\n", + (long)idx, u_errorName(errorCode)); + exit(errorCode); + } + + ds->printError=printPackageError; + ds->printErrorContext=stderr; + + // swap the item from its platform properties to the desired ones + udata_swap(ds, pItem->data, pItem->length, pItem->data, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_swap(item %ld) failed - %s\n", (long)idx, u_errorName(errorCode)); + exit(errorCode); + } + udata_closeSwapper(ds); + pItem->type=outType; + } + + // create the file and write its contents + makeFullFilenameAndDirs(filesPath, outName, filename, (int32_t)sizeof(filename)); + file=fopen(filename, "wb"); + if(file==nullptr) { + fprintf(stderr, "icupkg: unable to create file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + fileLength=(int32_t)fwrite(pItem->data, 1, pItem->length, file); + + if(ferror(file) || fileLength!=pItem->length) { + fprintf(stderr, "icupkg: unable to write complete file \"%s\"\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + fclose(file); +} + +void +Package::extractItem(const char *filesPath, int32_t idx, char outType) { + extractItem(filesPath, items[idx].name, idx, outType); +} + +void +Package::extractItems(const char *filesPath, const char *pattern, char outType) { + int32_t idx; + + findItems(pattern); + while((idx=findNextItem())>=0) { + extractItem(filesPath, idx, outType); + } +} + +void +Package::extractItems(const char *filesPath, const Package &listPkg, char outType) { + const Item *pItem; + int32_t i; + + for(pItem=listPkg.items, i=0; i<listPkg.itemCount; ++pItem, ++i) { + extractItems(filesPath, pItem->name, outType); + } +} + +int32_t +Package::getItemCount() const { + return itemCount; +} + +const Item * +Package::getItem(int32_t idx) const { + if (0 <= idx && idx < itemCount) { + return &items[idx]; + } + return nullptr; +} + +void +Package::checkDependency(void *context, const char *itemName, const char *targetName) { + // check dependency: make sure the target item is in the package + Package *me=(Package *)context; + if(me->findItem(targetName)<0) { + me->isMissingItems=true; + fprintf(stderr, "Item %s depends on missing item %s\n", itemName, targetName); + } +} + +UBool +Package::checkDependencies() { + isMissingItems=false; + enumDependencies(this, checkDependency); + return (UBool)!isMissingItems; +} + +void +Package::enumDependencies(void *context, CheckDependency check) { + int32_t i; + + for(i=0; i<itemCount; ++i) { + enumDependencies(items+i, context, check); + } +} + +char * +Package::allocString(UBool in, int32_t length) { + char *p; + int32_t top; + + if(in) { + top=inStringTop; + p=inStrings+top; + } else { + top=outStringTop; + p=outStrings+top; + } + top+=length+1; + + if(top>STRING_STORE_SIZE) { + fprintf(stderr, "icupkg: string storage overflow\n"); + exit(U_BUFFER_OVERFLOW_ERROR); + } + if(in) { + inStringTop=top; + } else { + outStringTop=top; + } + return p; +} + +void +Package::sortItems() { + UErrorCode errorCode=U_ZERO_ERROR; + uprv_sortArray(items, itemCount, (int32_t)sizeof(Item), compareItems, nullptr, false, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: sorting item names failed - %s\n", u_errorName(errorCode)); + exit(errorCode); + } +} + +void Package::setItemCapacity(int32_t max) +{ + if(max<=itemMax) { + return; + } + Item *newItems = (Item*)uprv_malloc(max * sizeof(items[0])); + Item *oldItems = items; + if(newItems == nullptr) { + fprintf(stderr, "icupkg: Out of memory trying to allocate %lu bytes for %d items\n", + (unsigned long)(max*sizeof(items[0])), max); + exit(U_MEMORY_ALLOCATION_ERROR); + } + if(items && itemCount>0) { + uprv_memcpy(newItems, items, (size_t)itemCount*sizeof(items[0])); + } + itemMax = max; + items = newItems; + uprv_free(oldItems); +} + +void Package::ensureItemCapacity() +{ + if((itemCount+1)>itemMax) { + setItemCapacity(itemCount+kItemsChunk); + } +} + +U_NAMESPACE_END diff --git a/intl/icu/source/tools/toolutil/package.h b/intl/icu/source/tools/toolutil/package.h new file mode 100644 index 0000000000..ea60c13a74 --- /dev/null +++ b/intl/icu/source/tools/toolutil/package.h @@ -0,0 +1,203 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2005-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: package.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005aug25 +* created by: Markus W. Scherer +* +* Read, modify, and write ICU .dat data package files. +*/ + +#ifndef __PACKAGE_H__ +#define __PACKAGE_H__ + +#include "unicode/utypes.h" + +#include <stdio.h> + +// .dat package file representation ---------------------------------------- *** + +#define STRING_STORE_SIZE 100000 +#define MAX_PKG_NAME_LENGTH 64 + +typedef void CheckDependency(void *context, const char *itemName, const char *targetName); + +U_NAMESPACE_BEGIN + +struct Item { + char *name; + uint8_t *data; + int32_t length; + UBool isDataOwned; + char type; +}; + +class U_TOOLUTIL_API Package { +public: + /* + * Constructor. + * Prepare this object for a new, empty package. + */ + Package(); + + /* Destructor. */ + ~Package(); + + /** + * Uses the prefix of the first entry of the package in readPackage(), + * rather than the package basename. + */ + void setAutoPrefix() { doAutoPrefix=true; } + /** + * Same as setAutoPrefix(), plus the prefix must end with the platform type letter. + */ + void setAutoPrefixWithType() { + doAutoPrefix=true; + prefixEndsWithType=true; + } + void setPrefix(const char *p); + + /* + * Read an existing .dat package file. + * The header and item name strings are swapped into this object, + * but the items are left unswapped. + */ + void readPackage(const char *filename); + /* + * Write a .dat package file with the items in this object. + * Swap all pieces to the desired output platform properties. + * The package becomes unusable: + * The item names are swapped and sorted in the outCharset rather than the local one. + * Also, the items themselves are swapped in-place + */ + void writePackage(const char *filename, char outType, const char *comment); + + /* + * Return the input data type letter (l, b, or e). + */ + char getInType(); + + // find the item in items[], return the non-negative index if found, else the binary-not of the insertion point + int32_t findItem(const char *name, int32_t length=-1) const; + + /* + * Set internal state for following calls to findNextItem() which will return + * indexes for items whose names match the pattern. + */ + void findItems(const char *pattern); + int32_t findNextItem(); + /* + * Set the match mode for findItems() & findNextItem(). + * @param mode 0=default + * MATCH_NOSLASH * does not match a '/' + */ + void setMatchMode(uint32_t mode); + + enum { + MATCH_NOSLASH=1 + }; + + void addItem(const char *name); + void addItem(const char *name, uint8_t *data, int32_t length, UBool isDataOwned, char type); + void addFile(const char *filesPath, const char *name); + void addItems(const Package &listPkg); + + void removeItem(int32_t itemIndex); + void removeItems(const char *pattern); + void removeItems(const Package &listPkg); + + /* The extractItem() functions accept outputType=0 to mean "don't swap the item". */ + void extractItem(const char *filesPath, int32_t itemIndex, char outType); + void extractItems(const char *filesPath, const char *pattern, char outType); + void extractItems(const char *filesPath, const Package &listPkg, char outType); + + /* This variant extracts an item to a specific filename. */ + void extractItem(const char *filesPath, const char *outName, int32_t itemIndex, char outType); + + int32_t getItemCount() const; + const Item *getItem(int32_t idx) const; + + /* + * Check dependencies and return true if all dependencies are fulfilled. + */ + UBool checkDependencies(); + + /* + * Enumerate all the dependencies and give the results to context and call CheckDependency callback + * @param context user context (will be passed to check function) + * @param check will be called with context and any missing items + */ + void enumDependencies(void *context, CheckDependency check); + +private: + void enumDependencies(Item *pItem, void *context, CheckDependency check); + + /** + * Default CheckDependency function used by checkDependencies() + */ + static void checkDependency(void *context, const char *itemName, const char *targetName); + + /* + * Allocate a string in inStrings or outStrings. + * The length does not include the terminating NUL. + */ + char *allocString(UBool in, int32_t length); + + void sortItems(); + + // data fields + char inPkgName[MAX_PKG_NAME_LENGTH]; + char pkgPrefix[MAX_PKG_NAME_LENGTH]; + + uint8_t *inData; + uint8_t header[1024]; + int32_t inLength, headerLength; + uint8_t inCharset; + UBool inIsBigEndian; + UBool doAutoPrefix; + UBool prefixEndsWithType; + + int32_t itemCount; + int32_t itemMax; + Item *items; + + int32_t inStringTop, outStringTop; + char inStrings[STRING_STORE_SIZE], outStrings[STRING_STORE_SIZE]; + + // match mode for findItems(pattern) and findNextItem() + uint32_t matchMode; + + // state for findItems(pattern) and findNextItem() + const char *findPrefix, *findSuffix; + int32_t findPrefixLength, findSuffixLength; + int32_t findNextIndex; + + // state for checkDependencies() + UBool isMissingItems; + + /** + * Grow itemMax to new value + */ + void setItemCapacity(int32_t max); + + /** + * Grow itemMax to at least itemCount+1 + */ + void ensureItemCapacity(); +}; + +U_NAMESPACE_END + +#endif + + diff --git a/intl/icu/source/tools/toolutil/pkg_genc.cpp b/intl/icu/source/tools/toolutil/pkg_genc.cpp new file mode 100644 index 0000000000..741a8a5228 --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkg_genc.cpp @@ -0,0 +1,1396 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2009-2016, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ +#include "unicode/utypes.h" + +#if U_PLATFORM_HAS_WIN32_API +# define VC_EXTRALEAN +# define WIN32_LEAN_AND_MEAN +# define NOUSER +# define NOSERVICE +# define NOIME +# define NOMCX +#include <windows.h> +#include <time.h> +# ifdef __GNUC__ +# define WINDOWS_WITH_GNUC +# endif +#endif + +#if U_PLATFORM_IS_LINUX_BASED && U_HAVE_ELF_H +# define U_ELF +#endif + +#ifdef U_ELF +# include <elf.h> +# if defined(ELFCLASS64) +# define U_ELF64 +# endif + /* Old elf.h headers may not have EM_X86_64, or have EM_X8664 instead. */ +# ifndef EM_X86_64 +# define EM_X86_64 62 +# endif +# define ICU_ENTRY_OFFSET 0 +#endif + +#include <stdio.h> +#include <stdlib.h> +#include "unicode/putil.h" +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" +#include "toolutil.h" +#include "unicode/uclean.h" +#include "uoptions.h" +#include "pkg_genc.h" +#include "filetools.h" +#include "charstr.h" +#include "unicode/errorcode.h" + +#define MAX_COLUMN ((uint32_t)(0xFFFFFFFFU)) + +#define HEX_0X 0 /* 0x1234 */ +#define HEX_0H 1 /* 01234h */ + +/* prototypes --------------------------------------------------------------- */ +static void +getOutFilename( + const char *inFilename, + const char *destdir, + char *outFilename, + int32_t outFilenameCapacity, + char *entryName, + int32_t entryNameCapacity, + const char *newSuffix, + const char *optFilename); + +static uint32_t +write8(FileStream *out, uint8_t byte, uint32_t column); + +static uint32_t +write32(FileStream *out, uint32_t byte, uint32_t column); + +#if U_PLATFORM == U_PF_OS400 +static uint32_t +write8str(FileStream *out, uint8_t byte, uint32_t column); +#endif +/* -------------------------------------------------------------------------- */ + +/* +Creating Template Files for New Platforms + +Let the cc compiler help you get started. +Compile this program + const unsigned int x[5] = {1, 2, 0xdeadbeef, 0xffffffff, 16}; +with the -S option to produce assembly output. + +For example, this will generate array.s: +gcc -S array.c + +This will produce a .s file that may look like this: + + .file "array.c" + .version "01.01" +gcc2_compiled.: + .globl x + .section .rodata + .align 4 + .type x,@object + .size x,20 +x: + .long 1 + .long 2 + .long -559038737 + .long -1 + .long 16 + .ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)" + +which gives a starting point that will compile, and can be transformed +to become the template, generally with some consulting of as docs and +some experimentation. + +If you want ICU to automatically use this assembly, you should +specify "GENCCODE_ASSEMBLY=-a name" in the specific config/mh-* file, +where the name is the compiler or platform that you used in this +assemblyHeader data structure. +*/ +static const struct AssemblyType { + const char *name; + const char *header; + const char *beginLine; + const char *footer; + int8_t hexType; /* HEX_0X or HEX_0h */ +} assemblyHeader[] = { + /* For gcc assemblers, the meaning of .align changes depending on the */ + /* hardware, so we use .balign 16 which always means 16 bytes. */ + /* https://sourceware.org/binutils/docs/as/Pseudo-Ops.html */ + {"gcc", + ".globl %s\n" + "\t.section .note.GNU-stack,\"\",%%progbits\n" + "#ifdef __CET__\n" + "# include <cet.h>\n" + "#endif\n" + "\t.section .rodata\n" + "\t.balign 16\n" + "#ifdef U_HIDE_DATA_SYMBOL\n" + "\t.hidden %s\n" + "#endif\n" + "\t.type %s,%%object\n" + "%s:\n\n", + + ".long ",".size %s, .-%s\n",HEX_0X + }, + {"gcc-darwin", + /*"\t.section __TEXT,__text,regular,pure_instructions\n" + "\t.section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32\n"*/ + ".globl _%s\n" + "#ifdef U_HIDE_DATA_SYMBOL\n" + "\t.private_extern _%s\n" + "#endif\n" + "\t.data\n" + "\t.const\n" + "\t.balign 16\n" + "_%s:\n\n", + + ".long ","",HEX_0X + }, + /* macOS PPC should use `.p2align 4` instead `.balign 16` because is + * unknown pseudo ops for such legacy system*/ + {"gcc-darwin-ppc", + /*"\t.section __TEXT,__text,regular,pure_instructions\n" + "\t.section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32\n"*/ + ".globl _%s\n" + "#ifdef U_HIDE_DATA_SYMBOL\n" + "\t.private_extern _%s\n" + "#endif\n" + "\t.data\n" + "\t.const\n" + "\t.p2align 4\n" + "_%s:\n\n", + + ".long ","",HEX_0X + }, + {"gcc-cygwin", + ".globl _%s\n" + "\t.section .rodata\n" + "\t.balign 16\n" + "_%s:\n\n", + + ".long ","",HEX_0X + }, + {"gcc-mingw64", + ".globl %s\n" + "\t.section .rodata\n" + "\t.balign 16\n" + "%s:\n\n", + + ".long ","",HEX_0X + }, +/* 16 bytes alignment. */ +/* http://docs.oracle.com/cd/E19641-01/802-1947/802-1947.pdf */ + {"sun", + "\t.section \".rodata\"\n" + "\t.align 16\n" + ".globl %s\n" + "%s:\n", + + ".word ","",HEX_0X + }, +/* 16 bytes alignment for sun-x86. */ +/* http://docs.oracle.com/cd/E19963-01/html/821-1608/eoiyg.html */ + {"sun-x86", + "Drodata.rodata:\n" + "\t.type Drodata.rodata,@object\n" + "\t.size Drodata.rodata,0\n" + "\t.globl %s\n" + "\t.align 16\n" + "%s:\n", + + ".4byte ","",HEX_0X + }, +/* 1<<4 bit alignment for aix. */ +/* http://pic.dhe.ibm.com/infocenter/aix/v6r1/index.jsp?topic=%2Fcom.ibm.aix.aixassem%2Fdoc%2Falangref%2Fidalangref_csect_pseudoop.htm */ + {"xlc", + ".globl %s{RO}\n" + "\t.toc\n" + "%s:\n" + "\t.csect %s{RO}, 4\n", + + ".long ","",HEX_0X + }, + {"aCC-ia64", + "\t.file \"%s.s\"\n" + "\t.type %s,@object\n" + "\t.global %s\n" + "\t.secalias .abe$0.rodata, \".rodata\"\n" + "\t.section .abe$0.rodata = \"a\", \"progbits\"\n" + "\t.align 16\n" + "%s::\t", + + "data4 ","",HEX_0X + }, + {"aCC-parisc", + "\t.SPACE $TEXT$\n" + "\t.SUBSPA $LIT$\n" + "%s\n" + "\t.EXPORT %s\n" + "\t.ALIGN 16\n", + + ".WORD ","",HEX_0X + }, +/* align 16 bytes */ +/* http://msdn.microsoft.com/en-us/library/dwa9fwef.aspx */ + {"nasm", + "global %s\n" +#if defined(_WIN32) + "section .rdata align=16\n" +#else + "section .rodata align=16\n" +#endif + "%s:\n", + " dd ","",HEX_0X + }, + { "masm", + "\tTITLE %s\n" + "; generated by genccode\n" + ".386\n" + ".model flat\n" + "\tPUBLIC _%s\n" + "ICUDATA_%s\tSEGMENT READONLY PARA PUBLIC FLAT 'DATA'\n" + "\tALIGN 16\n" + "_%s\tLABEL DWORD\n", + "\tDWORD ","\nICUDATA_%s\tENDS\n\tEND\n",HEX_0H + }, + { "masm64", + "\tTITLE %s\n" + "; generated by genccode\n" + "\tPUBLIC _%s\n" + "ICUDATA_%s\tSEGMENT READONLY 'DATA'\n" + "\tALIGN 16\n" + "_%s\tLABEL DWORD\n", + "\tDWORD ","\nICUDATA_%s\tENDS\n\tEND\n",HEX_0H + } +}; + +static int32_t assemblyHeaderIndex = -1; +static int32_t hexType = HEX_0X; + +U_CAPI UBool U_EXPORT2 +checkAssemblyHeaderName(const char* optAssembly) { + int32_t idx; + assemblyHeaderIndex = -1; + for (idx = 0; idx < UPRV_LENGTHOF(assemblyHeader); idx++) { + if (uprv_strcmp(optAssembly, assemblyHeader[idx].name) == 0) { + assemblyHeaderIndex = idx; + hexType = assemblyHeader[idx].hexType; /* set the hex type */ + return true; + } + } + + return false; +} + + +U_CAPI void U_EXPORT2 +printAssemblyHeadersToStdErr() { + int32_t idx; + fprintf(stderr, "%s", assemblyHeader[0].name); + for (idx = 1; idx < UPRV_LENGTHOF(assemblyHeader); idx++) { + fprintf(stderr, ", %s", assemblyHeader[idx].name); + } + fprintf(stderr, + ")\n"); +} + +U_CAPI void U_EXPORT2 +writeAssemblyCode( + const char *filename, + const char *destdir, + const char *optEntryPoint, + const char *optFilename, + char *outFilePath, + size_t outFilePathCapacity) { + uint32_t column = MAX_COLUMN; + char entry[96]; + union { + uint32_t uint32s[1024]; + char chars[4096]; + } buffer; + FileStream *in, *out; + size_t i, length, count; + + in=T_FileStream_open(filename, "rb"); + if(in==nullptr) { + fprintf(stderr, "genccode: unable to open input file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + const char* newSuffix = nullptr; + + if (uprv_strcmp(assemblyHeader[assemblyHeaderIndex].name, "masm") == 0) { + newSuffix = ".masm"; + } + else if (uprv_strcmp(assemblyHeader[assemblyHeaderIndex].name, "nasm") == 0) { + newSuffix = ".asm"; + } else { + newSuffix = ".S"; + } + + getOutFilename( + filename, + destdir, + buffer.chars, + sizeof(buffer.chars), + entry, + sizeof(entry), + newSuffix, + optFilename); + out=T_FileStream_open(buffer.chars, "w"); + if(out==nullptr) { + fprintf(stderr, "genccode: unable to open output file %s\n", buffer.chars); + exit(U_FILE_ACCESS_ERROR); + } + + if (outFilePath != nullptr) { + if (uprv_strlen(buffer.chars) >= outFilePathCapacity) { + fprintf(stderr, "genccode: filename too long\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + uprv_strcpy(outFilePath, buffer.chars); +#if defined (WINDOWS_WITH_GNUC) && U_PLATFORM != U_PF_CYGWIN + /* Need to fix the file separator character when using MinGW. */ + swapFileSepChar(outFilePath, U_FILE_SEP_CHAR, '/'); +#endif + } + + if(optEntryPoint != nullptr) { + uprv_strcpy(entry, optEntryPoint); + uprv_strcat(entry, "_dat"); + } + + /* turn dashes or dots in the entry name into underscores */ + length=uprv_strlen(entry); + for(i=0; i<length; ++i) { + if(entry[i]=='-' || entry[i]=='.') { + entry[i]='_'; + } + } + + count = snprintf( + buffer.chars, sizeof(buffer.chars), + assemblyHeader[assemblyHeaderIndex].header, + entry, entry, entry, entry, + entry, entry, entry, entry); + if (count >= sizeof(buffer.chars)) { + fprintf(stderr, "genccode: entry name too long (long filename?)\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + T_FileStream_writeLine(out, buffer.chars); + T_FileStream_writeLine(out, assemblyHeader[assemblyHeaderIndex].beginLine); + + for(;;) { + memset(buffer.uint32s, 0, sizeof(buffer.uint32s)); + length=T_FileStream_read(in, buffer.uint32s, sizeof(buffer.uint32s)); + if(length==0) { + break; + } + for(i=0; i<(length/sizeof(buffer.uint32s[0])); i++) { + // TODO: What if the last read sees length not as a multiple of 4? + column = write32(out, buffer.uint32s[i], column); + } + } + + T_FileStream_writeLine(out, "\n"); + + count = snprintf( + buffer.chars, sizeof(buffer.chars), + assemblyHeader[assemblyHeaderIndex].footer, + entry, entry, entry, entry, + entry, entry, entry, entry); + if (count >= sizeof(buffer.chars)) { + fprintf(stderr, "genccode: entry name too long (long filename?)\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + T_FileStream_writeLine(out, buffer.chars); + + if(T_FileStream_error(in)) { + fprintf(stderr, "genccode: file read error while generating from file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + if(T_FileStream_error(out)) { + fprintf(stderr, "genccode: file write error while generating from file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + T_FileStream_close(out); + T_FileStream_close(in); +} + +U_CAPI void U_EXPORT2 +writeCCode( + const char *filename, + const char *destdir, + const char *optEntryPoint, + const char *optName, + const char *optFilename, + char *outFilePath, + size_t outFilePathCapacity) { + uint32_t column = MAX_COLUMN; + char buffer[4096], entry[96]; + FileStream *in, *out; + size_t i, length, count; + + in=T_FileStream_open(filename, "rb"); + if(in==nullptr) { + fprintf(stderr, "genccode: unable to open input file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + if(optName != nullptr) { /* prepend 'icudt28_' */ + // +2 includes the _ and the NUL + if (uprv_strlen(optName) + 2 > sizeof(entry)) { + fprintf(stderr, "genccode: entry name too long (long filename?)\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + strcpy(entry, optName); + strcat(entry, "_"); + } else { + entry[0] = 0; + } + + getOutFilename( + filename, + destdir, + buffer, + static_cast<int32_t>(sizeof(buffer)), + entry + uprv_strlen(entry), + static_cast<int32_t>(sizeof(entry) - uprv_strlen(entry)), + ".c", + optFilename); + + if (outFilePath != nullptr) { + if (uprv_strlen(buffer) >= outFilePathCapacity) { + fprintf(stderr, "genccode: filename too long\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + uprv_strcpy(outFilePath, buffer); +#if defined (WINDOWS_WITH_GNUC) && U_PLATFORM != U_PF_CYGWIN + /* Need to fix the file separator character when using MinGW. */ + swapFileSepChar(outFilePath, U_FILE_SEP_CHAR, '/'); +#endif + } + + out=T_FileStream_open(buffer, "w"); + if(out==nullptr) { + fprintf(stderr, "genccode: unable to open output file %s\n", buffer); + exit(U_FILE_ACCESS_ERROR); + } + + if(optEntryPoint != nullptr) { + uprv_strcpy(entry, optEntryPoint); + uprv_strcat(entry, "_dat"); + } + + /* turn dashes or dots in the entry name into underscores */ + length=uprv_strlen(entry); + for(i=0; i<length; ++i) { + if(entry[i]=='-' || entry[i]=='.') { + entry[i]='_'; + } + } + +#if U_PLATFORM == U_PF_OS400 + /* + TODO: Fix this once the compiler implements this feature. Keep in sync with udatamem.c + + This is here because this platform can't currently put + const data into the read-only pages of an object or + shared library (service program). Only strings are allowed in read-only + pages, so we use char * strings to store the data. + + In order to prevent the beginning of the data from ever matching the + magic numbers we must still use the initial double. + [grhoten 4/24/2003] + */ + count = snprintf(buffer, sizeof(buffer), + "#ifndef IN_GENERATED_CCODE\n" + "#define IN_GENERATED_CCODE\n" + "#define U_DISABLE_RENAMING 1\n" + "#include \"unicode/umachine.h\"\n" + "#endif\n" + "U_CDECL_BEGIN\n" + "const struct {\n" + " double bogus;\n" + " const char *bytes; \n" + "} %s={ 0.0, \n", + entry); + if (count >= sizeof(buffer)) { + fprintf(stderr, "genccode: entry name too long (long filename?)\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + T_FileStream_writeLine(out, buffer); + + for(;;) { + length=T_FileStream_read(in, buffer, sizeof(buffer)); + if(length==0) { + break; + } + for(i=0; i<length; ++i) { + column = write8str(out, (uint8_t)buffer[i], column); + } + } + + T_FileStream_writeLine(out, "\"\n};\nU_CDECL_END\n"); +#else + /* Function renaming shouldn't be done in data */ + count = snprintf(buffer, sizeof(buffer), + "#ifndef IN_GENERATED_CCODE\n" + "#define IN_GENERATED_CCODE\n" + "#define U_DISABLE_RENAMING 1\n" + "#include \"unicode/umachine.h\"\n" + "#endif\n" + "U_CDECL_BEGIN\n" + "const struct {\n" + " double bogus;\n" + " uint8_t bytes[%ld]; \n" + "} %s={ 0.0, {\n", + (long)T_FileStream_size(in), entry); + if (count >= sizeof(buffer)) { + fprintf(stderr, "genccode: entry name too long (long filename?)\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + T_FileStream_writeLine(out, buffer); + + for(;;) { + length=T_FileStream_read(in, buffer, sizeof(buffer)); + if(length==0) { + break; + } + for(i=0; i<length; ++i) { + column = write8(out, (uint8_t)buffer[i], column); + } + } + + T_FileStream_writeLine(out, "\n}\n};\nU_CDECL_END\n"); +#endif + + if(T_FileStream_error(in)) { + fprintf(stderr, "genccode: file read error while generating from file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + if(T_FileStream_error(out)) { + fprintf(stderr, "genccode: file write error while generating from file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + T_FileStream_close(out); + T_FileStream_close(in); +} + +static uint32_t +write32(FileStream *out, uint32_t bitField, uint32_t column) { + int32_t i; + char bitFieldStr[64]; /* This is more bits than needed for a 32-bit number */ + char *s = bitFieldStr; + uint8_t *ptrIdx = (uint8_t *)&bitField; + static const char hexToStr[16] = { + '0','1','2','3', + '4','5','6','7', + '8','9','A','B', + 'C','D','E','F' + }; + + /* write the value, possibly with comma and newline */ + if(column==MAX_COLUMN) { + /* first byte */ + column=1; + } else if(column<32) { + *(s++)=','; + ++column; + } else { + *(s++)='\n'; + uprv_strcpy(s, assemblyHeader[assemblyHeaderIndex].beginLine); + s+=uprv_strlen(s); + column=1; + } + + if (bitField < 10) { + /* It's a small number. Don't waste the space for 0x */ + *(s++)=hexToStr[bitField]; + } + else { + int seenNonZero = 0; /* This is used to remove leading zeros */ + + if(hexType==HEX_0X) { + *(s++)='0'; + *(s++)='x'; + } else if(hexType==HEX_0H) { + *(s++)='0'; + } + + /* This creates a 32-bit field */ +#if U_IS_BIG_ENDIAN + for (i = 0; i < sizeof(uint32_t); i++) +#else + for (i = sizeof(uint32_t)-1; i >= 0 ; i--) +#endif + { + uint8_t value = ptrIdx[i]; + if (value || seenNonZero) { + *(s++)=hexToStr[value>>4]; + *(s++)=hexToStr[value&0xF]; + seenNonZero = 1; + } + } + if(hexType==HEX_0H) { + *(s++)='h'; + } + } + + *(s++)=0; + T_FileStream_writeLine(out, bitFieldStr); + return column; +} + +static uint32_t +write8(FileStream *out, uint8_t byte, uint32_t column) { + char s[4]; + int i=0; + + /* convert the byte value to a string */ + if(byte>=100) { + s[i++]=(char)('0'+byte/100); + byte%=100; + } + if(i>0 || byte>=10) { + s[i++]=(char)('0'+byte/10); + byte%=10; + } + s[i++]=(char)('0'+byte); + s[i]=0; + + /* write the value, possibly with comma and newline */ + if(column==MAX_COLUMN) { + /* first byte */ + column=1; + } else if(column<16) { + T_FileStream_writeLine(out, ","); + ++column; + } else { + T_FileStream_writeLine(out, ",\n"); + column=1; + } + T_FileStream_writeLine(out, s); + return column; +} + +#if U_PLATFORM == U_PF_OS400 +static uint32_t +write8str(FileStream *out, uint8_t byte, uint32_t column) { + char s[8]; + + if (byte > 7) + snprintf(s, sizeof(s), "\\x%X", byte); + else + snprintf(s, sizeof(s), "\\%X", byte); + + /* write the value, possibly with comma and newline */ + if(column==MAX_COLUMN) { + /* first byte */ + column=1; + T_FileStream_writeLine(out, "\""); + } else if(column<24) { + ++column; + } else { + T_FileStream_writeLine(out, "\"\n\""); + column=1; + } + T_FileStream_writeLine(out, s); + return column; +} +#endif + +static void +getOutFilename( + const char *inFilename, + const char *destdir, + char *outFilename, + int32_t outFilenameCapacity, + char *entryName, + int32_t entryNameCapacity, + const char *newSuffix, + const char *optFilename) { + const char *basename=findBasename(inFilename), *suffix=uprv_strrchr(basename, '.'); + + icu::CharString outFilenameBuilder; + icu::CharString entryNameBuilder; + icu::ErrorCode status; + + /* copy path */ + if(destdir!=nullptr && *destdir!=0) { + outFilenameBuilder.append(destdir, status); + outFilenameBuilder.ensureEndsWithFileSeparator(status); + } else { + outFilenameBuilder.append(inFilename, static_cast<int32_t>(basename - inFilename), status); + } + inFilename=basename; + + if(suffix==nullptr) { + /* the filename does not have a suffix */ + entryNameBuilder.append(inFilename, status); + if(optFilename != nullptr) { + outFilenameBuilder.append(optFilename, status); + } else { + outFilenameBuilder.append(inFilename, status); + } + outFilenameBuilder.append(newSuffix, status); + } else { + int32_t saveOutFilenameLength = outFilenameBuilder.length(); + /* copy basename */ + while(inFilename<suffix) { + // iSeries cannot have '-' in the .o objects. + char c = (*inFilename=='-') ? '_' : *inFilename; + outFilenameBuilder.append(c, status); + entryNameBuilder.append(c, status); + inFilename++; + } + + /* replace '.' by '_' */ + outFilenameBuilder.append('_', status); + entryNameBuilder.append('_', status); + ++inFilename; + + /* copy suffix */ + outFilenameBuilder.append(inFilename, status); + entryNameBuilder.append(inFilename, status); + + if(optFilename != nullptr) { + outFilenameBuilder.truncate(saveOutFilenameLength); + outFilenameBuilder.append(optFilename, status); + } + // add ".c" + outFilenameBuilder.append(newSuffix, status); + } + + if (status.isFailure()) { + fprintf(stderr, "genccode: error building filename or entrypoint\n"); + exit(status.get()); + } + + if (outFilenameBuilder.length() >= outFilenameCapacity) { + fprintf(stderr, "genccode: output filename too long\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + + if (entryNameBuilder.length() >= entryNameCapacity) { + fprintf(stderr, "genccode: entry name too long (long filename?)\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + + outFilenameBuilder.extract(outFilename, outFilenameCapacity, status); + entryNameBuilder.extract(entryName, entryNameCapacity, status); +} + +#ifdef CAN_GENERATE_OBJECTS +static void +getArchitecture(uint16_t *pCPU, uint16_t *pBits, UBool *pIsBigEndian, const char *optMatchArch) { + union { + char bytes[2048]; +#ifdef U_ELF + Elf32_Ehdr header32; + /* Elf32_Ehdr and ELF64_Ehdr are identical for the necessary fields. */ +#elif U_PLATFORM_HAS_WIN32_API + IMAGE_FILE_HEADER header; +#endif + } buffer; + + const char *filename; + FileStream *in; + int32_t length; + +#ifdef U_ELF + +#elif U_PLATFORM_HAS_WIN32_API + const IMAGE_FILE_HEADER *pHeader; +#else +# error "Unknown platform for CAN_GENERATE_OBJECTS." +#endif + + if(optMatchArch != nullptr) { + filename=optMatchArch; + } else { + /* set defaults */ +#ifdef U_ELF + /* set EM_386 because elf.h does not provide better defaults */ + *pCPU=EM_386; + *pBits=32; + *pIsBigEndian=(UBool)(U_IS_BIG_ENDIAN ? ELFDATA2MSB : ELFDATA2LSB); +#elif U_PLATFORM_HAS_WIN32_API + // Windows always runs in little-endian mode. + *pIsBigEndian = false; + + // Note: The various _M_<arch> macros are predefined by the MSVC compiler based + // on the target compilation architecture. + // https://docs.microsoft.com/cpp/preprocessor/predefined-macros + + // link.exe will link an IMAGE_FILE_MACHINE_UNKNOWN data-only .obj file + // no matter what architecture it is targeting (though other values are + // required to match). Unfortunately, the variable name decoration/mangling + // is slightly different on x86, which means we can't use the UNKNOWN type + // for all architectures though. +# if defined(_M_IX86) + *pCPU = IMAGE_FILE_MACHINE_I386; +# else + *pCPU = IMAGE_FILE_MACHINE_UNKNOWN; +# endif +# if defined(_M_IA64) || defined(_M_AMD64) || defined (_M_ARM64) + *pBits = 64; // Doesn't seem to be used for anything interesting though? +# elif defined(_M_IX86) || defined(_M_ARM) + *pBits = 32; +# else +# error "Unknown platform for CAN_GENERATE_OBJECTS." +# endif +#else +# error "Unknown platform for CAN_GENERATE_OBJECTS." +#endif + return; + } + + in=T_FileStream_open(filename, "rb"); + if(in==nullptr) { + fprintf(stderr, "genccode: unable to open match-arch file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + length=T_FileStream_read(in, buffer.bytes, sizeof(buffer.bytes)); + +#ifdef U_ELF + if(length<(int32_t)sizeof(Elf32_Ehdr)) { + fprintf(stderr, "genccode: match-arch file %s is too short\n", filename); + exit(U_UNSUPPORTED_ERROR); + } + if( + buffer.header32.e_ident[0]!=ELFMAG0 || + buffer.header32.e_ident[1]!=ELFMAG1 || + buffer.header32.e_ident[2]!=ELFMAG2 || + buffer.header32.e_ident[3]!=ELFMAG3 || + buffer.header32.e_ident[EI_CLASS]<ELFCLASS32 || buffer.header32.e_ident[EI_CLASS]>ELFCLASS64 + ) { + fprintf(stderr, "genccode: match-arch file %s is not an ELF object file, or not supported\n", filename); + exit(U_UNSUPPORTED_ERROR); + } + + *pBits= buffer.header32.e_ident[EI_CLASS]==ELFCLASS32 ? 32 : 64; /* only 32 or 64: see check above */ +#ifdef U_ELF64 + if(*pBits!=32 && *pBits!=64) { + fprintf(stderr, "genccode: currently only supports 32-bit and 64-bit ELF format\n"); + exit(U_UNSUPPORTED_ERROR); + } +#else + if(*pBits!=32) { + fprintf(stderr, "genccode: built with elf.h missing 64-bit definitions\n"); + exit(U_UNSUPPORTED_ERROR); + } +#endif + + *pIsBigEndian=(UBool)(buffer.header32.e_ident[EI_DATA]==ELFDATA2MSB); + if(*pIsBigEndian!=U_IS_BIG_ENDIAN) { + fprintf(stderr, "genccode: currently only same-endianness ELF formats are supported\n"); + exit(U_UNSUPPORTED_ERROR); + } + /* TODO: Support byte swapping */ + + *pCPU=buffer.header32.e_machine; +#elif U_PLATFORM_HAS_WIN32_API + if(length<sizeof(IMAGE_FILE_HEADER)) { + fprintf(stderr, "genccode: match-arch file %s is too short\n", filename); + exit(U_UNSUPPORTED_ERROR); + } + /* TODO: Use buffer.header. Keep aliasing legal. */ + pHeader=(const IMAGE_FILE_HEADER *)buffer.bytes; + *pCPU=pHeader->Machine; + /* + * The number of bits is implicit with the Machine value. + * *pBits is ignored in the calling code, so this need not be precise. + */ + *pBits= *pCPU==IMAGE_FILE_MACHINE_I386 ? 32 : 64; + /* Windows always runs on little-endian CPUs. */ + *pIsBigEndian=false; +#else +# error "Unknown platform for CAN_GENERATE_OBJECTS." +#endif + + T_FileStream_close(in); +} + +U_CAPI void U_EXPORT2 +writeObjectCode( + const char *filename, + const char *destdir, + const char *optEntryPoint, + const char *optMatchArch, + const char *optFilename, + char *outFilePath, + size_t outFilePathCapacity, + UBool optWinDllExport) { + /* common variables */ + char buffer[4096], entry[96]={ 0 }; + FileStream *in, *out; + const char *newSuffix; + int32_t i, entryLength, length, size, entryOffset=0, entryLengthOffset=0; + + uint16_t cpu, bits; + UBool makeBigEndian; + + (void)optWinDllExport; /* unused except Windows */ + + /* platform-specific variables and initialization code */ +#ifdef U_ELF + /* 32-bit Elf file header */ + static Elf32_Ehdr header32={ + { + /* e_ident[] */ + ELFMAG0, ELFMAG1, ELFMAG2, ELFMAG3, + ELFCLASS32, + U_IS_BIG_ENDIAN ? ELFDATA2MSB : ELFDATA2LSB, + EV_CURRENT /* EI_VERSION */ + }, + ET_REL, + EM_386, + EV_CURRENT, /* e_version */ + 0, /* e_entry */ + 0, /* e_phoff */ + (Elf32_Off)sizeof(Elf32_Ehdr), /* e_shoff */ + 0, /* e_flags */ + (Elf32_Half)sizeof(Elf32_Ehdr), /* eh_size */ + 0, /* e_phentsize */ + 0, /* e_phnum */ + (Elf32_Half)sizeof(Elf32_Shdr), /* e_shentsize */ + 5, /* e_shnum */ + 2 /* e_shstrndx */ + }; + + /* 32-bit Elf section header table */ + static Elf32_Shdr sectionHeaders32[5]={ + { /* SHN_UNDEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }, + { /* .symtab */ + 1, /* sh_name */ + SHT_SYMTAB, + 0, /* sh_flags */ + 0, /* sh_addr */ + (Elf32_Off)(sizeof(header32)+sizeof(sectionHeaders32)), /* sh_offset */ + (Elf32_Word)(2*sizeof(Elf32_Sym)), /* sh_size */ + 3, /* sh_link=sect hdr index of .strtab */ + 1, /* sh_info=One greater than the symbol table index of the last + * local symbol (with STB_LOCAL). */ + 4, /* sh_addralign */ + (Elf32_Word)(sizeof(Elf32_Sym)) /* sh_entsize */ + }, + { /* .shstrtab */ + 9, /* sh_name */ + SHT_STRTAB, + 0, /* sh_flags */ + 0, /* sh_addr */ + (Elf32_Off)(sizeof(header32)+sizeof(sectionHeaders32)+2*sizeof(Elf32_Sym)), /* sh_offset */ + 40, /* sh_size */ + 0, /* sh_link */ + 0, /* sh_info */ + 1, /* sh_addralign */ + 0 /* sh_entsize */ + }, + { /* .strtab */ + 19, /* sh_name */ + SHT_STRTAB, + 0, /* sh_flags */ + 0, /* sh_addr */ + (Elf32_Off)(sizeof(header32)+sizeof(sectionHeaders32)+2*sizeof(Elf32_Sym)+40), /* sh_offset */ + (Elf32_Word)sizeof(entry), /* sh_size */ + 0, /* sh_link */ + 0, /* sh_info */ + 1, /* sh_addralign */ + 0 /* sh_entsize */ + }, + { /* .rodata */ + 27, /* sh_name */ + SHT_PROGBITS, + SHF_ALLOC, /* sh_flags */ + 0, /* sh_addr */ + (Elf32_Off)(sizeof(header32)+sizeof(sectionHeaders32)+2*sizeof(Elf32_Sym)+40+sizeof(entry)), /* sh_offset */ + 0, /* sh_size */ + 0, /* sh_link */ + 0, /* sh_info */ + 16, /* sh_addralign */ + 0 /* sh_entsize */ + } + }; + + /* symbol table */ + static Elf32_Sym symbols32[2]={ + { /* STN_UNDEF */ + 0, 0, 0, 0, 0, 0 + }, + { /* data entry point */ + 1, /* st_name */ + 0, /* st_value */ + 0, /* st_size */ + ELF64_ST_INFO(STB_GLOBAL, STT_OBJECT), + 0, /* st_other */ + 4 /* st_shndx=index of related section table entry */ + } + }; + + /* section header string table, with decimal string offsets */ + static const char sectionStrings[40]= + /* 0 */ "\0" + /* 1 */ ".symtab\0" + /* 9 */ ".shstrtab\0" + /* 19 */ ".strtab\0" + /* 27 */ ".rodata\0" + /* 35 */ "\0\0\0\0"; /* contains terminating NUL */ + /* 40: padded to multiple of 8 bytes */ + + /* + * Use entry[] for the string table which will contain only the + * entry point name. + * entry[0] must be 0 (NUL) + * The entry point name can be up to 38 characters long (sizeof(entry)-2). + */ + + /* 16-align .rodata in the .o file, just in case */ + static const char padding[16]={ 0 }; + int32_t paddingSize; + +#ifdef U_ELF64 + /* 64-bit Elf file header */ + static Elf64_Ehdr header64={ + { + /* e_ident[] */ + ELFMAG0, ELFMAG1, ELFMAG2, ELFMAG3, + ELFCLASS64, + U_IS_BIG_ENDIAN ? ELFDATA2MSB : ELFDATA2LSB, + EV_CURRENT /* EI_VERSION */ + }, + ET_REL, + EM_X86_64, + EV_CURRENT, /* e_version */ + 0, /* e_entry */ + 0, /* e_phoff */ + (Elf64_Off)sizeof(Elf64_Ehdr), /* e_shoff */ + 0, /* e_flags */ + (Elf64_Half)sizeof(Elf64_Ehdr), /* eh_size */ + 0, /* e_phentsize */ + 0, /* e_phnum */ + (Elf64_Half)sizeof(Elf64_Shdr), /* e_shentsize */ + 5, /* e_shnum */ + 2 /* e_shstrndx */ + }; + + /* 64-bit Elf section header table */ + static Elf64_Shdr sectionHeaders64[5]={ + { /* SHN_UNDEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }, + { /* .symtab */ + 1, /* sh_name */ + SHT_SYMTAB, + 0, /* sh_flags */ + 0, /* sh_addr */ + (Elf64_Off)(sizeof(header64)+sizeof(sectionHeaders64)), /* sh_offset */ + (Elf64_Xword)(2*sizeof(Elf64_Sym)), /* sh_size */ + 3, /* sh_link=sect hdr index of .strtab */ + 1, /* sh_info=One greater than the symbol table index of the last + * local symbol (with STB_LOCAL). */ + 4, /* sh_addralign */ + (Elf64_Xword)(sizeof(Elf64_Sym)) /* sh_entsize */ + }, + { /* .shstrtab */ + 9, /* sh_name */ + SHT_STRTAB, + 0, /* sh_flags */ + 0, /* sh_addr */ + (Elf64_Off)(sizeof(header64)+sizeof(sectionHeaders64)+2*sizeof(Elf64_Sym)), /* sh_offset */ + 40, /* sh_size */ + 0, /* sh_link */ + 0, /* sh_info */ + 1, /* sh_addralign */ + 0 /* sh_entsize */ + }, + { /* .strtab */ + 19, /* sh_name */ + SHT_STRTAB, + 0, /* sh_flags */ + 0, /* sh_addr */ + (Elf64_Off)(sizeof(header64)+sizeof(sectionHeaders64)+2*sizeof(Elf64_Sym)+40), /* sh_offset */ + (Elf64_Xword)sizeof(entry), /* sh_size */ + 0, /* sh_link */ + 0, /* sh_info */ + 1, /* sh_addralign */ + 0 /* sh_entsize */ + }, + { /* .rodata */ + 27, /* sh_name */ + SHT_PROGBITS, + SHF_ALLOC, /* sh_flags */ + 0, /* sh_addr */ + (Elf64_Off)(sizeof(header64)+sizeof(sectionHeaders64)+2*sizeof(Elf64_Sym)+40+sizeof(entry)), /* sh_offset */ + 0, /* sh_size */ + 0, /* sh_link */ + 0, /* sh_info */ + 16, /* sh_addralign */ + 0 /* sh_entsize */ + } + }; + + /* + * 64-bit symbol table + * careful: different order of items compared with Elf32_sym! + */ + static Elf64_Sym symbols64[2]={ + { /* STN_UNDEF */ + 0, 0, 0, 0, 0, 0 + }, + { /* data entry point */ + 1, /* st_name */ + ELF64_ST_INFO(STB_GLOBAL, STT_OBJECT), + 0, /* st_other */ + 4, /* st_shndx=index of related section table entry */ + 0, /* st_value */ + 0 /* st_size */ + } + }; + +#endif /* U_ELF64 */ + + /* entry[] have a leading NUL */ + entryOffset=1; + + /* in the common code, count entryLength from after the NUL */ + entryLengthOffset=1; + + newSuffix=".o"; + +#elif U_PLATFORM_HAS_WIN32_API + struct { + IMAGE_FILE_HEADER fileHeader; + IMAGE_SECTION_HEADER sections[2]; + char linkerOptions[100]; + } objHeader; + IMAGE_SYMBOL symbols[1]; + struct { + DWORD sizeofLongNames; + char longNames[100]; + } symbolNames; + + /* + * entry sometimes have a leading '_' + * overwritten if entryOffset==0 depending on the target platform + * see check for cpu below + */ + entry[0]='_'; + + newSuffix=".obj"; +#else +# error "Unknown platform for CAN_GENERATE_OBJECTS." +#endif + + /* deal with options, files and the entry point name */ + getArchitecture(&cpu, &bits, &makeBigEndian, optMatchArch); + if (optMatchArch) + { + printf("genccode: --match-arch cpu=%hu bits=%hu big-endian=%d\n", cpu, bits, makeBigEndian); + } + else + { + printf("genccode: using architecture cpu=%hu bits=%hu big-endian=%d\n", cpu, bits, makeBigEndian); + } +#if U_PLATFORM_HAS_WIN32_API + if(cpu==IMAGE_FILE_MACHINE_I386) { + entryOffset=1; + } +#endif + + in=T_FileStream_open(filename, "rb"); + if(in==nullptr) { + fprintf(stderr, "genccode: unable to open input file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + size=T_FileStream_size(in); + + getOutFilename( + filename, + destdir, + buffer, + sizeof(buffer), + entry + entryOffset, + sizeof(entry) - entryOffset, + newSuffix, + optFilename); + + if (outFilePath != nullptr) { + if (uprv_strlen(buffer) >= outFilePathCapacity) { + fprintf(stderr, "genccode: filename too long\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + uprv_strcpy(outFilePath, buffer); + } + + if(optEntryPoint != nullptr) { + uprv_strcpy(entry+entryOffset, optEntryPoint); + uprv_strcat(entry+entryOffset, "_dat"); + } + /* turn dashes in the entry name into underscores */ + entryLength=(int32_t)uprv_strlen(entry+entryLengthOffset); + for(i=0; i<entryLength; ++i) { + if(entry[entryLengthOffset+i]=='-') { + entry[entryLengthOffset+i]='_'; + } + } + + /* open the output file */ + out=T_FileStream_open(buffer, "wb"); + if(out==nullptr) { + fprintf(stderr, "genccode: unable to open output file %s\n", buffer); + exit(U_FILE_ACCESS_ERROR); + } + +#ifdef U_ELF + if(bits==32) { + header32.e_ident[EI_DATA]= makeBigEndian ? ELFDATA2MSB : ELFDATA2LSB; + header32.e_machine=cpu; + + /* 16-align .rodata in the .o file, just in case */ + paddingSize=sectionHeaders32[4].sh_offset & 0xf; + if(paddingSize!=0) { + paddingSize=0x10-paddingSize; + sectionHeaders32[4].sh_offset+=paddingSize; + } + + sectionHeaders32[4].sh_size=(Elf32_Word)size; + + symbols32[1].st_size=(Elf32_Word)size; + + /* write .o headers */ + T_FileStream_write(out, &header32, (int32_t)sizeof(header32)); + T_FileStream_write(out, sectionHeaders32, (int32_t)sizeof(sectionHeaders32)); + T_FileStream_write(out, symbols32, (int32_t)sizeof(symbols32)); + } else /* bits==64 */ { +#ifdef U_ELF64 + header64.e_ident[EI_DATA]= makeBigEndian ? ELFDATA2MSB : ELFDATA2LSB; + header64.e_machine=cpu; + + /* 16-align .rodata in the .o file, just in case */ + paddingSize=sectionHeaders64[4].sh_offset & 0xf; + if(paddingSize!=0) { + paddingSize=0x10-paddingSize; + sectionHeaders64[4].sh_offset+=paddingSize; + } + + sectionHeaders64[4].sh_size=(Elf64_Xword)size; + + symbols64[1].st_size=(Elf64_Xword)size; + + /* write .o headers */ + T_FileStream_write(out, &header64, (int32_t)sizeof(header64)); + T_FileStream_write(out, sectionHeaders64, (int32_t)sizeof(sectionHeaders64)); + T_FileStream_write(out, symbols64, (int32_t)sizeof(symbols64)); +#endif + } + + T_FileStream_write(out, sectionStrings, (int32_t)sizeof(sectionStrings)); + T_FileStream_write(out, entry, (int32_t)sizeof(entry)); + if(paddingSize!=0) { + T_FileStream_write(out, padding, paddingSize); + } +#elif U_PLATFORM_HAS_WIN32_API + /* populate the .obj headers */ + uprv_memset(&objHeader, 0, sizeof(objHeader)); + uprv_memset(&symbols, 0, sizeof(symbols)); + uprv_memset(&symbolNames, 0, sizeof(symbolNames)); + + /* write the linker export directive */ + if (optWinDllExport) { + uprv_strcpy(objHeader.linkerOptions, "-export:"); + length=8; + uprv_strcpy(objHeader.linkerOptions+length, entry); + length+=entryLength; + uprv_strcpy(objHeader.linkerOptions+length, ",data "); + length+=6; + } + else { + length=0; + } + + /* set the file header */ + objHeader.fileHeader.Machine=cpu; + objHeader.fileHeader.NumberOfSections=2; + objHeader.fileHeader.TimeDateStamp=(DWORD)time(nullptr); + objHeader.fileHeader.PointerToSymbolTable=IMAGE_SIZEOF_FILE_HEADER+2*IMAGE_SIZEOF_SECTION_HEADER+length+size; /* start of symbol table */ + objHeader.fileHeader.NumberOfSymbols=1; + + /* set the section for the linker options */ + uprv_strncpy((char *)objHeader.sections[0].Name, ".drectve", 8); + objHeader.sections[0].SizeOfRawData=length; + objHeader.sections[0].PointerToRawData=IMAGE_SIZEOF_FILE_HEADER+2*IMAGE_SIZEOF_SECTION_HEADER; + objHeader.sections[0].Characteristics=IMAGE_SCN_LNK_INFO|IMAGE_SCN_LNK_REMOVE|IMAGE_SCN_ALIGN_1BYTES; + + /* set the data section */ + uprv_strncpy((char *)objHeader.sections[1].Name, ".rdata", 6); + objHeader.sections[1].SizeOfRawData=size; + objHeader.sections[1].PointerToRawData=IMAGE_SIZEOF_FILE_HEADER+2*IMAGE_SIZEOF_SECTION_HEADER+length; + objHeader.sections[1].Characteristics=IMAGE_SCN_CNT_INITIALIZED_DATA|IMAGE_SCN_ALIGN_16BYTES|IMAGE_SCN_MEM_READ; + + /* set the symbol table */ + if(entryLength<=8) { + uprv_strncpy((char *)symbols[0].N.ShortName, entry, entryLength); + symbolNames.sizeofLongNames=4; + } else { + symbols[0].N.Name.Short=0; + symbols[0].N.Name.Long=4; + symbolNames.sizeofLongNames=4+entryLength+1; + uprv_strcpy(symbolNames.longNames, entry); + } + symbols[0].SectionNumber=2; + symbols[0].StorageClass=IMAGE_SYM_CLASS_EXTERNAL; + + /* write the file header and the linker options section */ + T_FileStream_write(out, &objHeader, objHeader.sections[1].PointerToRawData); +#else +# error "Unknown platform for CAN_GENERATE_OBJECTS." +#endif + + /* copy the data file into section 2 */ + for(;;) { + length=T_FileStream_read(in, buffer, sizeof(buffer)); + if(length==0) { + break; + } + T_FileStream_write(out, buffer, (int32_t)length); + } + +#if U_PLATFORM_HAS_WIN32_API + /* write the symbol table */ + T_FileStream_write(out, symbols, IMAGE_SIZEOF_SYMBOL); + T_FileStream_write(out, &symbolNames, symbolNames.sizeofLongNames); +#endif + + if(T_FileStream_error(in)) { + fprintf(stderr, "genccode: file read error while generating from file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + if(T_FileStream_error(out)) { + fprintf(stderr, "genccode: file write error while generating from file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + T_FileStream_close(out); + T_FileStream_close(in); +} +#endif diff --git a/intl/icu/source/tools/toolutil/pkg_genc.h b/intl/icu/source/tools/toolutil/pkg_genc.h new file mode 100644 index 0000000000..2dd1b45cde --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkg_genc.h @@ -0,0 +1,107 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2008-2011, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ + +#ifndef __PKG_GENC_H__ +#define __PKG_GENC_H__ + +#include "unicode/utypes.h" +#include "toolutil.h" + +#include "unicode/putil.h" +#include "putilimp.h" + +/*** Platform #defines move here ***/ +#if U_PLATFORM_HAS_WIN32_API +#ifdef __GNUC__ +#define WINDOWS_WITH_GNUC +#else +#define WINDOWS_WITH_MSVC +#endif +#endif + + +#if !defined(WINDOWS_WITH_MSVC) +#define BUILD_DATA_WITHOUT_ASSEMBLY +#endif + +#ifndef U_DISABLE_OBJ_CODE /* testing */ +#if defined(WINDOWS_WITH_MSVC) || U_PLATFORM_IS_LINUX_BASED +#define CAN_WRITE_OBJ_CODE +#endif +#if U_PLATFORM_HAS_WIN32_API || defined(U_ELF) +#define CAN_GENERATE_OBJECTS +#endif +#endif + +#if U_PLATFORM == U_PF_CYGWIN || defined(CYGWINMSVC) +#define USING_CYGWIN +#endif + +/* + * When building the data library without assembly, + * some platforms use a single c code file for all of + * the data to generate the final data library. This can + * increase the performance of the pkdata tool. + */ +#if U_PLATFORM == U_PF_OS400 +#define USE_SINGLE_CCODE_FILE +#endif + +/* Need to fix the file seperator character when using MinGW. */ +#if defined(WINDOWS_WITH_GNUC) || defined(USING_CYGWIN) +#define PKGDATA_FILE_SEP_STRING "/" +#else +#define PKGDATA_FILE_SEP_STRING U_FILE_SEP_STRING +#endif + +#define LARGE_BUFFER_MAX_SIZE 2048 +#define SMALL_BUFFER_MAX_SIZE 512 +#define SMALL_BUFFER_FLAG_NAMES 32 +#define BUFFER_PADDING_SIZE 20 + +/** End platform defines **/ + + + +U_CAPI void U_EXPORT2 +printAssemblyHeadersToStdErr(void); + +U_CAPI UBool U_EXPORT2 +checkAssemblyHeaderName(const char* optAssembly); + +U_CAPI void U_EXPORT2 +writeCCode( + const char *filename, + const char *destdir, + const char *optEntryPoint, + const char *optName, + const char *optFilename, + char *outFilePath, + size_t outFilePathCapacity); + +U_CAPI void U_EXPORT2 +writeAssemblyCode( + const char *filename, + const char *destdir, + const char *optEntryPoint, + const char *optFilename, + char *outFilePath, + size_t outFilePathCapacity); + +U_CAPI void U_EXPORT2 +writeObjectCode( + const char *filename, + const char *destdir, + const char *optEntryPoint, + const char *optMatchArch, + const char *optFilename, + char *outFilePath, + size_t outFilePathCapacity, + UBool optWinDllExport); + +#endif diff --git a/intl/icu/source/tools/toolutil/pkg_gencmn.cpp b/intl/icu/source/tools/toolutil/pkg_gencmn.cpp new file mode 100644 index 0000000000..a301c322eb --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkg_gencmn.cpp @@ -0,0 +1,578 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2008-2012, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ +#include "unicode/utypes.h" + +#include <stdio.h> +#include <stdlib.h> +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" +#include "toolutil.h" +#include "unicode/uclean.h" +#include "unewdata.h" +#include "putilimp.h" +#include "pkg_gencmn.h" + +#define STRING_STORE_SIZE 200000 + +#define COMMON_DATA_NAME U_ICUDATA_NAME +#define DATA_TYPE "dat" + +/* ICU package data file format (.dat files) ------------------------------- *** + +Description of the data format after the usual ICU data file header +(UDataInfo etc.). + +Format version 1 + +A .dat package file contains a simple Table of Contents of item names, +followed by the items themselves: + +1. ToC table + +uint32_t count; - number of items +UDataOffsetTOCEntry entry[count]; - pair of uint32_t values per item: + uint32_t nameOffset; - offset of the item name + uint32_t dataOffset; - offset of the item data +both are byte offsets from the beginning of the data + +2. item name strings + +All item names are stored as char * strings in one block between the ToC table +and the data items. + +3. data items + +The data items are stored following the item names block. +Each data item is 16-aligned. +The data items are stored in the sorted order of their names. + +Therefore, the top of the name strings block is the offset of the first item, +the length of the last item is the difference between its offset and +the .dat file length, and the length of all previous items is the difference +between its offset and the next one. + +----------------------------------------------------------------------------- */ + +/* UDataInfo cf. udata.h */ +static const UDataInfo dataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + sizeof(char16_t), + 0, + + {0x43, 0x6d, 0x6e, 0x44}, /* dataFormat="CmnD" */ + {1, 0, 0, 0}, /* formatVersion */ + {3, 0, 0, 0} /* dataVersion */ +}; + +static uint32_t maxSize; + +static char stringStore[STRING_STORE_SIZE]; +static uint32_t stringTop=0, basenameTotal=0; + +typedef struct { + char *pathname, *basename; + uint32_t basenameLength, basenameOffset, fileSize, fileOffset; +} File; + +#define CHUNK_FILE_COUNT 256 +static File *files = nullptr; +static uint32_t fileCount=0; +static uint32_t fileMax = 0; + + +static char *symPrefix = nullptr; + +#define LINE_BUFFER_SIZE 512 +/* prototypes --------------------------------------------------------------- */ + +static void +addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose); + +static char * +allocString(uint32_t length); + +U_CDECL_BEGIN +static int +compareFiles(const void *file1, const void *file2); +U_CDECL_END + +static char * +pathToFullPath(const char *path, const char *source); + +/* map non-tree separator (such as '\') to tree separator ('/') inplace. */ +static void +fixDirToTreePath(char *s); +/* -------------------------------------------------------------------------- */ + +U_CAPI void U_EXPORT2 +createCommonDataFile(const char *destDir, const char *name, const char *entrypointName, const char *type, const char *source, const char *copyRight, + const char *dataFile, uint32_t max_size, UBool sourceTOC, UBool verbose, char *gencmnFileName) { + static char buffer[4096]; + char *line; + char *linePtr; + char *s = nullptr; + UErrorCode errorCode=U_ZERO_ERROR; + uint32_t i, fileOffset, basenameOffset, length, nread; + FileStream *in, *file; + + line = (char *)uprv_malloc(sizeof(char) * LINE_BUFFER_SIZE); + if (line == nullptr) { + fprintf(stderr, "gencmn: unable to allocate memory for line buffer of size %d\n", LINE_BUFFER_SIZE); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + linePtr = line; + + maxSize = max_size; + + if (destDir == nullptr) { + destDir = u_getDataDirectory(); + } + if (name == nullptr) { + name = COMMON_DATA_NAME; + } + if (type == nullptr) { + type = DATA_TYPE; + } + if (source == nullptr) { + source = "."; + } + + if (dataFile == nullptr) { + in = T_FileStream_stdin(); + } else { + in = T_FileStream_open(dataFile, "r"); + if(in == nullptr) { + fprintf(stderr, "gencmn: unable to open input file %s\n", dataFile); + exit(U_FILE_ACCESS_ERROR); + } + } + + if (verbose) { + if(sourceTOC) { + printf("generating %s_%s.c (table of contents source file)\n", name, type); + } else { + printf("generating %s.%s (common data file with table of contents)\n", name, type); + } + } + + /* read the list of files and get their lengths */ + while((s != nullptr && *s != 0) || (s=T_FileStream_readLine(in, (line=linePtr), + LINE_BUFFER_SIZE))!=nullptr) { + /* remove trailing newline characters and parse space separated items */ + if (s != nullptr && *s != 0) { + line=s; + } else { + s=line; + } + while(*s!=0) { + if(*s==' ') { + *s=0; + ++s; + break; + } else if(*s=='\r' || *s=='\n') { + *s=0; + break; + } + ++s; + } + + /* check for comment */ + + if (*line == '#') { + continue; + } + + /* add the file */ +#if (U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR) + { + char *t; + while((t = uprv_strchr(line,U_FILE_ALT_SEP_CHAR))) { + *t = U_FILE_SEP_CHAR; + } + } +#endif + addFile(getLongPathname(line), name, source, sourceTOC, verbose); + } + + uprv_free(linePtr); + + if(in!=T_FileStream_stdin()) { + T_FileStream_close(in); + } + + if(fileCount==0) { + fprintf(stderr, "gencmn: no files listed in %s\n", dataFile == nullptr ? "<stdin>" : dataFile); + return; + } + + /* sort the files by basename */ + qsort(files, fileCount, sizeof(File), compareFiles); + + if(!sourceTOC) { + UNewDataMemory *out; + + /* determine the offsets of all basenames and files in this common one */ + basenameOffset=4+8*fileCount; + fileOffset=(basenameOffset+(basenameTotal+15))&~0xf; + for(i=0; i<fileCount; ++i) { + files[i].fileOffset=fileOffset; + fileOffset+=(files[i].fileSize+15)&~0xf; + files[i].basenameOffset=basenameOffset; + basenameOffset+=files[i].basenameLength; + } + + /* create the output file */ + out=udata_create(destDir, type, name, + &dataInfo, + copyRight == nullptr ? U_COPYRIGHT_STRING : copyRight, + &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "gencmn: udata_create(-d %s -n %s -t %s) failed - %s\n", + destDir, name, type, + u_errorName(errorCode)); + exit(errorCode); + } + + /* write the table of contents */ + udata_write32(out, fileCount); + for(i=0; i<fileCount; ++i) { + udata_write32(out, files[i].basenameOffset); + udata_write32(out, files[i].fileOffset); + } + + /* write the basenames */ + for(i=0; i<fileCount; ++i) { + udata_writeString(out, files[i].basename, files[i].basenameLength); + } + length=4+8*fileCount+basenameTotal; + + /* copy the files */ + for(i=0; i<fileCount; ++i) { + /* pad to 16-align the next file */ + length&=0xf; + if(length!=0) { + udata_writePadding(out, 16-length); + } + + if (verbose) { + printf("adding %s (%ld byte%s)\n", files[i].pathname, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s"); + } + + /* copy the next file */ + file=T_FileStream_open(files[i].pathname, "rb"); + if(file==nullptr) { + fprintf(stderr, "gencmn: unable to open listed file %s\n", files[i].pathname); + exit(U_FILE_ACCESS_ERROR); + } + for(nread = 0;;) { + length=T_FileStream_read(file, buffer, sizeof(buffer)); + if(length <= 0) { + break; + } + nread += length; + udata_writeBlock(out, buffer, length); + } + T_FileStream_close(file); + length=files[i].fileSize; + + if (nread != files[i].fileSize) { + fprintf(stderr, "gencmn: unable to read %s properly (got %ld/%ld byte%s)\n", files[i].pathname, (long)nread, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s"); + exit(U_FILE_ACCESS_ERROR); + } + } + + /* pad to 16-align the last file (cleaner, avoids growing .dat files in icuswap) */ + length&=0xf; + if(length!=0) { + udata_writePadding(out, 16-length); + } + + /* finish */ + udata_finish(out, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "gencmn: udata_finish() failed - %s\n", u_errorName(errorCode)); + exit(errorCode); + } + } else { + /* write a .c source file with the table of contents */ + char *filename; + FileStream *out; + + /* create the output filename */ + filename=s=buffer; + uprv_strcpy(filename, destDir); + s=filename+uprv_strlen(filename); + if(s>filename && *(s-1)!=U_FILE_SEP_CHAR) { + *s++=U_FILE_SEP_CHAR; + } + uprv_strcpy(s, name); + if(*(type)!=0) { + s+=uprv_strlen(s); + *s++='_'; + uprv_strcpy(s, type); + } + s+=uprv_strlen(s); + uprv_strcpy(s, ".c"); + + /* open the output file */ + out=T_FileStream_open(filename, "w"); + if (gencmnFileName != nullptr) { + uprv_strcpy(gencmnFileName, filename); + } + if(out==nullptr) { + fprintf(stderr, "gencmn: unable to open .c output file %s\n", filename); + exit(U_FILE_ACCESS_ERROR); + } + + /* write the source file */ + snprintf(buffer, sizeof(buffer), + "/*\n" + " * ICU common data table of contents for %s.%s\n" + " * Automatically generated by icu/source/tools/gencmn/gencmn .\n" + " */\n\n" + "#include \"unicode/utypes.h\"\n" + "#include \"unicode/udata.h\"\n" + "\n" + "/* external symbol declarations for data (%d files) */\n", + name, type, fileCount); + T_FileStream_writeLine(out, buffer); + + snprintf(buffer, sizeof(buffer), "extern const char\n %s%s[]", symPrefix?symPrefix:"", files[0].pathname); + T_FileStream_writeLine(out, buffer); + for(i=1; i<fileCount; ++i) { + snprintf(buffer, sizeof(buffer), ",\n %s%s[]", symPrefix?symPrefix:"", files[i].pathname); + T_FileStream_writeLine(out, buffer); + } + T_FileStream_writeLine(out, ";\n\n"); + + snprintf( + buffer, sizeof(buffer), + "U_EXPORT struct {\n" + " uint16_t headerSize;\n" + " uint8_t magic1, magic2;\n" + " UDataInfo info;\n" + " char padding[%lu];\n" + " uint32_t count, reserved;\n" + " struct {\n" + " const char *name;\n" + " const void *data;\n" + " } toc[%lu];\n" + "} U_EXPORT2 %s_dat = {\n" + " 32, 0xda, 0x27, {\n" + " %lu, 0,\n" + " %u, %u, %u, 0,\n" + " {0x54, 0x6f, 0x43, 0x50},\n" + " {1, 0, 0, 0},\n" + " {0, 0, 0, 0}\n" + " },\n" + " \"\", %lu, 0, {\n", + static_cast<unsigned long>(32-4-sizeof(UDataInfo)), + static_cast<unsigned long>(fileCount), + entrypointName, + static_cast<unsigned long>(sizeof(UDataInfo)), + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + static_cast<unsigned long>(fileCount) + ); + T_FileStream_writeLine(out, buffer); + + snprintf(buffer, sizeof(buffer), " { \"%s\", %s%s }", files[0].basename, symPrefix?symPrefix:"", files[0].pathname); + T_FileStream_writeLine(out, buffer); + for(i=1; i<fileCount; ++i) { + snprintf(buffer, sizeof(buffer), ",\n { \"%s\", %s%s }", files[i].basename, symPrefix?symPrefix:"", files[i].pathname); + T_FileStream_writeLine(out, buffer); + } + + T_FileStream_writeLine(out, "\n }\n};\n"); + T_FileStream_close(out); + + uprv_free(symPrefix); + } +} + +static void +addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose) { + char *s; + uint32_t length; + char *fullPath = nullptr; + + if(fileCount==fileMax) { + fileMax += CHUNK_FILE_COUNT; + files = (File *)uprv_realloc(files, fileMax*sizeof(files[0])); /* note: never freed. */ + if(files==nullptr) { + fprintf(stderr, "pkgdata/gencmn: Could not allocate %u bytes for %d files\n", (unsigned int)(fileMax*sizeof(files[0])), fileCount); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + if(!sourceTOC) { + FileStream *file; + + if(uprv_pathIsAbsolute(filename)) { + fprintf(stderr, "gencmn: Error: absolute path encountered. Old style paths are not supported. Use relative paths such as 'fur.res' or 'translit%cfur.res'.\n\tBad path: '%s'\n", U_FILE_SEP_CHAR, filename); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + fullPath = pathToFullPath(filename, source); + /* store the pathname */ + length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1); + s=allocString(length); + uprv_strcpy(s, name); + uprv_strcat(s, U_TREE_ENTRY_SEP_STRING); + uprv_strcat(s, filename); + + /* get the basename */ + fixDirToTreePath(s); + files[fileCount].basename=s; + files[fileCount].basenameLength=length; + + files[fileCount].pathname=fullPath; + + basenameTotal+=length; + + /* try to open the file */ + file=T_FileStream_open(fullPath, "rb"); + if(file==nullptr) { + fprintf(stderr, "gencmn: unable to open listed file %s\n", fullPath); + exit(U_FILE_ACCESS_ERROR); + } + + /* get the file length */ + length=T_FileStream_size(file); + if(T_FileStream_error(file) || length<=20) { + fprintf(stderr, "gencmn: unable to get length of listed file %s\n", fullPath); + exit(U_FILE_ACCESS_ERROR); + } + + T_FileStream_close(file); + + /* do not add files that are longer than maxSize */ + if(maxSize && length>maxSize) { + if (verbose) { + printf("%s ignored (size %ld > %ld)\n", fullPath, (long)length, (long)maxSize); + } + return; + } + files[fileCount].fileSize=length; + } else { + char *t; + /* get and store the basename */ + /* need to include the package name */ + length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1); + s=allocString(length); + uprv_strcpy(s, name); + uprv_strcat(s, U_TREE_ENTRY_SEP_STRING); + uprv_strcat(s, filename); + fixDirToTreePath(s); + files[fileCount].basename=s; + /* turn the basename into an entry point name and store in the pathname field */ + t=files[fileCount].pathname=allocString(length); + while(--length>0) { + if(*s=='.' || *s=='-' || *s=='/') { + *t='_'; + } else { + *t=*s; + } + ++s; + ++t; + } + *t=0; + } + ++fileCount; +} + +static char * +allocString(uint32_t length) { + uint32_t top=stringTop+length; + char *p; + + if(top>STRING_STORE_SIZE) { + fprintf(stderr, "gencmn: out of memory\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + p=stringStore+stringTop; + stringTop=top; + return p; +} + +static char * +pathToFullPath(const char *path, const char *source) { + int32_t length; + int32_t newLength; + char *fullPath; + int32_t n; + + length = (uint32_t)(uprv_strlen(path) + 1); + newLength = (length + 1 + (int32_t)uprv_strlen(source)); + fullPath = (char *)uprv_malloc(newLength); + if(source != nullptr) { + uprv_strcpy(fullPath, source); + uprv_strcat(fullPath, U_FILE_SEP_STRING); + } else { + fullPath[0] = 0; + } + n = (int32_t)uprv_strlen(fullPath); + fullPath[n] = 0; /* Suppress compiler warning for unused variable n */ + /* when conditional code below is not compiled. */ + uprv_strcat(fullPath, path); + +#if (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) +#if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) + /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */ + for(;fullPath[n];n++) { + if(fullPath[n] == U_FILE_ALT_SEP_CHAR) { + fullPath[n] = U_FILE_SEP_CHAR; + } + } +#endif +#endif +#if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) + /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */ + for(;fullPath[n];n++) { + if(fullPath[n] == U_TREE_ENTRY_SEP_CHAR) { + fullPath[n] = U_FILE_SEP_CHAR; + } + } +#endif + return fullPath; +} + +U_CDECL_BEGIN +static int +compareFiles(const void *file1, const void *file2) { + /* sort by basename */ + return uprv_strcmp(((File *)file1)->basename, ((File *)file2)->basename); +} +U_CDECL_END + +static void +fixDirToTreePath(char *s) +{ + (void)s; +#if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) || ((U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)) + char *t; +#endif +#if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) + for(t=s;t=uprv_strchr(t,U_FILE_SEP_CHAR);) { + *t = U_TREE_ENTRY_SEP_CHAR; + } +#endif +#if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) + for(t=s;t=uprv_strchr(t,U_FILE_ALT_SEP_CHAR);) { + *t = U_TREE_ENTRY_SEP_CHAR; + } +#endif +} diff --git a/intl/icu/source/tools/toolutil/pkg_gencmn.h b/intl/icu/source/tools/toolutil/pkg_gencmn.h new file mode 100644 index 0000000000..238239960a --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkg_gencmn.h @@ -0,0 +1,18 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2008, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ + +#ifndef __PKG_GENCMN_H__ +#define __PKG_GENCMN_H__ + +#include "unicode/utypes.h" + +U_CAPI void U_EXPORT2 +createCommonDataFile(const char *destDir, const char *name, const char *entrypointName, const char *type, const char *source, const char *copyRight, + const char *dataFile, uint32_t max_size, UBool sourceTOC, UBool verbose, char *gencmnFileName); + +#endif diff --git a/intl/icu/source/tools/toolutil/pkg_icu.cpp b/intl/icu/source/tools/toolutil/pkg_icu.cpp new file mode 100644 index 0000000000..d9c6717ecd --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkg_icu.cpp @@ -0,0 +1,176 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2008-2015, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ +#include "unicode/utypes.h" +#include "unicode/localpointer.h" +#include "unicode/putil.h" +#include "cstring.h" +#include "toolutil.h" +#include "uoptions.h" +#include "uparse.h" +#include "package.h" +#include "pkg_icu.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +// read a file list -------------------------------------------------------- *** + +U_NAMESPACE_USE + +static const struct { + const char *suffix; + int32_t length; +} listFileSuffixes[]={ + { ".txt", 4 }, + { ".lst", 4 }, + { ".tmp", 4 } +}; + +/* check for multiple text file suffixes to see if this list name is a text file name */ +static UBool +isListTextFile(const char *listname) { + const char *listNameEnd=strchr(listname, 0); + const char *suffix; + int32_t i, length; + for(i=0; i<UPRV_LENGTHOF(listFileSuffixes); ++i) { + suffix=listFileSuffixes[i].suffix; + length=listFileSuffixes[i].length; + if((listNameEnd-listname)>length && 0==memcmp(listNameEnd-length, suffix, length)) { + return true; + } + } + return false; +} + +/* + * Read a file list. + * If the listname ends with ".txt", then read the list file + * (in the system/ invariant charset). + * If the listname ends with ".dat", then read the ICU .dat package file. + * Otherwise, read the file itself as a single-item list. + */ +U_CAPI Package * U_EXPORT2 +readList(const char *filesPath, const char *listname, UBool readContents, Package *listPkgIn) { + Package *listPkg = listPkgIn; + FILE *file; + const char *listNameEnd; + + if(listname==nullptr || listname[0]==0) { + fprintf(stderr, "missing list file\n"); + return nullptr; + } + + if (listPkg == nullptr) { + listPkg=new Package(); + if(listPkg==nullptr) { + fprintf(stderr, "icupkg: not enough memory\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + listNameEnd=strchr(listname, 0); + if(isListTextFile(listname)) { + // read the list file + char line[1024]; + char *end; + const char *start; + + file=fopen(listname, "r"); + if(file==nullptr) { + fprintf(stderr, "icupkg: unable to open list file \"%s\"\n", listname); + delete listPkg; + exit(U_FILE_ACCESS_ERROR); + } + + while(fgets(line, sizeof(line), file)) { + // remove comments + end=strchr(line, '#'); + if(end!=nullptr) { + *end=0; + } else { + // remove trailing CR LF + end=strchr(line, 0); + while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) { + *--end=0; + } + } + + // check first non-whitespace character and + // skip empty lines and + // skip lines starting with reserved characters + start=u_skipWhitespace(line); + if(*start==0 || nullptr!=strchr(U_PKG_RESERVED_CHARS, *start)) { + continue; + } + + // take whitespace-separated items from the line + for(;;) { + // find whitespace after the item or the end of the line + for(end=(char *)start; *end!=0 && *end!=' ' && *end!='\t'; ++end) {} + if(*end==0) { + // this item is the last one on the line + end=nullptr; + } else { + // the item is terminated by whitespace, terminate it with NUL + *end=0; + } + if(readContents) { + listPkg->addFile(filesPath, start); + } else { + listPkg->addItem(start); + } + + // find the start of the next item or exit the loop + if(end==nullptr || *(start=u_skipWhitespace(end+1))==0) { + break; + } + } + } + fclose(file); + } else if((listNameEnd-listname)>4 && 0==memcmp(listNameEnd-4, ".dat", 4)) { + // read the ICU .dat package + // Accept a .dat file whose name differs from the ToC prefixes. + listPkg->setAutoPrefix(); + listPkg->readPackage(listname); + } else { + // list the single file itself + if(readContents) { + listPkg->addFile(filesPath, listname); + } else { + listPkg->addItem(listname); + } + } + + return listPkg; +} + +U_CAPI int U_EXPORT2 +writePackageDatFile(const char *outFilename, const char *outComment, const char *sourcePath, const char *addList, Package *pkg, char outType) { + LocalPointer<Package> ownedPkg; + LocalPointer<Package> addListPkg; + + if (pkg == nullptr) { + ownedPkg.adoptInstead(new Package); + if(ownedPkg.isNull()) { + fprintf(stderr, "icupkg: not enough memory\n"); + return U_MEMORY_ALLOCATION_ERROR; + } + pkg = ownedPkg.getAlias(); + + addListPkg.adoptInstead(readList(sourcePath, addList, true, nullptr)); + if(addListPkg.isValid()) { + pkg->addItems(*addListPkg); + } else { + return U_ILLEGAL_ARGUMENT_ERROR; + } + } + + pkg->writePackage(outFilename, outType, outComment); + return 0; +} diff --git a/intl/icu/source/tools/toolutil/pkg_icu.h b/intl/icu/source/tools/toolutil/pkg_icu.h new file mode 100644 index 0000000000..638056e60b --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkg_icu.h @@ -0,0 +1,25 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/****************************************************************************** + * Copyright (C) 2008-2016, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ + +#ifndef __PKG_ICU_H__ +#define __PKG_ICU_H__ + +#include "unicode/utypes.h" +#include "package.h" + +#define U_PKG_RESERVED_CHARS "\"%&'()*+,-./:;<=>?_" + +U_CAPI int U_EXPORT2 +writePackageDatFile(const char *outFilename, const char *outComment, + const char *sourcePath, const char *addList, icu::Package *pkg, + char outType); + +U_CAPI icu::Package * U_EXPORT2 +readList(const char *filesPath, const char *listname, UBool readContents, icu::Package *listPkgIn); + +#endif diff --git a/intl/icu/source/tools/toolutil/pkg_imp.h b/intl/icu/source/tools/toolutil/pkg_imp.h new file mode 100644 index 0000000000..29abd8d83c --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkg_imp.h @@ -0,0 +1,38 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2005-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: pkg_imp.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005sep18 +* created by: Markus W. Scherer +* +* Implementation definitions for data package functions in toolutil. +*/ + +#ifndef __PKG_IMP_H__ +#define __PKG_IMP_H__ + +#include "unicode/utypes.h" +#include "unicode/udata.h" + +/* + * Read an ICU data item with any platform type, + * return the pointer to the UDataInfo in its header, + * and set the lengths of the UDataInfo and of the whole header. + * All data remains in its platform type. + */ +U_CFUNC const UDataInfo * +getDataInfo(const uint8_t *data, int32_t length, + int32_t &infoLength, int32_t &headerLength, + UErrorCode *pErrorCode); + +#endif diff --git a/intl/icu/source/tools/toolutil/pkgitems.cpp b/intl/icu/source/tools/toolutil/pkgitems.cpp new file mode 100644 index 0000000000..e49775d56d --- /dev/null +++ b/intl/icu/source/tools/toolutil/pkgitems.cpp @@ -0,0 +1,645 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2003-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: pkgitems.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005sep18 +* created by: Markus W. Scherer +* +* Companion file to package.cpp. Deals with details of ICU data item formats. +* Used for item dependencies. +* Contains adapted code from ucnv_bld.c (swapper code from 2003). +*/ + +#include "unicode/utypes.h" +#include "unicode/ures.h" +#include "unicode/putil.h" +#include "unicode/udata.h" +#include "cstring.h" +#include "uinvchar.h" +#include "ucmndata.h" +#include "udataswp.h" +#include "swapimpl.h" +#include "toolutil.h" +#include "package.h" +#include "pkg_imp.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +/* item formats in common */ + +#include "uresdata.h" +#include "ucnv_bld.h" +#include "ucnv_io.h" + +// general definitions ----------------------------------------------------- *** + +U_CDECL_BEGIN + +static void U_CALLCONV +printError(void *context, const char *fmt, va_list args) { + vfprintf((FILE *)context, fmt, args); +} + +U_CDECL_END + +// a data item in native-platform form ------------------------------------- *** + +U_NAMESPACE_BEGIN + +class NativeItem { +public: + NativeItem() : pItem(nullptr), pInfo(nullptr), bytes(nullptr), swapped(nullptr), length(0) {} + NativeItem(const Item *item, UDataSwapFn *swap) : swapped(nullptr) { + setItem(item, swap); + } + ~NativeItem() { + delete [] swapped; + } + const UDataInfo *getDataInfo() const { + return pInfo; + } + const uint8_t *getBytes() const { + return bytes; + } + int32_t getLength() const { + return length; + } + + void setItem(const Item *item, UDataSwapFn *swap) { + pItem=item; + int32_t infoLength, itemHeaderLength; + UErrorCode errorCode=U_ZERO_ERROR; + pInfo=::getDataInfo(pItem->data, pItem->length, infoLength, itemHeaderLength, &errorCode); + if(U_FAILURE(errorCode)) { + exit(errorCode); // should succeed because readFile() checks headers + } + length=pItem->length-itemHeaderLength; + + if(pInfo->isBigEndian==U_IS_BIG_ENDIAN && pInfo->charsetFamily==U_CHARSET_FAMILY) { + bytes=pItem->data+itemHeaderLength; + } else { + UDataSwapper *ds=udata_openSwapper((UBool)pInfo->isBigEndian, pInfo->charsetFamily, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_openSwapper(\"%s\") failed - %s\n", + pItem->name, u_errorName(errorCode)); + exit(errorCode); + } + + ds->printError=printError; + ds->printErrorContext=stderr; + + swapped=new uint8_t[pItem->length]; + if(swapped==nullptr) { + fprintf(stderr, "icupkg: unable to allocate memory for swapping \"%s\"\n", pItem->name); + exit(U_MEMORY_ALLOCATION_ERROR); + } + swap(ds, pItem->data, pItem->length, swapped, &errorCode); + pInfo=::getDataInfo(swapped, pItem->length, infoLength, itemHeaderLength, &errorCode); + bytes=swapped+itemHeaderLength; + udata_closeSwapper(ds); + } + } + +private: + const Item *pItem; + const UDataInfo *pInfo; + const uint8_t *bytes; + uint8_t *swapped; + int32_t length; +}; + +// check a dependency ------------------------------------------------------ *** + +/* + * assemble the target item name from the source item name, an ID + * and a suffix + */ +static void +makeTargetName(const char *itemName, const char *id, int32_t idLength, const char *suffix, + char *target, int32_t capacity, + UErrorCode *pErrorCode) { + const char *itemID; + int32_t treeLength, suffixLength, targetLength; + + // get the item basename + itemID=strrchr(itemName, '/'); + if(itemID!=nullptr) { + ++itemID; + } else { + itemID=itemName; + } + + // build the target string + treeLength=(int32_t)(itemID-itemName); + if(idLength<0) { + idLength=(int32_t)strlen(id); + } + suffixLength=(int32_t)strlen(suffix); + targetLength=treeLength+idLength+suffixLength; + if(targetLength>=capacity) { + fprintf(stderr, "icupkg/makeTargetName(%s) target item name length %ld too long\n", + itemName, (long)targetLength); + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + return; + } + + memcpy(target, itemName, treeLength); + memcpy(target+treeLength, id, idLength); + memcpy(target+treeLength+idLength, suffix, suffixLength+1); // +1 includes the terminating NUL +} + +static void +checkIDSuffix(const char *itemName, const char *id, int32_t idLength, const char *suffix, + CheckDependency check, void *context, + UErrorCode *pErrorCode) { + char target[200]; + makeTargetName(itemName, id, idLength, suffix, target, (int32_t)sizeof(target), pErrorCode); + if(U_SUCCESS(*pErrorCode)) { + check(context, itemName, target); + } +} + +/* assemble the target item name from the item's parent item name */ +static void +checkParent(const char *itemName, CheckDependency check, void *context, + UErrorCode *pErrorCode) { + const char *itemID, *parent, *parentLimit, *suffix; + int32_t parentLength; + + // get the item basename + itemID=strrchr(itemName, '/'); + if(itemID!=nullptr) { + ++itemID; + } else { + itemID=itemName; + } + + // get the item suffix + suffix=strrchr(itemID, '.'); + if(suffix==nullptr) { + // empty suffix, point to the end of the string + suffix=strrchr(itemID, 0); + } + + // get the position of the last '_' + for(parentLimit=suffix; parentLimit>itemID && *--parentLimit!='_';) {} + + if(parentLimit!=itemID) { + // get the parent item name by truncating the last part of this item's name */ + parent=itemID; + parentLength=(int32_t)(parentLimit-itemID); + } else { + // no '_' in the item name: the parent is the root bundle + parent="root"; + parentLength=4; + if((suffix-itemID)==parentLength && 0==memcmp(itemID, parent, parentLength)) { + // the item itself is "root", which does not depend on a parent + return; + } + } + checkIDSuffix(itemName, parent, parentLength, suffix, check, context, pErrorCode); +} + +// get dependencies from resource bundles ---------------------------------- *** + +static const char16_t SLASH=0x2f; + +/* + * Check for the alias from the string or alias resource res. + */ +static void +checkAlias(const char *itemName, + Resource res, const char16_t *alias, int32_t length, UBool useResSuffix, + CheckDependency check, void *context, UErrorCode *pErrorCode) { + int32_t i; + + if(!uprv_isInvariantUString(alias, length)) { + fprintf(stderr, "icupkg/ures_enumDependencies(%s res=%08x) alias string contains non-invariant characters\n", + itemName, res); + *pErrorCode=U_INVALID_CHAR_FOUND; + return; + } + + // extract the locale ID from alias strings like + // locale_ID/key1/key2/key3 + // locale_ID + + // search for the first slash + for(i=0; i<length && alias[i]!=SLASH; ++i) {} + + if(res_getPublicType(res)==URES_ALIAS) { + // ignore aliases with an initial slash: + // /ICUDATA/... and /pkgname/... go to a different package + // /LOCALE/... are for dynamic sideways fallbacks and don't go to a fixed bundle + if(i==0) { + return; // initial slash ('/') + } + + // ignore the intra-bundle path starting from the first slash ('/') + length=i; + } else /* URES_STRING */ { + // the whole string should only consist of a locale ID + if(i!=length) { + fprintf(stderr, "icupkg/ures_enumDependencies(%s res=%08x) %%ALIAS contains a '/'\n", + itemName, res); + *pErrorCode=U_UNSUPPORTED_ERROR; + return; + } + } + + // convert the Unicode string to char * + char localeID[48]; + if(length>=(int32_t)sizeof(localeID)) { + fprintf(stderr, "icupkg/ures_enumDependencies(%s res=%08x) alias locale ID length %ld too long\n", + itemName, res, (long)length); + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + return; + } + u_UCharsToChars(alias, localeID, length); + localeID[length]=0; + + checkIDSuffix(itemName, localeID, -1, (useResSuffix ? ".res" : ""), check, context, pErrorCode); +} + +/* + * Enumerate one resource item and its children and extract dependencies from + * aliases. + */ +static UBool +ures_enumDependencies(const char *itemName, + const ResourceData *pResData, + Resource res, const char *inKey, const char *parentKey, int32_t depth, + CheckDependency check, void *context, + Package *pkg, + UErrorCode *pErrorCode) { + UBool doCheckParent = true; // always remains true if depth>1 + switch(res_getPublicType(res)) { + case URES_STRING: + if(depth==1 && inKey!=nullptr && + (0==strcmp(inKey, "%%ALIAS") || 0==strcmp(inKey, "%%Parent"))) { + // Top-level %%ALIAS string: + // The alias resource bundle will be used instead of this one. + // Top-level %%Parent string: + // We use this bundle as well as the explicit parent bundle. + // Either way, the truncation parent is ignored. + doCheckParent = false; + // No tracing: build tool + int32_t length; + const char16_t *alias=res_getStringNoTrace(pResData, res, &length); + checkAlias(itemName, res, alias, length, /*useResSuffix=*/ true, + check, context, pErrorCode); + // If there is a %%ALIAS, then there should be nothing else in this resource bundle. + } else if(depth==2 && parentKey!=nullptr && 0==strcmp(parentKey, "%%DEPENDENCY")) { + // Second-level %%DEPENDENCY string: + // Explicit declaration of a dependency of this item on that one. + // No tracing: build tool + int32_t length; + const char16_t *alias=res_getStringNoTrace(pResData, res, &length); + checkAlias(itemName, res, alias, length, /*useResSuffix=*/ false, + check, context, pErrorCode); + } + // we ignore all other strings + break; + case URES_ALIAS: + { + int32_t length; + const char16_t *alias=res_getAlias(pResData, res, &length); + checkAlias(itemName, res, alias, length, true, check, context, pErrorCode); + } + break; + case URES_TABLE: + { + /* recurse */ + int32_t count=res_countArrayItems(pResData, res); + for(int32_t i=0; i<count; ++i) { + const char *itemKey; + Resource item=res_getTableItemByIndex(pResData, res, i, &itemKey); + // This doCheckParent return value is needed to + // propagate the possible false value from depth=1 to depth=0. + doCheckParent &= ures_enumDependencies( + itemName, pResData, + item, itemKey, + inKey, depth+1, + check, context, + pkg, + pErrorCode); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "icupkg/ures_enumDependencies(%s table res=%08x)[%d].recurse(%s: %08x) failed\n", + itemName, res, i, itemKey, item); + break; + } + } + } + break; + case URES_ARRAY: + { + /* recurse */ + int32_t count=res_countArrayItems(pResData, res); + for(int32_t i=0; i<count; ++i) { + Resource item=res_getArrayItem(pResData, res, i); + ures_enumDependencies( + itemName, pResData, + item, nullptr, + inKey, depth+1, + check, context, + pkg, + pErrorCode); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "icupkg/ures_enumDependencies(%s array res=%08x)[%d].recurse(%08x) failed\n", + itemName, res, i, item); + break; + } + } + } + break; + default: + break; + } + return doCheckParent; +} + +static void +ures_enumDependencies(const char *itemName, const UDataInfo *pInfo, + const uint8_t *inBytes, int32_t length, + CheckDependency check, void *context, + Package *pkg, + UErrorCode *pErrorCode) { + ResourceData resData; + + res_read(&resData, pInfo, inBytes, length, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "icupkg: .res format version %02x.%02x not supported, or bundle malformed\n", + pInfo->formatVersion[0], pInfo->formatVersion[1]); + exit(U_UNSUPPORTED_ERROR); + } + + icu::NativeItem nativePool; + + if(resData.usesPoolBundle) { + char poolName[200]; + makeTargetName(itemName, "pool", 4, ".res", poolName, (int32_t)sizeof(poolName), pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return; + } + check(context, itemName, poolName); + int32_t index=pkg->findItem(poolName); + if(index<0) { + // We cannot work with a bundle if its pool resource is missing. + // check() already printed a complaint. + return; + } + // TODO: Cache the native version in the Item itself. + nativePool.setItem(pkg->getItem(index), ures_swap); + const UDataInfo *poolInfo=nativePool.getDataInfo(); + if(poolInfo->formatVersion[0]<=1) { + fprintf(stderr, "icupkg: %s is not a pool bundle\n", poolName); + return; + } + const int32_t *poolRoot=(const int32_t *)nativePool.getBytes(); + const int32_t *poolIndexes=poolRoot+1; + int32_t poolIndexLength=poolIndexes[URES_INDEX_LENGTH]&0xff; + if(!(poolIndexLength>URES_INDEX_POOL_CHECKSUM && + (poolIndexes[URES_INDEX_ATTRIBUTES]&URES_ATT_IS_POOL_BUNDLE)) + ) { + fprintf(stderr, "icupkg: %s is not a pool bundle\n", poolName); + return; + } + if(resData.pRoot[1+URES_INDEX_POOL_CHECKSUM]==poolIndexes[URES_INDEX_POOL_CHECKSUM]) { + resData.poolBundleKeys=(const char *)(poolIndexes+poolIndexLength); + resData.poolBundleStrings=(const uint16_t *)(poolRoot+poolIndexes[URES_INDEX_KEYS_TOP]); + } else { + fprintf(stderr, "icupkg: %s has mismatched checksum for %s\n", poolName, itemName); + return; + } + } + + UBool doCheckParent = ures_enumDependencies( + itemName, &resData, + resData.rootRes, nullptr, nullptr, 0, + check, context, + pkg, + pErrorCode); + if(!doCheckParent) { + return; + } + + /* + * if the bundle attributes are present and the nofallback flag is not set, + * then add the parent bundle as a dependency + */ + if(pInfo->formatVersion[0]>1 || (pInfo->formatVersion[0]==1 && pInfo->formatVersion[1]>=1)) { + if(!resData.noFallback) { + /* this bundle participates in locale fallback */ + checkParent(itemName, check, context, pErrorCode); + } + } +} + +// get dependencies from conversion tables --------------------------------- *** + +#if !UCONFIG_NO_CONVERSION +/* code adapted from ucnv_swap() */ +static void +ucnv_enumDependencies(const UDataSwapper *ds, + const char *itemName, const UDataInfo *pInfo, + const uint8_t *inBytes, int32_t length, + CheckDependency check, void *context, + UErrorCode *pErrorCode) { + uint32_t staticDataSize; + + const UConverterStaticData *inStaticData; + + const _MBCSHeader *inMBCSHeader; + uint8_t outputType; + + /* check format version */ + if(!( + pInfo->formatVersion[0]==6 && + pInfo->formatVersion[1]>=2 + )) { + fprintf(stderr, "icupkg/ucnv_enumDependencies(): .cnv format version %02x.%02x not supported\n", + pInfo->formatVersion[0], pInfo->formatVersion[1]); + exit(U_UNSUPPORTED_ERROR); + } + + /* read the initial UConverterStaticData structure after the UDataInfo header */ + inStaticData=(const UConverterStaticData *)inBytes; + + if( length<(int32_t)sizeof(UConverterStaticData) || + (uint32_t)length<(staticDataSize=ds->readUInt32(inStaticData->structSize)) + ) { + udata_printError(ds, "icupkg/ucnv_enumDependencies(): too few bytes (%d after header) for an ICU .cnv conversion table\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return; + } + + inBytes+=staticDataSize; + length-=(int32_t)staticDataSize; + + /* check for supported conversionType values */ + if(inStaticData->conversionType==UCNV_MBCS) { + /* MBCS data */ + uint32_t mbcsHeaderLength, mbcsHeaderFlags, mbcsHeaderOptions; + int32_t extOffset; + + inMBCSHeader=(const _MBCSHeader *)inBytes; + + if(length<(int32_t)sizeof(_MBCSHeader)) { + udata_printError(ds, "icupkg/ucnv_enumDependencies(): too few bytes (%d after headers) for an ICU MBCS .cnv conversion table\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return; + } + if(inMBCSHeader->version[0]==4 && inMBCSHeader->version[1]>=1) { + mbcsHeaderLength=MBCS_HEADER_V4_LENGTH; + } else if(inMBCSHeader->version[0]==5 && inMBCSHeader->version[1]>=3 && + ((mbcsHeaderOptions=ds->readUInt32(inMBCSHeader->options))& + MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0 + ) { + mbcsHeaderLength=mbcsHeaderOptions&MBCS_OPT_LENGTH_MASK; + } else { + udata_printError(ds, "icupkg/ucnv_enumDependencies(): unsupported _MBCSHeader.version %d.%d\n", + inMBCSHeader->version[0], inMBCSHeader->version[1]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return; + } + + mbcsHeaderFlags=ds->readUInt32(inMBCSHeader->flags); + extOffset=(int32_t)(mbcsHeaderFlags>>8); + outputType=(uint8_t)mbcsHeaderFlags; + + if(outputType==MBCS_OUTPUT_EXT_ONLY) { + /* + * extension-only file, + * contains a base name instead of normal base table data + */ + char baseName[32]; + int32_t baseNameLength; + + /* there is extension data after the base data, see ucnv_ext.h */ + if(length<(extOffset+UCNV_EXT_INDEXES_MIN_LENGTH*4)) { + udata_printError(ds, "icupkg/ucnv_enumDependencies(): too few bytes (%d after headers) for an ICU MBCS .cnv conversion table with extension data\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return; + } + + /* swap the base name, between the header and the extension data */ + const char *inBaseName=(const char *)inBytes+mbcsHeaderLength*4; + baseNameLength=(int32_t)strlen(inBaseName); + if(baseNameLength>=(int32_t)sizeof(baseName)) { + udata_printError(ds, "icupkg/ucnv_enumDependencies(%s): base name length %ld too long\n", + itemName, baseNameLength); + *pErrorCode=U_UNSUPPORTED_ERROR; + return; + } + ds->swapInvChars(ds, inBaseName, baseNameLength+1, baseName, pErrorCode); + + checkIDSuffix(itemName, baseName, -1, ".cnv", check, context, pErrorCode); + } + } +} + +// ICU data formats -------------------------------------------------------- *** + +static const struct { + uint8_t dataFormat[4]; +} dataFormats[]={ + { { 0x52, 0x65, 0x73, 0x42 } }, /* dataFormat="ResB" */ + { { 0x63, 0x6e, 0x76, 0x74 } }, /* dataFormat="cnvt" */ + { { 0x43, 0x76, 0x41, 0x6c } } /* dataFormat="CvAl" */ +}; + +enum { + FMT_RES, + FMT_CNV, + FMT_ALIAS, + FMT_COUNT +}; + +static int32_t +getDataFormat(const uint8_t dataFormat[4]) { + int32_t i; + + for(i=0; i<FMT_COUNT; ++i) { + if(0==memcmp(dataFormats[i].dataFormat, dataFormat, 4)) { + return i; + } + } + return -1; +} + +// enumerate dependencies of a package item -------------------------------- *** + +void +Package::enumDependencies(Item *pItem, void *context, CheckDependency check) { + int32_t infoLength, itemHeaderLength; + UErrorCode errorCode=U_ZERO_ERROR; + const UDataInfo *pInfo=getDataInfo(pItem->data, pItem->length, infoLength, itemHeaderLength, &errorCode); + if(U_FAILURE(errorCode)) { + return; // should not occur because readFile() checks headers + } + + // find the data format and call the corresponding function, if any + int32_t format=getDataFormat(pInfo->dataFormat); + if(format>=0) { + switch(format) { + case FMT_RES: + { + /* + * Swap the resource bundle (if necessary) so that we can use + * the normal runtime uresdata.c code to read it. + * We do not want to duplicate that code, especially not together with on-the-fly swapping. + */ + NativeItem nrb(pItem, ures_swap); + ures_enumDependencies(pItem->name, nrb.getDataInfo(), nrb.getBytes(), nrb.getLength(), check, context, this, &errorCode); + break; + } + case FMT_CNV: + { + // TODO: share/cache swappers + UDataSwapper *ds=udata_openSwapper( + (UBool)pInfo->isBigEndian, pInfo->charsetFamily, + U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, + &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "icupkg: udata_openSwapper(\"%s\") failed - %s\n", + pItem->name, u_errorName(errorCode)); + exit(errorCode); + } + + ds->printError=printError; + ds->printErrorContext=stderr; + + const uint8_t *inBytes=pItem->data+itemHeaderLength; + int32_t length=pItem->length-itemHeaderLength; + + ucnv_enumDependencies(ds, pItem->name, pInfo, inBytes, length, check, context, &errorCode); + udata_closeSwapper(ds); + break; + } + default: + break; + } + + if(U_FAILURE(errorCode)) { + exit(errorCode); + } + } +} +#endif /* UCONFIG_NO_CONVERSION */ + +U_NAMESPACE_END diff --git a/intl/icu/source/tools/toolutil/ppucd.cpp b/intl/icu/source/tools/toolutil/ppucd.cpp new file mode 100644 index 0000000000..0d59b28ce4 --- /dev/null +++ b/intl/icu/source/tools/toolutil/ppucd.cpp @@ -0,0 +1,622 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2011-2014, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: ppucd.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2011dec11 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" +#include "unicode/uchar.h" +#include "charstr.h" +#include "cstring.h" +#include "ppucd.h" +#include "uassert.h" +#include "uparse.h" + +#include <stdio.h> +#include <string.h> + +U_NAMESPACE_BEGIN + +PropertyNames::~PropertyNames() {} + +// TODO: Create a concrete subclass for the default PropertyNames implementation +// using the ICU library built-in property names API & data. +// Currently only the genprops tool uses PreparsedUCD, and provides its own +// PropertyNames implementation using its just-build property names data and its own code. +// At some point, we should use PreparsedUCD in tests, and then we will need the +// default implementation somewhere. +#if 0 +int32_t +PropertyNames::getPropertyEnum(const char *name) const { + return u_getPropertyEnum(name); +} + +int32_t +PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const { + return u_getPropertyValueEnum((UProperty)property, name); +} +#endif + +UniProps::UniProps() + : start(U_SENTINEL), end(U_SENTINEL), + bmg(U_SENTINEL), bpb(U_SENTINEL), + scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL), + digitValue(-1), numericValue(nullptr), + name(nullptr), nameAlias(nullptr) { + memset(binProps, 0, sizeof(binProps)); + memset(intProps, 0, sizeof(intProps)); + memset(age, 0, 4); +} + +UniProps::~UniProps() {} + +const int32_t PreparsedUCD::kNumLineBuffers; + +PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode) + : pnames(nullptr), + file(nullptr), + defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0), + lineNumber(0), + lineType(NO_LINE), + fieldLimit(nullptr), lineLimit(nullptr) { + if(U_FAILURE(errorCode)) { return; } + + if(filename==nullptr || *filename==0 || (*filename=='-' && filename[1]==0)) { + filename=nullptr; + file=stdin; + } else { + file=fopen(filename, "r"); + } + if(file==nullptr) { + perror("error opening preparsed UCD"); + fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\""); + errorCode=U_FILE_ACCESS_ERROR; + return; + } + + memset(ucdVersion, 0, 4); + lines[0][0]=0; +} + +PreparsedUCD::~PreparsedUCD() { + if(file!=stdin) { + fclose(file); + } +} + +// Same order as the LineType values. +static const char *lineTypeStrings[]={ + nullptr, + nullptr, + "ucd", + "property", + "binary", + "value", + "defaults", + "block", + "cp", + "unassigned", + "algnamesrange" +}; + +PreparsedUCD::LineType +PreparsedUCD::readLine(UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return NO_LINE; } + // Select the next available line buffer. + while(!isLineBufferAvailable(lineIndex)) { + ++lineIndex; + if (lineIndex == kNumLineBuffers) { + lineIndex = 0; + } + } + char *line=lines[lineIndex]; + *line=0; + lineLimit=fieldLimit=line; + lineType=NO_LINE; + char *result=fgets(line, sizeof(lines[0]), file); + if(result==nullptr) { + if(ferror(file)) { + perror("error reading preparsed UCD"); + fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber); + errorCode=U_FILE_ACCESS_ERROR; + } + return NO_LINE; + } + ++lineNumber; + if(*line=='#') { + fieldLimit=strchr(line, 0); + return lineType=EMPTY_LINE; + } + // Remove trailing /r/n. + char c; + char *limit=strchr(line, 0); + while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; } + // Remove trailing white space. + while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; } + *limit=0; + lineLimit=limit; + if(line==limit) { + fieldLimit=limit; + return lineType=EMPTY_LINE; + } + // Split by ';'. + char *semi=line; + while((semi=strchr(semi, ';'))!=nullptr) { *semi++=0; } + fieldLimit=strchr(line, 0); + // Determine the line type. + int32_t type; + for(type=EMPTY_LINE+1;; ++type) { + if(type==LINE_TYPE_COUNT) { + fprintf(stderr, + "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n", + line, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return NO_LINE; + } + if(0==strcmp(line, lineTypeStrings[type])) { + break; + } + } + lineType=(LineType)type; + if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) { + u_versionFromString(ucdVersion, fieldLimit+1); + } + return lineType; +} + +const char * +PreparsedUCD::firstField() { + char *field=lines[lineIndex]; + fieldLimit=strchr(field, 0); + return field; +} + +const char * +PreparsedUCD::nextField() { + if(fieldLimit==lineLimit) { return nullptr; } + char *field=fieldLimit+1; + fieldLimit=strchr(field, 0); + return field; +} + +const UniProps * +PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return nullptr; } + newValues.clear(); + if(!lineHasPropertyValues()) { + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + firstField(); + const char *field=nextField(); + if(field==nullptr) { + // No range field after the type. + fprintf(stderr, + "error in preparsed UCD: missing default/block/cp range field " + "(no second field) on line %ld\n", + (long)lineNumber); + errorCode=U_PARSE_ERROR; + return nullptr; + } + UChar32 start, end; + if(!parseCodePointRange(field, start, end, errorCode)) { return nullptr; } + UniProps *props; + UBool insideBlock=false; // true if cp or unassigned range inside the block range. + switch(lineType) { + case DEFAULTS_LINE: + // Should occur before any block/cp/unassigned line. + if(blockLineIndex>=0) { + fprintf(stderr, + "error in preparsed UCD: default line %ld after one or more block lines\n", + (long)lineNumber); + errorCode=U_PARSE_ERROR; + return nullptr; + } + if(defaultLineIndex>=0) { + fprintf(stderr, + "error in preparsed UCD: second line with default properties on line %ld\n", + (long)lineNumber); + errorCode=U_PARSE_ERROR; + return nullptr; + } + if(start!=0 || end!=0x10ffff) { + fprintf(stderr, + "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return nullptr; + } + props=&defaultProps; + defaultLineIndex=lineIndex; + break; + case BLOCK_LINE: + blockProps=defaultProps; // Block inherits default properties. + props=&blockProps; + blockLineIndex=lineIndex; + break; + case CP_LINE: + case UNASSIGNED_LINE: + if(blockProps.start<=start && end<=blockProps.end) { + insideBlock=true; + if(lineType==CP_LINE) { + // Code point range fully inside the last block inherits the block properties. + cpProps=blockProps; + } else { + // Unassigned line inside the block is based on default properties + // which override block properties. + cpProps=defaultProps; + newValues=blockValues; + // Except, it inherits the one blk=Block property. + int32_t blkIndex=UCHAR_BLOCK-UCHAR_INT_START; + cpProps.intProps[blkIndex]=blockProps.intProps[blkIndex]; + newValues.remove((UChar32)UCHAR_BLOCK); + } + } else if(start>blockProps.end || end<blockProps.start) { + // Code point range fully outside the last block inherits the default properties. + cpProps=defaultProps; + } else { + // Code point range partially overlapping with the last block is illegal. + fprintf(stderr, + "error in preparsed UCD: cp range %s on line %ld only " + "partially overlaps with block range %04lX..%04lX\n", + field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end); + errorCode=U_PARSE_ERROR; + return nullptr; + } + props=&cpProps; + break; + default: + // Will not occur because of the range check above. + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + props->start=start; + props->end=end; + while((field=nextField())!=nullptr) { + if(!parseProperty(*props, field, newValues, errorCode)) { return nullptr; } + } + if(lineType==BLOCK_LINE) { + blockValues=newValues; + } else if(lineType==UNASSIGNED_LINE && insideBlock) { + // Unset newValues for values that are the same as the block values. + for(int32_t prop=0; prop<UCHAR_BINARY_LIMIT; ++prop) { + if(newValues.contains(prop) && cpProps.binProps[prop]==blockProps.binProps[prop]) { + newValues.remove(prop); + } + } + for(int32_t prop=UCHAR_INT_START; prop<UCHAR_INT_LIMIT; ++prop) { + int32_t index=prop-UCHAR_INT_START; + if(newValues.contains(prop) && cpProps.intProps[index]==blockProps.intProps[index]) { + newValues.remove(prop); + } + } + } + return props; +} + +static const struct { + const char *name; + int32_t prop; +} ppucdProperties[]={ + { "Name_Alias", PPUCD_NAME_ALIAS }, + { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS }, + { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING } +}; + +// Returns true for "ok to continue parsing fields". +UBool +PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, + UErrorCode &errorCode) { + CharString pBuffer; + const char *p=field; + const char *v=strchr(p, '='); + int binaryValue; + if(*p=='-') { + if(v!=nullptr) { + fprintf(stderr, + "error in preparsed UCD: mix of binary-property-no and " + "enum-property syntax '%s' on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return false; + } + binaryValue=0; + ++p; + } else if(v==nullptr) { + binaryValue=1; + } else { + binaryValue=-1; + // Copy out the property name rather than modifying the field (writing a NUL). + pBuffer.append(p, (int32_t)(v-p), errorCode); + p=pBuffer.data(); + ++v; + } + int32_t prop=pnames->getPropertyEnum(p); + if(prop<0) { + for(int32_t i=0;; ++i) { + if(i==UPRV_LENGTHOF(ppucdProperties)) { + // Ignore unknown property names. + return true; + } + if(0==uprv_stricmp(p, ppucdProperties[i].name)) { + prop=ppucdProperties[i].prop; + U_ASSERT(prop>=0); + break; + } + } + } + if(prop<UCHAR_BINARY_LIMIT) { + if(binaryValue>=0) { + props.binProps[prop]=(UBool)binaryValue; + } else { + // No binary value for a binary property. + fprintf(stderr, + "error in preparsed UCD: enum-property syntax '%s' " + "for binary property on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + } + } else if(binaryValue>=0) { + // Binary value for a non-binary property. + fprintf(stderr, + "error in preparsed UCD: binary-property syntax '%s' " + "for non-binary property on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + } else if (prop < UCHAR_INT_START) { + fprintf(stderr, + "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n", + prop, (long)lineNumber); + errorCode=U_PARSE_ERROR; + } else if(prop<UCHAR_INT_LIMIT) { + int32_t value=pnames->getPropertyValueEnum(prop, v); + if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) { + // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work. + char *end; + unsigned long ccc=uprv_strtoul(v, &end, 10); + if(v<end && *end==0 && ccc<=254) { + value=(int32_t)ccc; + } + } + if(value==UCHAR_INVALID_CODE) { + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid value on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + } else { + props.intProps[prop-UCHAR_INT_START]=value; + } + } else if(*v=='<') { + // Do not parse default values like <code point>, just set null values. + switch(prop) { + case UCHAR_BIDI_MIRRORING_GLYPH: + props.bmg=U_SENTINEL; + break; + case UCHAR_BIDI_PAIRED_BRACKET: + props.bpb=U_SENTINEL; + break; + case UCHAR_SIMPLE_CASE_FOLDING: + props.scf=U_SENTINEL; + break; + case UCHAR_SIMPLE_LOWERCASE_MAPPING: + props.slc=U_SENTINEL; + break; + case UCHAR_SIMPLE_TITLECASE_MAPPING: + props.stc=U_SENTINEL; + break; + case UCHAR_SIMPLE_UPPERCASE_MAPPING: + props.suc=U_SENTINEL; + break; + case UCHAR_CASE_FOLDING: + props.cf.remove(); + break; + case UCHAR_LOWERCASE_MAPPING: + props.lc.remove(); + break; + case UCHAR_TITLECASE_MAPPING: + props.tc.remove(); + break; + case UCHAR_UPPERCASE_MAPPING: + props.uc.remove(); + break; + case UCHAR_SCRIPT_EXTENSIONS: + props.scx.clear(); + break; + default: + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid default value on line %ld\n", + field, (long)lineNumber); + errorCode=U_PARSE_ERROR; + } + } else { + char c; + switch(prop) { + case UCHAR_NUMERIC_VALUE: + props.numericValue=v; + c=*v; + if('0'<=c && c<='9' && v[1]==0) { + props.digitValue=c-'0'; + } else { + props.digitValue=-1; + } + break; + case UCHAR_NAME: + props.name=v; + break; + case UCHAR_AGE: + u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric. + break; + case UCHAR_BIDI_MIRRORING_GLYPH: + props.bmg=parseCodePoint(v, errorCode); + break; + case UCHAR_BIDI_PAIRED_BRACKET: + props.bpb=parseCodePoint(v, errorCode); + break; + case UCHAR_SIMPLE_CASE_FOLDING: + props.scf=parseCodePoint(v, errorCode); + break; + case UCHAR_SIMPLE_LOWERCASE_MAPPING: + props.slc=parseCodePoint(v, errorCode); + break; + case UCHAR_SIMPLE_TITLECASE_MAPPING: + props.stc=parseCodePoint(v, errorCode); + break; + case UCHAR_SIMPLE_UPPERCASE_MAPPING: + props.suc=parseCodePoint(v, errorCode); + break; + case UCHAR_CASE_FOLDING: + parseString(v, props.cf, errorCode); + break; + case UCHAR_LOWERCASE_MAPPING: + parseString(v, props.lc, errorCode); + break; + case UCHAR_TITLECASE_MAPPING: + parseString(v, props.tc, errorCode); + break; + case UCHAR_UPPERCASE_MAPPING: + parseString(v, props.uc, errorCode); + break; + case PPUCD_NAME_ALIAS: + props.nameAlias=v; + break; + case PPUCD_CONDITIONAL_CASE_MAPPINGS: + case PPUCD_TURKIC_CASE_FOLDING: + // No need to parse their values: They are hardcoded in the runtime library. + break; + case UCHAR_SCRIPT_EXTENSIONS: + parseScriptExtensions(v, props.scx, errorCode); + break; + default: + // Ignore unhandled properties. + return true; + } + } + if(U_SUCCESS(errorCode)) { + newValues.add((UChar32)prop); + return true; + } else { + return false; + } +} + +UBool +PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return false; } + if(lineType!=ALG_NAMES_RANGE_LINE) { + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + return false; + } + firstField(); + const char *field=nextField(); + if(field==nullptr) { + // No range field after the type. + fprintf(stderr, + "error in preparsed UCD: missing algnamesrange range field " + "(no second field) on line %ld\n", + (long)lineNumber); + errorCode=U_PARSE_ERROR; + return false; + } + return parseCodePointRange(field, start, end, errorCode); +} + +UChar32 +PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) { + char *end; + uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || *end!=0 || value>=0x110000) { + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid code point on line %ld\n", + s, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return U_SENTINEL; + } + return (UChar32)value; +} + +UBool +PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) { + uint32_t st, e; + u_parseCodePointRange(s, &st, &e, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n", + s, (long)lineNumber); + return false; + } + start=(UChar32)st; + end=(UChar32)e; + return true; +} + +void +PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) { + char16_t *buffer=toUCharPtr(uni.getBuffer(-1)); + int32_t length=u_parseString(s, buffer, uni.getCapacity(), nullptr, &errorCode); + if(errorCode==U_BUFFER_OVERFLOW_ERROR) { + errorCode=U_ZERO_ERROR; + uni.releaseBuffer(0); + buffer=toUCharPtr(uni.getBuffer(length)); + length=u_parseString(s, buffer, uni.getCapacity(), nullptr, &errorCode); + } + uni.releaseBuffer(length); + if(U_FAILURE(errorCode)) { + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n", + s, (long)lineNumber); + } +} + +void +PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return; } + scx.clear(); + CharString scString; + for(;;) { + const char *scs; + const char *scLimit=strchr(s, ' '); + if(scLimit!=nullptr) { + scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data(); + if(U_FAILURE(errorCode)) { return; } + } else { + scs=s; + } + int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs); + if(script==UCHAR_INVALID_CODE) { + fprintf(stderr, + "error in preparsed UCD: '%s' is not a valid script code on line %ld\n", + scs, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return; + } else if(scx.contains(script)) { + fprintf(stderr, + "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n", + scs, (long)lineNumber); + errorCode=U_PARSE_ERROR; + return; + } else { + scx.add(script); + } + if(scLimit!=nullptr) { + s=scLimit+1; + } else { + break; + } + } + if(scx.isEmpty()) { + fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber); + errorCode=U_PARSE_ERROR; + } +} + +U_NAMESPACE_END diff --git a/intl/icu/source/tools/toolutil/ppucd.h b/intl/icu/source/tools/toolutil/ppucd.h new file mode 100644 index 0000000000..d5c63fab49 --- /dev/null +++ b/intl/icu/source/tools/toolutil/ppucd.h @@ -0,0 +1,180 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* Copyright (C) 2011-2013, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* file name: ppucd.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2011dec11 +* created by: Markus W. Scherer +*/ + +#ifndef __PPUCD_H__ +#define __PPUCD_H__ + +#include "unicode/utypes.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" + +#include <stdio.h> + +/** Additions to the uchar.h enum UProperty. */ +enum { + /** Name_Alias */ + PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT, + PPUCD_CONDITIONAL_CASE_MAPPINGS, + PPUCD_TURKIC_CASE_FOLDING +}; + +U_NAMESPACE_BEGIN + +class U_TOOLUTIL_API PropertyNames { +public: + virtual ~PropertyNames(); + virtual int32_t getPropertyEnum(const char *name) const = 0; + virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const = 0; +}; + +struct U_TOOLUTIL_API UniProps { + UniProps(); + ~UniProps(); + + int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; } + + UChar32 start, end; + UBool binProps[UCHAR_BINARY_LIMIT]; + int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]; + UVersionInfo age; + UChar32 bmg, bpb; + UChar32 scf, slc, stc, suc; + int32_t digitValue; + const char *numericValue; + const char *name; + const char *nameAlias; + UnicodeString cf, lc, tc, uc; + UnicodeSet scx; +}; + +class U_TOOLUTIL_API PreparsedUCD { +public: + enum LineType { + /** No line, end of file. */ + NO_LINE, + /** Empty line. (Might contain a comment.) */ + EMPTY_LINE, + + /** ucd;6.1.0 */ + UNICODE_VERSION_LINE, + + /** property;Binary;Alpha;Alphabetic */ + PROPERTY_LINE, + /** binary;N;No;F;False */ + BINARY_LINE, + /** value;gc;Zs;Space_Separator */ + VALUE_LINE, + + /** defaults;0000..10FFFF;age=NA;bc=L;... */ + DEFAULTS_LINE, + /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */ + BLOCK_LINE, + /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */ + CP_LINE, + /** unassigned;E01F0..E0FFF;bc=BN;CWKCF;DI;GCB=CN;NFKC_CF= */ + UNASSIGNED_LINE, + + /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */ + ALG_NAMES_RANGE_LINE, + + LINE_TYPE_COUNT + }; + + /** + * Constructor. + * Prepare this object for a new, empty package. + */ + PreparsedUCD(const char *filename, UErrorCode &errorCode); + + /** Destructor. */ + ~PreparsedUCD(); + + /** Sets (aliases) a PropertyNames implementation. Caller retains ownership. */ + void setPropertyNames(const PropertyNames *pn) { pnames=pn; } + + /** + * Reads a line from the preparsed UCD file. + * Splits the line by replacing each ';' with a NUL. + */ + LineType readLine(UErrorCode &errorCode); + + /** Returns the number of the line read by readLine(). */ + int32_t getLineNumber() const { return lineNumber; } + + /** Returns the line's next field, or nullptr. */ + const char *nextField(); + + /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */ + const UVersionInfo &getUnicodeVersion() const { return ucdVersion; } + + /** Returns true if the current line has property values. */ + UBool lineHasPropertyValues() const { + return DEFAULTS_LINE<=lineType && lineType<=UNASSIGNED_LINE; + } + + /** + * Parses properties from the current line. + * Clears newValues and sets UProperty codes for property values mentioned + * on the current line (as opposed to being inherited). + * Returns a pointer to the filled-in UniProps, or nullptr if something went wrong. + * The returned UniProps are usable until the next line of the same type is read. + */ + const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode); + + /** + * Returns the code point range for the current algnamesrange line. + * Calls & parses nextField(). + * Further nextField() calls will yield the range's type & prefix string. + * Returns U_SUCCESS(errorCode). + */ + UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode); + +private: + UBool isLineBufferAvailable(int32_t i) { + return defaultLineIndex!=i && blockLineIndex!=i; + } + + /** Resets the field iterator and returns the line's first field (the line type field). */ + const char *firstField(); + + UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, + UErrorCode &errorCode); + UChar32 parseCodePoint(const char *s, UErrorCode &errorCode); + UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode); + void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode); + void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode); + + static const int32_t kNumLineBuffers=3; + + const PropertyNames *pnames; // aliased + FILE *file; + int32_t defaultLineIndex, blockLineIndex, lineIndex; + int32_t lineNumber; + LineType lineType; + char *fieldLimit; + char *lineLimit; + + UVersionInfo ucdVersion; + UniProps defaultProps, blockProps, cpProps; + UnicodeSet blockValues; + // Multiple lines so that default and block properties can maintain pointers + // into their line buffers. + char lines[kNumLineBuffers][4096]; +}; + +U_NAMESPACE_END + +#endif // __PPUCD_H__ diff --git a/intl/icu/source/tools/toolutil/sources.txt b/intl/icu/source/tools/toolutil/sources.txt new file mode 100644 index 0000000000..d3288997e2 --- /dev/null +++ b/intl/icu/source/tools/toolutil/sources.txt @@ -0,0 +1,24 @@ +collationinfo.cpp +dbgutil.cpp +denseranges.cpp +filestrm.cpp +filetools.cpp +flagparser.cpp +package.cpp +pkg_genc.cpp +pkg_gencmn.cpp +pkg_icu.cpp +pkgitems.cpp +ppucd.cpp +swapimpl.cpp +toolutil.cpp +ucbuf.cpp +ucln_tu.cpp +ucm.cpp +ucmstate.cpp +udbgutil.cpp +unewdata.cpp +uoptions.cpp +uparse.cpp +writesrc.cpp +xmlparser.cpp diff --git a/intl/icu/source/tools/toolutil/swapimpl.cpp b/intl/icu/source/tools/toolutil/swapimpl.cpp new file mode 100644 index 0000000000..9c58563965 --- /dev/null +++ b/intl/icu/source/tools/toolutil/swapimpl.cpp @@ -0,0 +1,1048 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2005-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: swapimpl.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005may05 +* created by: Markus W. Scherer +* +* Data file swapping functions moved here from the common library +* because some data is hardcoded in ICU4C and needs not be swapped any more. +* Moving the functions here simplifies testing (for code coverage) because +* we need not jump through hoops (like adding snapshots of these files +* to testdata). +* +* The declarations for these functions remain in the internal header files +* in icu/source/common/ +*/ + +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/udata.h" + +/* Explicit include statement for std_string.h is needed + * for compilation on certain platforms. (e.g. AIX/VACPP) + */ +#include "unicode/std_string.h" + +#include "cmemory.h" +#include "cstring.h" +#include "uinvchar.h" +#include "uassert.h" +#include "uarrsort.h" +#include "ucmndata.h" +#include "udataswp.h" +#include "ulayout_props.h" + +/* swapping implementations in common */ + +#include "emojiprops.h" +#include "uresdata.h" +#include "ucnv_io.h" +#include "uprops.h" +#include "ucase.h" +#include "ubidi_props.h" +#include "ucol_swp.h" +#include "ucnv_bld.h" +#include "unormimp.h" +#include "normalizer2impl.h" +#include "sprpimpl.h" +#include "propname.h" +#include "rbbidata.h" +#include "utrie.h" +#include "utrie2.h" +#include "dictionarydata.h" + +/* swapping implementations in i18n */ + +#if !UCONFIG_NO_NORMALIZATION +#include "uspoof_impl.h" +#endif + +U_NAMESPACE_USE + +/* definitions */ + +/* Unicode property (value) aliases data swapping --------------------------- */ + +static int32_t U_CALLCONV +upname_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + /* udata_swapDataHeader checks the arguments */ + int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* check data format and format version */ + const UDataInfo *pInfo= + reinterpret_cast<const UDataInfo *>( + static_cast<const char *>(inData)+4); + if(!( + pInfo->dataFormat[0]==0x70 && /* dataFormat="pnam" */ + pInfo->dataFormat[1]==0x6e && + pInfo->dataFormat[2]==0x61 && + pInfo->dataFormat[3]==0x6d && + pInfo->formatVersion[0]==2 + )) { + udata_printError(ds, "upname_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as pnames.icu\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + const uint8_t *inBytes=static_cast<const uint8_t *>(inData)+headerSize; + uint8_t *outBytes=static_cast<uint8_t *>(outData)+headerSize; + + if(length>=0) { + length-=headerSize; + // formatVersion 2 initially has indexes[8], 32 bytes. + if(length<32) { + udata_printError(ds, "upname_swap(): too few bytes (%d after header) for pnames.icu\n", + (int)length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + } + + const int32_t *inIndexes=reinterpret_cast<const int32_t *>(inBytes); + int32_t totalSize=udata_readInt32(ds, inIndexes[PropNameData::IX_TOTAL_SIZE]); + if(length>=0) { + if(length<totalSize) { + udata_printError(ds, "upname_swap(): too few bytes (%d after header, should be %d) " + "for pnames.icu\n", + (int)length, (int)totalSize); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + int32_t numBytesIndexesAndValueMaps= + udata_readInt32(ds, inIndexes[PropNameData::IX_BYTE_TRIES_OFFSET]); + + // Swap the indexes[] and the valueMaps[]. + ds->swapArray32(ds, inBytes, numBytesIndexesAndValueMaps, outBytes, pErrorCode); + + // Copy the rest of the data. + if(inBytes!=outBytes) { + uprv_memcpy(outBytes+numBytesIndexesAndValueMaps, + inBytes+numBytesIndexesAndValueMaps, + totalSize-numBytesIndexesAndValueMaps); + } + + // We need not swap anything else: + // + // The ByteTries are already byte-serialized, and are fixed on ASCII. + // (On an EBCDIC machine, the input string is converted to lowercase ASCII + // while matching.) + // + // The name groups are mostly invariant characters, but since we only + // generate, and keep in subversion, ASCII versions of pnames.icu, + // and since only ICU4J uses the pnames.icu data file + // (the data is hardcoded in ICU4C) and ICU4J uses ASCII data files, + // we just copy those bytes too. + } + + return headerSize+totalSize; +} + +/* Unicode properties data swapping ----------------------------------------- */ + +static int32_t U_CALLCONV +uprops_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t headerSize, i; + + int32_t dataIndexes[UPROPS_INDEX_COUNT]; + const int32_t *inData32; + + /* udata_swapDataHeader checks the arguments */ + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==0x55 && /* dataFormat="UPro" */ + pInfo->dataFormat[1]==0x50 && + pInfo->dataFormat[2]==0x72 && + pInfo->dataFormat[3]==0x6f && + (3<=pInfo->formatVersion[0] && pInfo->formatVersion[0]<=7) && + (pInfo->formatVersion[0]>=7 || + (pInfo->formatVersion[2]==UTRIE_SHIFT && + pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT)) + )) { + udata_printError(ds, "uprops_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not a Unicode properties file\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + /* the properties file must contain at least the indexes array */ + if(length>=0 && (length-headerSize)<(int32_t)sizeof(dataIndexes)) { + udata_printError(ds, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n", + length-headerSize); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + /* read the indexes */ + inData32=(const int32_t *)((const char *)inData+headerSize); + for(i=0; i<UPROPS_INDEX_COUNT; ++i) { + dataIndexes[i]=udata_readInt32(ds, inData32[i]); + } + + /* + * comments are copied from the data format description in genprops/store.c + * indexes[] constants are in uprops.h + */ + int32_t dataTop; + if(length>=0) { + int32_t *outData32; + + /* + * In formatVersion 7, UPROPS_DATA_TOP_INDEX has the post-header data size. + * In earlier formatVersions, it is 0 and a lower dataIndexes entry + * has the top of the last item. + */ + for(i=UPROPS_DATA_TOP_INDEX; i>0 && (dataTop=dataIndexes[i])==0; --i) {} + + if((length-headerSize)<(4*dataTop)) { + udata_printError(ds, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n", + length-headerSize); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + outData32=(int32_t *)((char *)outData+headerSize); + + /* copy everything for inaccessible data (padding) */ + if(inData32!=outData32) { + uprv_memcpy(outData32, inData32, 4*(size_t)dataTop); + } + + /* swap the indexes[16] */ + ds->swapArray32(ds, inData32, 4*UPROPS_INDEX_COUNT, outData32, pErrorCode); + + /* + * swap the main properties UTrie + * PT serialized properties trie, see utrie.h (byte size: 4*(i0-16)) + */ + utrie_swapAnyVersion(ds, + inData32+UPROPS_INDEX_COUNT, + 4*(dataIndexes[UPROPS_PROPS32_INDEX]-UPROPS_INDEX_COUNT), + outData32+UPROPS_INDEX_COUNT, + pErrorCode); + + /* + * swap the properties and exceptions words + * P const uint32_t props32[i1-i0]; + * E const uint32_t exceptions[i2-i1]; + */ + ds->swapArray32(ds, + inData32+dataIndexes[UPROPS_PROPS32_INDEX], + 4*(dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX]-dataIndexes[UPROPS_PROPS32_INDEX]), + outData32+dataIndexes[UPROPS_PROPS32_INDEX], + pErrorCode); + + /* + * swap the UChars + * U const char16_t uchars[2*(i3-i2)]; + */ + ds->swapArray16(ds, + inData32+dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX], + 4*(dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX]-dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX]), + outData32+dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX], + pErrorCode); + + /* + * swap the additional UTrie + * i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties + */ + utrie_swapAnyVersion(ds, + inData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX], + 4*(dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]-dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX]), + outData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX], + pErrorCode); + + /* + * swap the properties vectors + * PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4]; + */ + ds->swapArray32(ds, + inData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX], + 4*(dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]-dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]), + outData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX], + pErrorCode); + + // swap the Script_Extensions data + // SCX const uint16_t scriptExtensions[2*(i7-i6)]; + ds->swapArray16(ds, + inData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX], + 4*(dataIndexes[UPROPS_RESERVED_INDEX_7]-dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]), + outData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX], + pErrorCode); + } + + /* i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data */ + return headerSize+4*dataIndexes[UPROPS_RESERVED_INDEX_7]; +} + +/* Unicode case mapping data swapping --------------------------------------- */ + +static int32_t U_CALLCONV +ucase_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t headerSize; + + const uint8_t *inBytes; + uint8_t *outBytes; + + const int32_t *inIndexes; + int32_t indexes[16]; + + int32_t i, offset, count, size; + + /* udata_swapDataHeader checks the arguments */ + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==UCASE_FMT_0 && /* dataFormat="cAsE" */ + pInfo->dataFormat[1]==UCASE_FMT_1 && + pInfo->dataFormat[2]==UCASE_FMT_2 && + pInfo->dataFormat[3]==UCASE_FMT_3 && + ((pInfo->formatVersion[0]==1 && + pInfo->formatVersion[2]==UTRIE_SHIFT && + pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT) || + (2<=pInfo->formatVersion[0] && pInfo->formatVersion[0]<=4)) + )) { + udata_printError(ds, "ucase_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as case mapping data\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + inBytes=(const uint8_t *)inData+headerSize; + outBytes=(uint8_t *)outData+headerSize; + + inIndexes=(const int32_t *)inBytes; + + if(length>=0) { + length-=headerSize; + if(length<16*4) { + udata_printError(ds, "ucase_swap(): too few bytes (%d after header) for case mapping data\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + } + + /* read the first 16 indexes (ICU 3.2/format version 1: UCASE_IX_TOP==16, might grow) */ + for(i=0; i<16; ++i) { + indexes[i]=udata_readInt32(ds, inIndexes[i]); + } + + /* get the total length of the data */ + size=indexes[UCASE_IX_LENGTH]; + + if(length>=0) { + if(length<size) { + udata_printError(ds, "ucase_swap(): too few bytes (%d after header) for all of case mapping data\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + /* copy the data for inaccessible bytes */ + if(inBytes!=outBytes) { + uprv_memcpy(outBytes, inBytes, size); + } + + offset=0; + + /* swap the int32_t indexes[] */ + count=indexes[UCASE_IX_INDEX_TOP]*4; + ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode); + offset+=count; + + /* swap the UTrie */ + count=indexes[UCASE_IX_TRIE_SIZE]; + utrie_swapAnyVersion(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + + /* swap the uint16_t exceptions[] and unfold[] */ + count=(indexes[UCASE_IX_EXC_LENGTH]+indexes[UCASE_IX_UNFOLD_LENGTH])*2; + ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + + U_ASSERT(offset==size); + } + + return headerSize+size; +} + +/* Unicode bidi/shaping data swapping --------------------------------------- */ + +static int32_t U_CALLCONV +ubidi_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t headerSize; + + const uint8_t *inBytes; + uint8_t *outBytes; + + const int32_t *inIndexes; + int32_t indexes[16]; + + int32_t i, offset, count, size; + + /* udata_swapDataHeader checks the arguments */ + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==UBIDI_FMT_0 && /* dataFormat="BiDi" */ + pInfo->dataFormat[1]==UBIDI_FMT_1 && + pInfo->dataFormat[2]==UBIDI_FMT_2 && + pInfo->dataFormat[3]==UBIDI_FMT_3 && + ((pInfo->formatVersion[0]==1 && + pInfo->formatVersion[2]==UTRIE_SHIFT && + pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT) || + pInfo->formatVersion[0]==2) + )) { + udata_printError(ds, "ubidi_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as bidi/shaping data\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + inBytes=(const uint8_t *)inData+headerSize; + outBytes=(uint8_t *)outData+headerSize; + + inIndexes=(const int32_t *)inBytes; + + if(length>=0) { + length-=headerSize; + if(length<16*4) { + udata_printError(ds, "ubidi_swap(): too few bytes (%d after header) for bidi/shaping data\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + } + + /* read the first 16 indexes (ICU 3.4/format version 1: UBIDI_IX_TOP==16, might grow) */ + for(i=0; i<16; ++i) { + indexes[i]=udata_readInt32(ds, inIndexes[i]); + } + + /* get the total length of the data */ + size=indexes[UBIDI_IX_LENGTH]; + + if(length>=0) { + if(length<size) { + udata_printError(ds, "ubidi_swap(): too few bytes (%d after header) for all of bidi/shaping data\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + /* copy the data for inaccessible bytes */ + if(inBytes!=outBytes) { + uprv_memcpy(outBytes, inBytes, size); + } + + offset=0; + + /* swap the int32_t indexes[] */ + count=indexes[UBIDI_IX_INDEX_TOP]*4; + ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode); + offset+=count; + + /* swap the UTrie */ + count=indexes[UBIDI_IX_TRIE_SIZE]; + utrie_swapAnyVersion(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + + /* swap the uint32_t mirrors[] */ + count=indexes[UBIDI_IX_MIRROR_LENGTH]*4; + ds->swapArray32(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + + /* just skip the uint8_t jgArray[] and jgArray2[] */ + count=indexes[UBIDI_IX_JG_LIMIT]-indexes[UBIDI_IX_JG_START]; + offset+=count; + count=indexes[UBIDI_IX_JG_LIMIT2]-indexes[UBIDI_IX_JG_START2]; + offset+=count; + + U_ASSERT(offset==size); + } + + return headerSize+size; +} + +/* Unicode normalization data swapping -------------------------------------- */ + +#if !UCONFIG_NO_NORMALIZATION + +static int32_t U_CALLCONV +unorm_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t headerSize; + + const uint8_t *inBytes; + uint8_t *outBytes; + + const int32_t *inIndexes; + int32_t indexes[32]; + + int32_t i, offset, count, size; + + /* udata_swapDataHeader checks the arguments */ + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */ + pInfo->dataFormat[1]==0x6f && + pInfo->dataFormat[2]==0x72 && + pInfo->dataFormat[3]==0x6d && + pInfo->formatVersion[0]==2 + )) { + udata_printError(ds, "unorm_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unorm.icu\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + inBytes=(const uint8_t *)inData+headerSize; + outBytes=(uint8_t *)outData+headerSize; + + inIndexes=(const int32_t *)inBytes; + + if(length>=0) { + length-=headerSize; + if(length<32*4) { + udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for unorm.icu\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + } + + /* read the first 32 indexes (ICU 2.8/format version 2.2: _NORM_INDEX_TOP==32, might grow) */ + for(i=0; i<32; ++i) { + indexes[i]=udata_readInt32(ds, inIndexes[i]); + } + + /* calculate the total length of the data */ + size= + 32*4+ /* size of indexes[] */ + indexes[_NORM_INDEX_TRIE_SIZE]+ + indexes[_NORM_INDEX_UCHAR_COUNT]*2+ + indexes[_NORM_INDEX_COMBINE_DATA_COUNT]*2+ + indexes[_NORM_INDEX_FCD_TRIE_SIZE]+ + indexes[_NORM_INDEX_AUX_TRIE_SIZE]+ + indexes[_NORM_INDEX_CANON_SET_COUNT]*2; + + if(length>=0) { + if(length<size) { + udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for all of unorm.icu\n", + length); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + /* copy the data for inaccessible bytes */ + if(inBytes!=outBytes) { + uprv_memcpy(outBytes, inBytes, size); + } + + offset=0; + + /* swap the indexes[] */ + count=32*4; + ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode); + offset+=count; + + /* swap the main UTrie */ + count=indexes[_NORM_INDEX_TRIE_SIZE]; + utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + + /* swap the uint16_t extraData[] and the uint16_t combiningTable[] */ + count=(indexes[_NORM_INDEX_UCHAR_COUNT]+indexes[_NORM_INDEX_COMBINE_DATA_COUNT])*2; + ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + + /* swap the FCD UTrie */ + count=indexes[_NORM_INDEX_FCD_TRIE_SIZE]; + if(count!=0) { + utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + } + + /* swap the aux UTrie */ + count=indexes[_NORM_INDEX_AUX_TRIE_SIZE]; + if(count!=0) { + utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + } + + /* swap the uint16_t combiningTable[] */ + count=indexes[_NORM_INDEX_CANON_SET_COUNT]*2; + ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode); + offset+=count; + } + + return headerSize+size; +} + +#endif + +// Unicode text layout properties data swapping -------------------------------- + +static int32_t U_CALLCONV +ulayout_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + // udata_swapDataHeader checks the arguments. + int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + // Check data format and format version. + const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4); + if (!( + pInfo->dataFormat[0] == ULAYOUT_FMT_0 && // dataFormat="Layo" + pInfo->dataFormat[1] == ULAYOUT_FMT_1 && + pInfo->dataFormat[2] == ULAYOUT_FMT_2 && + pInfo->dataFormat[3] == ULAYOUT_FMT_3 && + pInfo->formatVersion[0] == 1)) { + udata_printError(ds, + "ulayout_swap(): data format %02x.%02x.%02x.%02x (format version %02x) " + "is not recognized as text layout properties data\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode = U_UNSUPPORTED_ERROR; + return 0; + } + + const uint8_t *inBytes = (const uint8_t *)inData + headerSize; + uint8_t *outBytes = (uint8_t *)outData + headerSize; + + const int32_t *inIndexes = (const int32_t *)inBytes; + + if (length >= 0) { + length -= headerSize; + if (length < 12 * 4) { + udata_printError(ds, + "ulayout_swap(): too few bytes (%d after header) for text layout properties data\n", + length); + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + } + + int32_t indexesLength = udata_readInt32(ds, inIndexes[ULAYOUT_IX_INDEXES_LENGTH]); + if (indexesLength < 12) { + udata_printError(ds, + "ulayout_swap(): too few indexes (%d) for text layout properties data\n", + indexesLength); + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + // Read the data offsets before swapping anything. + int32_t indexes[ULAYOUT_IX_TRIES_TOP + 1]; + for (int32_t i = ULAYOUT_IX_INPC_TRIE_TOP; i <= ULAYOUT_IX_TRIES_TOP; ++i) { + indexes[i] = udata_readInt32(ds, inIndexes[i]); + } + int32_t size = indexes[ULAYOUT_IX_TRIES_TOP]; + + if (length >= 0) { + if (length < size) { + udata_printError(ds, + "ulayout_swap(): too few bytes (%d after header) " + "for all of text layout properties data\n", + length); + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + // Copy the data for inaccessible bytes. + if (inBytes != outBytes) { + uprv_memcpy(outBytes, inBytes, size); + } + + // Swap the int32_t indexes[]. + int32_t offset = 0; + int32_t count = indexesLength * 4; + ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode); + offset += count; + + // Swap each trie. + for (int32_t i = ULAYOUT_IX_INPC_TRIE_TOP; i <= ULAYOUT_IX_TRIES_TOP; ++i) { + int32_t top = indexes[i]; + count = top - offset; + U_ASSERT(count >= 0); + if (count >= 16) { + utrie_swapAnyVersion(ds, inBytes + offset, count, outBytes + offset, pErrorCode); + } + offset = top; + } + + U_ASSERT(offset == size); + } + + return headerSize + size; +} + +// Unicode emoji properties data swapping -------------------------------------- + +static int32_t U_CALLCONV +uemoji_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + // udata_swapDataHeader checks the arguments. + int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + // Check data format and format version. + const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4); + if (!( + pInfo->dataFormat[0] == u'E' && + pInfo->dataFormat[1] == u'm' && + pInfo->dataFormat[2] == u'o' && + pInfo->dataFormat[3] == u'j' && + pInfo->formatVersion[0] == 1)) { + udata_printError(ds, + "uemoji_swap(): data format %02x.%02x.%02x.%02x (format version %02x) " + "is not recognized as emoji properties data\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode = U_UNSUPPORTED_ERROR; + return 0; + } + + const uint8_t *inBytes = (const uint8_t *)inData + headerSize; + uint8_t *outBytes = (uint8_t *)outData + headerSize; + + const int32_t *inIndexes = (const int32_t *)inBytes; + + if (length >= 0) { + length -= headerSize; + // We expect to read at least EmojiProps::IX_TOTAL_SIZE. + if (length < 14 * 4) { + udata_printError(ds, + "uemoji_swap(): too few bytes (%d after header) for emoji properties data\n", + length); + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + } + + // First offset after indexes[]. + int32_t cpTrieOffset = udata_readInt32(ds, inIndexes[EmojiProps::IX_CPTRIE_OFFSET]); + int32_t indexesLength = cpTrieOffset / 4; + if (indexesLength < 14) { + udata_printError(ds, + "uemoji_swap(): too few indexes (%d) for emoji properties data\n", + indexesLength); + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + // Read the data offsets before swapping anything. + int32_t indexes[EmojiProps::IX_TOTAL_SIZE + 1]; + indexes[0] = cpTrieOffset; + for (int32_t i = 1; i <= EmojiProps::IX_TOTAL_SIZE; ++i) { + indexes[i] = udata_readInt32(ds, inIndexes[i]); + } + int32_t size = indexes[EmojiProps::IX_TOTAL_SIZE]; + + if (length >= 0) { + if (length < size) { + udata_printError(ds, + "uemoji_swap(): too few bytes (%d after header) " + "for all of emoji properties data\n", + length); + *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + // Copy the data for inaccessible bytes. + if (inBytes != outBytes) { + uprv_memcpy(outBytes, inBytes, size); + } + + // Swap the int32_t indexes[]. + int32_t offset = 0; + int32_t top = cpTrieOffset; + ds->swapArray32(ds, inBytes, top - offset, outBytes, pErrorCode); + offset = top; + + // Swap the code point trie. + top = indexes[EmojiProps::IX_CPTRIE_OFFSET + 1]; + int32_t count = top - offset; + U_ASSERT(count >= 0); + if (count >= 16) { + utrie_swapAnyVersion(ds, inBytes + offset, count, outBytes + offset, pErrorCode); + } + offset = top; + + // Swap all of the string tries. + // They are all serialized as arrays of 16-bit units. + offset = indexes[EmojiProps::IX_BASIC_EMOJI_TRIE_OFFSET]; + top = indexes[EmojiProps::IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET + 1]; + ds->swapArray16(ds, inBytes + offset, top - offset, outBytes + offset, pErrorCode); + offset = top; + + U_ASSERT(offset == size); + } + + return headerSize + size; +} + +/* Swap 'Test' data from gentest */ +static int32_t U_CALLCONV +test_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + const UDataInfo *pInfo; + int32_t headerSize; + + const uint8_t *inBytes; + uint8_t *outBytes; + + int32_t offset; + + /* udata_swapDataHeader checks the arguments */ + headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + udata_printError(ds, "test_swap(): data header swap failed %s\n", pErrorCode != nullptr ? u_errorName(*pErrorCode) : "pErrorCode is nullptr"); + return 0; + } + + /* check data format and format version */ + pInfo=(const UDataInfo *)((const char *)inData+4); + if(!( + pInfo->dataFormat[0]==0x54 && /* dataFormat="Norm" */ + pInfo->dataFormat[1]==0x65 && + pInfo->dataFormat[2]==0x73 && + pInfo->dataFormat[3]==0x74 && + pInfo->formatVersion[0]==1 + )) { + udata_printError(ds, "test_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as testdata\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0]); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } + + inBytes=(const uint8_t *)inData+headerSize; + outBytes=(uint8_t *)outData+headerSize; + + int32_t size16 = 2; // 16bit plus padding + int32_t sizeStr = 5; // 4 char inv-str plus null + int32_t size = size16 + sizeStr; + + if(length>=0) { + if(length<size) { + udata_printError(ds, "test_swap(): too few bytes (%d after header, wanted %d) for all of testdata\n", + length, size); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + offset =0; + /* swap a 1 entry array */ + ds->swapArray16(ds, inBytes+offset, size16, outBytes+offset, pErrorCode); + offset+=size16; + ds->swapInvChars(ds, inBytes+offset, sizeStr, outBytes+offset, pErrorCode); + } + + return headerSize+size; +} + +/* swap any data (except a .dat package) ------------------------------------ */ + +static const struct { + uint8_t dataFormat[4]; + UDataSwapFn *swapFn; +} swapFns[]={ + { { 0x52, 0x65, 0x73, 0x42 }, ures_swap }, /* dataFormat="ResB" */ +#if !UCONFIG_NO_LEGACY_CONVERSION + { { 0x63, 0x6e, 0x76, 0x74 }, ucnv_swap }, /* dataFormat="cnvt" */ +#endif +#if !UCONFIG_NO_CONVERSION + { { 0x43, 0x76, 0x41, 0x6c }, ucnv_swapAliases }, /* dataFormat="CvAl" */ +#endif +#if !UCONFIG_NO_IDNA + { { 0x53, 0x50, 0x52, 0x50 }, usprep_swap }, /* dataFormat="SPRP" */ +#endif + /* insert data formats here, descending by expected frequency of occurrence */ + { { 0x55, 0x50, 0x72, 0x6f }, uprops_swap }, /* dataFormat="UPro" */ + + { { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 }, + ucase_swap }, /* dataFormat="cAsE" */ + + { { UBIDI_FMT_0, UBIDI_FMT_1, UBIDI_FMT_2, UBIDI_FMT_3 }, + ubidi_swap }, /* dataFormat="BiDi" */ + +#if !UCONFIG_NO_NORMALIZATION + { { 0x4e, 0x6f, 0x72, 0x6d }, unorm_swap }, /* dataFormat="Norm" */ + { { 0x4e, 0x72, 0x6d, 0x32 }, unorm2_swap }, /* dataFormat="Nrm2" */ +#endif + + { { ULAYOUT_FMT_0, ULAYOUT_FMT_1, ULAYOUT_FMT_2, ULAYOUT_FMT_3 }, + ulayout_swap }, // dataFormat="Layo" + + { { u'E', u'm', u'o', u'j' }, uemoji_swap }, + +#if !UCONFIG_NO_COLLATION + { { 0x55, 0x43, 0x6f, 0x6c }, ucol_swap }, /* dataFormat="UCol" */ + { { 0x49, 0x6e, 0x76, 0x43 }, ucol_swapInverseUCA },/* dataFormat="InvC" */ +#endif +#if !UCONFIG_NO_BREAK_ITERATION + { { 0x42, 0x72, 0x6b, 0x20 }, ubrk_swap }, /* dataFormat="Brk " */ + { { 0x44, 0x69, 0x63, 0x74 }, udict_swap }, /* dataFormat="Dict" */ +#endif + { { 0x70, 0x6e, 0x61, 0x6d }, upname_swap }, /* dataFormat="pnam" */ + { { 0x75, 0x6e, 0x61, 0x6d }, uchar_swapNames }, /* dataFormat="unam" */ +#if !UCONFIG_NO_NORMALIZATION + { { 0x43, 0x66, 0x75, 0x20 }, uspoof_swap }, /* dataFormat="Cfu " */ +#endif + { { 0x54, 0x65, 0x73, 0x74 }, test_swap } /* dataFormat="Test" */ +}; + +U_CAPI int32_t U_EXPORT2 +udata_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode) { + char dataFormatChars[4]; + const UDataInfo *pInfo; + int32_t i, swappedLength; + + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + /* + * Preflight the header first; checks for illegal arguments, too. + * Do not swap the header right away because the format-specific swapper + * will swap it, get the headerSize again, and also use the header + * information. Otherwise we would have to pass some of the information + * and not be able to use the UDataSwapFn signature. + */ + udata_swapDataHeader(ds, inData, -1, nullptr, pErrorCode); + + /* + * If we wanted udata_swap() to also handle non-loadable data like a UTrie, + * then we could check here for further known magic values and structures. + */ + if(U_FAILURE(*pErrorCode)) { + return 0; /* the data format was not recognized */ + } + + pInfo=(const UDataInfo *)((const char *)inData+4); + + { + /* convert the data format from ASCII to Unicode to the system charset */ + char16_t u[4]={ + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3] + }; + + if(uprv_isInvariantUString(u, 4)) { + u_UCharsToChars(u, dataFormatChars, 4); + } else { + dataFormatChars[0]=dataFormatChars[1]=dataFormatChars[2]=dataFormatChars[3]='?'; + } + } + + /* dispatch to the swap function for the dataFormat */ + for(i=0; i<UPRV_LENGTHOF(swapFns); ++i) { + if(0==memcmp(swapFns[i].dataFormat, pInfo->dataFormat, 4)) { + swappedLength=swapFns[i].swapFn(ds, inData, length, outData, pErrorCode); + + if(U_FAILURE(*pErrorCode)) { + udata_printError(ds, "udata_swap(): failure swapping data format %02x.%02x.%02x.%02x (\"%c%c%c%c\") - %s\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + dataFormatChars[0], dataFormatChars[1], + dataFormatChars[2], dataFormatChars[3], + u_errorName(*pErrorCode)); + } else if(swappedLength<(length-15)) { + /* swapped less than expected */ + udata_printError(ds, "udata_swap() warning: swapped only %d out of %d bytes - data format %02x.%02x.%02x.%02x (\"%c%c%c%c\")\n", + swappedLength, length, + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + dataFormatChars[0], dataFormatChars[1], + dataFormatChars[2], dataFormatChars[3], + u_errorName(*pErrorCode)); + } + + return swappedLength; + } + } + + /* the dataFormat was not recognized */ + udata_printError(ds, "udata_swap(): unknown data format %02x.%02x.%02x.%02x (\"%c%c%c%c\")\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + dataFormatChars[0], dataFormatChars[1], + dataFormatChars[2], dataFormatChars[3]); + + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; +} diff --git a/intl/icu/source/tools/toolutil/swapimpl.h b/intl/icu/source/tools/toolutil/swapimpl.h new file mode 100644 index 0000000000..8c6474f662 --- /dev/null +++ b/intl/icu/source/tools/toolutil/swapimpl.h @@ -0,0 +1,45 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2005, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: swapimpl.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005jul29 +* created by: Markus W. Scherer +* +* Declarations for data file swapping functions not declared in internal +* library headers. +*/ + +#ifndef __SWAPIMPL_H__ +#define __SWAPIMPL_H__ + +#include "unicode/utypes.h" +#include "udataswp.h" + +/** + * Identifies and then transforms the ICU data piece in-place, or determines + * its length. See UDataSwapFn. + * This function handles single data pieces (but not .dat data packages) + * and internally dispatches to per-type swap functions. + * Sets a U_UNSUPPORTED_ERROR if the data format is not recognized. + * + * @see UDataSwapFn + * @see udata_openSwapper + * @see udata_openSwapperForInputData + * @internal ICU 2.8 + */ +U_CAPI int32_t U_EXPORT2 +udata_swap(const UDataSwapper *ds, + const void *inData, int32_t length, void *outData, + UErrorCode *pErrorCode); + +#endif diff --git a/intl/icu/source/tools/toolutil/toolutil.cpp b/intl/icu/source/tools/toolutil/toolutil.cpp new file mode 100644 index 0000000000..7e7bdc78a1 --- /dev/null +++ b/intl/icu/source/tools/toolutil/toolutil.cpp @@ -0,0 +1,381 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: toolutil.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999nov19 +* created by: Markus W. Scherer +* +* 6/25/08 - Added Cygwin specific code in uprv_mkdir - Brian Rower +* +* This file contains utility functions for ICU tools like genccode. +*/ + +#include "unicode/platform.h" +#if U_PLATFORM == U_PF_MINGW +// *cough* - for struct stat +#ifdef __STRICT_ANSI__ +#undef __STRICT_ANSI__ +#endif +#endif + +#include <stdio.h> +#include <sys/stat.h> +#include <fstream> +#include <time.h> +#include "unicode/utypes.h" + +#ifndef U_TOOLUTIL_IMPLEMENTATION +#error U_TOOLUTIL_IMPLEMENTATION not set - must be set for all ICU source files in common/ - see https://unicode-org.github.io/icu/userguide/howtouseicu +#endif + +#if U_PLATFORM_USES_ONLY_WIN32_API +# define VC_EXTRALEAN +# define WIN32_LEAN_AND_MEAN +# define NOUSER +# define NOSERVICE +# define NOIME +# define NOMCX +# if U_PLATFORM == U_PF_MINGW +# define __NO_MINGW_LFS /* gets around missing 'off64_t' */ +# endif +# include <windows.h> +# include <direct.h> +#else +# include <sys/stat.h> +# include <sys/types.h> +#endif + +/* In MinGW environment, io.h needs to be included for _mkdir() */ +#if U_PLATFORM == U_PF_MINGW +#include <io.h> +#endif + +#include <errno.h> + +#include <cstddef> + +#include "unicode/errorcode.h" +#include "unicode/putil.h" +#include "cmemory.h" +#include "cstring.h" +#include "toolutil.h" + +U_NAMESPACE_BEGIN + +IcuToolErrorCode::~IcuToolErrorCode() { + // Safe because our handleFailure() does not throw exceptions. + if(isFailure()) { handleFailure(); } +} + +void IcuToolErrorCode::handleFailure() const { + fprintf(stderr, "error at %s: %s\n", location, errorName()); + exit(errorCode); +} + +U_NAMESPACE_END + +static int32_t currentYear = -1; + +U_CAPI int32_t U_EXPORT2 getCurrentYear() { + if(currentYear == -1) { + time_t now = time(nullptr); + tm *fields = gmtime(&now); + currentYear = 1900 + fields->tm_year; + } + return currentYear; +} + + +U_CAPI const char * U_EXPORT2 +getLongPathname(const char *pathname) { +#if U_PLATFORM_USES_ONLY_WIN32_API + /* anticipate problems with "short" pathnames */ + static WIN32_FIND_DATAA info; + HANDLE file=FindFirstFileA(pathname, &info); + if(file!=INVALID_HANDLE_VALUE) { + if(info.cAlternateFileName[0]!=0) { + /* this file has a short name, get and use the long one */ + const char *basename=findBasename(pathname); + if(basename!=pathname) { + /* prepend the long filename with the original path */ + uprv_memmove(info.cFileName+(basename-pathname), info.cFileName, uprv_strlen(info.cFileName)+1); + uprv_memcpy(info.cFileName, pathname, basename-pathname); + } + pathname=info.cFileName; + } + FindClose(file); + } +#endif + return pathname; +} + +U_CAPI const char * U_EXPORT2 +findDirname(const char *path, char *buffer, int32_t bufLen, UErrorCode* status) { + if(U_FAILURE(*status)) return nullptr; + const char *resultPtr = nullptr; + int32_t resultLen = 0; + + const char *basename=uprv_strrchr(path, U_FILE_SEP_CHAR); +#if U_FILE_ALT_SEP_CHAR!=U_FILE_SEP_CHAR + const char *basenameAlt=uprv_strrchr(path, U_FILE_ALT_SEP_CHAR); + if(basenameAlt && (!basename || basename<basenameAlt)) { + basename = basenameAlt; + } +#endif + if(!basename) { + /* no basename - return ''. */ + resultPtr = ""; + resultLen = 0; + } else { + resultPtr = path; + resultLen = static_cast<int32_t>(basename - path); + if(resultLen<1) { + resultLen = 1; /* '/' or '/a' -> '/' */ + } + } + + if((resultLen+1) <= bufLen) { + uprv_strncpy(buffer, resultPtr, resultLen); + buffer[resultLen]=0; + return buffer; + } else { + *status = U_BUFFER_OVERFLOW_ERROR; + return nullptr; + } +} + +U_CAPI const char * U_EXPORT2 +findBasename(const char *filename) { + const char *basename=uprv_strrchr(filename, U_FILE_SEP_CHAR); + +#if U_FILE_ALT_SEP_CHAR!=U_FILE_SEP_CHAR + //be lenient about pathname separators on Windows, like official implementation of C++17 std::filesystem in MSVC + //would be convenient to merge this loop with the one above, but alas, there is no such solution in the standard library + const char *alt_basename=uprv_strrchr(filename, U_FILE_ALT_SEP_CHAR); + if(alt_basename>basename) { + basename=alt_basename; + } +#endif + + if(basename!=nullptr) { + return basename+1; + } else { + return filename; + } +} + +U_CAPI void U_EXPORT2 +uprv_mkdir(const char *pathname, UErrorCode *status) { + + int retVal = 0; +#if U_PLATFORM_USES_ONLY_WIN32_API + retVal = _mkdir(pathname); +#else + retVal = mkdir(pathname, S_IRWXU | (S_IROTH | S_IXOTH) | (S_IROTH | S_IXOTH)); +#endif + if (retVal && errno != EEXIST) { +#if U_PF_MINGW <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN + /*if using Cygwin and the mkdir says it failed...check if the directory already exists..*/ + /* if it does...don't give the error, if it does not...give the error - Brian Rower - 6/25/08 */ + struct stat st; + + if(stat(pathname,&st) != 0) + { + *status = U_FILE_ACCESS_ERROR; + } +#else + *status = U_FILE_ACCESS_ERROR; +#endif + } +} + +#if !UCONFIG_NO_FILE_IO +U_CAPI UBool U_EXPORT2 +uprv_fileExists(const char *file) { + struct stat stat_buf; + if (stat(file, &stat_buf) == 0) { + return true; + } else { + return false; + } +} +#endif + +U_CAPI int32_t U_EXPORT2 +uprv_compareGoldenFiles( + const char* buffer, int32_t bufferLen, + const char* goldenFilePath, + bool overwrite) { + + if (overwrite) { + std::ofstream ofs; + ofs.open(goldenFilePath); + ofs.write(buffer, bufferLen); + ofs.close(); + return -1; + } + + std::ifstream ifs(goldenFilePath, std::ifstream::in); + int32_t pos = 0; + char c; + while (ifs.get(c) && pos < bufferLen) { + if (c != buffer[pos]) { + // Files differ at this position + break; + } + pos++; + } + if (pos == bufferLen && ifs.eof()) { + // Files are same lengths + pos = -1; + } + ifs.close(); + return pos; +} + +/*U_CAPI UDate U_EXPORT2 +uprv_getModificationDate(const char *pathname, UErrorCode *status) +{ + if(U_FAILURE(*status)) { + return; + } + // TODO: handle case where stat is not available + struct stat st; + + if(stat(pathname,&st) != 0) + { + *status = U_FILE_ACCESS_ERROR; + } else { + return st.st_mtime; + } +} +*/ + +/* tool memory helper ------------------------------------------------------- */ + +struct UToolMemory { + char name[64]; + int32_t capacity, maxCapacity, size, idx; + void *array; + alignas(std::max_align_t) char staticArray[1]; +}; + +U_CAPI UToolMemory * U_EXPORT2 +utm_open(const char *name, int32_t initialCapacity, int32_t maxCapacity, int32_t size) { + UToolMemory *mem; + + if(maxCapacity<initialCapacity) { + maxCapacity=initialCapacity; + } + + mem=(UToolMemory *)uprv_malloc(sizeof(UToolMemory)+initialCapacity*size); + if(mem==nullptr) { + fprintf(stderr, "error: %s - out of memory\n", name); + exit(U_MEMORY_ALLOCATION_ERROR); + } + mem->array=mem->staticArray; + + uprv_strcpy(mem->name, name); + mem->capacity=initialCapacity; + mem->maxCapacity=maxCapacity; + mem->size=size; + mem->idx=0; + return mem; +} + +U_CAPI void U_EXPORT2 +utm_close(UToolMemory *mem) { + if(mem!=nullptr) { + if(mem->array!=mem->staticArray) { + uprv_free(mem->array); + } + uprv_free(mem); + } +} + + +U_CAPI void * U_EXPORT2 +utm_getStart(UToolMemory *mem) { + return (char *)mem->array; +} + +U_CAPI int32_t U_EXPORT2 +utm_countItems(UToolMemory *mem) { + return mem->idx; +} + + +static UBool +utm_hasCapacity(UToolMemory *mem, int32_t capacity) { + if(mem->capacity<capacity) { + int32_t newCapacity; + + if(mem->maxCapacity<capacity) { + fprintf(stderr, "error: %s - trying to use more than maxCapacity=%ld units\n", + mem->name, (long)mem->maxCapacity); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + /* try to allocate a larger array */ + if(capacity>=2*mem->capacity) { + newCapacity=capacity; + } else if(mem->capacity<=mem->maxCapacity/3) { + newCapacity=2*mem->capacity; + } else { + newCapacity=mem->maxCapacity; + } + + if(mem->array==mem->staticArray) { + mem->array=uprv_malloc(newCapacity*mem->size); + if(mem->array!=nullptr) { + uprv_memcpy(mem->array, mem->staticArray, (size_t)mem->idx*mem->size); + } + } else { + mem->array=uprv_realloc(mem->array, newCapacity*mem->size); + } + + if(mem->array==nullptr) { + fprintf(stderr, "error: %s - out of memory\n", mem->name); + exit(U_MEMORY_ALLOCATION_ERROR); + } + mem->capacity=newCapacity; + } + + return true; +} + +U_CAPI void * U_EXPORT2 +utm_alloc(UToolMemory *mem) { + char *p=nullptr; + int32_t oldIndex=mem->idx; + int32_t newIndex=oldIndex+1; + if(utm_hasCapacity(mem, newIndex)) { + p=(char *)mem->array+oldIndex*mem->size; + mem->idx=newIndex; + uprv_memset(p, 0, mem->size); + } + return p; +} + +U_CAPI void * U_EXPORT2 +utm_allocN(UToolMemory *mem, int32_t n) { + char *p=nullptr; + int32_t oldIndex=mem->idx; + int32_t newIndex=oldIndex+n; + if(utm_hasCapacity(mem, newIndex)) { + p=(char *)mem->array+oldIndex*mem->size; + mem->idx=newIndex; + uprv_memset(p, 0, n*mem->size); + } + return p; +} diff --git a/intl/icu/source/tools/toolutil/toolutil.h b/intl/icu/source/tools/toolutil/toolutil.h new file mode 100644 index 0000000000..b32a0b8762 --- /dev/null +++ b/intl/icu/source/tools/toolutil/toolutil.h @@ -0,0 +1,201 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2013, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: toolutil.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999nov19 +* created by: Markus W. Scherer +* +* This file defines utility functions for ICU tools like genccode. +*/ + +#ifndef __TOOLUTIL_H__ +#define __TOOLUTIL_H__ + +#include "unicode/utypes.h" + +#ifdef __cplusplus + +#include "unicode/errorcode.h" + +U_NAMESPACE_BEGIN + +/** + * ErrorCode subclass for use in ICU command-line tools. + * The destructor calls handleFailure() which calls exit(errorCode) when isFailure(). + */ +class U_TOOLUTIL_API IcuToolErrorCode : public ErrorCode { +public: + /** + * @param loc A short string describing where the IcuToolErrorCode is used. + */ + IcuToolErrorCode(const char *loc) : location(loc) {} + virtual ~IcuToolErrorCode(); +protected: + virtual void handleFailure() const override; +private: + const char *location; +}; + +U_NAMESPACE_END + +#endif + +/* + * For Windows, a path/filename may be the short (8.3) version + * of the "real", long one. In this case, the short one + * is abbreviated and contains a tilde etc. + * This function returns a pointer to the original pathname + * if it is the "real" one itself, and a pointer to a static + * buffer (not thread-safe) containing the long version + * if the pathname is indeed abbreviated. + * + * On platforms other than Windows, this function always returns + * the input pathname pointer. + * + * This function is especially useful in tools that are called + * by a batch file for loop, which yields short pathnames on Win9x. + */ +U_CAPI const char * U_EXPORT2 +getLongPathname(const char *pathname); + +/** + * Find the basename at the end of a pathname, i.e., the part + * after the last file separator, and return a pointer + * to this part of the pathname. + * If the pathname only contains a basename and no file separator, + * then the pathname pointer itself is returned. + **/ +U_CAPI const char * U_EXPORT2 +findBasename(const char *filename); + +/** + * Find the directory name of a pathname, that is, everything + * up to but not including the last file separator. + * + * If successful, copies the directory name into the output buffer along with + * a terminating NULL. + * + * If there isn't a directory name in the path, it returns an empty string. + * @param path the full pathname to inspect. + * @param buffer the output buffer + * @param bufLen the output buffer length + * @param status error code- may return U_BUFFER_OVERFLOW_ERROR if bufLen is too small. + * @return If successful, a pointer to the output buffer. If failure or bufLen is too small, NULL. + **/ +U_CAPI const char * U_EXPORT2 +findDirname(const char *path, char *buffer, int32_t bufLen, UErrorCode* status); + +/* + * Return the current year in the Gregorian calendar. Used for copyright generation. + */ +U_CAPI int32_t U_EXPORT2 +getCurrentYear(); + +/* + * Creates a directory with pathname. + * + * @param status Set to an error code when mkdir failed. + */ +U_CAPI void U_EXPORT2 +uprv_mkdir(const char *pathname, UErrorCode *status); + +#if !UCONFIG_NO_FILE_IO +/** + * Return true if the named item exists + * @param file filename + * @return true if named item (file, dir, etc) exists, false otherwise + */ +U_CAPI UBool U_EXPORT2 +uprv_fileExists(const char *file); +#endif + +/** + * Performs a golden data test. Asserts that the contents of the buffer is equal + * to the data in goldenFilePath. + * + * Pass the value of the -G flag to "overwrite"; if true, new goldens will be + * written to the filesystem. + * + * @return The first index at which the files differ, or -1 if they are the same. + */ +U_CAPI int32_t U_EXPORT2 +uprv_compareGoldenFiles( + const char* buffer, int32_t bufferLen, + const char* goldenFilePath, + bool overwrite); + +/** + * Return the modification date for the specified file or directory. + * Return value is undefined if there was an error. + */ +/*U_CAPI UDate U_EXPORT2 +uprv_getModificationDate(const char *pathname, UErrorCode *status); +*/ +/* + * Returns the modification + * + * @param status Set to an error code when mkdir failed. + */ + +/* + * UToolMemory is used for generic, custom memory management. + * It is allocated with enough space for count*size bytes starting + * at array. + * The array is declared with a union of large data types so + * that its base address is aligned for any types. + * If size is a multiple of a data type size, then such items + * can be safely allocated inside the array, at offsets that + * are themselves multiples of size. + */ +struct UToolMemory; +typedef struct UToolMemory UToolMemory; + +/** + * Open a UToolMemory object for allocation of initialCapacity to maxCapacity + * items with size bytes each. + */ +U_CAPI UToolMemory * U_EXPORT2 +utm_open(const char *name, int32_t initialCapacity, int32_t maxCapacity, int32_t size); + +/** + * Close a UToolMemory object. + */ +U_CAPI void U_EXPORT2 +utm_close(UToolMemory *mem); + +/** + * Get the pointer to the beginning of the array of items. + * The pointer becomes invalid after allocation of new items. + */ +U_CAPI void * U_EXPORT2 +utm_getStart(UToolMemory *mem); + +/** + * Get the current number of items. + */ +U_CAPI int32_t U_EXPORT2 +utm_countItems(UToolMemory *mem); + +/** + * Allocate one more item and return the pointer to its start in the array. + */ +U_CAPI void * U_EXPORT2 +utm_alloc(UToolMemory *mem); + +/** + * Allocate n items and return the pointer to the start of the first one in the array. + */ +U_CAPI void * U_EXPORT2 +utm_allocN(UToolMemory *mem, int32_t n); + +#endif diff --git a/intl/icu/source/tools/toolutil/toolutil.vcxproj b/intl/icu/source/tools/toolutil/toolutil.vcxproj new file mode 100644 index 0000000000..0995ef06f7 --- /dev/null +++ b/intl/icu/source/tools/toolutil/toolutil.vcxproj @@ -0,0 +1,272 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{6B231032-3CB5-4EED-9210-810D666A23A0}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>DynamicLibrary</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* project configurations --> + <ItemDefinitionGroup> + <ClCompile> + <AdditionalIncludeDirectories>..\..\..\include;..\..\common;..\..\i18n;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PreprocessorDefinitions>U_TOOLUTIL_IMPLEMENTATION;%(PreprocessorDefinitions)</PreprocessorDefinitions> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + </ClCompile> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + <DebugInformationFormat>EditAndContinue</DebugInformationFormat> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icuind.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icuin.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Win32' project configurations --> + <ItemDefinitionGroup Condition="'$(Platform)'=='Win32'"> + <ClCompile> + <PrecompiledHeaderOutputFile>.\x86\$(Configuration)/toolutil.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>.\x86\$(Configuration)/</AssemblerListingLocation> + <ObjectFileName>.\x86\$(Configuration)/</ObjectFileName> + <ProgramDataBaseFileName>.\x86\$(Configuration)/</ProgramDataBaseFileName> + </ClCompile> + <Link> + <AdditionalLibraryDirectories>..\..\..\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> + <Midl> + <TypeLibraryName>.\..\..\..\lib\icutu.tlb</TypeLibraryName> + </Midl> + <Link> + <OutputFile>..\..\..\bin\icutu$(IcuMajorVersion).dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\lib\icutu.pdb</ProgramDatabaseFile> + <DataExecutionPrevention> + </DataExecutionPrevention> + <ImportLibrary>..\..\..\lib\icutu.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> + <Midl> + <TypeLibraryName>.\..\..\..\lib\icutud.tlb</TypeLibraryName> + </Midl> + <Link> + <OutputFile>..\..\..\bin\icutu$(IcuMajorVersion)d.dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\lib\icutud.pdb</ProgramDatabaseFile> + <DataExecutionPrevention> + </DataExecutionPrevention> + <ImportLibrary>..\..\..\lib\icutud.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'x64' project configurations --> + <ItemDefinitionGroup Condition="'$(Platform)'=='x64'"> + <ClCompile> + <PrecompiledHeaderOutputFile>.\x64\$(Configuration)/toolutil.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>.\x64\$(Configuration)/</AssemblerListingLocation> + <ObjectFileName>.\x64\$(Configuration)/</ObjectFileName> + <ProgramDataBaseFileName>.\x64\$(Configuration)/</ProgramDataBaseFileName> + </ClCompile> + <Link> + <AdditionalLibraryDirectories>..\..\..\lib64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> + <Midl> + <TypeLibraryName>.\..\..\..\lib64\icutu.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WholeProgramOptimization>true</WholeProgramOptimization> + </ClCompile> + <Link> + <OutputFile>..\..\..\bin64\icutu$(IcuMajorVersion).dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\lib64\icutu.pdb</ProgramDatabaseFile> + <ImportLibrary>..\..\..\lib64\icutu.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> + <Midl> + <TypeLibraryName>.\..\..\..\lib64\icutud.tlb</TypeLibraryName> + </Midl> + <Link> + <OutputFile>..\..\..\bin64\icutu$(IcuMajorVersion)d.dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\lib64\icutud.pdb</ProgramDatabaseFile> + <ImportLibrary>..\..\..\lib64\icutud.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Platform)'=='ARM'"> + <ClCompile> + <PrecompiledHeaderOutputFile>.\ARM\$(Configuration)/toolutil.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>.\ARM\$(Configuration)/</AssemblerListingLocation> + <ObjectFileName>.\ARM\$(Configuration)/</ObjectFileName> + <ProgramDataBaseFileName>.\ARM\$(Configuration)/</ProgramDataBaseFileName> + </ClCompile> + <Link> + <AdditionalLibraryDirectories>.\..\..\..\libARM;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM'"> + <Midl> + <TypeLibraryName>..\..\..\libARM\icutu.tlb</TypeLibraryName> + </Midl> + <Link> + <OutputFile>..\..\..\binARM\icutu$(IcuMajorVersion).dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\libARM\icutu.pdb</ProgramDatabaseFile> + <ImportLibrary>..\..\..\libARM\icutu.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'"> + <Midl> + <TypeLibraryName>.\..\..\..\libARM\icutud.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <DebugInformationFormat>ProgramDatabase</DebugInformationFormat> + </ClCompile> + <Link> + <OutputFile>..\..\..\binARM\icutu$(IcuMajorVersion)d.dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\libARM\icutud.pdb</ProgramDatabaseFile> + <ImportLibrary>..\..\..\libARM\icutud.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Platform)'=='ARM64'"> + <ClCompile> + <PrecompiledHeaderOutputFile>.\ARM64\$(Configuration)/toolutil.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>.\ARM64\$(Configuration)/</AssemblerListingLocation> + <ObjectFileName>.\ARM64\$(Configuration)/</ObjectFileName> + <ProgramDataBaseFileName>.\ARM64\$(Configuration)/</ProgramDataBaseFileName> + </ClCompile> + <Link> + <AdditionalLibraryDirectories>.\..\..\..\libARM64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'"> + <Midl> + <TypeLibraryName>.\..\..\..\libARM64\icutu.tlb</TypeLibraryName> + </Midl> + <Link> + <OutputFile>..\..\..\binARM64\icutu$(IcuMajorVersion).dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\libARM64\icutu.pdb</ProgramDatabaseFile> + <ImportLibrary>..\..\..\libARM64\icutu.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'"> + <Midl> + <TypeLibraryName>.\..\..\..\libARM64\icutud.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <DebugInformationFormat>ProgramDatabase</DebugInformationFormat> + </ClCompile> + <Link> + <OutputFile>..\..\..\binARM64\icutu$(IcuMajorVersion)d.dll</OutputFile> + <ProgramDatabaseFile>.\..\..\..\libARM64\icutud.pdb</ProgramDatabaseFile> + <ImportLibrary>..\..\..\libARM64\icutud.lib</ImportLibrary> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="collationinfo.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="denseranges.cpp" /> + <ClCompile Include="filestrm.cpp" /> + <ClCompile Include="filetools.cpp" /> + <ClCompile Include="flagparser.cpp" /> + <ClCompile Include="package.cpp" /> + <ClCompile Include="pkg_genc.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="pkg_gencmn.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="pkg_icu.cpp" /> + <ClCompile Include="pkgitems.cpp" /> + <ClCompile Include="ppucd.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="swapimpl.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="toolutil.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="ucbuf.cpp" /> + <ClCompile Include="ucm.cpp" /> + <ClCompile Include="ucmstate.cpp" /> + <ClCompile Include="unewdata.cpp" /> + <ClCompile Include="uoptions.cpp" /> + <ClCompile Include="uparse.cpp" /> + <ClCompile Include="writesrc.cpp" /> + <ClCompile Include="xmlparser.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="dbgutil.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="udbgutil.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="ucln_tu.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="collationinfo.h" /> + <ClInclude Include="denseranges.h" /> + <ClInclude Include="filestrm.h" /> + <ClInclude Include="filetools.h" /> + <ClInclude Include="flagparser.h" /> + <ClInclude Include="package.h" /> + <ClInclude Include="pkg_genc.h" /> + <ClInclude Include="pkg_gencmn.h" /> + <ClInclude Include="pkg_icu.h" /> + <ClInclude Include="pkg_imp.h" /> + <ClInclude Include="ppucd.h" /> + <ClInclude Include="swapimpl.h" /> + <ClInclude Include="toolutil.h" /> + <ClInclude Include="ucbuf.h" /> + <ClInclude Include="ucm.h" /> + <ClInclude Include="unewdata.h" /> + <ClInclude Include="uoptions.h" /> + <ClInclude Include="uparse.h" /> + <ClInclude Include="writesrc.h" /> + <ClInclude Include="xmlparser.h" /> + <ClInclude Include="dbgutil.h" /> + <ClInclude Include="udbgutil.h" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project> diff --git a/intl/icu/source/tools/toolutil/ucbuf.cpp b/intl/icu/source/tools/toolutil/ucbuf.cpp new file mode 100644 index 0000000000..1eb54e260e --- /dev/null +++ b/intl/icu/source/tools/toolutil/ucbuf.cpp @@ -0,0 +1,788 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File ucbuf.cpp +* +* Modification History: +* +* Date Name Description +* 05/10/01 Ram Creation. +******************************************************************************* +*/ + +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/uchar.h" +#include "unicode/ucnv.h" +#include "unicode/ucnv_err.h" +#include "unicode/ustring.h" +#include "unicode/utf16.h" +#include "filestrm.h" +#include "cstring.h" +#include "cmemory.h" +#include "ustrfmt.h" +#include "ucbuf.h" +#include <stdio.h> + +#if !UCONFIG_NO_CONVERSION + + +#define MAX_IN_BUF 1000 +#define MAX_U_BUF 1500 +#define CONTEXT_LEN 20 + +struct UCHARBUF { + char16_t* buffer; + char16_t* currentPos; + char16_t* bufLimit; + int32_t bufCapacity; + int32_t remaining; + int32_t signatureLength; + FileStream* in; + UConverter* conv; + UBool showWarning; /* makes this API not produce any errors */ + UBool isBuffered; +}; + +U_CAPI UBool U_EXPORT2 +ucbuf_autodetect_fs(FileStream* in, const char** cp, UConverter** conv, int32_t* signatureLength, UErrorCode* error){ + char start[8]; + int32_t numRead; + + char16_t target[1]={ 0 }; + char16_t* pTarget; + const char* pStart; + + /* read a few bytes */ + numRead=T_FileStream_read(in, start, sizeof(start)); + + *cp = ucnv_detectUnicodeSignature(start, numRead, signatureLength, error); + + /* unread the bytes beyond what was consumed for U+FEFF */ + T_FileStream_rewind(in); + if (*signatureLength > 0) { + T_FileStream_read(in, start, *signatureLength); + } + + if(*cp==nullptr){ + *conv =nullptr; + return false; + } + + /* open the converter for the detected Unicode charset */ + *conv = ucnv_open(*cp,error); + + /* convert and ignore initial U+FEFF, and the buffer overflow */ + pTarget = target; + pStart = start; + ucnv_toUnicode(*conv, &pTarget, target+1, &pStart, start+*signatureLength, nullptr, false, error); + *signatureLength = (int32_t)(pStart - start); + if(*error==U_BUFFER_OVERFLOW_ERROR) { + *error=U_ZERO_ERROR; + } + + /* verify that we successfully read exactly U+FEFF */ + if(U_SUCCESS(*error) && (pTarget!=(target+1) || target[0]!=0xfeff)) { + *error=U_INTERNAL_PROGRAM_ERROR; + } + + + return true; +} +static UBool ucbuf_isCPKnown(const char* cp){ + if(ucnv_compareNames("UTF-8",cp)==0){ + return true; + } + if(ucnv_compareNames("UTF-16BE",cp)==0){ + return true; + } + if(ucnv_compareNames("UTF-16LE",cp)==0){ + return true; + } + if(ucnv_compareNames("UTF-16",cp)==0){ + return true; + } + if(ucnv_compareNames("UTF-32",cp)==0){ + return true; + } + if(ucnv_compareNames("UTF-32BE",cp)==0){ + return true; + } + if(ucnv_compareNames("UTF-32LE",cp)==0){ + return true; + } + if(ucnv_compareNames("SCSU",cp)==0){ + return true; + } + if(ucnv_compareNames("BOCU-1",cp)==0){ + return true; + } + if(ucnv_compareNames("UTF-7",cp)==0){ + return true; + } + return false; +} + +U_CAPI FileStream * U_EXPORT2 +ucbuf_autodetect(const char* fileName, const char** cp,UConverter** conv, int32_t* signatureLength,UErrorCode* error){ + FileStream* in=nullptr; + if(error==nullptr || U_FAILURE(*error)){ + return nullptr; + } + if(conv==nullptr || cp==nullptr || fileName==nullptr){ + *error = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + /* open the file */ + in= T_FileStream_open(fileName,"rb"); + + if(in == nullptr){ + *error=U_FILE_ACCESS_ERROR; + return nullptr; + } + + if(ucbuf_autodetect_fs(in,cp,conv,signatureLength,error)) { + return in; + } else { + ucnv_close(*conv); + *conv=nullptr; + T_FileStream_close(in); + return nullptr; + } +} + +/* fill the uchar buffer */ +static UCHARBUF* +ucbuf_fillucbuf( UCHARBUF* buf,UErrorCode* error){ + char16_t* pTarget=nullptr; + char16_t* target=nullptr; + const char* source=nullptr; + char carr[MAX_IN_BUF] = {'\0'}; + char* cbuf = carr; + int32_t inputRead=0; + int32_t outputWritten=0; + int32_t offset=0; + const char* sourceLimit =nullptr; + int32_t cbufSize=0; + pTarget = buf->buffer; + /* check if we arrived here without exhausting the buffer*/ + if(buf->currentPos<buf->bufLimit){ + offset = (int32_t)(buf->bufLimit-buf->currentPos); + memmove(buf->buffer,buf->currentPos,offset* sizeof(char16_t)); + } + +#ifdef UCBUF_DEBUG + memset(pTarget+offset,0xff,sizeof(char16_t)*(MAX_IN_BUF-offset)); +#endif + if(buf->isBuffered){ + cbufSize = MAX_IN_BUF; + /* read the file */ + inputRead=T_FileStream_read(buf->in,cbuf,cbufSize-offset); + buf->remaining-=inputRead; + + }else{ + cbufSize = T_FileStream_size(buf->in); + cbuf = (char*)uprv_malloc(cbufSize); + if (cbuf == nullptr) { + *error = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + inputRead= T_FileStream_read(buf->in,cbuf,cbufSize); + buf->remaining-=inputRead; + } + + /* just to be sure...*/ + if ( 0 == inputRead ) + buf->remaining = 0; + + target=pTarget; + /* convert the bytes */ + if(buf->conv){ + /* set the callback to stop */ + UConverterToUCallback toUOldAction ; + void* toUOldContext; + void* toUNewContext=nullptr; + ucnv_setToUCallBack(buf->conv, + UCNV_TO_U_CALLBACK_STOP, + toUNewContext, + &toUOldAction, + (const void**)&toUOldContext, + error); + /* since state is saved in the converter we add offset to source*/ + target = pTarget+offset; + source = cbuf; + sourceLimit = source + inputRead; + ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset), + &source,sourceLimit,nullptr, + (UBool)(buf->remaining==0),error); + + if(U_FAILURE(*error)){ + char context[CONTEXT_LEN+1]; + char preContext[CONTEXT_LEN+1]; + char postContext[CONTEXT_LEN+1]; + int8_t len = CONTEXT_LEN; + int32_t start=0; + int32_t stop =0; + int32_t pos =0; + /* use erro1 to preserve the error code */ + UErrorCode error1 =U_ZERO_ERROR; + + if( buf->showWarning==true){ + fprintf(stderr,"\n###WARNING: Encountered abnormal bytes while" + " converting input stream to target encoding: %s\n", + u_errorName(*error)); + } + + + /* now get the context chars */ + ucnv_getInvalidChars(buf->conv,context,&len,&error1); + context[len]= 0 ; /* null terminate the buffer */ + + pos = (int32_t)(source - cbuf - len); + + /* for pre-context */ + start = (pos <=CONTEXT_LEN)? 0 : (pos - (CONTEXT_LEN-1)); + stop = pos-len; + + memcpy(preContext,cbuf+start,stop-start); + /* null terminate the buffer */ + preContext[stop-start] = 0; + + /* for post-context */ + start = pos+len; + stop = (int32_t)(((pos+CONTEXT_LEN)<= (sourceLimit-cbuf) )? (pos+(CONTEXT_LEN-1)) : (sourceLimit-cbuf)); + + memcpy(postContext,source,stop-start); + /* null terminate the buffer */ + postContext[stop-start] = 0; + + if(buf->showWarning ==true){ + /* print out the context */ + fprintf(stderr,"\tPre-context: %s\n",preContext); + fprintf(stderr,"\tContext: %s\n",context); + fprintf(stderr,"\tPost-context: %s\n", postContext); + } + + /* reset the converter */ + ucnv_reset(buf->conv); + + /* set the call back to substitute + * and restart conversion + */ + ucnv_setToUCallBack(buf->conv, + UCNV_TO_U_CALLBACK_SUBSTITUTE, + toUNewContext, + &toUOldAction, + (const void**)&toUOldContext, + &error1); + + /* reset source and target start positions */ + target = pTarget+offset; + source = cbuf; + + /* re convert */ + ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset), + &source,sourceLimit,nullptr, + (UBool)(buf->remaining==0),&error1); + + } + outputWritten = (int32_t)(target - pTarget); + +#ifdef UCBUF_DEBUG + { + int i; + target = pTarget; + for(i=0;i<numRead;i++){ + /* printf("%c", (char)(*target++));*/ + } + } +#endif + + }else{ + u_charsToUChars(cbuf,target+offset,inputRead); + outputWritten=((buf->remaining>cbufSize)? cbufSize:inputRead+offset); + } + buf->currentPos = pTarget; + buf->bufLimit=pTarget+outputWritten; + *buf->bufLimit=0; /*NUL terminate*/ + if(cbuf!=carr){ + uprv_free(cbuf); + } + return buf; +} + + + +/* get a char16_t from the stream*/ +U_CAPI int32_t U_EXPORT2 +ucbuf_getc(UCHARBUF* buf,UErrorCode* error){ + if(error==nullptr || U_FAILURE(*error)){ + return false; + } + if(buf->currentPos>=buf->bufLimit){ + if(buf->remaining==0){ + return U_EOF; + } + buf=ucbuf_fillucbuf(buf,error); + if(U_FAILURE(*error)){ + return U_EOF; + } + } + + return *(buf->currentPos++); +} + +/* get a UChar32 from the stream*/ +U_CAPI int32_t U_EXPORT2 +ucbuf_getc32(UCHARBUF* buf,UErrorCode* error){ + int32_t retVal = (int32_t)U_EOF; + if(error==nullptr || U_FAILURE(*error)){ + return false; + } + if(buf->currentPos+1>=buf->bufLimit){ + if(buf->remaining==0){ + return U_EOF; + } + buf=ucbuf_fillucbuf(buf,error); + if(U_FAILURE(*error)){ + return U_EOF; + } + } + if(U16_IS_LEAD(*(buf->currentPos))){ + retVal=U16_GET_SUPPLEMENTARY(buf->currentPos[0],buf->currentPos[1]); + buf->currentPos+=2; + }else{ + retVal = *(buf->currentPos++); + } + return retVal; +} + +/* u_unescapeAt() callback to return a char16_t*/ +static char16_t U_CALLCONV +_charAt(int32_t offset, void *context) { + return ((UCHARBUF*) context)->currentPos[offset]; +} + +/* getc and escape it */ +U_CAPI int32_t U_EXPORT2 +ucbuf_getcx32(UCHARBUF* buf,UErrorCode* error) { + int32_t length; + int32_t offset; + UChar32 c32,c1,c2; + if(error==nullptr || U_FAILURE(*error)){ + return false; + } + /* Fill the buffer if it is empty */ + if (buf->currentPos >=buf->bufLimit-2) { + ucbuf_fillucbuf(buf,error); + } + + /* Get the next character in the buffer */ + if (buf->currentPos < buf->bufLimit) { + c1 = *(buf->currentPos)++; + } else { + c1 = U_EOF; + } + + c2 = *(buf->currentPos); + + /* If it isn't a backslash, return it */ + if (c1 != 0x005C) { + return c1; + } + + /* Determine the amount of data in the buffer */ + length = (int32_t)(buf->bufLimit - buf->currentPos); + + /* The longest escape sequence is \Uhhhhhhhh; make sure + we have at least that many characters */ + if (length < 10) { + + /* fill the buffer */ + ucbuf_fillucbuf(buf,error); + length = (int32_t)(buf->bufLimit - buf->buffer); + } + + /* Process the escape */ + offset = 0; + c32 = u_unescapeAt(_charAt, &offset, length, (void*)buf); + + /* check if u_unescapeAt unescaped and converted + * to c32 or not + */ + if(c32==(UChar32)0xFFFFFFFF){ + if(buf->showWarning) { + char context[CONTEXT_LEN+1]; + int32_t len = CONTEXT_LEN; + if(length < len) { + len = length; + } + context[len]= 0 ; /* null terminate the buffer */ + u_UCharsToChars( buf->currentPos, context, len); + fprintf(stderr,"Bad escape: [%c%s]...\n", (int)c1, context); + } + *error= U_ILLEGAL_ESCAPE_SEQUENCE; + return c1; + }else if(c32!=c2 || (c32==0x0075 && c2==0x0075 && c1==0x005C) /* for \u0075 c2=0x0075 and c32==0x0075*/){ + /* Update the current buffer position */ + buf->currentPos += offset; + }else{ + /* unescaping failed so we just return + * c1 and not consume the buffer + * this is useful for rules with escapes + * in resource bundles + * eg: \' \\ \" + */ + return c1; + } + + return c32; +} + +U_CAPI UCHARBUF* U_EXPORT2 +ucbuf_open(const char* fileName,const char** cp,UBool showWarning, UBool buffered, UErrorCode* error){ + + FileStream* in = nullptr; + int32_t fileSize=0; + const char* knownCp; + if(error==nullptr || U_FAILURE(*error)){ + return nullptr; + } + if(cp==nullptr || fileName==nullptr){ + *error = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + if (!uprv_strcmp(fileName, "-")) { + in = T_FileStream_stdin(); + }else{ + in = T_FileStream_open(fileName, "rb"); + } + + if(in!=nullptr){ + UCHARBUF* buf =(UCHARBUF*) uprv_malloc(sizeof(UCHARBUF)); + fileSize = T_FileStream_size(in); + if(buf == nullptr){ + *error = U_MEMORY_ALLOCATION_ERROR; + T_FileStream_close(in); + return nullptr; + } + buf->in=in; + buf->conv=nullptr; + buf->showWarning = showWarning; + buf->isBuffered = buffered; + buf->signatureLength=0; + if(*cp==nullptr || **cp=='\0'){ + /* don't have code page name... try to autodetect */ + ucbuf_autodetect_fs(in,cp,&buf->conv,&buf->signatureLength,error); + }else if(ucbuf_isCPKnown(*cp)){ + /* discard BOM */ + ucbuf_autodetect_fs(in,&knownCp,&buf->conv,&buf->signatureLength,error); + } + if(U_SUCCESS(*error) && buf->conv==nullptr) { + buf->conv=ucnv_open(*cp,error); + } + if(U_FAILURE(*error)){ + ucnv_close(buf->conv); + uprv_free(buf); + T_FileStream_close(in); + return nullptr; + } + + if((buf->conv==nullptr) && (buf->showWarning==true)){ + fprintf(stderr,"###WARNING: No converter defined. Using codepage of system.\n"); + } + buf->remaining=fileSize-buf->signatureLength; + if(buf->isBuffered){ + buf->bufCapacity=MAX_U_BUF; + }else{ + buf->bufCapacity=buf->remaining+buf->signatureLength+1/*for terminating nul*/; + } + buf->buffer=(char16_t*) uprv_malloc(U_SIZEOF_UCHAR * buf->bufCapacity ); + if (buf->buffer == nullptr) { + *error = U_MEMORY_ALLOCATION_ERROR; + ucbuf_close(buf); + return nullptr; + } + buf->currentPos=buf->buffer; + buf->bufLimit=buf->buffer; + if(U_FAILURE(*error)){ + fprintf(stderr, "Could not open codepage [%s]: %s\n", *cp, u_errorName(*error)); + ucbuf_close(buf); + return nullptr; + } + ucbuf_fillucbuf(buf,error); + if(U_FAILURE(*error)){ + ucbuf_close(buf); + return nullptr; + } + return buf; + } + *error =U_FILE_ACCESS_ERROR; + return nullptr; +} + + + +/* TODO: this method will fail if at the + * beginning of buffer and the uchar to unget + * is from the previous buffer. Need to implement + * system to take care of that situation. + */ +U_CAPI void U_EXPORT2 +ucbuf_ungetc(int32_t c,UCHARBUF* buf){ + /* decrement currentPos pointer + * if not at the beginning of buffer + */ + if(buf->currentPos!=buf->buffer){ + if(*(buf->currentPos-1)==c){ + buf->currentPos--; + } else { + /* ungetc failed - did not match. */ + } + } else { + /* ungetc failed - beginning of buffer. */ + } +} + +/* frees the resources of char16_t* buffer */ +static void +ucbuf_closebuf(UCHARBUF* buf){ + uprv_free(buf->buffer); + buf->buffer = nullptr; +} + +/* close the buf and release resources*/ +U_CAPI void U_EXPORT2 +ucbuf_close(UCHARBUF* buf){ + if(buf!=nullptr){ + if(buf->conv){ + ucnv_close(buf->conv); + } + T_FileStream_close(buf->in); + ucbuf_closebuf(buf); + uprv_free(buf); + } +} + +/* rewind the buf and file stream */ +U_CAPI void U_EXPORT2 +ucbuf_rewind(UCHARBUF* buf,UErrorCode* error){ + if(error==nullptr || U_FAILURE(*error)){ + return; + } + if(buf){ + buf->currentPos=buf->buffer; + buf->bufLimit=buf->buffer; + T_FileStream_rewind(buf->in); + buf->remaining=T_FileStream_size(buf->in)-buf->signatureLength; + + ucnv_resetToUnicode(buf->conv); + if(buf->signatureLength>0) { + char16_t target[1]={ 0 }; + char16_t* pTarget; + char start[8]; + const char* pStart; + int32_t numRead; + + /* read the signature bytes */ + numRead=T_FileStream_read(buf->in, start, buf->signatureLength); + + /* convert and ignore initial U+FEFF, and the buffer overflow */ + pTarget = target; + pStart = start; + ucnv_toUnicode(buf->conv, &pTarget, target+1, &pStart, start+numRead, nullptr, false, error); + if(*error==U_BUFFER_OVERFLOW_ERROR) { + *error=U_ZERO_ERROR; + } + + /* verify that we successfully read exactly U+FEFF */ + if(U_SUCCESS(*error) && (numRead!=buf->signatureLength || pTarget!=(target+1) || target[0]!=0xfeff)) { + *error=U_INTERNAL_PROGRAM_ERROR; + } + } + } +} + + +U_CAPI int32_t U_EXPORT2 +ucbuf_size(UCHARBUF* buf){ + if(buf){ + if(buf->isBuffered){ + return (T_FileStream_size(buf->in)-buf->signatureLength)/ucnv_getMinCharSize(buf->conv); + }else{ + return (int32_t)(buf->bufLimit - buf->buffer); + } + } + return 0; +} + +U_CAPI const char16_t* U_EXPORT2 +ucbuf_getBuffer(UCHARBUF* buf,int32_t* len,UErrorCode* error){ + if(error==nullptr || U_FAILURE(*error)){ + return nullptr; + } + if(buf==nullptr || len==nullptr){ + *error = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + *len = (int32_t)(buf->bufLimit - buf->buffer); + return buf->buffer; +} + +U_CAPI const char* U_EXPORT2 +ucbuf_resolveFileName(const char* inputDir, const char* fileName, char* target, int32_t* len, UErrorCode* status){ + int32_t requiredLen = 0; + int32_t dirlen = 0; + int32_t filelen = 0; + if(status==nullptr || U_FAILURE(*status)){ + return nullptr; + } + + if(inputDir == nullptr || fileName == nullptr || len==nullptr || (target==nullptr && *len>0)){ + *status = U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + + + dirlen = (int32_t)uprv_strlen(inputDir); + filelen = (int32_t)uprv_strlen(fileName); + if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) { + requiredLen = dirlen + filelen + 2; + if((*len < requiredLen) || target==nullptr){ + *len = requiredLen; + *status = U_BUFFER_OVERFLOW_ERROR; + return nullptr; + } + + target[0] = '\0'; + /* + * append the input dir to openFileName if the first char in + * filename is not file separation char and the last char input directory is not '.'. + * This is to support : + * genrb -s. /home/icu/data + * genrb -s. icu/data + * The user cannot mix notations like + * genrb -s. /icu/data --- the absolute path specified. -s redundant + * user should use + * genrb -s. icu/data --- start from CWD and look in icu/data dir + */ + if( (fileName[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')){ + uprv_strcpy(target, inputDir); + target[dirlen] = U_FILE_SEP_CHAR; + } + target[dirlen + 1] = '\0'; + } else { + requiredLen = dirlen + filelen + 1; + if((*len < requiredLen) || target==nullptr){ + *len = requiredLen; + *status = U_BUFFER_OVERFLOW_ERROR; + return nullptr; + } + + uprv_strcpy(target, inputDir); + } + + uprv_strcat(target, fileName); + return target; +} +/* + * Unicode TR 13 says any of the below chars is + * a new line char in a readline function in addition + * to CR+LF combination which needs to be + * handled separately + */ +static UBool ucbuf_isCharNewLine(char16_t c){ + switch(c){ + case 0x000A: /* LF */ + case 0x000D: /* CR */ + case 0x000C: /* FF */ + case 0x0085: /* NEL */ + case 0x2028: /* LS */ + case 0x2029: /* PS */ + return true; + default: + return false; + } +} + +U_CAPI const char16_t* U_EXPORT2 +ucbuf_readline(UCHARBUF* buf,int32_t* len,UErrorCode* err){ + char16_t* temp = buf->currentPos; + char16_t* savePos =nullptr; + char16_t c=0x0000; + if(buf->isBuffered){ + /* The input is buffered we have to do more + * for returning a pointer U_TRUNCATED_CHAR_FOUND + */ + for(;;){ + c = *temp++; + if(buf->remaining==0){ + return nullptr; /* end of file is reached return nullptr */ + } + if(temp>=buf->bufLimit && buf->currentPos == buf->buffer){ + *err= U_TRUNCATED_CHAR_FOUND; + return nullptr; + }else{ + ucbuf_fillucbuf(buf,err); + if(U_FAILURE(*err)){ + return nullptr; + } + } + /* + * According to TR 13 readLine functions must interpret + * CR, CR+LF, LF, NEL, PS, LS or FF as line seperators + */ + /* Windows CR LF */ + if(c ==0x0d && temp <= buf->bufLimit && *temp == 0x0a ){ + *len = (int32_t)(temp++ - buf->currentPos); + savePos = buf->currentPos; + buf->currentPos = temp; + return savePos; + } + /* else */ + + if (temp>=buf->bufLimit|| ucbuf_isCharNewLine(c)){ /* Unipad inserts 2028 line separators! */ + *len = (int32_t)(temp - buf->currentPos); + savePos = buf->currentPos; + buf->currentPos = temp; + return savePos; + } + } + }else{ + /* we know that all input is read into the internal + * buffer so we can safely return pointers + */ + for(;;){ + c = *temp++; + + if(buf->currentPos==buf->bufLimit){ + return nullptr; /* end of file is reached return nullptr */ + } + /* Windows CR LF */ + if(c ==0x0d && temp <= buf->bufLimit && *temp == 0x0a ){ + *len = (int32_t)(temp++ - buf->currentPos); + savePos = buf->currentPos; + buf->currentPos = temp; + return savePos; + } + /* else */ + if (temp>=buf->bufLimit|| ucbuf_isCharNewLine(c)) { /* Unipad inserts 2028 line separators! */ + *len = (int32_t)(temp - buf->currentPos); + savePos = buf->currentPos; + buf->currentPos = temp; + return savePos; + } + } + } + /* not reached */ + /* A compiler warning will appear if all paths don't contain a return statement. */ +/* return nullptr;*/ +} +#endif diff --git a/intl/icu/source/tools/toolutil/ucbuf.h b/intl/icu/source/tools/toolutil/ucbuf.h new file mode 100644 index 0000000000..117920b794 --- /dev/null +++ b/intl/icu/source/tools/toolutil/ucbuf.h @@ -0,0 +1,218 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File ucbuf.h +* +* Modification History: +* +* Date Name Description +* 05/10/01 Ram Creation. +* +* This API reads in files and returns UChars +******************************************************************************* +*/ + +#include "unicode/localpointer.h" +#include "unicode/ucnv.h" +#include "filestrm.h" + +#if !UCONFIG_NO_CONVERSION + +#ifndef UCBUF_H +#define UCBUF_H 1 + +typedef struct UCHARBUF UCHARBUF; +/** + * End of file value + */ +#define U_EOF ((int32_t)0xFFFFFFFF) +/** + * Error value if a sequence cannot be unescaped + */ +#define U_ERR ((int32_t)0xFFFFFFFE) + +typedef struct ULine ULine; + +struct ULine { + UChar *name; + int32_t len; +}; + +/** + * Opens the UCHARBUF with the given file stream and code page for conversion + * @param fileName Name of the file to open. + * @param codepage The encoding of the file stream to convert to Unicode. + * If *codepage is NULL on input the API will try to autodetect + * popular Unicode encodings + * @param showWarning Flag to print out warnings to STDOUT + * @param buffered If true performs a buffered read of the input file. If false reads + * the whole file into memory and converts it. + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + * @return pointer to the newly opened UCHARBUF + */ +U_CAPI UCHARBUF* U_EXPORT2 +ucbuf_open(const char* fileName,const char** codepage,UBool showWarning, UBool buffered, UErrorCode* err); + +/** + * Gets a UTF-16 code unit at the current position from the converted buffer + * and increments the current position + * @param buf Pointer to UCHARBUF structure + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + */ +U_CAPI int32_t U_EXPORT2 +ucbuf_getc(UCHARBUF* buf,UErrorCode* err); + +/** + * Gets a UTF-32 code point at the current position from the converted buffer + * and increments the current position + * @param buf Pointer to UCHARBUF structure + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + */ +U_CAPI int32_t U_EXPORT2 +ucbuf_getc32(UCHARBUF* buf,UErrorCode* err); + +/** + * Gets a UTF-16 code unit at the current position from the converted buffer after + * unescaping and increments the current position. If the escape sequence is for UTF-32 + * code point (\\Uxxxxxxxx) then a UTF-32 codepoint is returned + * @param buf Pointer to UCHARBUF structure + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + */ +U_CAPI int32_t U_EXPORT2 +ucbuf_getcx32(UCHARBUF* buf,UErrorCode* err); + +/** + * Gets a pointer to the current position in the internal buffer and length of the line. + * It imperative to make a copy of the returned buffer before performing operations on it. + * @param buf Pointer to UCHARBUF structure + * @param len Output param to receive the len of the buffer returned till end of the line + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + * Error: U_TRUNCATED_CHAR_FOUND + * @return Pointer to the internal buffer, NULL if EOF + */ +U_CAPI const UChar* U_EXPORT2 +ucbuf_readline(UCHARBUF* buf,int32_t* len, UErrorCode* err); + + +/** + * Resets the buffers and the underlying file stream. + * @param buf Pointer to UCHARBUF structure + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + */ +U_CAPI void U_EXPORT2 +ucbuf_rewind(UCHARBUF* buf,UErrorCode* err); + +/** + * Returns a pointer to the internal converted buffer + * @param buf Pointer to UCHARBUF structure + * @param len Pointer to int32_t to receive the length of buffer + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + * @return Pointer to internal UChar buffer + */ +U_CAPI const UChar* U_EXPORT2 +ucbuf_getBuffer(UCHARBUF* buf,int32_t* len,UErrorCode* err); + +/** + * Closes the UCHARBUF structure members and cleans up the malloc'ed memory + * @param buf Pointer to UCHARBUF structure + */ +U_CAPI void U_EXPORT2 +ucbuf_close(UCHARBUF* buf); + +#if U_SHOW_CPLUSPLUS_API + +U_NAMESPACE_BEGIN + +/** + * \class LocalUCHARBUFPointer + * "Smart pointer" class, closes a UCHARBUF via ucbuf_close(). + * For most methods see the LocalPointerBase base class. + * + * @see LocalPointerBase + * @see LocalPointer + */ +U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close); + +U_NAMESPACE_END + +#endif + +/** + * Rewinds the buffer by one codepoint. Does not rewind over escaped characters. + */ +U_CAPI void U_EXPORT2 +ucbuf_ungetc(int32_t ungetChar,UCHARBUF* buf); + + +/** + * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected. + * Some Unicode charsets are stateful and need byte identifiers to be converted also to bring + * the converter to correct state for converting the rest of the stream. So the UConverter parameter + * is necessary. + * If the charset was autodetected, the caller must close both the input FileStream + * and the converter. + * + * @param fileName The file name to be opened and encoding autodected + * @param conv Output param to receive the opened converter if autodetected; NULL otherwise. + * @param cp Output param to receive the detected encoding + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + * @return The input FileStream if its charset was autodetected; NULL otherwise. + */ +U_CAPI FileStream * U_EXPORT2 +ucbuf_autodetect(const char* fileName, const char** cp,UConverter** conv, +int32_t* signatureLength, UErrorCode* status); + +/** + * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected. + * Some Unicode charsets are stateful and need byte identifiers to be converted also to bring + * the converter to correct state for converting the rest of the stream. So the UConverter parameter + * is necessary. + * If the charset was autodetected, the caller must close the converter. + * + * @param fileStream The file stream whose encoding is to be detected + * @param conv Output param to receive the opened converter if autodetected; NULL otherwise. + * @param cp Output param to receive the detected encoding + * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value + * indicates a failure on entry, the function will immediately return. + * On exit the value will indicate the success of the operation. + * @return Boolean whether the Unicode charset was autodetected. + */ + +U_CAPI UBool U_EXPORT2 +ucbuf_autodetect_fs(FileStream* in, const char** cp, UConverter** conv, int32_t* signatureLength, UErrorCode* status); + +/** + * Returns the approximate size in UChars required for converting the file to UChars + */ +U_CAPI int32_t U_EXPORT2 +ucbuf_size(UCHARBUF* buf); + +U_CAPI const char* U_EXPORT2 +ucbuf_resolveFileName(const char* inputDir, const char* fileName, char* target, int32_t* len, UErrorCode* status); + +#endif +#endif + diff --git a/intl/icu/source/tools/toolutil/ucln_tu.cpp b/intl/icu/source/tools/toolutil/ucln_tu.cpp new file mode 100644 index 0000000000..4727227ebf --- /dev/null +++ b/intl/icu/source/tools/toolutil/ucln_tu.cpp @@ -0,0 +1,19 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2007-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + + +/** Auto-client **/ +#define UCLN_TYPE UCLN_TOOLUTIL +#include "ucln_imp.h" + +int uprv_dummyFunction_TU(); +int uprv_dummyFunction_TU() +{ + /* this is here to prevent the compiler from complaining about an empty file */ + return 0; +} diff --git a/intl/icu/source/tools/toolutil/ucm.cpp b/intl/icu/source/tools/toolutil/ucm.cpp new file mode 100644 index 0000000000..272570e72f --- /dev/null +++ b/intl/icu/source/tools/toolutil/ucm.cpp @@ -0,0 +1,1195 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2003-2013, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: ucm.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003jun20 +* created by: Markus W. Scherer +* +* This file reads a .ucm file, stores its mappings and sorts them. +* It implements handling of Unicode conversion mappings from .ucm files +* for makeconv, canonucm, rptp2ucm, etc. +* +* Unicode code point sequences with a length of more than 1, +* as well as byte sequences with more than 4 bytes or more than one complete +* character sequence are handled to support m:n mappings. +*/ + +#include "unicode/utypes.h" +#include "unicode/ustring.h" +#include "cstring.h" +#include "cmemory.h" +#include "filestrm.h" +#include "uarrsort.h" +#include "ucnvmbcs.h" +#include "ucnv_bld.h" +#include "ucnv_ext.h" +#include "uparse.h" +#include "ucm.h" +#include <stdio.h> + +#if !UCONFIG_NO_CONVERSION + +/* -------------------------------------------------------------------------- */ + +static void +printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) { + int32_t j; + + for(j=0; j<m->uLen; ++j) { + fprintf(f, "<U%04lX>", (long)codePoints[j]); + } + + fputc(' ', f); + + for(j=0; j<m->bLen; ++j) { + fprintf(f, "\\x%02X", bytes[j]); + } + + if(m->f>=0) { + fprintf(f, " |%u\n", m->f); + } else { + fputs("\n", f); + } +} + +U_CAPI void U_EXPORT2 +ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) { + printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f); +} + +U_CAPI void U_EXPORT2 +ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) { + UCMapping *m; + int32_t i, length; + + m=table->mappings; + length=table->mappingsLength; + if(byUnicode) { + for(i=0; i<length; ++m, ++i) { + ucm_printMapping(table, m, f); + } + } else { + const int32_t *map=table->reverseMap; + for(i=0; i<length; ++i) { + ucm_printMapping(table, m+map[i], f); + } + } +} + +/* mapping comparisons ------------------------------------------------------ */ + +static int32_t +compareUnicode(UCMTable *lTable, const UCMapping *l, + UCMTable *rTable, const UCMapping *r) { + const UChar32 *lu, *ru; + int32_t result, i, length; + + if(l->uLen==1 && r->uLen==1) { + /* compare two single code points */ + return l->u-r->u; + } + + /* get pointers to the code point sequences */ + lu=UCM_GET_CODE_POINTS(lTable, l); + ru=UCM_GET_CODE_POINTS(rTable, r); + + /* get the minimum length */ + if(l->uLen<=r->uLen) { + length=l->uLen; + } else { + length=r->uLen; + } + + /* compare the code points */ + for(i=0; i<length; ++i) { + result=lu[i]-ru[i]; + if(result!=0) { + return result; + } + } + + /* compare the lengths */ + return l->uLen-r->uLen; +} + +static int32_t +compareBytes(UCMTable *lTable, const UCMapping *l, + UCMTable *rTable, const UCMapping *r, + UBool lexical) { + const uint8_t *lb, *rb; + int32_t result, i, length; + + /* + * A lexical comparison is used for sorting in the builder, to allow + * an efficient search for a byte sequence that could be a prefix + * of a previously entered byte sequence. + * + * Comparing by lengths first is for compatibility with old .ucm tools + * like canonucm and rptp2ucm. + */ + if(lexical) { + /* get the minimum length and continue */ + if(l->bLen<=r->bLen) { + length=l->bLen; + } else { + length=r->bLen; + } + } else { + /* compare lengths first */ + result=l->bLen-r->bLen; + if(result!=0) { + return result; + } else { + length=l->bLen; + } + } + + /* get pointers to the byte sequences */ + lb=UCM_GET_BYTES(lTable, l); + rb=UCM_GET_BYTES(rTable, r); + + /* compare the bytes */ + for(i=0; i<length; ++i) { + result=lb[i]-rb[i]; + if(result!=0) { + return result; + } + } + + /* compare the lengths */ + return l->bLen-r->bLen; +} + +/* compare UCMappings for sorting */ +static int32_t +compareMappings(UCMTable *lTable, const UCMapping *l, + UCMTable *rTable, const UCMapping *r, + UBool uFirst) { + int32_t result; + + /* choose which side to compare first */ + if(uFirst) { + /* Unicode then bytes */ + result=compareUnicode(lTable, l, rTable, r); + if(result==0) { + result=compareBytes(lTable, l, rTable, r, false); /* not lexically, like canonucm */ + } + } else { + /* bytes then Unicode */ + result=compareBytes(lTable, l, rTable, r, true); /* lexically, for builder */ + if(result==0) { + result=compareUnicode(lTable, l, rTable, r); + } + } + + if(result!=0) { + return result; + } + + /* compare the flags */ + return l->f-r->f; +} +U_CDECL_BEGIN +/* sorting by Unicode first sorts mappings directly */ +static int32_t U_CALLCONV +compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) { + return compareMappings( + (UCMTable *)context, (const UCMapping *)left, + (UCMTable *)context, (const UCMapping *)right, true); +} + +/* sorting by bytes first sorts the reverseMap; use indirection to mappings */ +static int32_t U_CALLCONV +compareMappingsBytesFirst(const void *context, const void *left, const void *right) { + UCMTable *table=(UCMTable *)context; + int32_t l=*(const int32_t *)left, r=*(const int32_t *)right; + return compareMappings( + table, table->mappings+l, + table, table->mappings+r, false); +} +U_CDECL_END + +U_CAPI void U_EXPORT2 +ucm_sortTable(UCMTable *t) { + UErrorCode errorCode; + int32_t i; + + if(t->isSorted) { + return; + } + + errorCode=U_ZERO_ERROR; + + /* 1. sort by Unicode first */ + uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping), + compareMappingsUnicodeFirst, t, + false, &errorCode); + + /* build the reverseMap */ + if(t->reverseMap==nullptr) { + /* + * allocate mappingsCapacity instead of mappingsLength so that + * if mappings are added, the reverseMap need not be + * reallocated each time + * (see ucm_moveMappings() and ucm_addMapping()) + */ + t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t)); + if(t->reverseMap==nullptr) { + fprintf(stderr, "ucm error: unable to allocate reverseMap\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + for(i=0; i<t->mappingsLength; ++i) { + t->reverseMap[i]=i; + } + + /* 2. sort reverseMap by mappings bytes first */ + uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t), + compareMappingsBytesFirst, t, + false, &errorCode); + + if(U_FAILURE(errorCode)) { + fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n", + u_errorName(errorCode)); + exit(errorCode); + } + + t->isSorted=true; +} + +/* + * remove mappings with their move flag set from the base table + * and move some of them (with UCM_MOVE_TO_EXT) to the extension table + */ +U_CAPI void U_EXPORT2 +ucm_moveMappings(UCMTable *base, UCMTable *ext) { + UCMapping *mb, *mbLimit; + int8_t flag; + + mb=base->mappings; + mbLimit=mb+base->mappingsLength; + + while(mb<mbLimit) { + flag=mb->moveFlag; + if(flag!=0) { + /* reset the move flag */ + mb->moveFlag=0; + + if(ext!=nullptr && (flag&UCM_MOVE_TO_EXT)) { + /* add the mapping to the extension table */ + ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb)); + } + + /* remove this mapping: move the last base mapping down and overwrite the current one */ + if(mb<(mbLimit-1)) { + uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping)); + } + --mbLimit; + --base->mappingsLength; + base->isSorted=false; + } else { + ++mb; + } + } +} + +enum { + NEEDS_MOVE=1, + HAS_ERRORS=2 +}; + +static uint8_t +checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext, + UBool moveToExt, UBool intersectBase) { + (void)baseStates; + + UCMapping *mb, *me, *mbLimit, *meLimit; + int32_t cmp; + uint8_t result; + + mb=base->mappings; + mbLimit=mb+base->mappingsLength; + + me=ext->mappings; + meLimit=me+ext->mappingsLength; + + result=0; + + for(;;) { + /* skip irrelevant mappings on both sides */ + for(;;) { + if(mb==mbLimit) { + return result; + } + + if((0<=mb->f && mb->f<=2) || mb->f==4) { + break; + } + + ++mb; + } + + for(;;) { + if(me==meLimit) { + return result; + } + + if((0<=me->f && me->f<=2) || me->f==4) { + break; + } + + ++me; + } + + /* compare the base and extension mappings */ + cmp=compareUnicode(base, mb, ext, me); + if(cmp<0) { + if(intersectBase && (intersectBase!=2 || mb->bLen>1)) { + /* + * mapping in base but not in ext, move it + * + * if ext is DBCS, move DBCS mappings here + * and check SBCS ones for Unicode prefix below + */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + + /* does mb map from an input sequence that is a prefix of me's? */ + } else if( mb->uLen<me->uLen && + 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) + ) { + if(moveToExt) { + /* mark this mapping to be moved to the extension table */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is a prefix of the input sequence of an extension mapping\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; + } + } + + ++mb; + } else if(cmp==0) { + /* + * same output: remove the extension mapping, + * otherwise treat as an error + */ + if( mb->f==me->f && mb->bLen==me->bLen && + 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) + ) { + me->moveFlag|=UCM_REMOVE_MAPPING; + result|=NEEDS_MOVE; + } else if(intersectBase) { + /* mapping in base but not in ext, move it */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is the same as the input sequence of an extension mapping\n" + " but it maps differently\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; + } + + ++mb; + } else /* cmp>0 */ { + ++me; + } + } +} + +static uint8_t +checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, + UBool moveToExt, UBool intersectBase) { + UCMapping *mb, *me; + int32_t *baseMap, *extMap; + int32_t b, e, bLimit, eLimit, cmp; + uint8_t result; + UBool isSISO; + + baseMap=base->reverseMap; + extMap=ext->reverseMap; + + b=e=0; + bLimit=base->mappingsLength; + eLimit=ext->mappingsLength; + + result=0; + + isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO); + + for(;;) { + /* skip irrelevant mappings on both sides */ + for(;; ++b) { + if(b==bLimit) { + return result; + } + mb=base->mappings+baseMap[b]; + + if(intersectBase==2 && mb->bLen==1) { + /* + * comparing a base against a DBCS extension: + * leave SBCS base mappings alone + */ + continue; + } + + if(mb->f==0 || mb->f==3) { + break; + } + } + + for(;;) { + if(e==eLimit) { + return result; + } + me=ext->mappings+extMap[e]; + + if(me->f==0 || me->f==3) { + break; + } + + ++e; + } + + /* compare the base and extension mappings */ + cmp=compareBytes(base, mb, ext, me, true); + if(cmp<0) { + if(intersectBase) { + /* mapping in base but not in ext, move it */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + + /* + * does mb map from an input sequence that is a prefix of me's? + * for SI/SO tables, a single byte is never a prefix because it + * occurs in a separate single-byte state + */ + } else if( mb->bLen<me->bLen && + (!isSISO || mb->bLen>1) && + 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) + ) { + if(moveToExt) { + /* mark this mapping to be moved to the extension table */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is a prefix of the input sequence of an extension mapping\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; + } + } + + ++b; + } else if(cmp==0) { + /* + * same output: remove the extension mapping, + * otherwise treat as an error + */ + if( mb->f==me->f && mb->uLen==me->uLen && + 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) + ) { + me->moveFlag|=UCM_REMOVE_MAPPING; + result|=NEEDS_MOVE; + } else if(intersectBase) { + /* mapping in base but not in ext, move it */ + mb->moveFlag|=UCM_MOVE_TO_EXT; + result|=NEEDS_MOVE; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is the same as the input sequence of an extension mapping\n" + " but it maps differently\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; + } + + ++b; + } else /* cmp>0 */ { + ++e; + } + } +} + +U_CAPI UBool U_EXPORT2 +ucm_checkValidity(UCMTable *table, UCMStates *baseStates) { + UCMapping *m, *mLimit; + int32_t count; + UBool isOK; + + m=table->mappings; + mLimit=m+table->mappingsLength; + isOK=true; + + while(m<mLimit) { + count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen); + if(count<1) { + ucm_printMapping(table, m, stderr); + isOK=false; + } + ++m; + } + + return isOK; +} + +U_CAPI UBool U_EXPORT2 +ucm_checkBaseExt(UCMStates *baseStates, + UCMTable *base, UCMTable *ext, UCMTable *moveTarget, + UBool intersectBase) { + uint8_t result; + + /* if we have an extension table, we must always use precision flags */ + if(base->flagsType&UCM_FLAGS_IMPLICIT) { + fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n"); + return false; + } + if(ext->flagsType&UCM_FLAGS_IMPLICIT) { + fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n"); + return false; + } + + /* checking requires both tables to be sorted */ + ucm_sortTable(base); + ucm_sortTable(ext); + + /* check */ + result= + checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=nullptr), intersectBase)| + checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=nullptr), intersectBase); + + if(result&HAS_ERRORS) { + return false; + } + + if(result&NEEDS_MOVE) { + ucm_moveMappings(ext, nullptr); + ucm_moveMappings(base, moveTarget); + ucm_sortTable(base); + ucm_sortTable(ext); + if(moveTarget!=nullptr) { + ucm_sortTable(moveTarget); + } + } + + return true; +} + +/* merge tables for rptp2ucm ------------------------------------------------ */ + +U_CAPI void U_EXPORT2 +ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, + const uint8_t *subchar, int32_t subcharLength, + uint8_t subchar1) { + UCMapping *fromUMapping, *toUMapping; + int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp; + + ucm_sortTable(fromUTable); + ucm_sortTable(toUTable); + + fromUMapping=fromUTable->mappings; + toUMapping=toUTable->mappings; + + fromUTop=fromUTable->mappingsLength; + toUTop=toUTable->mappingsLength; + + fromUIndex=toUIndex=0; + + while(fromUIndex<fromUTop && toUIndex<toUTop) { + cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, true); + if(cmp==0) { + /* equal: roundtrip, nothing to do (flags are initially 0) */ + ++fromUMapping; + ++toUMapping; + + ++fromUIndex; + ++toUIndex; + } else if(cmp<0) { + /* + * the fromU mapping does not have a toU counterpart: + * fallback Unicode->codepage + */ + if( (fromUMapping->bLen==subcharLength && + 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || + (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) + ) { + fromUMapping->f=2; /* SUB mapping */ + } else { + fromUMapping->f=1; /* normal fallback */ + } + + ++fromUMapping; + ++fromUIndex; + } else { + /* + * the toU mapping does not have a fromU counterpart: + * (reverse) fallback codepage->Unicode, copy it to the fromU table + */ + + /* ignore reverse fallbacks to Unicode SUB */ + if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { + toUMapping->f=3; /* reverse fallback */ + ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); + + /* the table may have been reallocated */ + fromUMapping=fromUTable->mappings+fromUIndex; + } + + ++toUMapping; + ++toUIndex; + } + } + + /* either one or both tables are exhausted */ + while(fromUIndex<fromUTop) { + /* leftover fromU mappings are fallbacks */ + if( (fromUMapping->bLen==subcharLength && + 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || + (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) + ) { + fromUMapping->f=2; /* SUB mapping */ + } else { + fromUMapping->f=1; /* normal fallback */ + } + + ++fromUMapping; + ++fromUIndex; + } + + while(toUIndex<toUTop) { + /* leftover toU mappings are reverse fallbacks */ + + /* ignore reverse fallbacks to Unicode SUB */ + if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { + toUMapping->f=3; /* reverse fallback */ + ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); + } + + ++toUMapping; + ++toUIndex; + } + + fromUTable->isSorted=false; +} + +/* separate extension mappings out of base table for rptp2ucm --------------- */ + +U_CAPI UBool U_EXPORT2 +ucm_separateMappings(UCMFile *ucm, UBool isSISO) { + UCMTable *table; + UCMapping *m, *mLimit; + int32_t type; + UBool needsMove, isOK; + + table=ucm->base; + m=table->mappings; + mLimit=m+table->mappingsLength; + + needsMove=false; + isOK=true; + + for(; m<mLimit; ++m) { + if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) { + fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n"); + ucm_printMapping(table, m, stderr); + m->moveFlag|=UCM_REMOVE_MAPPING; + needsMove=true; + continue; + } + + type=ucm_mappingType( + &ucm->states, m, + UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m)); + if(type<0) { + /* illegal byte sequence */ + printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr); + isOK=false; + } else if(type>0) { + m->moveFlag|=UCM_MOVE_TO_EXT; + needsMove=true; + } + } + + if(!isOK) { + return false; + } + if(needsMove) { + ucm_moveMappings(ucm->base, ucm->ext); + return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, false); + } else { + ucm_sortTable(ucm->base); + return true; + } +} + +/* ucm parser --------------------------------------------------------------- */ + +U_CAPI int8_t U_EXPORT2 +ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) { + const char *s=*ps; + char *end; + uint8_t byte; + int8_t bLen; + + bLen=0; + for(;;) { + /* skip an optional plus sign */ + if(bLen>0 && *s=='+') { + ++s; + } + if(*s!='\\') { + break; + } + + if( s[1]!='x' || + (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4 + ) { + fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line); + return -1; + } + + if(bLen==UCNV_EXT_MAX_BYTES) { + fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line); + return -1; + } + bytes[bLen++]=byte; + s=end; + } + + *ps=s; + return bLen; +} + +/* parse a mapping line; must not be empty */ +U_CAPI UBool U_EXPORT2 +ucm_parseMappingLine(UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES], + const char *line) { + const char *s; + char *end; + UChar32 cp; + int32_t u16Length; + int8_t uLen, bLen, f; + + s=line; + uLen=bLen=0; + + /* parse code points */ + for(;;) { + /* skip an optional plus sign */ + if(uLen>0 && *s=='+') { + ++s; + } + if(*s!='<') { + break; + } + + if( s[1]!='U' || + (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 || + *end!='>' + ) { + fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line); + return false; + } + if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) { + fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line); + return false; + } + + if(uLen==UCNV_EXT_MAX_UCHARS) { + fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line); + return false; + } + codePoints[uLen++]=cp; + s=end+1; + } + + if(uLen==0) { + fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line); + return false; + } else if(uLen==1) { + m->u=codePoints[0]; + } else { + UErrorCode errorCode=U_ZERO_ERROR; + u_strFromUTF32(nullptr, 0, &u16Length, codePoints, uLen, &errorCode); + if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) || + u16Length>UCNV_EXT_MAX_UCHARS + ) { + fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line); + return false; + } + } + + s=u_skipWhitespace(s); + + /* parse bytes */ + bLen=ucm_parseBytes(bytes, line, &s); + + if(bLen<0) { + return false; + } else if(bLen==0) { + fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line); + return false; + } else if(bLen<=4) { + uprv_memcpy(m->b.bytes, bytes, bLen); + } + + /* skip everything until the fallback indicator, even the start of a comment */ + for(;;) { + if(*s==0) { + f=-1; /* no fallback indicator */ + break; + } else if(*s=='|') { + f=(int8_t)(s[1]-'0'); + if((uint8_t)f>4) { + fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line); + return false; + } + break; + } + ++s; + } + + m->uLen=uLen; + m->bLen=bLen; + m->f=f; + return true; +} + +/* general APIs ------------------------------------------------------------- */ + +U_CAPI UCMTable * U_EXPORT2 +ucm_openTable() { + UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable)); + if(table==nullptr) { + fprintf(stderr, "ucm error: unable to allocate a UCMTable\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + memset(table, 0, sizeof(UCMTable)); + return table; +} + +U_CAPI void U_EXPORT2 +ucm_closeTable(UCMTable *table) { + if(table!=nullptr) { + uprv_free(table->mappings); + uprv_free(table->codePoints); + uprv_free(table->bytes); + uprv_free(table->reverseMap); + uprv_free(table); + } +} + +U_CAPI void U_EXPORT2 +ucm_resetTable(UCMTable *table) { + if(table!=nullptr) { + table->mappingsLength=0; + table->flagsType=0; + table->unicodeMask=0; + table->bytesLength=table->codePointsLength=0; + table->isSorted=false; + } +} + +U_CAPI void U_EXPORT2 +ucm_addMapping(UCMTable *table, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]) { + UCMapping *tm; + UChar32 c; + int32_t idx; + + if(table->mappingsLength>=table->mappingsCapacity) { + /* make the mappings array larger */ + if(table->mappingsCapacity==0) { + table->mappingsCapacity=1000; + } else { + table->mappingsCapacity*=10; + } + table->mappings=(UCMapping *)uprv_realloc(table->mappings, + table->mappingsCapacity*sizeof(UCMapping)); + if(table->mappings==nullptr) { + fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n", + (int)table->mappingsCapacity); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + if(table->reverseMap!=nullptr) { + /* the reverseMap must be reallocated in a new sort */ + uprv_free(table->reverseMap); + table->reverseMap=nullptr; + } + } + + if(m->uLen>1 && table->codePointsCapacity==0) { + table->codePointsCapacity=10000; + table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4); + if(table->codePoints==nullptr) { + fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n", + (int)table->codePointsCapacity); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + if(m->bLen>4 && table->bytesCapacity==0) { + table->bytesCapacity=10000; + table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity); + if(table->bytes==nullptr) { + fprintf(stderr, "ucm error: unable to allocate %d bytes\n", + (int)table->bytesCapacity); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + if(m->uLen>1) { + idx=table->codePointsLength; + table->codePointsLength+=m->uLen; + if(table->codePointsLength>table->codePointsCapacity) { + fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4); + m->u=idx; + } + + if(m->bLen>4) { + idx=table->bytesLength; + table->bytesLength+=m->bLen; + if(table->bytesLength>table->bytesCapacity) { + fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + uprv_memcpy(table->bytes+idx, bytes, m->bLen); + m->b.idx=idx; + } + + /* set unicodeMask */ + for(idx=0; idx<m->uLen; ++idx) { + c=codePoints[idx]; + if(c>=0x10000) { + table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */ + } else if(U_IS_SURROGATE(c)) { + table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */ + } + } + + /* set flagsType */ + if(m->f<0) { + table->flagsType|=UCM_FLAGS_IMPLICIT; + } else { + table->flagsType|=UCM_FLAGS_EXPLICIT; + } + + tm=table->mappings+table->mappingsLength++; + uprv_memcpy(tm, m, sizeof(UCMapping)); + + table->isSorted=false; +} + +U_CAPI UCMFile * U_EXPORT2 +ucm_open() { + UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile)); + if(ucm==nullptr) { + fprintf(stderr, "ucm error: unable to allocate a UCMFile\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + memset(ucm, 0, sizeof(UCMFile)); + + ucm->base=ucm_openTable(); + ucm->ext=ucm_openTable(); + + ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT; + ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER; + ucm->states.outputType=-1; + ucm->states.minCharLength=ucm->states.maxCharLength=1; + + return ucm; +} + +U_CAPI void U_EXPORT2 +ucm_close(UCMFile *ucm) { + if(ucm!=nullptr) { + ucm_closeTable(ucm->base); + ucm_closeTable(ucm->ext); + uprv_free(ucm); + } +} + +U_CAPI int32_t U_EXPORT2 +ucm_mappingType(UCMStates *baseStates, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]) { + (void)codePoints; + /* check validity of the bytes and count the characters in them */ + int32_t count=ucm_countChars(baseStates, bytes, m->bLen); + if(count<1) { + /* illegal byte sequence */ + return -1; + } + + /* + * Suitable for an ICU conversion base table means: + * - a 1:1 mapping (1 Unicode code point : 1 byte sequence) + * - precision flag 0..3 + * - SBCS: any 1:1 mapping + * (the table stores additional bits to distinguish mapping types) + * - MBCS: not a |2 SUB mapping for <subchar1> + * - MBCS: not a |1 fallback to 0x00 + * - MBCS: not a multi-byte mapping with leading 0x00 bytes + * + * Further restrictions for fromUnicode tables + * are enforced in makeconv (MBCSOkForBaseFromUnicode()). + * + * All of the MBCS fromUnicode specific tests could be removed from here, + * but the ones above are for unusual mappings, and removing the tests + * from here would change canonucm output which seems gratuitous. + * (Markus Scherer 2006-nov-28) + * + * Exception: All implicit mappings (f<0) that need to be moved + * because of fromUnicode restrictions _must_ be moved here because + * makeconv uses a hack for moving mappings only for the fromUnicode table + * that only works with non-negative values of f. + */ + if( m->uLen==1 && count==1 && m->f<=3 && + (baseStates->maxCharLength==1 || + !((m->f==2 && m->bLen==1) || + (m->f==1 && bytes[0]==0) || + (m->f<=1 && m->bLen>1 && bytes[0]==0))) + ) { + return 0; /* suitable for a base table */ + } else { + return 1; /* needs to go into an extension table */ + } +} + +U_CAPI UBool U_EXPORT2 +ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]) { + int32_t type; + + if(m->f==2 && m->uLen>1) { + fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n"); + printMapping(m, codePoints, bytes, stderr); + return false; + } + + if(baseStates!=nullptr) { + /* check validity of the bytes and count the characters in them */ + type=ucm_mappingType(baseStates, m, codePoints, bytes); + if(type<0) { + /* illegal byte sequence */ + printMapping(m, codePoints, bytes, stderr); + return false; + } + } else { + /* not used - adding a mapping for an extension-only table before its base table is read */ + type=1; + } + + /* + * Add the mapping to the base table if this is requested and suitable. + * Otherwise, add it to the extension table. + */ + if(forBase && type==0) { + ucm_addMapping(ucm->base, m, codePoints, bytes); + } else { + ucm_addMapping(ucm->ext, m, codePoints, bytes); + } + + return true; +} + +U_CAPI UBool U_EXPORT2 +ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) { + UCMapping m={ 0, {0}, 0, 0, 0, 0 }; + UChar32 codePoints[UCNV_EXT_MAX_UCHARS]; + uint8_t bytes[UCNV_EXT_MAX_BYTES]; + + const char *s; + + /* ignore empty and comment lines */ + if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') { + return true; + } + + return + ucm_parseMappingLine(&m, codePoints, bytes, line) && + ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes); +} + +U_CAPI void U_EXPORT2 +ucm_readTable(UCMFile *ucm, FileStream* convFile, + UBool forBase, UCMStates *baseStates, + UErrorCode *pErrorCode) { + char line[500]; + char *end; + UBool isOK; + + if(U_FAILURE(*pErrorCode)) { + return; + } + + isOK=true; + + for(;;) { + /* read the next line */ + if(!T_FileStream_readLine(convFile, line, sizeof(line))) { + fprintf(stderr, "incomplete charmap section\n"); + isOK=false; + break; + } + + /* remove CR LF */ + end=uprv_strchr(line, 0); + while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) { + --end; + } + *end=0; + + /* ignore empty and comment lines */ + if(line[0]==0 || line[0]=='#') { + continue; + } + + /* stop at the end of the mapping table */ + if(0==uprv_strcmp(line, "END CHARMAP")) { + break; + } + + isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates); + } + + if(!isOK) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } +} +#endif diff --git a/intl/icu/source/tools/toolutil/ucm.h b/intl/icu/source/tools/toolutil/ucm.h new file mode 100644 index 0000000000..8ea90604d4 --- /dev/null +++ b/intl/icu/source/tools/toolutil/ucm.h @@ -0,0 +1,302 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ******************************************************************************* + * Copyright (C) 2003-2013, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + * file name: ucm.h + * encoding: UTF-8 + * tab size: 8 (not used) + * indentation:4 + * + * created on: 2003jun20 + * created by: Markus W. Scherer + * + * Definitions for the .ucm file parser and handler module ucm.c. + */ + +#ifndef __UCM_H__ +#define __UCM_H__ + +#include "unicode/utypes.h" +#include "ucnvmbcs.h" +#include "ucnv_ext.h" +#include "filestrm.h" +#include <stdio.h> + +#if !UCONFIG_NO_CONVERSION + +U_CDECL_BEGIN + +/* constants for UCMapping.moveFlag */ +enum { + UCM_MOVE_TO_EXT=1, + UCM_REMOVE_MAPPING=2 +}; + +/* + * Per-mapping data structure + * + * u if uLen==1: Unicode code point + * else index to uLen code points + * b if bLen<=4: up to 4 bytes + * else index to bLen bytes + * uLen number of code points + * bLen number of words containing left-justified bytes + * bIsMultipleChars indicates that the bytes contain more than one sequence + * according to the state table + * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3) + * or "good one-way" mapping (4). + * Same values as in the source file after | + */ +typedef struct UCMapping { + UChar32 u; + union { + uint32_t idx; + uint8_t bytes[4]; + } b; + int8_t uLen, bLen, f, moveFlag; +} UCMapping; + +/* constants for UCMTable.flagsType */ +enum { + UCM_FLAGS_INITIAL, /* no mappings parsed yet */ + UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */ + UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */ + UCM_FLAGS_MIXED /* both implicit and explicit */ +}; + +typedef struct UCMTable { + UCMapping *mappings; + int32_t mappingsCapacity, mappingsLength; + + UChar32 *codePoints; + int32_t codePointsCapacity, codePointsLength; + + uint8_t *bytes; + int32_t bytesCapacity, bytesLength; + + /* index map for mapping by bytes first */ + int32_t *reverseMap; + + uint8_t unicodeMask; + int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */ + UBool isSorted; +} UCMTable; + +enum { + MBCS_STATE_FLAG_DIRECT=1, + MBCS_STATE_FLAG_SURROGATES, + + MBCS_STATE_FLAG_READY=16 +}; + +typedef struct UCMStates { + int32_t stateTable[MBCS_MAX_STATE_COUNT][256]; + uint32_t stateFlags[MBCS_MAX_STATE_COUNT], + stateOffsetSum[MBCS_MAX_STATE_COUNT]; + + int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits; + int8_t conversionType, outputType; +} UCMStates; + +typedef struct UCMFile { + UCMTable *base, *ext; + UCMStates states; + + char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH]; +} UCMFile; + +/* simple accesses ---------------------------------------------------------- */ + +#define UCM_GET_CODE_POINTS(t, m) \ + (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u) + +#define UCM_GET_BYTES(t, m) \ + (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx) + +/* APIs --------------------------------------------------------------------- */ + +U_CAPI UCMFile * U_EXPORT2 +ucm_open(void); + +U_CAPI void U_EXPORT2 +ucm_close(UCMFile *ucm); + +U_CAPI UBool U_EXPORT2 +ucm_parseHeaderLine(UCMFile *ucm, + char *line, char **pKey, char **pValue); + +/* @return -1 illegal bytes 0 suitable for base table 1 needs to go into extension table */ +U_CAPI int32_t U_EXPORT2 +ucm_mappingType(UCMStates *baseStates, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]); + +/* add a mapping to the base or extension table as appropriate */ +U_CAPI UBool U_EXPORT2 +ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]); + +U_CAPI UBool U_EXPORT2 +ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates); + + +U_CAPI UCMTable * U_EXPORT2 +ucm_openTable(void); + +U_CAPI void U_EXPORT2 +ucm_closeTable(UCMTable *table); + +U_CAPI void U_EXPORT2 +ucm_resetTable(UCMTable *table); + +U_CAPI void U_EXPORT2 +ucm_sortTable(UCMTable *t); + +/* + * Remove mappings with their move flag set from the base table + * and move some of them (with UCM_MOVE_TO_EXT) to the extension table. + */ +U_CAPI void U_EXPORT2 +ucm_moveMappings(UCMTable *base, UCMTable *ext); + +/** + * Read a table from a .ucm file, from after the CHARMAP line to + * including the END CHARMAP line. + */ +U_CAPI void U_EXPORT2 +ucm_readTable(UCMFile *ucm, FileStream* convFile, + UBool forBase, UCMStates *baseStates, + UErrorCode *pErrorCode); + +/** + * Check the validity of mappings against a base table's states; + * necessary for extension-only tables that were read before their base tables. + */ +U_CAPI UBool U_EXPORT2 +ucm_checkValidity(UCMTable *ext, UCMStates *baseStates); + +/** + * Check a base table against an extension table. + * Set the moveTarget!=NULL if it is possible to move mappings from the base. + * This is the case where base and extension tables are parsed from a single file + * (moveTarget==ext) + * or when delta file mappings are subtracted from a base table. + * + * When a base table cannot be modified because a delta file is parsed in makeconv, + * then set moveTarget=NULL. + * + * if(intersectBase) then mappings that exist in the base table but not in + * the extension table are moved to moveTarget instead of showing an error. + * + * Special mode: + * If intersectBase==2 for a DBCS extension table, then SBCS mappings are + * not moved out of the base unless their Unicode input requires it. + * This helps ucmkbase generate base tables for DBCS-only extension .cnv files. + * + * For both tables in the same file, the extension table is automatically + * built. + * For separate files, the extension file can use a complete mapping table (.ucm file), + * so that common mappings need not be stripped out manually. + * + * + * Sort both tables, and then for each mapping direction: + * + * If intersectBase is true and the base table contains a mapping + * that does not exist in the extension table, then this mapping is moved + * to moveTarget. + * + * - otherwise - + * + * If the base table contains a mapping for which the input sequence is + * the same as the extension input, then + * - if the output is the same: remove the extension mapping + * - else: error + * + * If the base table contains a mapping for which the input sequence is + * a prefix of the extension input, then + * - if moveTarget!=NULL: move the base mapping to the moveTarget table + * - else: error + * + * @return false in case of an irreparable error + */ +U_CAPI UBool U_EXPORT2 +ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, + UCMTable *moveTarget, UBool intersectBase); + +U_CAPI void U_EXPORT2 +ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode); + +U_CAPI void U_EXPORT2 +ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f); + + +U_CAPI void U_EXPORT2 +ucm_addState(UCMStates *states, const char *s); + +U_CAPI void U_EXPORT2 +ucm_processStates(UCMStates *states, UBool ignoreSISOCheck); + +U_CAPI int32_t U_EXPORT2 +ucm_countChars(UCMStates *states, + const uint8_t *bytes, int32_t length); + + +U_CAPI int8_t U_EXPORT2 +ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps); + +U_CAPI UBool U_EXPORT2 +ucm_parseMappingLine(UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES], + const char *line); + +U_CAPI void U_EXPORT2 +ucm_addMapping(UCMTable *table, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]); + +/* very makeconv-specific functions ----------------------------------------- */ + +/* finalize and optimize states after the toUnicode mappings are processed */ +U_CAPI void U_EXPORT2 +ucm_optimizeStates(UCMStates *states, + uint16_t **pUnicodeCodeUnits, + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + UBool verbose); + +/* moved here because it is used inside ucmstate.c */ +U_CAPI int32_t U_EXPORT2 +ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + uint32_t offset); + +/* very rptp2ucm-specific functions ----------------------------------------- */ + +/* + * Input: Separate tables with mappings from/to Unicode, + * subchar and subchar1 (0 if none). + * All mappings must have flag 0. + * + * Output: fromUTable will contain the union of mappings with the correct + * precision flags, and be sorted. + */ +U_CAPI void U_EXPORT2 +ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, + const uint8_t *subchar, int32_t subcharLength, + uint8_t subchar1); + +U_CAPI UBool U_EXPORT2 +ucm_separateMappings(UCMFile *ucm, UBool isSISO); + +U_CDECL_END + +#endif + +#endif + diff --git a/intl/icu/source/tools/toolutil/ucmstate.cpp b/intl/icu/source/tools/toolutil/ucmstate.cpp new file mode 100644 index 0000000000..08782f68d1 --- /dev/null +++ b/intl/icu/source/tools/toolutil/ucmstate.cpp @@ -0,0 +1,1053 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2003-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: ucmstate.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003oct09 +* created by: Markus W. Scherer +* +* This file handles ICU .ucm file state information as part of the ucm module. +* Most of this code used to be in makeconv.c. +*/ + +#include "unicode/utypes.h" +#include "cstring.h" +#include "cmemory.h" +#include "uarrsort.h" +#include "ucnvmbcs.h" +#include "ucnv_ext.h" +#include "uparse.h" +#include "ucm.h" +#include <stdio.h> + +#if !UCONFIG_NO_CONVERSION + +/* MBCS state handling ------------------------------------------------------ */ + +/* + * state table row grammar (ebnf-style): + * (whitespace is allowed between all tokens) + * + * row=[[firstentry ','] entry (',' entry)*] + * firstentry="initial" | "surrogates" + * (initial state (default for state 0), output is all surrogate pairs) + * entry=range [':' nextstate] ['.' action] + * range=number ['-' number] + * nextstate=number + * (0..7f) + * action='u' | 's' | 'p' | 'i' + * (unassigned, state change only, surrogate pair, illegal) + * number=(1- or 2-digit hexadecimal number) + */ +static const char * +parseState(const char *s, int32_t state[256], uint32_t *pFlags) { + const char *t; + uint32_t start, end, i; + int32_t entry; + + /* initialize the state: all illegal with U+ffff */ + for(i=0; i<256; ++i) { + state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff); + } + + /* skip leading white space */ + s=u_skipWhitespace(s); + + /* is there an "initial" or "surrogates" directive? */ + if(uprv_strncmp("initial", s, 7)==0) { + *pFlags=MBCS_STATE_FLAG_DIRECT; + s=u_skipWhitespace(s+7); + if(*s++!=',') { + return s-1; + } + } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) { + *pFlags=MBCS_STATE_FLAG_SURROGATES; + s=u_skipWhitespace(s+10); + if(*s++!=',') { + return s-1; + } + } else if(*s==0) { + /* empty state row: all-illegal */ + return nullptr; + } + + for(;;) { + /* read an entry, the start of the range first */ + s=u_skipWhitespace(s); + start=uprv_strtoul(s, (char **)&t, 16); + if(s==t || 0xff<start) { + return s; + } + s=u_skipWhitespace(t); + + /* read the end of the range if there is one */ + if(*s=='-') { + s=u_skipWhitespace(s+1); + end=uprv_strtoul(s, (char **)&t, 16); + if(s==t || end<start || 0xff<end) { + return s; + } + s=u_skipWhitespace(t); + } else { + end=start; + } + + /* determine the state entry for this range */ + if(*s!=':' && *s!='.') { + /* the default is: final state with valid entries */ + entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0); + } else { + entry=MBCS_ENTRY_TRANSITION(0, 0); + if(*s==':') { + /* get the next state, default to 0 */ + s=u_skipWhitespace(s+1); + i=uprv_strtoul(s, (char **)&t, 16); + if(s!=t) { + if(0x7f<i) { + return s; + } + s=u_skipWhitespace(t); + entry=MBCS_ENTRY_SET_STATE(entry, i); + } + } + + /* get the state action, default to valid */ + if(*s=='.') { + /* this is a final state */ + entry=MBCS_ENTRY_SET_FINAL(entry); + + s=u_skipWhitespace(s+1); + if(*s=='u') { + /* unassigned set U+fffe */ + entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe); + s=u_skipWhitespace(s+1); + } else if(*s=='p') { + if(*pFlags!=MBCS_STATE_FLAG_DIRECT) { + entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR); + } else { + entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16); + } + s=u_skipWhitespace(s+1); + } else if(*s=='s') { + entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY); + s=u_skipWhitespace(s+1); + } else if(*s=='i') { + /* illegal set U+ffff */ + entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff); + s=u_skipWhitespace(s+1); + } else { + /* default to valid */ + entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16); + } + } else { + /* this is an intermediate state, nothing to do */ + } + } + + /* adjust "final valid" states according to the state flags */ + if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) { + switch(*pFlags) { + case 0: + /* no adjustment */ + break; + case MBCS_STATE_FLAG_DIRECT: + /* set the valid-direct code point to "unassigned"==0xfffe */ + entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe); + break; + case MBCS_STATE_FLAG_SURROGATES: + entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0); + break; + default: + break; + } + } + + /* set this entry for the range */ + for(i=start; i<=end; ++i) { + state[i]=entry; + } + + if(*s==',') { + ++s; + } else { + return *s==0 ? nullptr : s; + } + } +} + +U_CAPI void U_EXPORT2 +ucm_addState(UCMStates *states, const char *s) { + const char *error; + + if(states->countStates==MBCS_MAX_STATE_COUNT) { + fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT); + exit(U_INVALID_TABLE_FORMAT); + } + + error=parseState(s, states->stateTable[states->countStates], + &states->stateFlags[states->countStates]); + if(error!=nullptr) { + fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error); + exit(U_INVALID_TABLE_FORMAT); + } + + ++states->countStates; +} + +U_CAPI UBool U_EXPORT2 +ucm_parseHeaderLine(UCMFile *ucm, + char *line, char **pKey, char **pValue) { + UCMStates *states; + char *s, *end; + char c; + + states=&ucm->states; + + /* remove comments and trailing CR and LF and remove whitespace from the end */ + for(end=line; (c=*end)!=0; ++end) { + if(c=='#' || c=='\r' || c=='\n') { + break; + } + } + while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) { + --end; + } + *end=0; + + /* skip leading white space and ignore empty lines */ + s=(char *)u_skipWhitespace(line); + if(*s==0) { + return true; + } + + /* stop at the beginning of the mapping section */ + if(uprv_memcmp(s, "CHARMAP", 7)==0) { + return false; + } + + /* get the key name, bracketed in <> */ + if(*s!='<') { + fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line); + exit(U_INVALID_TABLE_FORMAT); + } + *pKey=++s; + while(*s!='>') { + if(*s==0) { + fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line); + exit(U_INVALID_TABLE_FORMAT); + } + ++s; + } + *s=0; + + /* get the value string, possibly quoted */ + s=(char *)u_skipWhitespace(s+1); + if(*s!='"') { + *pValue=s; + } else { + /* remove the quotes */ + *pValue=s+1; + if(end>*pValue && *(end-1)=='"') { + *--end=0; + } + } + + /* collect the information from the header field, ignore unknown keys */ + if(uprv_strcmp(*pKey, "uconv_class")==0) { + if(uprv_strcmp(*pValue, "DBCS")==0) { + states->conversionType=UCNV_DBCS; + } else if(uprv_strcmp(*pValue, "SBCS")==0) { + states->conversionType = UCNV_SBCS; + } else if(uprv_strcmp(*pValue, "MBCS")==0) { + states->conversionType = UCNV_MBCS; + } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) { + states->conversionType = UCNV_EBCDIC_STATEFUL; + } else { + fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue); + exit(U_INVALID_TABLE_FORMAT); + } + return true; + } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) { + c=**pValue; + if('1'<=c && c<='4' && (*pValue)[1]==0) { + states->maxCharLength=(int8_t)(c-'0'); + states->outputType=(int8_t)(states->maxCharLength-1); + } else { + fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue); + exit(U_INVALID_TABLE_FORMAT); + } + return true; + } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) { + c=**pValue; + if('1'<=c && c<='4' && (*pValue)[1]==0) { + states->minCharLength=(int8_t)(c-'0'); + } else { + fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue); + exit(U_INVALID_TABLE_FORMAT); + } + return true; + } else if(uprv_strcmp(*pKey, "icu:state")==0) { + /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */ + switch(states->conversionType) { + case UCNV_SBCS: + case UCNV_DBCS: + case UCNV_EBCDIC_STATEFUL: + states->conversionType=UCNV_MBCS; + break; + case UCNV_MBCS: + break; + default: + fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n"); + exit(U_INVALID_TABLE_FORMAT); + } + + if(states->maxCharLength==0) { + fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n"); + exit(U_INVALID_TABLE_FORMAT); + } + ucm_addState(states, *pValue); + return true; + } else if(uprv_strcmp(*pKey, "icu:base")==0) { + if(**pValue==0) { + fprintf(stderr, "ucm error: <icu:base> without a base table name\n"); + exit(U_INVALID_TABLE_FORMAT); + } + uprv_strcpy(ucm->baseName, *pValue); + return true; + } + + return false; +} + +/* post-processing ---------------------------------------------------------- */ + +static int32_t +sumUpStates(UCMStates *states) { + int32_t entry, sum, state, cell, count; + UBool allStatesReady; + + /* + * Sum up the offsets for all states. + * In each final state (where there are only final entries), + * the offsets add up directly. + * In all other state table rows, for each transition entry to another state, + * the offsets sum of that state needs to be added. + * This is achieved in at most countStates iterations. + */ + allStatesReady=false; + for(count=states->countStates; !allStatesReady && count>=0; --count) { + allStatesReady=true; + for(state=states->countStates-1; state>=0; --state) { + if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) { + allStatesReady=false; + sum=0; + + /* at first, add up only the final delta offsets to keep them <512 */ + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[state][cell]; + if(MBCS_ENTRY_IS_FINAL(entry)) { + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_VALID_16: + states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); + sum+=1; + break; + case MBCS_STATE_VALID_16_PAIR: + states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); + sum+=2; + break; + default: + /* no addition */ + break; + } + } + } + + /* now, add up the delta offsets for the transitional entries */ + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[state][cell]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) { + states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum); + sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)]; + } else { + /* that next state does not have a sum yet, we cannot finish the one for this state */ + sum=-1; + break; + } + } + } + + if(sum!=-1) { + states->stateOffsetSum[state]=sum; + states->stateFlags[state]|=MBCS_STATE_FLAG_READY; + } + } + } + } + + if(!allStatesReady) { + fprintf(stderr, "ucm error: the state table contains loops\n"); + exit(U_INVALID_TABLE_FORMAT); + } + + /* + * For all "direct" (i.e., initial) states>0, + * the offsets need to be increased by the sum of + * the previous initial states. + */ + sum=states->stateOffsetSum[0]; + for(state=1; state<states->countStates; ++state) { + if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { + int32_t sum2=sum; + sum+=states->stateOffsetSum[state]; + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[state][cell]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2); + } + } + } + } + + /* round up to the next even number to have the following data 32-bit-aligned */ + return states->countToUCodeUnits=(sum+1)&~1; +} + +U_CAPI void U_EXPORT2 +ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) { + int32_t entry, state, cell, count; + + if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) { + fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n"); + exit(U_INVALID_TABLE_FORMAT); + } + + if(states->countStates==0) { + switch(states->conversionType) { + case UCNV_SBCS: + /* SBCS: use MBCS data structure with a default state table */ + if(states->maxCharLength!=1) { + fprintf(stderr, "error: SBCS codepage with max B/char!=1\n"); + exit(U_INVALID_TABLE_FORMAT); + } + states->conversionType=UCNV_MBCS; + ucm_addState(states, "0-ff"); + break; + case UCNV_MBCS: + fprintf(stderr, "ucm error: missing state table information (<icu:state>) for MBCS\n"); + exit(U_INVALID_TABLE_FORMAT); + break; + case UCNV_EBCDIC_STATEFUL: + /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */ + if(states->minCharLength!=1 || states->maxCharLength!=2) { + fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n"); + exit(U_INVALID_TABLE_FORMAT); + } + states->conversionType=UCNV_MBCS; + ucm_addState(states, "0-ff, e:1.s, f:0.s"); + ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4"); + ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i"); + ucm_addState(states, "0-ff:1.i, 40:1."); + ucm_addState(states, "0-ff:1.i"); + break; + case UCNV_DBCS: + /* DBCS: use MBCS data structure with a default state table */ + if(states->minCharLength!=2 || states->maxCharLength!=2) { + fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n"); + exit(U_INVALID_TABLE_FORMAT); + } + states->conversionType = UCNV_MBCS; + ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3"); + ucm_addState(states, "41-fe"); + ucm_addState(states, "40"); + ucm_addState(states, ""); + break; + default: + fprintf(stderr, "ucm error: unknown charset structure\n"); + exit(U_INVALID_TABLE_FORMAT); + break; + } + } + + /* + * check that the min/max character lengths are reasonable; + * to do this right, all paths through the state table would have to be + * recursively walked while keeping track of the sequence lengths, + * but these simple checks cover most state tables in practice + */ + if(states->maxCharLength<states->minCharLength) { + fprintf(stderr, "ucm error: max B/char < min B/char\n"); + exit(U_INVALID_TABLE_FORMAT); + } + + /* count non-direct states and compare with max B/char */ + count=0; + for(state=0; state<states->countStates; ++state) { + if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) { + ++count; + } + } + if(states->maxCharLength>count+1) { + fprintf(stderr, "ucm error: max B/char too large\n"); + exit(U_INVALID_TABLE_FORMAT); + } + + if(states->minCharLength==1) { + int32_t action; + + /* + * if there are single-byte characters, + * then the initial state must have direct result states + */ + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[0][cell]; + if( MBCS_ENTRY_IS_FINAL(entry) && + ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 || + action==MBCS_STATE_UNASSIGNED) + ) { + break; + } + } + + if(cell==256) { + fprintf(stderr, "ucm warning: min B/char too small\n"); + } + } + + /* + * make sure that all "next state" values are within limits + * and that all next states after final ones have the "direct" + * flag of initial states + */ + for(state=states->countStates-1; state>=0; --state) { + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[state][cell]; + if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) { + fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n", + (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); + exit(U_INVALID_TABLE_FORMAT); + } + if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) { + fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n", + (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); + exit(U_INVALID_TABLE_FORMAT); + } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) { + fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n", + (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); + exit(U_INVALID_TABLE_FORMAT); + } + } + } + + /* is this an SI/SO (like EBCDIC-stateful) state table? */ + if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) { + if(states->maxCharLength!=2) { + fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength); + exit(U_INVALID_TABLE_FORMAT); + } + if(states->countStates<3) { + fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates); + exit(U_INVALID_TABLE_FORMAT); + } + /* are the SI/SO all in the right places? */ + if( ignoreSISOCheck || + (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && + states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) && + states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && + states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0)) + ) { + states->outputType=MBCS_OUTPUT_2_SISO; + } else { + fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n"); + exit(U_INVALID_TABLE_FORMAT); + } + state=2; + } else { + state=1; + } + + /* check that no unexpected state is a "direct" one */ + while(state<states->countStates) { + if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { + fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state); + exit(U_INVALID_TABLE_FORMAT); + } + ++state; + } + + sumUpStates(states); +} + +/* find a fallback for this offset; return the index or -1 if not found */ +U_CAPI int32_t U_EXPORT2 +ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + uint32_t offset) { + int32_t i; + + if(countToUFallbacks==0) { + /* shortcut: most codepages do not have fallbacks from codepage to Unicode */ + return -1; + } + + /* do a linear search for the fallback mapping (the table is not yet sorted) */ + for(i=0; i<countToUFallbacks; ++i) { + if(offset==toUFallbacks[i].offset) { + return i; + } + } + return -1; +} + +/* + * This function tries to compact toUnicode tables for 2-byte codepages + * by finding lead bytes with all-unassigned trail bytes and adding another state + * for them. + */ +static void +compactToUnicode2(UCMStates *states, + uint16_t **pUnicodeCodeUnits, + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + UBool verbose) { + int32_t (*oldStateTable)[256]; + uint16_t count[256]; + uint16_t *oldUnicodeCodeUnits; + int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum; + int32_t i, j, leadState, trailState, newState, fallback; + uint16_t unit; + + /* find the lead state */ + if(states->outputType==MBCS_OUTPUT_2_SISO) { + /* use the DBCS lead state for SI/SO codepages */ + leadState=1; + } else { + leadState=0; + } + + /* find the main trail state: the most used target state */ + uprv_memset(count, 0, sizeof(count)); + for(i=0; i<256; ++i) { + entry=states->stateTable[leadState][i]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + ++count[MBCS_ENTRY_TRANSITION_STATE(entry)]; + } + } + trailState=0; + for(i=1; i<states->countStates; ++i) { + if(count[i]>count[trailState]) { + trailState=i; + } + } + + /* count possible savings from lead bytes with all-unassigned results in all trail bytes */ + uprv_memset(count, 0, sizeof(count)); + savings=0; + /* for each lead byte */ + for(i=0; i<256; ++i) { + entry=states->stateTable[leadState][i]; + if(MBCS_ENTRY_IS_TRANSITION(entry) && + (MBCS_ENTRY_TRANSITION_STATE(entry))==static_cast<uint32_t>(trailState)) { + /* the offset is different for each lead byte */ + offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); + /* for each trail byte for this lead byte */ + for(j=0; j<256; ++j) { + entry=states->stateTable[trailState][j]; + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_VALID_16: + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); + if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) { + ++count[i]; + } else { + j=999; /* do not count for this lead byte because there are assignments */ + } + break; + case MBCS_STATE_VALID_16_PAIR: + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); + if((*pUnicodeCodeUnits)[entry]==0xfffe) { + count[i]+=2; + } else { + j=999; /* do not count for this lead byte because there are assignments */ + } + break; + default: + break; + } + } + if(j==256) { + /* all trail bytes for this lead byte are unassigned */ + savings+=count[i]; + } else { + count[i]=0; + } + } + } + /* subtract from the possible savings the cost of an additional state */ + savings=savings*2-1024; /* count bytes, not 16-bit words */ + if(savings<=0) { + return; + } + if(verbose) { + printf("compacting toUnicode data saves %ld bytes\n", (long)savings); + } + if(states->countStates>=MBCS_MAX_STATE_COUNT) { + fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n"); + return; + } + + /* make a copy of the state table */ + oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024); + if(oldStateTable==nullptr) { + fprintf(stderr, "cannot compact toUnicode: out of memory\n"); + return; + } + uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024); + + /* add the new state */ + /* + * this function does not catch the degenerate case where all lead bytes + * have all-unassigned trail bytes and the lead state could be removed + */ + newState=states->countStates++; + states->stateFlags[newState]=0; + /* copy the old trail state, turning all assigned states into unassigned ones */ + for(i=0; i<256; ++i) { + entry=states->stateTable[trailState][i]; + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_VALID_16: + case MBCS_STATE_VALID_16_PAIR: + states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe); + break; + default: + states->stateTable[newState][i]=entry; + break; + } + } + + /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */ + for(i=0; i<256; ++i) { + if(count[i]>0) { + states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState); + } + } + + /* sum up the new state table */ + for(i=0; i<states->countStates; ++i) { + states->stateFlags[i]&=~MBCS_STATE_FLAG_READY; + } + sum=sumUpStates(states); + + /* allocate a new, smaller code units array */ + oldUnicodeCodeUnits=*pUnicodeCodeUnits; + if(sum==0) { + *pUnicodeCodeUnits=nullptr; + if(oldUnicodeCodeUnits!=nullptr) { + uprv_free(oldUnicodeCodeUnits); + } + uprv_free(oldStateTable); + return; + } + *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t)); + if(*pUnicodeCodeUnits==nullptr) { + fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n", + (long)sum); + /* revert to the old state table */ + *pUnicodeCodeUnits=oldUnicodeCodeUnits; + --states->countStates; + uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024); + uprv_free(oldStateTable); + return; + } + for(i=0; i<sum; ++i) { + (*pUnicodeCodeUnits)[i]=0xfffe; + } + + /* copy the code units for all assigned characters */ + /* + * The old state table has the same lead _and_ trail states for assigned characters! + * The differences are in the offsets, and in the trail states for some unassigned characters. + * For each character with an assigned state in the new table, it was assigned in the old one. + * Only still-assigned characters are copied. + * Note that fallback mappings need to get their offset values adjusted. + */ + + /* for each initial state */ + for(leadState=0; leadState<states->countStates; ++leadState) { + if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) { + /* for each lead byte from there */ + for(i=0; i<256; ++i) { + entry=states->stateTable[leadState][i]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); + /* the new state does not have assigned states */ + if(trailState!=newState) { + trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry); + oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]); + /* for each trail byte */ + for(j=0; j<256; ++j) { + entry=states->stateTable[trailState][j]; + /* copy assigned-character code units and adjust fallback offsets */ + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_VALID_16: + offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); + /* find the old offset according to the old state table */ + oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); + unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset]; + if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) { + toUFallbacks[fallback].offset=0x80000000|offset; + } + break; + case MBCS_STATE_VALID_16_PAIR: + offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); + /* find the old offset according to the old state table */ + oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); + (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++]; + (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset]; + break; + default: + break; + } + } + } + } + } + } + } + + /* remove temporary flags from fallback offsets that protected them from being modified twice */ + for(i=0; i<countToUFallbacks; ++i) { + toUFallbacks[i].offset&=0x7fffffff; + } + + /* free temporary memory */ + uprv_free(oldUnicodeCodeUnits); + uprv_free(oldStateTable); +} + +/* + * recursive sub-function of compactToUnicodeHelper() + * returns: + * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved, + * if all sequences from this state are unassigned, returns the + * <0 there are assignments in unicodeCodeUnits[] + * 0 no use of unicodeCodeUnits[] + */ +static int32_t +findUnassigned(UCMStates *states, + uint16_t *unicodeCodeUnits, + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + int32_t state, int32_t offset, uint32_t b) { + int32_t i, entry, savings, localSavings, belowSavings; + UBool haveAssigned; + + localSavings=belowSavings=0; + haveAssigned=false; + for(i=0; i<256; ++i) { + entry=states->stateTable[state][i]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + savings=findUnassigned(states, + unicodeCodeUnits, + toUFallbacks, countToUFallbacks, + MBCS_ENTRY_TRANSITION_STATE(entry), + offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), + (b<<8)|(uint32_t)i); + if(savings<0) { + haveAssigned=true; + } else if(savings>0) { + printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n", + (unsigned long)((b<<8)|i), (long)state, (long)savings); + belowSavings+=savings; + } + } else if(!haveAssigned) { + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_VALID_16: + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); + if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) { + localSavings+=2; + } else { + haveAssigned=true; + } + break; + case MBCS_STATE_VALID_16_PAIR: + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); + if(unicodeCodeUnits[entry]==0xfffe) { + localSavings+=4; + } else { + haveAssigned=true; + } + break; + default: + break; + } + } + } + if(haveAssigned) { + return -1; + } else { + return localSavings+belowSavings; + } +} + +/* helper function for finding compaction opportunities */ +static void +compactToUnicodeHelper(UCMStates *states, + uint16_t *unicodeCodeUnits, + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) { + int32_t state, savings; + + /* for each initial state */ + for(state=0; state<states->countStates; ++state) { + if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { + savings=findUnassigned(states, + unicodeCodeUnits, + toUFallbacks, countToUFallbacks, + state, 0, 0); + if(savings>0) { + printf(" all-unassigned sequences from initial state %ld use %ld bytes\n", + (long)state, (long)savings); + } + } + } +} + +U_CDECL_BEGIN +static int32_t U_CALLCONV +compareFallbacks(const void *context, const void *fb1, const void *fb2) { + (void)context; + return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset; +} +U_CDECL_END + +U_CAPI void U_EXPORT2 +ucm_optimizeStates(UCMStates *states, + uint16_t **pUnicodeCodeUnits, + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + UBool verbose) { + UErrorCode errorCode; + int32_t state, cell, entry; + + /* test each state table entry */ + for(state=0; state<states->countStates; ++state) { + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[state][cell]; + /* + * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code + * and the code point is "unassigned" (0xfffe), then change it to + * the "unassigned" action code with bits 26..23 set to zero and U+fffe. + */ + if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) { + states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED); + } + } + } + + /* try to compact the toUnicode tables */ + if(states->maxCharLength==2) { + compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose); + } else if(states->maxCharLength>2) { + if(verbose) { + compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks); + } + } + + /* sort toUFallbacks */ + /* + * It should be safe to sort them before compactToUnicode2() is called, + * because it should not change the relative order of the offset values + * that it adjusts, but they need to be sorted at some point, and + * it is safest here. + */ + if(countToUFallbacks>0) { + errorCode=U_ZERO_ERROR; /* nothing bad will happen... */ + uprv_sortArray(toUFallbacks, countToUFallbacks, + sizeof(_MBCSToUFallback), + compareFallbacks, nullptr, false, &errorCode); + } +} + +/* use a complete state table ----------------------------------------------- */ + +U_CAPI int32_t U_EXPORT2 +ucm_countChars(UCMStates *states, + const uint8_t *bytes, int32_t length) { + uint32_t offset; + int32_t i, entry, count; + uint8_t state; + + offset=0; + count=0; + state=0; + + if(states->countStates==0) { + fprintf(stderr, "ucm error: there is no state information!\n"); + return -1; + } + + /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */ + if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) { + state=1; + } + + /* + * Walk down the state table like in conversion, + * much like getNextUChar(). + * We assume that c<=0x10ffff. + */ + for(i=0; i<length; ++i) { + entry=states->stateTable[state][bytes[i]]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); + offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); + } else { + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_ILLEGAL: + fprintf(stderr, "ucm error: byte sequence ends in illegal state\n"); + return -1; + case MBCS_STATE_CHANGE_ONLY: + fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n"); + return -1; + case MBCS_STATE_UNASSIGNED: + case MBCS_STATE_FALLBACK_DIRECT_16: + case MBCS_STATE_VALID_DIRECT_16: + case MBCS_STATE_FALLBACK_DIRECT_20: + case MBCS_STATE_VALID_DIRECT_20: + case MBCS_STATE_VALID_16: + case MBCS_STATE_VALID_16_PAIR: + /* count a complete character and prepare for a new one */ + ++count; + state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); + offset=0; + break; + default: + /* reserved, must never occur */ + fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry); + return -1; + } + } + } + + if(offset!=0) { + fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %u\n", state); + return -1; + } + + /* + * for SI/SO (like EBCDIC-stateful), multiple-character results + * must consist of only double-byte sequences + */ + if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) { + fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count); + return -1; + } + + return count; +} +#endif + diff --git a/intl/icu/source/tools/toolutil/udbgutil.cpp b/intl/icu/source/tools/toolutil/udbgutil.cpp new file mode 100644 index 0000000000..3f4bf3718e --- /dev/null +++ b/intl/icu/source/tools/toolutil/udbgutil.cpp @@ -0,0 +1,769 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 2007-2016, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ + +#include "udbgutil.h" +#include <string.h> +#include "ustr_imp.h" +#include "cmemory.h" +#include "cstring.h" +#include "putilimp.h" +#include "unicode/ulocdata.h" +#include "unicode/ucnv.h" +#include "unicode/unistr.h" +#include "cstr.h" + +/* +To add a new enum type + (For example: UShoeSize with values USHOE_WIDE=0, USHOE_REGULAR, USHOE_NARROW, USHOE_COUNT) + + 0. Make sure that all lines you add are protected with appropriate uconfig guards, + such as '#if !UCONFIG_NO_SHOES'. + 1. udbgutil.h: add UDBG_UShoeSize to the UDebugEnumType enum before UDBG_ENUM_COUNT + ( The subsequent steps involve this file, udbgutil.cpp ) + 2. Find the marker "Add new enum types above this line" + 3. Before that marker, add a #include of any header file you need. + 4. Each enum type has three things in this section: a #define, a count_, and an array of Fields. + It may help to copy and paste a previous definition. + 5. In the case of the USHOE_... strings above, "USHOE_" is common to all values- six characters + " #define LEN_USHOE 6 " + 6 characters will strip off "USHOE_" leaving enum values of WIDE, REGULAR, and NARROW. + 6. Define the 'count_' variable, with the number of enum values. If the enum has a _MAX or _COUNT value, + that can be helpful for automatically defining the count. Otherwise define it manually. + " static const int32_t count_UShoeSize = USHOE_COUNT; " + 7. Define the field names, in order. + " static const Field names_UShoeSize[] = { + " FIELD_NAME_STR( LEN_USHOE, USHOE_WIDE ), + " FIELD_NAME_STR( LEN_USHOE, USHOE_REGULAR ), + " FIELD_NAME_STR( LEN_USHOE, USHOE_NARROW ), + " }; + ( The following command was usedfor converting ucol.h into partially correct entities ) + grep "^[ ]*UCOL" < unicode/ucol.h | + sed -e 's%^[ ]*\([A-Z]*\)_\([A-Z_]*\).*% FIELD_NAME_STR( LEN_\1, \1_\2 ),%g' + 8. Now, a bit farther down, add the name of the enum itself to the end of names_UDebugEnumType + ( UDebugEnumType is an enum, too!) + names_UDebugEnumType[] { ... + " FIELD_NAME_STR( LEN_UDBG, UDBG_UShoeSize ), " + 9. Find the function _udbg_enumCount and add the count macro: + " COUNT_CASE(UShoeSize) + 10. Find the function _udbg_enumFields and add the field macro: + " FIELD_CASE(UShoeSize) + 11. verify that your test code, and Java data generation, works properly. +*/ + +/** + * Structure representing an enum value + */ +struct Field { + int32_t prefix; /**< how many characters to remove in the prefix - i.e. UCHAR_ = 5 */ + const char *str; /**< The actual string value */ + int32_t num; /**< The numeric value */ +}; + +/** + * Define another field name. Used in an array of Field s + * @param y the common prefix length (i.e. 6 for "USHOE_" ) + * @param x the actual enum value - it will be copied in both string and symbolic form. + * @see Field + */ +#define FIELD_NAME_STR(y,x) { y, #x, x } + + +// TODO: Currently, this whole functionality goes away with UCONFIG_NO_FORMATTING. Should be split up. +#if !UCONFIG_NO_FORMATTING + +// Calendar +#include "unicode/ucal.h" + +// 'UCAL_' = 5 +#define LEN_UCAL 5 /* UCAL_ */ +static const int32_t count_UCalendarDateFields = UCAL_FIELD_COUNT; +static const Field names_UCalendarDateFields[] = +{ + FIELD_NAME_STR( LEN_UCAL, UCAL_ERA ), + FIELD_NAME_STR( LEN_UCAL, UCAL_YEAR ), + FIELD_NAME_STR( LEN_UCAL, UCAL_MONTH ), + FIELD_NAME_STR( LEN_UCAL, UCAL_WEEK_OF_YEAR ), + FIELD_NAME_STR( LEN_UCAL, UCAL_WEEK_OF_MONTH ), + FIELD_NAME_STR( LEN_UCAL, UCAL_DATE ), + FIELD_NAME_STR( LEN_UCAL, UCAL_DAY_OF_YEAR ), + FIELD_NAME_STR( LEN_UCAL, UCAL_DAY_OF_WEEK ), + FIELD_NAME_STR( LEN_UCAL, UCAL_DAY_OF_WEEK_IN_MONTH ), + FIELD_NAME_STR( LEN_UCAL, UCAL_AM_PM ), + FIELD_NAME_STR( LEN_UCAL, UCAL_HOUR ), + FIELD_NAME_STR( LEN_UCAL, UCAL_HOUR_OF_DAY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_MINUTE ), + FIELD_NAME_STR( LEN_UCAL, UCAL_SECOND ), + FIELD_NAME_STR( LEN_UCAL, UCAL_MILLISECOND ), + FIELD_NAME_STR( LEN_UCAL, UCAL_ZONE_OFFSET ), + FIELD_NAME_STR( LEN_UCAL, UCAL_DST_OFFSET ), + FIELD_NAME_STR( LEN_UCAL, UCAL_YEAR_WOY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_DOW_LOCAL ), + FIELD_NAME_STR( LEN_UCAL, UCAL_EXTENDED_YEAR ), + FIELD_NAME_STR( LEN_UCAL, UCAL_JULIAN_DAY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_MILLISECONDS_IN_DAY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_IS_LEAP_MONTH ), +#ifndef U_HIDE_DRAFT_API + FIELD_NAME_STR( LEN_UCAL, UCAL_ORDINAL_MONTH ), +#endif // U_HIDE_DRAFT_API +}; + + +static const int32_t count_UCalendarMonths = UCAL_UNDECIMBER+1; +static const Field names_UCalendarMonths[] = +{ + FIELD_NAME_STR( LEN_UCAL, UCAL_JANUARY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_FEBRUARY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_MARCH ), + FIELD_NAME_STR( LEN_UCAL, UCAL_APRIL ), + FIELD_NAME_STR( LEN_UCAL, UCAL_MAY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_JUNE ), + FIELD_NAME_STR( LEN_UCAL, UCAL_JULY ), + FIELD_NAME_STR( LEN_UCAL, UCAL_AUGUST ), + FIELD_NAME_STR( LEN_UCAL, UCAL_SEPTEMBER ), + FIELD_NAME_STR( LEN_UCAL, UCAL_OCTOBER ), + FIELD_NAME_STR( LEN_UCAL, UCAL_NOVEMBER ), + FIELD_NAME_STR( LEN_UCAL, UCAL_DECEMBER ), + FIELD_NAME_STR( LEN_UCAL, UCAL_UNDECIMBER) +}; + +#include "unicode/udat.h" + +#define LEN_UDAT 5 /* "UDAT_" */ +static const int32_t count_UDateFormatStyle = UDAT_SHORT+1; +static const Field names_UDateFormatStyle[] = +{ + FIELD_NAME_STR( LEN_UDAT, UDAT_FULL ), + FIELD_NAME_STR( LEN_UDAT, UDAT_LONG ), + FIELD_NAME_STR( LEN_UDAT, UDAT_MEDIUM ), + FIELD_NAME_STR( LEN_UDAT, UDAT_SHORT ), + /* end regular */ + /* + * negative enums.. leave out for now. + FIELD_NAME_STR( LEN_UDAT, UDAT_NONE ), + FIELD_NAME_STR( LEN_UDAT, UDAT_PATTERN ), + */ +}; + +#endif + +#include "unicode/uloc.h" + +#define LEN_UAR 12 /* "ULOC_ACCEPT_" */ +static const int32_t count_UAcceptResult = 3; +static const Field names_UAcceptResult[] = +{ + FIELD_NAME_STR( LEN_UAR, ULOC_ACCEPT_FAILED ), + FIELD_NAME_STR( LEN_UAR, ULOC_ACCEPT_VALID ), + FIELD_NAME_STR( LEN_UAR, ULOC_ACCEPT_FALLBACK ), +}; + +#if !UCONFIG_NO_COLLATION +#include "unicode/ucol.h" +#define LEN_UCOL 5 /* UCOL_ */ +static const int32_t count_UColAttributeValue = UCOL_ATTRIBUTE_VALUE_COUNT; +static const Field names_UColAttributeValue[] = { + FIELD_NAME_STR( LEN_UCOL, UCOL_PRIMARY ), + FIELD_NAME_STR( LEN_UCOL, UCOL_SECONDARY ), + FIELD_NAME_STR( LEN_UCOL, UCOL_TERTIARY ), +// FIELD_NAME_STR( LEN_UCOL, UCOL_CE_STRENGTH_LIMIT ), + FIELD_NAME_STR( LEN_UCOL, UCOL_QUATERNARY ), + // gap + FIELD_NAME_STR( LEN_UCOL, UCOL_IDENTICAL ), +// FIELD_NAME_STR( LEN_UCOL, UCOL_STRENGTH_LIMIT ), + FIELD_NAME_STR( LEN_UCOL, UCOL_OFF ), + FIELD_NAME_STR( LEN_UCOL, UCOL_ON ), + // gap + FIELD_NAME_STR( LEN_UCOL, UCOL_SHIFTED ), + FIELD_NAME_STR( LEN_UCOL, UCOL_NON_IGNORABLE ), + // gap + FIELD_NAME_STR( LEN_UCOL, UCOL_LOWER_FIRST ), + FIELD_NAME_STR( LEN_UCOL, UCOL_UPPER_FIRST ), +}; + +#endif + + +#if UCONFIG_ENABLE_PLUGINS +#include "unicode/icuplug.h" + +#define LEN_UPLUG_REASON 13 /* UPLUG_REASON_ */ +static const int32_t count_UPlugReason = UPLUG_REASON_COUNT; +static const Field names_UPlugReason[] = { + FIELD_NAME_STR( LEN_UPLUG_REASON, UPLUG_REASON_QUERY ), + FIELD_NAME_STR( LEN_UPLUG_REASON, UPLUG_REASON_LOAD ), + FIELD_NAME_STR( LEN_UPLUG_REASON, UPLUG_REASON_UNLOAD ), +}; + +#define LEN_UPLUG_LEVEL 12 /* UPLUG_LEVEL_ */ +static const int32_t count_UPlugLevel = UPLUG_LEVEL_COUNT; +static const Field names_UPlugLevel[] = { + FIELD_NAME_STR( LEN_UPLUG_LEVEL, UPLUG_LEVEL_INVALID ), + FIELD_NAME_STR( LEN_UPLUG_LEVEL, UPLUG_LEVEL_UNKNOWN ), + FIELD_NAME_STR( LEN_UPLUG_LEVEL, UPLUG_LEVEL_LOW ), + FIELD_NAME_STR( LEN_UPLUG_LEVEL, UPLUG_LEVEL_HIGH ), +}; +#endif + +#define LEN_UDBG 5 /* "UDBG_" */ +static const int32_t count_UDebugEnumType = UDBG_ENUM_COUNT; +static const Field names_UDebugEnumType[] = +{ + FIELD_NAME_STR( LEN_UDBG, UDBG_UDebugEnumType ), +#if !UCONFIG_NO_FORMATTING + FIELD_NAME_STR( LEN_UDBG, UDBG_UCalendarDateFields ), + FIELD_NAME_STR( LEN_UDBG, UDBG_UCalendarMonths ), + FIELD_NAME_STR( LEN_UDBG, UDBG_UDateFormatStyle ), +#endif +#if UCONFIG_ENABLE_PLUGINS + FIELD_NAME_STR( LEN_UDBG, UDBG_UPlugReason ), + FIELD_NAME_STR( LEN_UDBG, UDBG_UPlugLevel ), +#endif + FIELD_NAME_STR( LEN_UDBG, UDBG_UAcceptResult ), +#if !UCONFIG_NO_COLLATION + FIELD_NAME_STR( LEN_UDBG, UDBG_UColAttributeValue ), +#endif +}; + + +// --- Add new enum types above this line --- + +#define COUNT_CASE(x) case UDBG_##x: return (actual?count_##x:UPRV_LENGTHOF(names_##x)); +#define COUNT_FAIL_CASE(x) case UDBG_##x: return -1; + +#define FIELD_CASE(x) case UDBG_##x: return names_##x; +#define FIELD_FAIL_CASE(x) case UDBG_##x: return nullptr; + +// low level + +/** + * @param type type of item + * @param actual true: for the actual enum's type (UCAL_FIELD_COUNT, etc), or false for the string count + */ +static int32_t _udbg_enumCount(UDebugEnumType type, UBool actual) { + switch(type) { + COUNT_CASE(UDebugEnumType) +#if !UCONFIG_NO_FORMATTING + COUNT_CASE(UCalendarDateFields) + COUNT_CASE(UCalendarMonths) + COUNT_CASE(UDateFormatStyle) +#endif +#if UCONFIG_ENABLE_PLUGINS + COUNT_CASE(UPlugReason) + COUNT_CASE(UPlugLevel) +#endif + COUNT_CASE(UAcceptResult) +#if !UCONFIG_NO_COLLATION + COUNT_CASE(UColAttributeValue) +#endif + // COUNT_FAIL_CASE(UNonExistentEnum) + default: + return -1; + } +} + +static const Field* _udbg_enumFields(UDebugEnumType type) { + switch(type) { + FIELD_CASE(UDebugEnumType) +#if !UCONFIG_NO_FORMATTING + FIELD_CASE(UCalendarDateFields) + FIELD_CASE(UCalendarMonths) + FIELD_CASE(UDateFormatStyle) +#endif +#if UCONFIG_ENABLE_PLUGINS + FIELD_CASE(UPlugReason) + FIELD_CASE(UPlugLevel) +#endif + FIELD_CASE(UAcceptResult) + // FIELD_FAIL_CASE(UNonExistentEnum) +#if !UCONFIG_NO_COLLATION + FIELD_CASE(UColAttributeValue) +#endif + default: + return nullptr; + } +} + +// implementation + +int32_t udbg_enumCount(UDebugEnumType type) { + return _udbg_enumCount(type, false); +} + +int32_t udbg_enumExpectedCount(UDebugEnumType type) { + return _udbg_enumCount(type, true); +} + +const char * udbg_enumName(UDebugEnumType type, int32_t field) { + if(field<0 || + field>=_udbg_enumCount(type,false)) { // also will catch unsupported items + return nullptr; + } else { + const Field *fields = _udbg_enumFields(type); + if(fields == nullptr) { + return nullptr; + } else { + return fields[field].str + fields[field].prefix; + } + } +} + +int32_t udbg_enumArrayValue(UDebugEnumType type, int32_t field) { + if(field<0 || + field>=_udbg_enumCount(type,false)) { // also will catch unsupported items + return -1; + } else { + const Field *fields = _udbg_enumFields(type); + if(fields == nullptr) { + return -1; + } else { + return fields[field].num; + } + } +} + +int32_t udbg_enumByName(UDebugEnumType type, const char *value) { + if(type<0||type>=_udbg_enumCount(UDBG_UDebugEnumType, true)) { + return -1; // type out of range + } + const Field *fields = _udbg_enumFields(type); + if (fields != nullptr) { + for(int32_t field = 0;field<_udbg_enumCount(type, false);field++) { + if(!strcmp(value, fields[field].str + fields[field].prefix)) { + return fields[field].num; + } + } + // try with the prefix + for(int32_t field = 0;field<_udbg_enumCount(type, false);field++) { + if(!strcmp(value, fields[field].str)) { + return fields[field].num; + } + } + } + // fail + return -1; +} + +/* platform info */ +/** + * Print the current platform + */ +U_CAPI const char *udbg_getPlatform() +{ +#if U_PLATFORM_USES_ONLY_WIN32_API + return "Windows"; +#elif U_PLATFORM == U_PF_CYGWIN + return "Cygwin"; +#elif U_PLATFORM == U_PF_UNKNOWN + return "unknown"; +#elif U_PLATFORM == U_PF_DARWIN + return "Darwin"; +#elif U_PLATFORM == U_PF_BSD + return "BSD"; +#elif U_PLATFORM == U_PF_QNX + return "QNX"; +#elif U_PLATFORM == U_PF_LINUX + return "Linux"; +#elif U_PLATFORM == U_PF_ANDROID + return "Android"; +#elif U_PLATFORM == U_PF_CLASSIC_MACOS + return "MacOS (Classic)"; +#elif U_PLATFORM == U_PF_OS390 + return "IBM z"; +#elif U_PLATFORM == U_PF_OS400 + return "IBM i"; +#else + return "Other (POSIX-like)"; +#endif +} + +struct USystemParams; + +typedef int32_t U_CALLCONV USystemParameterCallback(const USystemParams *param, char *target, int32_t targetCapacity, UErrorCode *status); + +struct USystemParams { + const char *paramName; + USystemParameterCallback *paramFunction; + const char *paramStr; + int32_t paramInt; +}; + +/* parameter types */ +U_CAPI int32_t +paramEmpty(const USystemParams * /* param */, char *target, int32_t targetCapacity, UErrorCode *status) { + if(U_FAILURE(*status))return 0; + return u_terminateChars(target, targetCapacity, 0, status); +} + +U_CAPI int32_t +paramStatic(const USystemParams *param, char *target, int32_t targetCapacity, UErrorCode *status) { + if(param->paramStr==nullptr) return paramEmpty(param,target,targetCapacity,status); + if(U_FAILURE(*status))return 0; + int32_t len = static_cast<int32_t>(uprv_strlen(param->paramStr)); + if(target!=nullptr) { + uprv_strncpy(target,param->paramStr,uprv_min(len,targetCapacity)); + } + return u_terminateChars(target, targetCapacity, len, status); +} + +static const char *nullString = "(null)"; + +static int32_t stringToStringBuffer(char *target, int32_t targetCapacity, const char *str, UErrorCode *status) { + if(str==nullptr) str=nullString; + + int32_t len = static_cast<int32_t>(uprv_strlen(str)); + if (U_SUCCESS(*status)) { + if(target!=nullptr) { + uprv_strncpy(target,str,uprv_min(len,targetCapacity)); + } + } else { + const char *s = u_errorName(*status); + len = static_cast<int32_t>(uprv_strlen(s)); + if(target!=nullptr) { + uprv_strncpy(target,s,uprv_min(len,targetCapacity)); + } + } + return u_terminateChars(target, targetCapacity, len, status); +} + +static int32_t integerToStringBuffer(char *target, int32_t targetCapacity, int32_t n, int32_t radix, UErrorCode *status) { + if(U_FAILURE(*status)) return 0; + char str[300]; + T_CString_integerToString(str,n,radix); + return stringToStringBuffer(target,targetCapacity,str,status); +} + +U_CAPI int32_t +paramInteger(const USystemParams *param, char *target, int32_t targetCapacity, UErrorCode *status) { + if(U_FAILURE(*status))return 0; + if(param->paramStr==nullptr || param->paramStr[0]=='d') { + return integerToStringBuffer(target,targetCapacity,param->paramInt, 10,status); + } else if(param->paramStr[0]=='x') { + return integerToStringBuffer(target,targetCapacity,param->paramInt, 16,status); + } else if(param->paramStr[0]=='o') { + return integerToStringBuffer(target,targetCapacity,param->paramInt, 8,status); + } else if(param->paramStr[0]=='b') { + return integerToStringBuffer(target,targetCapacity,param->paramInt, 2,status); + } else { + *status = U_INTERNAL_PROGRAM_ERROR; + return 0; + } +} + + +U_CAPI int32_t +paramCldrVersion(const USystemParams * /* param */, char *target, int32_t targetCapacity, UErrorCode *status) { + if(U_FAILURE(*status))return 0; + char str[200]=""; + UVersionInfo icu; + + ulocdata_getCLDRVersion(icu, status); + if(U_SUCCESS(*status)) { + u_versionToString(icu, str); + return stringToStringBuffer(target,targetCapacity,str,status); + } else { + return 0; + } +} + + +#if !UCONFIG_NO_FORMATTING +U_CAPI int32_t +paramTimezoneDefault(const USystemParams * /* param */, char *target, int32_t targetCapacity, UErrorCode *status) { + if(U_FAILURE(*status))return 0; + char16_t buf[100]; + char buf2[100]; + int32_t len; + + len = ucal_getDefaultTimeZone(buf, 100, status); + if(U_SUCCESS(*status)&&len>0) { + u_UCharsToChars(buf, buf2, len+1); + return stringToStringBuffer(target,targetCapacity, buf2,status); + } else { + return 0; + } +} +#endif + +U_CAPI int32_t +paramLocaleDefaultBcp47(const USystemParams * /* param */, char *target, int32_t targetCapacity, UErrorCode *status) { + if(U_FAILURE(*status))return 0; + const char *def = uloc_getDefault(); + return uloc_toLanguageTag(def,target,targetCapacity,false,status); +} + + +/* simple 1-liner param functions */ +#define STRING_PARAM(func, str) U_CAPI int32_t \ + func(const USystemParams *, char *target, int32_t targetCapacity, UErrorCode *status) \ + { return stringToStringBuffer(target,targetCapacity,(str),status); } + +STRING_PARAM(paramIcudataPath, u_getDataDirectory()) +STRING_PARAM(paramPlatform, udbg_getPlatform()) +STRING_PARAM(paramLocaleDefault, uloc_getDefault()) +#if !UCONFIG_NO_CONVERSION +STRING_PARAM(paramConverterDefault, ucnv_getDefaultName()) +#endif + +#if !UCONFIG_NO_FORMATTING +STRING_PARAM(paramTimezoneVersion, ucal_getTZDataVersion(status)) +#endif + +static const USystemParams systemParams[] = { + { "copyright", paramStatic, U_COPYRIGHT_STRING,0 }, + { "product", paramStatic, "icu4c",0 }, + { "product.full", paramStatic, "International Components for Unicode for C/C++",0 }, + { "version", paramStatic, U_ICU_VERSION,0 }, + { "version.unicode", paramStatic, U_UNICODE_VERSION,0 }, + { "platform.number", paramInteger, "d",U_PLATFORM}, + { "platform.type", paramPlatform, nullptr ,0}, + { "locale.default", paramLocaleDefault, nullptr, 0}, + { "locale.default.bcp47", paramLocaleDefaultBcp47, nullptr, 0}, +#if !UCONFIG_NO_CONVERSION + { "converter.default", paramConverterDefault, nullptr, 0}, +#endif + { "icudata.name", paramStatic, U_ICUDATA_NAME, 0}, + { "icudata.path", paramIcudataPath, nullptr, 0}, + + { "cldr.version", paramCldrVersion, nullptr, 0}, + +#if !UCONFIG_NO_FORMATTING + { "tz.version", paramTimezoneVersion, nullptr, 0}, + { "tz.default", paramTimezoneDefault, nullptr, 0}, +#endif + + { "cpu.bits", paramInteger, "d", (sizeof(void*))*8}, + { "cpu.big_endian", paramInteger, "b", U_IS_BIG_ENDIAN}, + { "os.wchar_width", paramInteger, "d", U_SIZEOF_WCHAR_T}, + { "os.charset_family", paramInteger, "d", U_CHARSET_FAMILY}, +#if defined (U_HOST) + { "os.host", paramStatic, U_HOST, 0}, +#endif +#if defined (U_BUILD) + { "build.build", paramStatic, U_BUILD, 0}, +#endif +#if defined (U_CC) + { "build.cc", paramStatic, U_CC, 0}, +#endif +#if defined (U_CXX) + { "build.cxx", paramStatic, U_CXX, 0}, +#endif +#if defined (CYGWINMSVC) + { "build.cygwinmsvc", paramInteger, "b", 1}, +#endif + { "uconfig.internal_digitlist", paramInteger, "b", 1}, /* always 1 */ + { "uconfig.have_parseallinput", paramInteger, "b", UCONFIG_HAVE_PARSEALLINPUT}, + + +}; + +#define U_SYSPARAM_COUNT UPRV_LENGTHOF(systemParams) + +U_CAPI const char *udbg_getSystemParameterNameByIndex(int32_t i) { + if(i>=0 && i < (int32_t)U_SYSPARAM_COUNT) { + return systemParams[i].paramName; + } else { + return nullptr; + } +} + + +U_CAPI int32_t udbg_getSystemParameterValueByIndex(int32_t i, char *buffer, int32_t bufferCapacity, UErrorCode *status) { + if(i>=0 && i< (int32_t)U_SYSPARAM_COUNT) { + return systemParams[i].paramFunction(&(systemParams[i]),buffer,bufferCapacity,status); + } else { + return 0; + } +} + +U_CAPI void udbg_writeIcuInfo(FILE *out) { + char str[2000]; + /* todo: API for writing DTD? */ + fprintf(out, " <icuSystemParams type=\"icu4c\">\n"); + const char *paramName; + for(int32_t i=0;(paramName=udbg_getSystemParameterNameByIndex(i))!=nullptr;i++) { + UErrorCode status2 = U_ZERO_ERROR; + udbg_getSystemParameterValueByIndex(i, str,2000,&status2); + if(U_SUCCESS(status2)) { + fprintf(out," <param name=\"%s\">%s</param>\n", paramName,str); + } else { + fprintf(out," <!-- n=\"%s\" ERROR: %s -->\n", paramName, u_errorName(status2)); + } + } + fprintf(out, " </icuSystemParams>\n"); +} + +#define UNICODE_BUG_URL "https://unicode-org.atlassian.net/browse/" +#define OLD_CLDR_PREFIX "cldrbug:" +#define CLDR_BUG_PREFIX "CLDR-" +#define ICU_BUG_PREFIX "ICU-" + + + +#include <set> +#include <map> +#include <string> +#include <ostream> +#include <iostream> + +class KnownIssues { +public: + KnownIssues(); + ~KnownIssues(); + void add(const char *ticket, const char *where, const char16_t *msg, UBool *firstForTicket, UBool *firstForWhere); + void add(const char *ticket, const char *where, const char *msg, UBool *firstForTicket, UBool *firstForWhere); + UBool print(); +private: + std::map< std::string, + std::map < std::string, std::set < std::string > > > fTable; +}; + +KnownIssues::KnownIssues() + : fTable() +{ +} + +KnownIssues::~KnownIssues() +{ +} + +/** + * Map cldr:1234 to CLDR-1234 + * Map 1234 to ICU-1234 + */ +static std::string mapTicketId(const char *ticketStr) { + std::string ticket(ticketStr); + // TODO: Can remove this function once all logKnownIssue calls are switched over + // to the ICU-1234 and CLDR-1234 format. + if(ticket.rfind(OLD_CLDR_PREFIX) == 0) { + // map cldrbug:1234 to CLDR-1234 + ticket.replace(0, uprv_strlen(OLD_CLDR_PREFIX), CLDR_BUG_PREFIX); + } else if(::isdigit(ticket[0])) { + // map 1234 to ICU-1234 + ticket.insert(0, ICU_BUG_PREFIX); + } + return ticket; +} + +void KnownIssues::add(const char *ticketStr, const char *where, const char16_t *msg, UBool *firstForTicket, UBool *firstForWhere) +{ + const std::string ticket = mapTicketId(ticketStr); + if(fTable.find(ticket) == fTable.end()) { + if(firstForTicket!=nullptr) *firstForTicket = true; + fTable[ticket] = std::map < std::string, std::set < std::string > >(); + } else { + if(firstForTicket!=nullptr) *firstForTicket = false; + } + if(where==nullptr) return; + + if(fTable[ticket].find(where) == fTable[ticket].end()) { + if(firstForWhere!=nullptr) *firstForWhere = true; + fTable[ticket][where] = std::set < std::string >(); + } else { + if(firstForWhere!=nullptr) *firstForWhere = false; + } + if(msg==nullptr || !*msg) return; + + const icu::UnicodeString ustr(msg); + + fTable[ticket][where].insert(std::string(icu::CStr(ustr)())); +} + +void KnownIssues::add(const char *ticketStr, const char *where, const char *msg, UBool *firstForTicket, UBool *firstForWhere) +{ + const std::string ticket = mapTicketId(ticketStr); + if(fTable.find(ticket) == fTable.end()) { + if(firstForTicket!=nullptr) *firstForTicket = true; + fTable[ticket] = std::map < std::string, std::set < std::string > >(); + } else { + if(firstForTicket!=nullptr) *firstForTicket = false; + } + if(where==nullptr) return; + + if(fTable[ticket].find(where) == fTable[ticket].end()) { + if(firstForWhere!=nullptr) *firstForWhere = true; + fTable[ticket][where] = std::set < std::string >(); + } else { + if(firstForWhere!=nullptr) *firstForWhere = false; + } + if(msg==nullptr || !*msg) return; + + std::string str(msg); + fTable[ticket][where].insert(str); +} + +UBool KnownIssues::print() +{ + if(fTable.empty()) { + return false; + } + + std::cout << "KNOWN ISSUES" << std::endl; + for( std::map< std::string, + std::map < std::string, std::set < std::string > > >::iterator i = fTable.begin(); + i != fTable.end(); + i++ ) { + const std::string ticketid = (*i).first; + std::cout << "[" << ticketid << "] "; + if(ticketid.rfind(ICU_BUG_PREFIX) == 0 || ticketid.rfind(CLDR_BUG_PREFIX) == 0) { + // If it's a unicode.org bug. + std::cout << UNICODE_BUG_URL << ticketid; + } // Else: some other kind of bug. Allow this, but without a URL. + std::cout << std::endl; + + for( std::map< std::string, std::set < std::string > >::iterator ii = (*i).second.begin(); + ii != (*i).second.end(); + ii++ ) { + std::cout << " " << (*ii).first << std::endl; + for ( std::set < std::string >::iterator iii = (*ii).second.begin(); + iii != (*ii).second.end(); + iii++ ) { + std::cout << " " << '"' << (*iii) << '"' << std::endl; + } + } + } + return true; +} + +U_CAPI void *udbg_knownIssue_openU(void *ptr, const char *ticket, char *where, const char16_t *msg, UBool *firstForTicket, + UBool *firstForWhere) { + KnownIssues *t = static_cast<KnownIssues*>(ptr); + if(t==nullptr) { + t = new KnownIssues(); + } + + t->add(ticket, where, msg, firstForTicket, firstForWhere); + + return static_cast<void*>(t); +} + +U_CAPI void *udbg_knownIssue_open(void *ptr, const char *ticket, char *where, const char *msg, UBool *firstForTicket, + UBool *firstForWhere) { + KnownIssues *t = static_cast<KnownIssues*>(ptr); + if(t==nullptr) { + t = new KnownIssues(); + } + + t->add(ticket, where, msg, firstForTicket, firstForWhere); + + return static_cast<void*>(t); +} + +U_CAPI UBool udbg_knownIssue_print(void *ptr) { + KnownIssues *t = static_cast<KnownIssues*>(ptr); + if(t==nullptr) { + return false; + } else { + t->print(); + return true; + } +} + +U_CAPI void udbg_knownIssue_close(void *ptr) { + KnownIssues *t = static_cast<KnownIssues*>(ptr); + delete t; +} diff --git a/intl/icu/source/tools/toolutil/udbgutil.h b/intl/icu/source/tools/toolutil/udbgutil.h new file mode 100644 index 0000000000..e3ed513839 --- /dev/null +++ b/intl/icu/source/tools/toolutil/udbgutil.h @@ -0,0 +1,147 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +************************************************************************ +* Copyright (c) 2008-2015, International Business Machines +* Corporation and others. All Rights Reserved. +************************************************************************ +*/ + +/** C Utilities to aid in debugging **/ + +#ifndef _UDBGUTIL_H +#define _UDBGUTIL_H + +#include "unicode/utypes.h" +#include <stdio.h> + +enum UDebugEnumType { + UDBG_UDebugEnumType = 0, /* Self-referential, strings for UDebugEnumType. Count=ENUM_COUNT. */ +#if !UCONFIG_NO_FORMATTING + UDBG_UCalendarDateFields, /* UCalendarDateFields. Count=UCAL_FIELD_COUNT. Unsupported if UCONFIG_NO_FORMATTING. */ + UDBG_UCalendarMonths, /* UCalendarMonths. Count= (UCAL_UNDECIMBER+1) */ + UDBG_UDateFormatStyle, /* Count = UDAT_SHORT=1 */ +#endif +#if UCONFIG_ENABLE_PLUGINS + UDBG_UPlugReason, /* Count = UPLUG_REASON_COUNT */ + UDBG_UPlugLevel, /* COUNT = UPLUG_LEVEL_COUNT */ +#endif + UDBG_UAcceptResult, /* Count = ULOC_ACCEPT_FALLBACK+1=3 */ + + /* All following enums may be discontiguous. */ + +#if !UCONFIG_NO_COLLATION + UDBG_UColAttributeValue, /* UCOL_ATTRIBUTE_VALUE_COUNT */ +#endif + UDBG_ENUM_COUNT, + UDBG_HIGHEST_CONTIGUOUS_ENUM = UDBG_UAcceptResult, /**< last enum in this list with contiguous (testable) values. */ + UDBG_INVALID_ENUM = -1 /** Invalid enum value **/ +}; + +typedef enum UDebugEnumType UDebugEnumType; + +/** + * @param type the type of enum + * Print how many enums are contained for this type. + * Should be equal to the appropriate _COUNT constant or there is an error. Return -1 if unsupported. + */ +U_CAPI int32_t U_EXPORT2 udbg_enumCount(UDebugEnumType type); + +/** + * Convert an enum to a string + * @param type type of enum + * @param field field number + * @return string of the format "ERA", "YEAR", etc, or NULL if out of range or unsupported + */ +U_CAPI const char * U_EXPORT2 udbg_enumName(UDebugEnumType type, int32_t field); + +/** + * for consistency checking + * @param type the type of enum + * Print how many enums should be contained for this type. + * This is equal to the appropriate _COUNT constant or there is an error. Returns -1 if unsupported. + */ +U_CAPI int32_t U_EXPORT2 udbg_enumExpectedCount(UDebugEnumType type); + +/** + * For consistency checking, returns the expected enum ordinal value for the given index value. + * @param type which type + * @param field field number + * @return should be equal to 'field' or -1 if out of range. + */ +U_CAPI int32_t U_EXPORT2 udbg_enumArrayValue(UDebugEnumType type, int32_t field); + +/** + * Locate the specified field value by name. + * @param type which type + * @param name name of string (case sensitive) + * @return should be a field value or -1 if not found. + */ +U_CAPI int32_t U_EXPORT2 udbg_enumByName(UDebugEnumType type, const char *name); + + +/** + * Return the Platform (U_PLATFORM) as a string + */ +U_CAPI const char *udbg_getPlatform(void); + +/** + * Get the nth system parameter's name + * @param i index of name, starting from zero + * @return name, or NULL if off the end + * @see udbg_getSystemParameterValue + */ +U_CAPI const char *udbg_getSystemParameterNameByIndex(int32_t i); + +/** + * Get the nth system parameter's value, in a user supplied buffer + * @parameter i index of value, starting from zero + * @param status error status + * @return length written (standard termination rules) + * @see udbg_getSystemParameterName + */ +U_CAPI int32_t udbg_getSystemParameterValueByIndex(int32_t i, char *buffer, int32_t bufferCapacity, UErrorCode *status); + +/** + * Write ICU info as XML + */ +U_CAPI void udbg_writeIcuInfo(FILE *f); + +/** + * \def UDBG_KNOWNISSUE_LEN + * Length of output buffer for udbg_knownIssueURLFrom + */ +#define UDBG_KNOWNISSUE_LEN 255 + +/** + * Open (or reopen) a 'known issue' table. + * @param ptr pointer to 'table'. Opaque. + * @return new or existing ptr + */ +U_CAPI void *udbg_knownIssue_openU(void *ptr, const char *ticket, char *where, const UChar *msg, UBool *firstForTicket, + UBool *firstForWhere); + + +/** + * Open (or reopen) a 'known issue' table. + * @param ptr pointer to 'table'. Opaque. + * @return new or existing ptr + */ +U_CAPI void *udbg_knownIssue_open(void *ptr, const char *ticket, char *where, const char *msg, UBool *firstForTicket, + UBool *firstForWhere); + +/** + * Print 'known issue' table, to std::cout. + * @param ptr pointer from udbg_knownIssue + * @return true if there were any issues. + */ +U_CAPI UBool udbg_knownIssue_print(void *ptr); + +/** + * Close 'known issue' table. + * @param ptr + */ +U_CAPI void udbg_knownIssue_close(void *ptr); + + +#endif diff --git a/intl/icu/source/tools/toolutil/unewdata.cpp b/intl/icu/source/tools/toolutil/unewdata.cpp new file mode 100644 index 0000000000..27414d2eba --- /dev/null +++ b/intl/icu/source/tools/toolutil/unewdata.cpp @@ -0,0 +1,286 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2010, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: unewdata.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999oct25 +* created by: Markus W. Scherer +*/ + +#include <stdio.h> +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/ustring.h" +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" +#include "unicode/udata.h" +#include "unewdata.h" + +struct UNewDataMemory { + FileStream *file; + uint16_t headerSize; + uint8_t magic1, magic2; +}; + +U_CAPI UNewDataMemory * U_EXPORT2 +udata_create(const char *dir, const char *type, const char *name, + const UDataInfo *pInfo, + const char *comment, + UErrorCode *pErrorCode) { + UNewDataMemory *pData; + uint16_t headerSize, commentLength; + char filename[512]; + uint8_t bytes[16]; + int32_t length; + + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return nullptr; + } else if(name==nullptr || *name==0 || pInfo==nullptr) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return nullptr; + } + + /* allocate the data structure */ + pData=(UNewDataMemory *)uprv_malloc(sizeof(UNewDataMemory)); + if(pData==nullptr) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + + char dirSepChar = U_FILE_SEP_CHAR; +#if (U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR) + // We may need to append a different directory separator when building for Cygwin or MSYS2. + if(dir && *dir) { + if(!uprv_strchr(dir, U_FILE_SEP_CHAR) && uprv_strchr(dir, U_FILE_ALT_SEP_CHAR)) { + dirSepChar = U_FILE_ALT_SEP_CHAR; + } + } +#endif + + /* Check that the full path won't be too long */ + length = 0; /* Start with nothing */ + if(dir != nullptr && *dir !=0) /* Add directory length if one was given */ + { + length += static_cast<int32_t>(strlen(dir)); + + /* Add 1 if dir doesn't end with path sep */ + if (dir[strlen(dir) - 1]!= dirSepChar) { + length++; + } + } + length += static_cast<int32_t>(strlen(name)); /* Add the filename length */ + + if(type != nullptr && *type !=0) { /* Add directory length if given */ + length += static_cast<int32_t>(strlen(type)); + } + + + /* LDH buffer Length error check */ + if(length > ((int32_t)sizeof(filename) - 1)) + { + *pErrorCode = U_BUFFER_OVERFLOW_ERROR; + uprv_free(pData); + return nullptr; + } + + /* open the output file */ + if(dir!=nullptr && *dir!=0) { /* if dir has a value, we prepend it to the filename */ + char *p=filename+strlen(dir); + uprv_strcpy(filename, dir); + if (*(p-1)!=dirSepChar) { + *p++=dirSepChar; + *p=0; + } + } else { /* otherwise, we'll output to the current dir */ + filename[0]=0; + } + uprv_strcat(filename, name); + if(type!=nullptr && *type!=0) { + uprv_strcat(filename, "."); + uprv_strcat(filename, type); + } + pData->file=T_FileStream_open(filename, "wb"); + if(pData->file==nullptr) { + uprv_free(pData); + *pErrorCode=U_FILE_ACCESS_ERROR; + return nullptr; + } + + /* write the header information */ + headerSize=(uint16_t)(pInfo->size+4); + if(comment!=nullptr && *comment!=0) { + commentLength=(uint16_t)(uprv_strlen(comment)+1); + headerSize+=commentLength; + } else { + commentLength=0; + } + + /* write the size of the header, take padding into account */ + pData->headerSize=(uint16_t)((headerSize+15)&~0xf); + pData->magic1=0xda; + pData->magic2=0x27; + T_FileStream_write(pData->file, &pData->headerSize, 4); + + /* write the information data */ + T_FileStream_write(pData->file, pInfo, pInfo->size); + + /* write the comment */ + if(commentLength>0) { + T_FileStream_write(pData->file, comment, commentLength); + } + + /* write padding bytes to align the data section to 16 bytes */ + headerSize&=0xf; + if(headerSize!=0) { + headerSize=(uint16_t)(16-headerSize); + uprv_memset(bytes, 0, headerSize); + T_FileStream_write(pData->file, bytes, headerSize); + } + + return pData; +} + +U_CAPI uint32_t U_EXPORT2 +udata_finish(UNewDataMemory *pData, UErrorCode *pErrorCode) { + uint32_t fileLength=0; + + if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) { + return 0; + } + + if(pData!=nullptr) { + if(pData->file!=nullptr) { + /* fflush(pData->file);*/ + fileLength=T_FileStream_size(pData->file); + if(T_FileStream_error(pData->file)) { + *pErrorCode=U_FILE_ACCESS_ERROR; + } else { + fileLength-=pData->headerSize; + } + T_FileStream_close(pData->file); + } + uprv_free(pData); + } + + return fileLength; +} + +/* dummy UDataInfo cf. udata.h */ +static const UDataInfo dummyDataInfo = { + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, + + { 0, 0, 0, 0 }, /* dummy dataFormat */ + { 0, 0, 0, 0 }, /* dummy formatVersion */ + { 0, 0, 0, 0 } /* dummy dataVersion */ +}; + +U_CAPI void U_EXPORT2 +udata_createDummy(const char *dir, const char *type, const char *name, UErrorCode *pErrorCode) { + if(U_SUCCESS(*pErrorCode)) { + udata_finish(udata_create(dir, type, name, &dummyDataInfo, nullptr, pErrorCode), pErrorCode); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "error %s writing dummy data file %s" U_FILE_SEP_STRING "%s.%s\n", + u_errorName(*pErrorCode), dir, name, type); + exit(*pErrorCode); + } + } +} + +U_CAPI void U_EXPORT2 +udata_write8(UNewDataMemory *pData, uint8_t byte) { + if(pData!=nullptr && pData->file!=nullptr) { + T_FileStream_write(pData->file, &byte, 1); + } +} + +U_CAPI void U_EXPORT2 +udata_write16(UNewDataMemory *pData, uint16_t word) { + if(pData!=nullptr && pData->file!=nullptr) { + T_FileStream_write(pData->file, &word, 2); + } +} + +U_CAPI void U_EXPORT2 +udata_write32(UNewDataMemory *pData, uint32_t wyde) { + if(pData!=nullptr && pData->file!=nullptr) { + T_FileStream_write(pData->file, &wyde, 4); + } +} + +U_CAPI void U_EXPORT2 +udata_writeBlock(UNewDataMemory *pData, const void *s, int32_t length) { + if(pData!=nullptr && pData->file!=nullptr) { + if(length>0) { + T_FileStream_write(pData->file, s, length); + } + } +} + +U_CAPI void U_EXPORT2 +udata_writePadding(UNewDataMemory *pData, int32_t length) { + static const uint8_t padding[16]={ + 0xaa, 0xaa, 0xaa, 0xaa, + 0xaa, 0xaa, 0xaa, 0xaa, + 0xaa, 0xaa, 0xaa, 0xaa, + 0xaa, 0xaa, 0xaa, 0xaa + }; + if(pData!=nullptr && pData->file!=nullptr) { + while(length>=16) { + T_FileStream_write(pData->file, padding, 16); + length-=16; + } + if(length>0) { + T_FileStream_write(pData->file, padding, length); + } + } +} + +U_CAPI void U_EXPORT2 +udata_writeString(UNewDataMemory *pData, const char *s, int32_t length) { + if(pData!=nullptr && pData->file!=nullptr) { + if(length==-1) { + length=(int32_t)uprv_strlen(s); + } + if(length>0) { + T_FileStream_write(pData->file, s, length); + } + } +} + +U_CAPI void U_EXPORT2 +udata_writeUString(UNewDataMemory *pData, const char16_t *s, int32_t length) { + if(pData!=nullptr && pData->file!=nullptr) { + if(length==-1) { + length=u_strlen(s); + } + if(length>0) { + T_FileStream_write(pData->file, s, length*sizeof(char16_t)); + } + } +} + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ + diff --git a/intl/icu/source/tools/toolutil/unewdata.h b/intl/icu/source/tools/toolutil/unewdata.h new file mode 100644 index 0000000000..137fb49584 --- /dev/null +++ b/intl/icu/source/tools/toolutil/unewdata.h @@ -0,0 +1,113 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2010, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: unewdata.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999oct25 +* created by: Markus W. Scherer +*/ + +#ifndef __UNEWDATA_H__ +#define __UNEWDATA_H__ + +#include "unicode/utypes.h" +#include "unicode/udata.h" + +/* API for writing data -----------------------------------------------------*/ + +/** @memo Forward declaration of the data memory creation type. */ +typedef struct UNewDataMemory UNewDataMemory; + +/** + * Create a new binary data file. + * The file-writing <code>udata_</code> functions facilitate writing + * binary data files that can be read by ICU's <code>udata</code> API. + * This function opens a new file with a filename determined from its + * parameters - of the form "name.type". + * It then writes a short header, followed by the <code>UDataInfo</code> + * structure and, optionally, by the comment string. + * It then writes padding bytes to round up to a multiple of 16 bytes. + * Subsequent write operations will thus start at an offset in the file + * that is a multiple of 16. <code>udata_getMemory()</code> will return + * a pointer to this same starting offset. + * + * See udata.h . + * + * @param dir A string that specifies the directory where the data will be + * written. If <code>NULL</code>, then + * <code>u_getDataDirectory</code> is used. + * @param type A string that specifies the type of data to be written. + * For example, resource bundles are written with type "res", + * conversion tables with type "cnv". + * This may be <code>NULL</code> or empty. + * @param name A string that specifies the name of the data. + * @param pInfo A pointer to a correctly filled <code>UDataInfo</code> + * structure that will be copied into the file. + * @param comment A string (e.g., a copyright statement) that will be + * copied into the file if it is not <code>NULL</code> + * or empty. This string serves only as a comment in the binary + * file. It will not be accessible by any API. + * @param pErrorCode An ICU UErrorCode parameter. It must not be <code>NULL</code>. + */ +U_CAPI UNewDataMemory * U_EXPORT2 +udata_create(const char *dir, const char *type, const char *name, + const UDataInfo *pInfo, + const char *comment, + UErrorCode *pErrorCode); + +/** @memo Close a newly written binary file. */ +U_CAPI uint32_t U_EXPORT2 +udata_finish(UNewDataMemory *pData, UErrorCode *pErrorCode); + +/** @memo Write a dummy data file. */ +U_CAPI void U_EXPORT2 +udata_createDummy(const char *dir, const char *type, const char *name, UErrorCode *pErrorCode); + +/** @memo Write an 8-bit byte to the file. */ +U_CAPI void U_EXPORT2 +udata_write8(UNewDataMemory *pData, uint8_t byte); + +/** @memo Write a 16-bit word to the file. */ +U_CAPI void U_EXPORT2 +udata_write16(UNewDataMemory *pData, uint16_t word); + +/** @memo Write a 32-bit word to the file. */ +U_CAPI void U_EXPORT2 +udata_write32(UNewDataMemory *pData, uint32_t wyde); + +/** @memo Write a block of bytes to the file. */ +U_CAPI void U_EXPORT2 +udata_writeBlock(UNewDataMemory *pData, const void *s, int32_t length); + +/** @memo Write a block of arbitrary padding bytes to the file. */ +U_CAPI void U_EXPORT2 +udata_writePadding(UNewDataMemory *pData, int32_t length); + +/** @memo Write a <code>char*</code> string of platform "invariant characters" to the file. */ +U_CAPI void U_EXPORT2 +udata_writeString(UNewDataMemory *pData, const char *s, int32_t length); + +/** @memo Write a <code>UChar*</code> string of Unicode character code units to the file. */ +U_CAPI void U_EXPORT2 +udata_writeUString(UNewDataMemory *pData, const UChar *s, int32_t length); + + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ + +#endif diff --git a/intl/icu/source/tools/toolutil/uoptions.cpp b/intl/icu/source/tools/toolutil/uoptions.cpp new file mode 100644 index 0000000000..808164ae4d --- /dev/null +++ b/intl/icu/source/tools/toolutil/uoptions.cpp @@ -0,0 +1,133 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uoptions.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000apr17 +* created by: Markus W. Scherer +* +* This file provides a command line argument parser. +*/ + +#include "unicode/utypes.h" +#include "cstring.h" +#include "uoptions.h" + +U_CAPI int U_EXPORT2 +u_parseArgs(int argc, char* argv[], + int optionCount, UOption options[]) { + char *arg; + int i=1, remaining=1; + char c, stopOptions=0; + + while(i<argc) { + arg=argv[i]; + if(!stopOptions && *arg=='-' && (c=arg[1])!=0) { + /* process an option */ + UOption *option=nullptr; + arg+=2; + if(c=='-') { + /* process a long option */ + if(*arg==0) { + /* stop processing options after "--" */ + stopOptions=1; + } else { + /* search for the option string */ + int j; + for(j=0; j<optionCount; ++j) { + if(options[j].longName && uprv_strcmp(arg, options[j].longName)==0) { + option=options+j; + break; + } + } + if(option==nullptr) { + /* no option matches */ + return -i; + } + option->doesOccur=1; + + if(option->hasArg!=UOPT_NO_ARG) { + /* parse the argument for the option, if any */ + if(i+1<argc && !(argv[i+1][0]=='-' && argv[i+1][1]!=0)) { + /* argument in the next argv[], and there is not an option in there */ + option->value=argv[++i]; + } else if(option->hasArg==UOPT_REQUIRES_ARG) { + /* there is no argument, but one is required: return with error */ + option->doesOccur=0; + return -i; + } + } + + if(option->optionFn!=nullptr && option->optionFn(option->context, option)<0) { + /* the option function was called and returned an error */ + option->doesOccur=0; + return -i; + } + } + } else { + /* process one or more short options */ + do { + /* search for the option letter */ + int j; + for(j=0; j<optionCount; ++j) { + if(c==options[j].shortName) { + option=options+j; + break; + } + } + if(option==nullptr) { + /* no option matches */ + return -i; + } + option->doesOccur=1; + + if(option->hasArg!=UOPT_NO_ARG) { + /* parse the argument for the option, if any */ + if(*arg!=0) { + /* argument following in the same argv[] */ + option->value=arg; + /* do not process the rest of this arg as option letters */ + break; + } else if(i+1<argc && !(argv[i+1][0]=='-' && argv[i+1][1]!=0)) { + /* argument in the next argv[], and there is not an option in there */ + option->value=argv[++i]; + /* this break is redundant because we know that *arg==0 */ + break; + } else if(option->hasArg==UOPT_REQUIRES_ARG) { + /* there is no argument, but one is required: return with error */ + option->doesOccur=0; + return -i; + } + } + + if(option->optionFn!=nullptr && option->optionFn(option->context, option)<0) { + /* the option function was called and returned an error */ + option->doesOccur=0; + return -i; + } + + /* get the next option letter */ + option=nullptr; + c=*arg++; + } while(c!=0); + } + + /* go to next argv[] */ + ++i; + } else { + /* move a non-option up in argv[] */ + argv[remaining++]=arg; + ++i; + } + } + return remaining; +} diff --git a/intl/icu/source/tools/toolutil/uoptions.h b/intl/icu/source/tools/toolutil/uoptions.h new file mode 100644 index 0000000000..d00e3da924 --- /dev/null +++ b/intl/icu/source/tools/toolutil/uoptions.h @@ -0,0 +1,143 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uoptions.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000apr17 +* created by: Markus W. Scherer +* +* This file provides a command line argument parser. +*/ + +#ifndef __UOPTIONS_H__ +#define __UOPTIONS_H__ + +#include "unicode/utypes.h" + +/* This should usually be called before calling u_parseArgs */ +/*#if U_PLATFORM == U_PF_OS390 && (U_CHARSET_FAMILY == U_ASCII_FAMILY)*/ + /* translate args from EBCDIC to ASCII */ +/*# define U_MAIN_INIT_ARGS(argc, argv) __argvtoascii_a(argc, argv)*/ +/*#elif defined(XP_MAC_CONSOLE)*/ +#if defined(XP_MAC_CONSOLE) +# include <console.h> + /* Get the arguments from the GUI, since old Macs don't have a console Window. */ +# define U_MAIN_INIT_ARGS(argc, argv) argc = ccommand((char***)&argv) +#else + /* Normally we do nothing. */ +# define U_MAIN_INIT_ARGS(argc, argv) +#endif + + + +/* forward declarations for the function declaration */ +struct UOption; +typedef struct UOption UOption; + +/* function to be called for a command line option */ +typedef int UOptionFn(void *context, UOption *option); + +/* values of UOption.hasArg */ +enum { UOPT_NO_ARG, UOPT_REQUIRES_ARG, UOPT_OPTIONAL_ARG }; + +/* structure describing a command line option */ +struct UOption { + const char *longName; /* "foo" for --foo */ + const char *value; /* output placeholder, will point to the argument string, if any */ + UOptionFn *optionFn; /* function to be called when this option occurs */ + void *context; /* parameter for the function */ + char shortName; /* 'f' for -f */ + char hasArg; /* enum value: option takes no/requires/may have argument */ + char doesOccur; /* boolean for "this one occurred" */ +}; + +/* macro for an entry in a declaration of UOption[] */ +#define UOPTION_DEF(longName, shortName, hasArg) \ + { longName, NULL, NULL, NULL, shortName, hasArg, 0 } + +/* ICU Tools option definitions */ +#define UOPTION_HELP_H UOPTION_DEF("help", 'h', UOPT_NO_ARG) +#define UOPTION_HELP_QUESTION_MARK UOPTION_DEF("help", '?', UOPT_NO_ARG) +#define UOPTION_VERBOSE UOPTION_DEF("verbose", 'v', UOPT_NO_ARG) +#define UOPTION_QUIET UOPTION_DEF("quiet", 'q', UOPT_NO_ARG) +#define UOPTION_VERSION UOPTION_DEF("version", 'V', UOPT_NO_ARG) +#define UOPTION_COPYRIGHT UOPTION_DEF("copyright", 'c', UOPT_NO_ARG) + +#define UOPTION_DESTDIR UOPTION_DEF("destdir", 'd', UOPT_REQUIRES_ARG) +#define UOPTION_SOURCEDIR UOPTION_DEF("sourcedir", 's', UOPT_REQUIRES_ARG) +#define UOPTION_ENCODING UOPTION_DEF("encoding", 'e', UOPT_REQUIRES_ARG) +#define UOPTION_ICUDATADIR UOPTION_DEF("icudatadir", 'i', UOPT_REQUIRES_ARG) +#define UOPTION_WRITE_JAVA UOPTION_DEF("write-java", 'j', UOPT_OPTIONAL_ARG) +#define UOPTION_PACKAGE_NAME UOPTION_DEF("package-name", 'p', UOPT_REQUIRES_ARG) +#define UOPTION_BUNDLE_NAME UOPTION_DEF("bundle-name", 'b', UOPT_REQUIRES_ARG) + +/** + * C Command line argument parser. + * + * This function takes the argv[argc] command line and a description of + * the program's options in form of an array of UOption structures. + * Each UOption defines a long and a short name (a string and a character) + * for options like "--foo" and "-f". + * + * Each option is marked with whether it does not take an argument, + * requires one, or optionally takes one. The argument may follow in + * the same argv[] entry for short options, or it may always follow + * in the next argv[] entry. + * + * An argument is in the next argv[] entry for both long and short name + * options, except it is taken from directly behind the short name in + * its own argv[] entry if there are characters following the option letter. + * An argument in its own argv[] entry must not begin with a '-' + * unless it is only the '-' itself. There is no restriction of the + * argument format if it is part of the short name options's argv[] entry. + * + * The argument is stored in the value field of the corresponding + * UOption entry, and the doesOccur field is set to 1 if the option + * is found at all. + * + * Short name options without arguments can be collapsed into a single + * argv[] entry. After an option letter takes an argument, following + * letters will be taken as its argument. + * + * If the same option is found several times, then the last + * argument value will be stored in the value field. + * + * For each option, a function can be called. This could be used + * for options that occur multiple times and all arguments are to + * be collected. + * + * All options are removed from the argv[] array itself. If the parser + * is successful, then it returns the number of remaining non-option + * strings (including argv[0]). + * argv[0], the program name, is never read or modified. + * + * An option "--" ends option processing; everything after this + * remains in the argv[] array. + * + * An option string "-" alone is treated as a non-option. + * + * If an option is not recognized or an argument missing, then + * the parser returns with the negative index of the argv[] entry + * where the error was detected. + * + * The OS/400 compiler requires that argv either be "char* argv[]", + * or "const char* const argv[]", and it will not accept, + * "const char* argv[]" as a definition for main(). + * + * @param argv This parameter is modified + * @param options This parameter is modified + */ +U_CAPI int U_EXPORT2 +u_parseArgs(int argc, char* argv[], + int optionCount, UOption options[]); + +#endif diff --git a/intl/icu/source/tools/toolutil/uparse.cpp b/intl/icu/source/tools/toolutil/uparse.cpp new file mode 100644 index 0000000000..5aee48b5a4 --- /dev/null +++ b/intl/icu/source/tools/toolutil/uparse.cpp @@ -0,0 +1,383 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uparse.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000apr18 +* created by: Markus W. Scherer +* +* This file provides a parser for files that are delimited by one single +* character like ';' or TAB. Example: the Unicode Character Properties files +* like UnicodeData.txt are semicolon-delimited. +*/ + +#include "unicode/utypes.h" +#include "unicode/uchar.h" +#include "unicode/ustring.h" +#include "unicode/utf16.h" +#include "cstring.h" +#include "filestrm.h" +#include "uparse.h" +#include "ustr_imp.h" + +#include <stdio.h> + +U_CAPI const char * U_EXPORT2 +u_skipWhitespace(const char *s) { + while(U_IS_INV_WHITESPACE(*s)) { + ++s; + } + return s; +} + +U_CAPI char * U_EXPORT2 +u_rtrim(char *s) { + char *end=uprv_strchr(s, 0); + while(s<end && U_IS_INV_WHITESPACE(*(end-1))) { + *--end = 0; + } + return end; +} + +/* + * If the string starts with # @missing: then return the pointer to the + * following non-whitespace character. + * Otherwise return the original pointer. + * Unicode 5.0 adds such lines in some data files to document + * default property values. + * Poor man's regex for variable amounts of white space. + */ +static const char * +getMissingLimit(const char *s) { + const char *s0=s; + if( + *(s=u_skipWhitespace(s))=='#' && + *(s=u_skipWhitespace(s+1))=='@' && + 0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) && + *(s=u_skipWhitespace(s+7))==':' + ) { + return u_skipWhitespace(s+1); + } else { + return s0; + } +} + +U_CAPI void U_EXPORT2 +u_parseDelimitedFile(const char *filename, char delimiter, + char *fields[][2], int32_t fieldCount, + UParseLineFn *lineFn, void *context, + UErrorCode *pErrorCode) { + FileStream *file; + char line[10000]; + char *start, *limit; + int32_t i, length; + + if(U_FAILURE(*pErrorCode)) { + return; + } + + if(fields==nullptr || lineFn==nullptr || fieldCount<=0) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if(filename==nullptr || *filename==0 || (*filename=='-' && filename[1]==0)) { + filename=nullptr; + file=T_FileStream_stdin(); + } else { + file=T_FileStream_open(filename, "r"); + } + if(file==nullptr) { + *pErrorCode=U_FILE_ACCESS_ERROR; + return; + } + + while(T_FileStream_readLine(file, line, sizeof(line))!=nullptr) { + /* remove trailing newline characters */ + length=(int32_t)(u_rtrim(line)-line); + + /* + * detect a line with # @missing: + * start parsing after that, or else from the beginning of the line + * set the default warning for @missing lines + */ + start=(char *)getMissingLimit(line); + if(start==line) { + *pErrorCode=U_ZERO_ERROR; + } else { + *pErrorCode=U_USING_DEFAULT_WARNING; + } + + /* skip this line if it is empty or a comment */ + if(*start==0 || *start=='#') { + continue; + } + + /* remove in-line comments */ + limit=uprv_strchr(start, '#'); + if(limit!=nullptr) { + /* get white space before the pound sign */ + while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) { + --limit; + } + + /* truncate the line */ + *limit=0; + } + + /* skip lines with only whitespace */ + if(u_skipWhitespace(start)[0]==0) { + continue; + } + + /* for each field, call the corresponding field function */ + for(i=0; i<fieldCount; ++i) { + /* set the limit pointer of this field */ + limit=start; + while(*limit!=delimiter && *limit!=0) { + ++limit; + } + + /* set the field start and limit in the fields array */ + fields[i][0]=start; + fields[i][1]=limit; + + /* set start to the beginning of the next field, if any */ + start=limit; + if(*start!=0) { + ++start; + } else if(i+1<fieldCount) { + *pErrorCode=U_PARSE_ERROR; + limit=line+length; + i=fieldCount; + break; + } + } + + /* too few fields? */ + if(U_FAILURE(*pErrorCode)) { + break; + } + + /* call the field function */ + lineFn(context, fields, fieldCount, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + break; + } + } + + if(filename!=nullptr) { + T_FileStream_close(file); + } +} + +/* + * parse a list of code points + * store them as a UTF-32 string in dest[destCapacity] + * return the number of code points + */ +U_CAPI int32_t U_EXPORT2 +u_parseCodePoints(const char *s, + uint32_t *dest, int32_t destCapacity, + UErrorCode *pErrorCode) { + char *end; + uint32_t value; + int32_t count; + + if(U_FAILURE(*pErrorCode)) { + return 0; + } + if(s==nullptr || destCapacity<0 || (destCapacity>0 && dest==nullptr)) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + count=0; + for(;;) { + s=u_skipWhitespace(s); + if(*s==';' || *s==0) { + return count; + } + + /* read one code point */ + value=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) { + *pErrorCode=U_PARSE_ERROR; + return 0; + } + + /* append it to the destination array */ + if(count<destCapacity) { + dest[count++]=value; + } else { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + + /* go to the following characters */ + s=end; + } +} + +/* + * parse a list of code points + * store them as a string in dest[destCapacity] + * set the first code point in *pFirst + * @return The length of the string in numbers of UChars. + */ +U_CAPI int32_t U_EXPORT2 +u_parseString(const char *s, + char16_t *dest, int32_t destCapacity, + uint32_t *pFirst, + UErrorCode *pErrorCode) { + char *end; + uint32_t value; + int32_t destLength; + + if(U_FAILURE(*pErrorCode)) { + return 0; + } + if(s==nullptr || destCapacity<0 || (destCapacity>0 && dest==nullptr)) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + if(pFirst!=nullptr) { + *pFirst=0xffffffff; + } + + destLength=0; + for(;;) { + s=u_skipWhitespace(s); + if(*s==';' || *s==0) { + if(destLength<destCapacity) { + dest[destLength]=0; + } else if(destLength==destCapacity) { + *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; + } else { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + return destLength; + } + + /* read one code point */ + value=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) { + *pErrorCode=U_PARSE_ERROR; + return 0; + } + + /* store the first code point */ + if(pFirst!=nullptr) { + *pFirst=value; + pFirst=nullptr; + } + + /* append it to the destination array */ + if((destLength+U16_LENGTH(value))<=destCapacity) { + U16_APPEND_UNSAFE(dest, destLength, value); + } else { + destLength+=U16_LENGTH(value); + } + + /* go to the following characters */ + s=end; + } +} + +/* read a range like start or start..end */ +U_CAPI int32_t U_EXPORT2 +u_parseCodePointRangeAnyTerminator(const char *s, + uint32_t *pStart, uint32_t *pEnd, + const char **terminator, + UErrorCode *pErrorCode) { + char *end; + uint32_t value; + + if(U_FAILURE(*pErrorCode)) { + return 0; + } + if(s==nullptr || pStart==nullptr || pEnd==nullptr) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* read the start code point */ + s=u_skipWhitespace(s); + value=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || value>=0x110000) { + *pErrorCode=U_PARSE_ERROR; + return 0; + } + *pStart=*pEnd=value; + + /* is there a "..end"? */ + s=u_skipWhitespace(end); + if(*s!='.' || s[1]!='.') { + *terminator=end; + return 1; + } + s=u_skipWhitespace(s+2); + + /* read the end code point */ + value=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || value>=0x110000) { + *pErrorCode=U_PARSE_ERROR; + return 0; + } + *pEnd=value; + + /* is this a valid range? */ + if(value<*pStart) { + *pErrorCode=U_PARSE_ERROR; + return 0; + } + + *terminator=end; + return value-*pStart+1; +} + +U_CAPI int32_t U_EXPORT2 +u_parseCodePointRange(const char *s, + uint32_t *pStart, uint32_t *pEnd, + UErrorCode *pErrorCode) { + const char *terminator; + int32_t rangeLength= + u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode); + if(U_SUCCESS(*pErrorCode)) { + terminator=u_skipWhitespace(terminator); + if(*terminator!=';' && *terminator!=0) { + *pErrorCode=U_PARSE_ERROR; + return 0; + } + } + return rangeLength; +} + +U_CAPI int32_t U_EXPORT2 +u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) { + const char *read = source; + int32_t i = 0; + unsigned int value = 0; + if(sLen == -1) { + sLen = (int32_t)strlen(source); + } + + while(read < source+sLen) { + sscanf(read, "%2x", &value); + if(i < destCapacity) { + dest[i] = (char)value; + } + i++; + read += 2; + } + return u_terminateChars(dest, destCapacity, i, status); +} diff --git a/intl/icu/source/tools/toolutil/uparse.h b/intl/icu/source/tools/toolutil/uparse.h new file mode 100644 index 0000000000..df0e79a21f --- /dev/null +++ b/intl/icu/source/tools/toolutil/uparse.h @@ -0,0 +1,153 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2010, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: uparse.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000apr18 +* created by: Markus W. Scherer +* +* This file provides a parser for files that are delimited by one single +* character like ';' or TAB. Example: the Unicode Character Properties files +* like UnicodeData.txt are semicolon-delimited. +*/ + +#ifndef __UPARSE_H__ +#define __UPARSE_H__ + +#include "unicode/utypes.h" + +/** + * Is c an invariant-character whitespace? + * @param c invariant character + */ +#define U_IS_INV_WHITESPACE(c) ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n') + +U_CDECL_BEGIN + +/** + * Skip space ' ' and TAB '\t' characters. + * + * @param s Pointer to characters. + * @return Pointer to first character at or after s that is not a space or TAB. + */ +U_CAPI const char * U_EXPORT2 +u_skipWhitespace(const char *s); + +/** + * Trim whitespace (including line endings) from the end of the string. + * + * @param s Pointer to the string. + * @return Pointer to the new end of the string. + */ +U_CAPI char * U_EXPORT2 +u_rtrim(char *s); + +/** Function type for u_parseDelimitedFile(). */ +typedef void U_CALLCONV +UParseLineFn(void *context, + char *fields[][2], + int32_t fieldCount, + UErrorCode *pErrorCode); + +/** + * Parser for files that are similar to UnicodeData.txt: + * This function opens the file and reads it line by line. It skips empty lines + * and comment lines that start with a '#'. + * All other lines are separated into fields with one delimiter character + * (semicolon for Unicode Properties files) between two fields. The last field in + * a line does not need to be terminated with a delimiter. + * + * For each line, after segmenting it, a line function is called. + * It gets passed the array of field start and limit pointers that is + * passed into this parser and filled by it for each line. + * For each field i of the line, the start pointer in fields[i][0] + * points to the beginning of the field, while the limit pointer in fields[i][1] + * points behind the field, i.e., to the delimiter or the line end. + * + * The context parameter of the line function is + * the same as the one for the parse function. + * + * The line function may modify the contents of the fields including the + * limit characters. + * + * If the file cannot be opened, or there is a parsing error or a field function + * sets *pErrorCode, then the parser returns with *pErrorCode set to an error code. + */ +U_CAPI void U_EXPORT2 +u_parseDelimitedFile(const char *filename, char delimiter, + char *fields[][2], int32_t fieldCount, + UParseLineFn *lineFn, void *context, + UErrorCode *pErrorCode); + +/** + * Parse a string of code points like 0061 0308 0300. + * s must end with either ';' or NUL. + * + * @return Number of code points. + */ +U_CAPI int32_t U_EXPORT2 +u_parseCodePoints(const char *s, + uint32_t *dest, int32_t destCapacity, + UErrorCode *pErrorCode); + +/** + * Parse a list of code points like 0061 0308 0300 + * into a UChar * string. + * s must end with either ';' or NUL. + * + * Set the first code point in *pFirst. + * + * @param s Input char * string. + * @param dest Output string buffer. + * @param destCapacity Capacity of dest in numbers of UChars. + * @param pFirst If pFirst!=NULL the *pFirst will be set to the first + * code point in the string. + * @param pErrorCode ICU error code. + * @return The length of the string in numbers of UChars. + */ +U_CAPI int32_t U_EXPORT2 +u_parseString(const char *s, + UChar *dest, int32_t destCapacity, + uint32_t *pFirst, + UErrorCode *pErrorCode); + +/** + * Parse a code point range like + * 0085 or + * 4E00..9FA5. + * + * s must contain such a range and end with either ';' or NUL. + * + * @return Length of code point range, end-start+1 + */ +U_CAPI int32_t U_EXPORT2 +u_parseCodePointRange(const char *s, + uint32_t *pStart, uint32_t *pEnd, + UErrorCode *pErrorCode); + +/** + * Same as u_parseCodePointRange() but the range may be terminated by + * any character. The position of the terminating character is returned via + * the *terminator output parameter. + */ +U_CAPI int32_t U_EXPORT2 +u_parseCodePointRangeAnyTerminator(const char *s, + uint32_t *pStart, uint32_t *pEnd, + const char **terminator, + UErrorCode *pErrorCode); + +U_CAPI int32_t U_EXPORT2 +u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status); + +U_CDECL_END + +#endif diff --git a/intl/icu/source/tools/toolutil/writesrc.cpp b/intl/icu/source/tools/toolutil/writesrc.cpp new file mode 100644 index 0000000000..55c2f277b3 --- /dev/null +++ b/intl/icu/source/tools/toolutil/writesrc.cpp @@ -0,0 +1,515 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2005-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: writesrc.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005apr23 +* created by: Markus W. Scherer +* +* Helper functions for writing source code for data. +*/ + +#include <stdio.h> +#include <time.h> + +// The C99 standard suggested that C++ implementations not define PRId64 etc. constants +// unless this macro is defined. +// See the Notes at https://en.cppreference.com/w/cpp/types/integer . +// Similar to defining __STDC_LIMIT_MACROS in unicode/ptypes.h . +#ifndef __STDC_FORMAT_MACROS +# define __STDC_FORMAT_MACROS +#endif +#include <cinttypes> + +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "unicode/ucptrie.h" +#include "unicode/errorcode.h" +#include "unicode/uniset.h" +#include "unicode/usetiter.h" +#include "unicode/utf16.h" +#include "utrie2.h" +#include "cstring.h" +#include "writesrc.h" +#include "util.h" + +U_NAMESPACE_BEGIN + +ValueNameGetter::~ValueNameGetter() {} + +U_NAMESPACE_END + +U_NAMESPACE_USE + +static FILE * +usrc_createWithoutHeader(const char *path, const char *filename) { + char buffer[1024]; + const char *p; + char *q; + FILE *f; + char c; + + if(path==nullptr) { + p=filename; + } else { + /* concatenate path and filename, with U_FILE_SEP_CHAR in between if necessary */ + uprv_strcpy(buffer, path); + q=buffer+uprv_strlen(buffer); + if(q>buffer && (c=*(q-1))!=U_FILE_SEP_CHAR && c!=U_FILE_ALT_SEP_CHAR) { + *q++=U_FILE_SEP_CHAR; + } + uprv_strcpy(q, filename); + p=buffer; + } + + f=fopen(p, "w"); + if (f==nullptr) { + fprintf( + stderr, + "usrc_create(%s, %s): unable to create file\n", + path!=nullptr ? path : "", filename); + } + return f; +} + +U_CAPI FILE * U_EXPORT2 +usrc_create(const char *path, const char *filename, int32_t copyrightYear, const char *generator) { + FILE *f = usrc_createWithoutHeader(path, filename); + if (f == nullptr) { + return f; + } + usrc_writeCopyrightHeader(f, "//", copyrightYear); + usrc_writeFileNameGeneratedBy(f, "//", filename, generator); + return f; +} + +U_CAPI FILE * U_EXPORT2 +usrc_createTextData(const char *path, const char *filename, int32_t copyrightYear, const char *generator) { + FILE *f = usrc_createWithoutHeader(path, filename); + if (f == nullptr) { + return f; + } + usrc_writeCopyrightHeader(f, "#", copyrightYear); + usrc_writeFileNameGeneratedBy(f, "#", filename, generator); + return f; +} + +U_CAPI void U_EXPORT2 +usrc_writeCopyrightHeader(FILE *f, const char *prefix, int32_t copyrightYear) { + fprintf(f, + "%s Copyright (C) %d and later: Unicode, Inc. and others.\n" + "%s License & terms of use: http://www.unicode.org/copyright.html\n", + prefix, copyrightYear, prefix); + if (copyrightYear <= 2016) { + fprintf(f, + "%s Copyright (C) 1999-2016, International Business Machines\n" + "%s Corporation and others. All Rights Reserved.\n", + prefix, prefix); + } +} + +U_CAPI void U_EXPORT2 +usrc_writeFileNameGeneratedBy( + FILE *f, + const char *prefix, + const char *filename, + const char *generator) { + char buffer[1024]; + const struct tm *lt; + time_t t; + + const char *pattern = + "%s\n" + "%s file name: %s\n" + "%s\n" + "%s machine-generated by: %s\n" + "\n"; + + time(&t); + lt=localtime(&t); + if(generator==nullptr) { + strftime(buffer, sizeof(buffer), "%Y-%m-%d", lt); + fprintf(f, pattern, prefix, prefix, filename, prefix, prefix, buffer); + } else { + fprintf(f, pattern, prefix, prefix, filename, prefix, prefix, generator); + } +} + +U_CAPI void U_EXPORT2 +usrc_writeArray(FILE *f, + const char *prefix, + const void *p, int32_t width, int32_t length, + const char *indent, + const char *postfix) { + const uint8_t *p8; + const uint16_t *p16; + const uint32_t *p32; + const int64_t *p64; // Signed due to TOML! + int64_t value; // Signed due to TOML! + int32_t i, col; + + p8=nullptr; + p16=nullptr; + p32=nullptr; + p64=nullptr; + switch(width) { + case 8: + p8=(const uint8_t *)p; + break; + case 16: + p16=(const uint16_t *)p; + break; + case 32: + p32=(const uint32_t *)p; + break; + case 64: + p64=(const int64_t *)p; + break; + default: + fprintf(stderr, "usrc_writeArray(width=%ld) unrecognized width\n", (long)width); + return; + } + if(prefix!=nullptr) { + fprintf(f, prefix, (long)length); + } + for(i=col=0; i<length; ++i, ++col) { + if(i>0) { + if(col<16) { + fputc(',', f); + } else { + fputs(",\n", f); + fputs(indent, f); + col=0; + } + } + switch(width) { + case 8: + value=p8[i]; + break; + case 16: + value=p16[i]; + break; + case 32: + value=p32[i]; + break; + case 64: + value=p64[i]; + break; + default: + value=0; /* unreachable */ + break; + } + fprintf(f, value<=9 ? "%" PRId64 : "0x%" PRIx64, value); + } + if(postfix!=nullptr) { + fputs(postfix, f); + } +} + +U_CAPI void U_EXPORT2 +usrc_writeUTrie2Arrays(FILE *f, + const char *indexPrefix, const char *data32Prefix, + const UTrie2 *pTrie, + const char *postfix) { + if(pTrie->data32==nullptr) { + /* 16-bit trie */ + usrc_writeArray(f, indexPrefix, pTrie->index, 16, pTrie->indexLength+pTrie->dataLength, "", postfix); + } else { + /* 32-bit trie */ + usrc_writeArray(f, indexPrefix, pTrie->index, 16, pTrie->indexLength, "", postfix); + usrc_writeArray(f, data32Prefix, pTrie->data32, 32, pTrie->dataLength, "", postfix); + } +} + +U_CAPI void U_EXPORT2 +usrc_writeUTrie2Struct(FILE *f, + const char *prefix, + const UTrie2 *pTrie, + const char *indexName, const char *data32Name, + const char *postfix) { + if(prefix!=nullptr) { + fputs(prefix, f); + } + if(pTrie->data32==nullptr) { + /* 16-bit trie */ + fprintf( + f, + " %s,\n" /* index */ + " %s+%ld,\n" /* data16 */ + " nullptr,\n", /* data32 */ + indexName, + indexName, + (long)pTrie->indexLength); + } else { + /* 32-bit trie */ + fprintf( + f, + " %s,\n" /* index */ + " nullptr,\n" /* data16 */ + " %s,\n", /* data32 */ + indexName, + data32Name); + } + fprintf( + f, + " %ld,\n" /* indexLength */ + " %ld,\n" /* dataLength */ + " 0x%hx,\n" /* index2NullOffset */ + " 0x%hx,\n" /* dataNullOffset */ + " 0x%lx,\n" /* initialValue */ + " 0x%lx,\n" /* errorValue */ + " 0x%lx,\n" /* highStart */ + " 0x%lx,\n" /* highValueIndex */ + " nullptr, 0, false, false, 0, nullptr\n", + (long)pTrie->indexLength, (long)pTrie->dataLength, + (short)pTrie->index2NullOffset, (short)pTrie->dataNullOffset, + (long)pTrie->initialValue, (long)pTrie->errorValue, + (long)pTrie->highStart, (long)pTrie->highValueIndex); + if(postfix!=nullptr) { + fputs(postfix, f); + } +} + +U_CAPI void U_EXPORT2 +usrc_writeUCPTrieArrays(FILE *f, + const char *indexPrefix, const char *dataPrefix, + const UCPTrie *pTrie, + const char *postfix, + UTargetSyntax syntax) { + const char* indent = (syntax == UPRV_TARGET_SYNTAX_TOML) ? " " : ""; + usrc_writeArray(f, indexPrefix, pTrie->index, 16, pTrie->indexLength, indent, postfix); + int32_t width= + pTrie->valueWidth==UCPTRIE_VALUE_BITS_16 ? 16 : + pTrie->valueWidth==UCPTRIE_VALUE_BITS_32 ? 32 : + pTrie->valueWidth==UCPTRIE_VALUE_BITS_8 ? 8 : 0; + usrc_writeArray(f, dataPrefix, pTrie->data.ptr0, width, pTrie->dataLength, indent, postfix); +} + +U_CAPI void U_EXPORT2 +usrc_writeUCPTrieStruct(FILE *f, + const char *prefix, + const UCPTrie *pTrie, + const char *indexName, const char *dataName, + const char *postfix, + UTargetSyntax syntax) { + if(prefix!=nullptr) { + fputs(prefix, f); + } + if (syntax == UPRV_TARGET_SYNTAX_CCODE) { + fprintf( + f, + " %s,\n" // index + " { %s },\n", // data (union) + indexName, + dataName); + } + const char* pattern = + (syntax == UPRV_TARGET_SYNTAX_CCODE) ? + " %ld, %ld,\n" // indexLength, dataLength + " 0x%lx, 0x%x,\n" // highStart, shifted12HighStart + " %d, %d,\n" // type, valueWidth + " 0, 0,\n" // reserved32, reserved16 + " 0x%x, 0x%lx,\n" // index3NullOffset, dataNullOffset + " 0x%lx,\n" // nullValue + : + "indexLength = %ld\n" + "dataLength = %ld\n" + "highStart = 0x%lx\n" + "shifted12HighStart = 0x%x\n" + "type = %d\n" + "valueWidth = %d\n" + "index3NullOffset = 0x%x\n" + "dataNullOffset = 0x%lx\n" + "nullValue = 0x%lx\n" + ; + fprintf( + f, + pattern, + (long)pTrie->indexLength, (long)pTrie->dataLength, + (long)pTrie->highStart, pTrie->shifted12HighStart, + pTrie->type, pTrie->valueWidth, + pTrie->index3NullOffset, (long)pTrie->dataNullOffset, + (long)pTrie->nullValue); + if(postfix!=nullptr) { + fputs(postfix, f); + } +} + +U_CAPI void U_EXPORT2 +usrc_writeUCPTrie(FILE *f, const char *name, const UCPTrie *pTrie, UTargetSyntax syntax) { + int32_t width= + pTrie->valueWidth==UCPTRIE_VALUE_BITS_16 ? 16 : + pTrie->valueWidth==UCPTRIE_VALUE_BITS_32 ? 32 : + pTrie->valueWidth==UCPTRIE_VALUE_BITS_8 ? 8 : 0; + char line[100], line2[100], line3[100], line4[100]; + + switch (syntax) { + case UPRV_TARGET_SYNTAX_CCODE: + snprintf(line, sizeof(line), "static const uint16_t %s_trieIndex[%%ld]={\n", name); + snprintf(line2, sizeof(line2), "static const uint%d_t %s_trieData[%%ld]={\n", (int)width, name); + snprintf(line3, sizeof(line3), "\n};\n\n"); + break; + case UPRV_TARGET_SYNTAX_TOML: + snprintf(line, sizeof(line), "index = [\n "); + snprintf(line2, sizeof(line2), "data_%d = [\n ", (int)width); + snprintf(line3, sizeof(line3), "\n]\n"); + break; + default: + UPRV_UNREACHABLE_EXIT; + } + usrc_writeUCPTrieArrays(f, line, line2, pTrie, line3, syntax); + + switch (syntax) { + case UPRV_TARGET_SYNTAX_CCODE: + snprintf(line, sizeof(line), "static const UCPTrie %s_trie={\n", name); + snprintf(line2, sizeof(line2), "%s_trieIndex", name); + snprintf(line3, sizeof(line3), "%s_trieData", name); + snprintf(line4, sizeof(line4), "};\n\n"); + break; + case UPRV_TARGET_SYNTAX_TOML: + line[0] = 0; + line2[0] = 0; + line3[0] = 0; + line4[0] = 0; + break; + default: + UPRV_UNREACHABLE_EXIT; + } + usrc_writeUCPTrieStruct(f, line, pTrie, line2, line3, line4, syntax); +} + +U_CAPI void U_EXPORT2 +usrc_writeUnicodeSet( + FILE *f, + const USet *pSet, + UTargetSyntax syntax) { + // ccode is not yet supported + U_ASSERT(syntax == UPRV_TARGET_SYNTAX_TOML); + + // Write out a list of ranges + const UnicodeSet* set = UnicodeSet::fromUSet(pSet); + UnicodeSetIterator it(*set); + fprintf(f, "# Inclusive ranges of the code points in the set.\n"); + fprintf(f, "ranges = [\n"); + bool seenFirstString = false; + while (it.nextRange()) { + if (it.isString()) { + if (!seenFirstString) { + seenFirstString = true; + fprintf(f, "]\nstrings = [\n"); + } + const UnicodeString& str = it.getString(); + fprintf(f, " "); + usrc_writeStringAsASCII(f, str.getBuffer(), str.length(), syntax); + fprintf(f, ",\n"); + } else { + U_ASSERT(!seenFirstString); + UChar32 start = it.getCodepoint(); + UChar32 end = it.getCodepointEnd(); + fprintf(f, " [0x%x, 0x%x],\n", start, end); + } + } + fprintf(f, "]\n"); +} + +U_CAPI void U_EXPORT2 +usrc_writeUCPMap( + FILE *f, + const UCPMap *pMap, + icu::ValueNameGetter *valueNameGetter, + UTargetSyntax syntax) { + // ccode is not yet supported + U_ASSERT(syntax == UPRV_TARGET_SYNTAX_TOML); + (void) syntax; // silence unused variable errors + + // Print out list of ranges + UChar32 start = 0, end; + uint32_t value; + fprintf(f, "# Code points `a` through `b` have value `v`, corresponding to `name`.\n"); + fprintf(f, "ranges = [\n"); + while ((end = ucpmap_getRange(pMap, start, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value)) >= 0) { + if (valueNameGetter != nullptr) { + const char *name = valueNameGetter->getName(value); + fprintf(f, " {a=0x%x, b=0x%x, v=%u, name=\"%s\"},\n", start, end, value, name); + } else { + fprintf(f, " {a=0x%x, b=0x%x, v=%u},\n", start, end, value); + } + start = end + 1; + } + fprintf(f, "]\n"); +} + +U_CAPI void U_EXPORT2 +usrc_writeArrayOfMostlyInvChars(FILE *f, + const char *prefix, + const char *p, int32_t length, + const char *postfix) { + int32_t i, col; + int prev2, prev, c; + + if(prefix!=nullptr) { + fprintf(f, prefix, (long)length); + } + prev2=prev=-1; + for(i=col=0; i<length; ++i, ++col) { + c=(uint8_t)p[i]; + if(i>0) { + /* Break long lines. Try to break at interesting places, to minimize revision diffs. */ + if( + /* Very long line. */ + col>=32 || + /* Long line, break after terminating NUL. */ + (col>=24 && prev2>=0x20 && prev==0) || + /* Medium-long line, break before non-NUL, non-character byte. */ + (col>=16 && (prev==0 || prev>=0x20) && 0<c && c<0x20) + ) { + fputs(",\n", f); + col=0; + } else { + fputc(',', f); + } + } + fprintf(f, c<0x20 ? "%u" : "'%c'", c); + prev2=prev; + prev=c; + } + if(postfix!=nullptr) { + fputs(postfix, f); + } +} + +U_CAPI void U_EXPORT2 +usrc_writeStringAsASCII(FILE *f, + const char16_t* ptr, int32_t length, + UTargetSyntax) { + // For now, assume all UTargetSyntax values are valid here. + fprintf(f, "\""); + int32_t i = 0; + UChar32 cp; + while (i < length) { + U16_NEXT(ptr, i, length, cp); + if (cp == u'"') { + fprintf(f, "\\\""); + } else if (ICU_Utility::isUnprintable(cp)) { + UnicodeString u16result; + ICU_Utility::escapeUnprintable(u16result, cp); + std::string u8result; + u16result.toUTF8String(u8result); + fprintf(f, "%s", u8result.data()); + } else { + U_ASSERT(cp < 0x80); + char s[2] = {static_cast<char>(cp), 0}; + fprintf(f, "%s", s); + } + } + fprintf(f, "\""); +} diff --git a/intl/icu/source/tools/toolutil/writesrc.h b/intl/icu/source/tools/toolutil/writesrc.h new file mode 100644 index 0000000000..9c0be5a100 --- /dev/null +++ b/intl/icu/source/tools/toolutil/writesrc.h @@ -0,0 +1,198 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2005-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: writesrc.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2005apr23 +* created by: Markus W. Scherer +* +* Helper functions for writing source code for data. +*/ + +#ifndef __WRITESRC_H__ +#define __WRITESRC_H__ + +#include <stdio.h> +#include "unicode/utypes.h" +#include "unicode/ucpmap.h" +#include "unicode/ucptrie.h" +#include "unicode/umutablecptrie.h" +#include "unicode/uset.h" +#include "utrie2.h" + +/** + * An input to some of the functions in this file specifying whether to write data + * as C/C++ code initializers or as TOML. + */ +typedef enum UTargetSyntax { + UPRV_TARGET_SYNTAX_CCODE = 0, + UPRV_TARGET_SYNTAX_TOML = 1, +} UTargetSyntax; + +/** + * Creates a source text file and writes a header comment with the ICU copyright. + * Writes a C/Java-style comment with the generator name. + */ +U_CAPI FILE * U_EXPORT2 +usrc_create(const char *path, const char *filename, int32_t copyrightYear, const char *generator); + +/** + * Creates a source text file and writes a header comment with the ICU copyright. + * Writes the comment with # lines, as used in scripts and text data. + */ +U_CAPI FILE * U_EXPORT2 +usrc_createTextData(const char *path, const char *filename, int32_t copyrightYear, const char *generator); + +/** + * Writes the ICU copyright to a file stream, with configurable year and comment style. + */ +U_CAPI void U_EXPORT2 +usrc_writeCopyrightHeader(FILE *f, const char *prefix, int32_t copyrightYear); + +/** + * Writes information about the file being machine-generated. + */ +U_CAPI void U_EXPORT2 +usrc_writeFileNameGeneratedBy( + FILE *f, + const char *prefix, + const char *filename, + const char *generator); + +/** + * Writes the contents of an array of 8/16/32/64-bit words. + * The prefix and postfix are optional (can be NULL) and are written first/last. + * The prefix may contain a %ld or similar field for the array length. + * The {} and declaration etc. need to be included in prefix/postfix or + * printed before and after the array contents. + */ +U_CAPI void U_EXPORT2 +usrc_writeArray(FILE *f, + const char *prefix, + const void *p, int32_t width, int32_t length, + const char *indent, + const char *postfix); + +/** + * Calls usrc_writeArray() for the index and data arrays of a frozen UTrie2. + * Only the index array is written for a 16-bit UTrie2. In this case, dataPrefix + * is ignored and can be NULL. + */ +U_CAPI void U_EXPORT2 +usrc_writeUTrie2Arrays(FILE *f, + const char *indexPrefix, const char *dataPrefix, + const UTrie2 *pTrie, + const char *postfix); + +/** + * Writes the UTrie2 struct values. + * The {} and declaration etc. need to be included in prefix/postfix or + * printed before and after the array contents. + */ +U_CAPI void U_EXPORT2 +usrc_writeUTrie2Struct(FILE *f, + const char *prefix, + const UTrie2 *pTrie, + const char *indexName, const char *dataName, + const char *postfix); + +/** + * Calls usrc_writeArray() for the index and data arrays of a UCPTrie. + */ +U_CAPI void U_EXPORT2 +usrc_writeUCPTrieArrays(FILE *f, + const char *indexPrefix, const char *dataPrefix, + const UCPTrie *pTrie, + const char *postfix, + UTargetSyntax syntax); + +/** + * Writes the UCPTrie struct values. + * The {} and declaration etc. need to be included in prefix/postfix or + * printed before and after the array contents. + */ +U_CAPI void U_EXPORT2 +usrc_writeUCPTrieStruct(FILE *f, + const char *prefix, + const UCPTrie *pTrie, + const char *indexName, const char *dataName, + const char *postfix, + UTargetSyntax syntax); + +/** + * Writes the UCPTrie arrays and struct values. + */ +U_CAPI void U_EXPORT2 +usrc_writeUCPTrie(FILE *f, const char *name, const UCPTrie *pTrie, UTargetSyntax syntax); + +/** + * Writes the UnicodeSet range and string lists. + */ +U_CAPI void U_EXPORT2 +usrc_writeUnicodeSet( + FILE *f, + const USet *pSet, + UTargetSyntax syntax); + +#ifdef __cplusplus + +U_NAMESPACE_BEGIN + +class U_TOOLUTIL_API ValueNameGetter { +public: + virtual ~ValueNameGetter(); + virtual const char *getName(uint32_t value) = 0; +}; + +U_NAMESPACE_END + +/** + * Writes the UCPMap ranges list. + * + * The "valueNameGetter" argument is optional; ignored if nullptr. + * If present, it will be used to look up value name strings. + */ +U_CAPI void U_EXPORT2 +usrc_writeUCPMap( + FILE *f, + const UCPMap *pMap, + icu::ValueNameGetter *valueNameGetter, + UTargetSyntax syntax); + +#endif // __cplusplus + +/** + * Writes the contents of an array of mostly invariant characters. + * Characters 0..0x1f are printed as numbers, + * others as characters with single quotes: '%c'. + * + * The prefix and postfix are optional (can be NULL) and are written first/last. + * The prefix may contain a %ld or similar field for the array length. + * The {} and declaration etc. need to be included in prefix/postfix or + * printed before and after the array contents. + */ +U_CAPI void U_EXPORT2 +usrc_writeArrayOfMostlyInvChars(FILE *f, + const char *prefix, + const char *p, int32_t length, + const char *postfix); + +/** + * Writes a syntactically valid Unicode string in all ASCII, escaping quotes + * and non-ASCII characters. + */ +U_CAPI void U_EXPORT2 +usrc_writeStringAsASCII(FILE *f, + const UChar* ptr, int32_t length, + UTargetSyntax syntax); + +#endif diff --git a/intl/icu/source/tools/toolutil/xmlparser.cpp b/intl/icu/source/tools/toolutil/xmlparser.cpp new file mode 100644 index 0000000000..edb85bdab0 --- /dev/null +++ b/intl/icu/source/tools/toolutil/xmlparser.cpp @@ -0,0 +1,827 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2004-2010, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: xmlparser.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2004jul21 +* created by: Andy Heninger +*/ + +#include <stdio.h> +#include "unicode/uchar.h" +#include "unicode/ucnv.h" +#include "unicode/regex.h" +#include "filestrm.h" +#include "xmlparser.h" + +#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION + +// character constants +enum { + x_QUOT=0x22, + x_AMP=0x26, + x_APOS=0x27, + x_LT=0x3c, + x_GT=0x3e, + x_l=0x6c +}; + +#define XML_SPACES "[ \\u0009\\u000d\\u000a]" + +// XML #4 +#define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ + "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \ + "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \ + "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" + +// XML #5 +#define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]" + +// XML #6 +#define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" + +U_NAMESPACE_BEGIN + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) + +// +// UXMLParser constructor. Mostly just initializes the ICU regexes that are +// used for parsing. +// +UXMLParser::UXMLParser(UErrorCode &status) : + // XML Declaration. XML Production #23. + // example: "<?xml version=1.0 encoding="utf-16" ?> + // This is a sloppy implementation - just look for the leading <?xml and the closing ?> + // allow for a possible leading BOM. + mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), + + // XML Comment production #15 + // example: "<!-- whatever --> + // note, does not detect an illegal "--" within comments + mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status), + + // XML Spaces + // production [3] + mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), + + // XML Doctype decl production #28 + // example "<!DOCTYPE foo SYSTEM "somewhere" > + // or "<!DOCTYPE foo [internal dtd]> + // TODO: we don't actually parse the DOCTYPE or internal subsets. + // Some internal dtd subsets could confuse this simple-minded + // attempt at skipping over them, specifically, occurrences + // of closing square brackets. These could appear in comments, + // or in parameter entity declarations, for example. + mXMLDoctype(UnicodeString( + "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV + ), 0, status), + + // XML PI production #16 + // example "<?target stuff?> + mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), + + // XML Element Start Productions #40, #41 + // example <foo att1='abc' att2="d e f" > + // capture #1: the tag name + // + mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" + "(?:" + XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " + "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' + ")*" // * for zero or more attributes. + XML_SPACES "*?>", -1, US_INV), 0, status), // match " >" + + // XML Element End production #42 + // example </foo> + mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status), + + // XML Element Empty production #44 + // example <foo att1="abc" att2="d e f" /> + mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" + "(?:" + XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " + "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' + ")*" // * for zero or more attributes. + XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />" + + + // XMLCharData. Everything but '<'. Note that & will be dealt with later. + mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), + + // Attribute name = "value". XML Productions 10, 40/41 + // Capture group 1 is name, + // 2 is the attribute value, including the quotes. + // + // Note that attributes are scanned twice. The first time is with + // the regex for an entire element start. There, the attributes + // are checked syntactically, but not separated out one by one. + // Here, we match a single attribute, and make its name and + // attribute value available to the parser code. + mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" + "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), + + + mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), + + // Match any of the new-line sequences in content. + // All are changed to \u000a. + mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), + + // & char references + // We will figure out what we've got based on which capture group has content. + // The last one is a catchall for unrecognized entity references.. + // 1 2 3 4 5 6 7 8 + mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"), + 0, status), + + fNames(status), + fElementStack(status), + fOneLF((char16_t)0x0a) // Plain new-line string, used in new line normalization. + { + } + +UXMLParser * +UXMLParser::createParser(UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { + return nullptr; + } else { + return new UXMLParser(errorCode); + } +} + +UXMLParser::~UXMLParser() {} + +UXMLElement * +UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { + char bytes[4096], charsetBuffer[100]; + FileStream *f; + const char *charset, *pb; + UnicodeString src; + UConverter *cnv; + char16_t *buffer, *pu; + int32_t fileLength, bytesLength, length, capacity; + UBool flush; + + if(U_FAILURE(errorCode)) { + return nullptr; + } + + f=T_FileStream_open(filename, "rb"); + if(f==nullptr) { + errorCode=U_FILE_ACCESS_ERROR; + return nullptr; + } + + bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); + if(bytesLength<(int32_t)sizeof(bytes)) { + // we have already read the entire file + fileLength=bytesLength; + } else { + // get the file length + fileLength=T_FileStream_size(f); + } + + /* + * get the charset: + * 1. Unicode signature + * 2. treat as ISO-8859-1 and read XML encoding="charser" + * 3. default to UTF-8 + */ + charset=ucnv_detectUnicodeSignature(bytes, bytesLength, nullptr, &errorCode); + if(U_SUCCESS(errorCode) && charset!=nullptr) { + // open converter according to Unicode signature + cnv=ucnv_open(charset, &errorCode); + } else { + // read as Latin-1 and parse the XML declaration and encoding + cnv=ucnv_open("ISO-8859-1", &errorCode); + if(U_FAILURE(errorCode)) { + // unexpected error opening Latin-1 converter + goto exit; + } + + buffer=toUCharPtr(src.getBuffer(bytesLength)); + if(buffer==nullptr) { + // unexpected failure to reserve some string capacity + errorCode=U_MEMORY_ALLOCATION_ERROR; + goto exit; + } + pb=bytes; + pu=buffer; + ucnv_toUnicode( + cnv, + &pu, buffer+src.getCapacity(), + &pb, bytes+bytesLength, + nullptr, true, &errorCode); + src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); + ucnv_close(cnv); + cnv=nullptr; + if(U_FAILURE(errorCode)) { + // unexpected error in conversion from Latin-1 + src.remove(); + goto exit; + } + + // parse XML declaration + if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { + int32_t declEnd=mXMLDecl.end(errorCode); + // go beyond <?xml + int32_t pos=src.indexOf((char16_t)x_l)+1; + + mAttrValue.reset(src); + while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element. + UnicodeString attName = mAttrValue.group(1, errorCode); + UnicodeString attValue = mAttrValue.group(2, errorCode); + + // Trim the quotes from the att value. These are left over from the original regex + // that parsed the attribute, which couldn't conveniently strip them. + attValue.remove(0,1); // one char from the beginning + attValue.truncate(attValue.length()-1); // and one from the end. + + if(attName==UNICODE_STRING("encoding", 8)) { + length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer)); + charset=charsetBuffer; + break; + } + pos = mAttrValue.end(2, errorCode); + } + + if(charset==nullptr) { + // default to UTF-8 + charset="UTF-8"; + } + cnv=ucnv_open(charset, &errorCode); + } + } + + if(U_FAILURE(errorCode)) { + // unable to open the converter + goto exit; + } + + // convert the file contents + capacity=fileLength; // estimated capacity + src.getBuffer(capacity); + src.releaseBuffer(0); // zero length + flush=false; + for(;;) { + // convert contents of bytes[bytesLength] + pb=bytes; + for(;;) { + length=src.length(); + buffer=toUCharPtr(src.getBuffer(capacity)); + if(buffer==nullptr) { + // unexpected failure to reserve some string capacity + errorCode=U_MEMORY_ALLOCATION_ERROR; + goto exit; + } + + pu=buffer+length; + ucnv_toUnicode( + cnv, &pu, buffer+src.getCapacity(), + &pb, bytes+bytesLength, + nullptr, false, &errorCode); + src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); + if(errorCode==U_BUFFER_OVERFLOW_ERROR) { + errorCode=U_ZERO_ERROR; + capacity=(3*src.getCapacity())/2; // increase capacity by 50% + } else { + break; + } + } + + if(U_FAILURE(errorCode)) { + break; // conversion error + } + + if(flush) { + break; // completely converted the file + } + + // read next block + bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); + if(bytesLength==0) { + // reached end of file, convert once more to flush the converter + flush=true; + } + } + +exit: + ucnv_close(cnv); + T_FileStream_close(f); + + if(U_SUCCESS(errorCode)) { + return parse(src, errorCode); + } else { + return nullptr; + } +} + +UXMLElement * +UXMLParser::parse(const UnicodeString &src, UErrorCode &status) { + if(U_FAILURE(status)) { + return nullptr; + } + + UXMLElement *root = nullptr; + fPos = 0; // TODO use just a local pos variable and pass it into functions + // where necessary? + + // set all matchers to work on the input string + mXMLDecl.reset(src); + mXMLComment.reset(src); + mXMLSP.reset(src); + mXMLDoctype.reset(src); + mXMLPI.reset(src); + mXMLElemStart.reset(src); + mXMLElemEnd.reset(src); + mXMLElemEmpty.reset(src); + mXMLCharData.reset(src); + mAttrValue.reset(src); + mAttrNormalizer.reset(src); + mNewLineNormalizer.reset(src); + mAmps.reset(src); + + // Consume the XML Declaration, if present. + if (mXMLDecl.lookingAt(fPos, status)) { + fPos = mXMLDecl.end(status); + } + + // Consume "misc" [XML production 27] appearing before DocType + parseMisc(status); + + // Consume a DocType declaration, if present. + if (mXMLDoctype.lookingAt(fPos, status)) { + fPos = mXMLDoctype.end(status); + } + + // Consume additional "misc" [XML production 27] appearing after the DocType + parseMisc(status); + + // Get the root element + if (mXMLElemEmpty.lookingAt(fPos, status)) { + // Root is an empty element (no nested elements or content) + root = createElement(mXMLElemEmpty, status); + fPos = mXMLElemEmpty.end(status); + } else { + if (mXMLElemStart.lookingAt(fPos, status) == false) { + error("Root Element expected", status); + goto errorExit; + } + root = createElement(mXMLElemStart, status); + UXMLElement *el = root; + + // + // This is the loop that consumes the root element of the document, + // including all nested content. Nested elements are handled by + // explicit pushes/pops of the element stack; there is no recursion + // in the control flow of this code. + // "el" always refers to the current element, the one to which content + // is being added. It is above the top of the element stack. + for (;;) { + // Nested Element Start + if (mXMLElemStart.lookingAt(fPos, status)) { + UXMLElement *t = createElement(mXMLElemStart, status); + el->fChildren.addElement(t, status); + t->fParent = el; + fElementStack.push(el, status); + el = t; + continue; + } + + // Text Content. String is concatenated onto the current node's content, + // but only if it contains something other than spaces. + UnicodeString s = scanContent(status); + if (s.length() > 0) { + mXMLSP.reset(s); + if (mXMLSP.matches(status) == false) { + // This chunk of text contains something other than just + // white space. Make a child node for it. + replaceCharRefs(s, status); + el->fChildren.addElement(s.clone(), status); + } + mXMLSP.reset(src); // The matchers need to stay set to the main input string. + continue; + } + + // Comments. Discard. + if (mXMLComment.lookingAt(fPos, status)) { + fPos = mXMLComment.end(status); + continue; + } + + // PIs. Discard. + if (mXMLPI.lookingAt(fPos, status)) { + fPos = mXMLPI.end(status); + continue; + } + + // Element End + if (mXMLElemEnd.lookingAt(fPos, status)) { + fPos = mXMLElemEnd.end(0, status); + const UnicodeString name = mXMLElemEnd.group(1, status); + if (name != *el->fName) { + error("Element start / end tag mismatch", status); + goto errorExit; + } + if (fElementStack.empty()) { + // Close of the root element. We're done with the doc. + el = nullptr; + break; + } + el = (UXMLElement *)fElementStack.pop(); + continue; + } + + // Empty Element. Stored as a child of the current element, but not stacked. + if (mXMLElemEmpty.lookingAt(fPos, status)) { + UXMLElement *t = createElement(mXMLElemEmpty, status); + el->fChildren.addElement(t, status); + continue; + } + + // Hit something within the document that doesn't match anything. + // It's an error. + error("Unrecognized markup", status); + break; + } + + if (el != nullptr || !fElementStack.empty()) { + // We bailed out early, for some reason. + error("Root element not closed.", status); + goto errorExit; + } + } + + // Root Element parse is complete. + // Consume the annoying xml "Misc" that can appear at the end of the doc. + parseMisc(status); + + // We should have reached the end of the input + if (fPos != src.length()) { + error("Extra content at the end of the document", status); + goto errorExit; + } + + // Success! + return root; + +errorExit: + delete root; + return nullptr; +} + +// +// createElement +// We've just matched an element start tag. Create and fill in a UXMLElement object +// for it. +// +UXMLElement * +UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) { + // First capture group is the element's name. + UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status); + + // Scan for attributes. + int32_t pos = mEl.end(1, status); // The position after the end of the tag name + + while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element. + UnicodeString attName = mAttrValue.group(1, status); + UnicodeString attValue = mAttrValue.group(2, status); + + // Trim the quotes from the att value. These are left over from the original regex + // that parsed the attribute, which couldn't conveniently strip them. + attValue.remove(0,1); // one char from the beginning + attValue.truncate(attValue.length()-1); // and one from the end. + + // XML Attribute value normalization. + // This is one of the really screwy parts of the XML spec. + // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize + // Note that non-validating parsers must treat all entities as type CDATA + // which simplifies things some. + + // Att normalization step 1: normalize any newlines in the attribute value + mNewLineNormalizer.reset(attValue); + attValue = mNewLineNormalizer.replaceAll(fOneLF, status); + + // Next change all xml white space chars to plain \u0020 spaces. + mAttrNormalizer.reset(attValue); + UnicodeString oneSpace((char16_t)0x0020); + attValue = mAttrNormalizer.replaceAll(oneSpace, status); + + // Replace character entities. + replaceCharRefs(attValue, status); + + // Save the attribute name and value in our document structure. + el->fAttNames.addElement((void *)intern(attName, status), status); + el->fAttValues.addElement(attValue.clone(), status); + pos = mAttrValue.end(2, status); + } + fPos = mEl.end(0, status); + return el; +} + +// +// parseMisc +// Consume XML "Misc" [production #27] +// which is any combination of space, PI and comments +// Need to watch end-of-input because xml MISC stuff is allowed after +// the document element, so we WILL scan off the end in this function +// +void +UXMLParser::parseMisc(UErrorCode &status) { + for (;;) { + if (fPos >= mXMLPI.input().length()) { + break; + } + if (mXMLPI.lookingAt(fPos, status)) { + fPos = mXMLPI.end(status); + continue; + } + if (mXMLSP.lookingAt(fPos, status)) { + fPos = mXMLSP.end(status); + continue; + } + if (mXMLComment.lookingAt(fPos, status)) { + fPos = mXMLComment.end(status); + continue; + } + break; + } +} + +// +// Scan for document content. +// +UnicodeString +UXMLParser::scanContent(UErrorCode &status) { + UnicodeString result; + if (mXMLCharData.lookingAt(fPos, status)) { + result = mXMLCharData.group((int32_t)0, status); + // Normalize the new-lines. (Before char ref substitution) + mNewLineNormalizer.reset(result); + result = mNewLineNormalizer.replaceAll(fOneLF, status); + + // TODO: handle CDATA + fPos = mXMLCharData.end(0, status); + } + + return result; +} + +// +// replaceCharRefs +// +// replace the char entities < & { ካ etc. in a string +// with the corresponding actual character. +// +void +UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { + UnicodeString result; + UnicodeString replacement; + int i; + + mAmps.reset(s); + // See the initialization for the regex matcher mAmps. + // Which entity we've matched is determined by which capture group has content, + // which is flagged by start() of that group not being -1. + while (mAmps.find()) { + if (mAmps.start(1, status) != -1) { + replacement.setTo((char16_t)x_AMP); + } else if (mAmps.start(2, status) != -1) { + replacement.setTo((char16_t)x_LT); + } else if (mAmps.start(3, status) != -1) { + replacement.setTo((char16_t)x_GT); + } else if (mAmps.start(4, status) != -1) { + replacement.setTo((char16_t)x_APOS); + } else if (mAmps.start(5, status) != -1) { + replacement.setTo((char16_t)x_QUOT); + } else if (mAmps.start(6, status) != -1) { + UnicodeString hexString = mAmps.group(6, status); + UChar32 val = 0; + for (i=0; i<hexString.length(); i++) { + val = (val << 4) + u_digit(hexString.charAt(i), 16); + } + // TODO: some verification that the character is valid + replacement.setTo(val); + } else if (mAmps.start(7, status) != -1) { + UnicodeString decimalString = mAmps.group(7, status); + UChar32 val = 0; + for (i=0; i<decimalString.length(); i++) { + val = val*10 + u_digit(decimalString.charAt(i), 10); + } + // TODO: some verification that the character is valid + replacement.setTo(val); + } else { + // An unrecognized &entity; Leave it alone. + // TODO: check that it really looks like an entity, and is not some + // random & in the text. + replacement = mAmps.group((int32_t)0, status); + } + mAmps.appendReplacement(result, replacement, status); + } + mAmps.appendTail(result); + s = result; +} + +void +UXMLParser::error(const char *message, UErrorCode &status) { + // TODO: something better here... + const UnicodeString &src=mXMLDecl.input(); + int line = 0; + int ci = 0; + while (ci < fPos && ci>=0) { + ci = src.indexOf((char16_t)0x0a, ci+1); + line++; + } + fprintf(stderr, "Error: %s at line %d\n", message, line); + if (U_SUCCESS(status)) { + status = U_PARSE_ERROR; + } +} + +// intern strings like in Java + +const UnicodeString * +UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) { + const UHashElement *he=fNames.find(s); + if(he!=nullptr) { + // already a known name, return its hashed key pointer + return (const UnicodeString *)he->key.pointer; + } else { + // add this new name and return its hashed key pointer + fNames.puti(s, 1, errorCode); + he=fNames.find(s); + return (const UnicodeString *)he->key.pointer; + } +} + +const UnicodeString * +UXMLParser::findName(const UnicodeString &s) const { + const UHashElement *he=fNames.find(s); + if(he!=nullptr) { + // a known name, return its hashed key pointer + return (const UnicodeString *)he->key.pointer; + } else { + // unknown name + return nullptr; + } +} + +// UXMLElement ------------------------------------------------------------- *** + +UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) : + fParser(parser), + fName(name), + fAttNames(errorCode), + fAttValues(errorCode), + fChildren(errorCode), + fParent(nullptr) +{ +} + +UXMLElement::~UXMLElement() { + int i; + // attribute names are owned by the UXMLParser, don't delete them here + for (i=fAttValues.size()-1; i>=0; i--) { + delete (UObject *)fAttValues.elementAt(i); + } + for (i=fChildren.size()-1; i>=0; i--) { + delete (UObject *)fChildren.elementAt(i); + } +} + +const UnicodeString & +UXMLElement::getTagName() const { + return *fName; +} + +UnicodeString +UXMLElement::getText(UBool recurse) const { + UnicodeString text; + appendText(text, recurse); + return text; +} + +void +UXMLElement::appendText(UnicodeString &text, UBool recurse) const { + const UObject *node; + int32_t i, count=fChildren.size(); + for(i=0; i<count; ++i) { + node=(const UObject *)fChildren.elementAt(i); + const UnicodeString *s=dynamic_cast<const UnicodeString *>(node); + if(s!=nullptr) { + text.append(*s); + } else if(recurse) /* must be a UXMLElement */ { + ((const UXMLElement *)node)->appendText(text, recurse); + } + } +} + +int32_t +UXMLElement::countAttributes() const { + return fAttNames.size(); +} + +const UnicodeString * +UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const { + if(0<=i && i<fAttNames.size()) { + name.setTo(*(const UnicodeString *)fAttNames.elementAt(i)); + value.setTo(*(const UnicodeString *)fAttValues.elementAt(i)); + return &value; // or return (UnicodeString *)fAttValues.elementAt(i); + } else { + return nullptr; + } +} + +const UnicodeString * +UXMLElement::getAttribute(const UnicodeString &name) const { + // search for the attribute name by comparing the interned pointer, + // not the string contents + const UnicodeString *p=fParser->findName(name); + if(p==nullptr) { + return nullptr; // no such attribute seen by the parser at all + } + + int32_t i, count=fAttNames.size(); + for(i=0; i<count; ++i) { + if(p==(const UnicodeString *)fAttNames.elementAt(i)) { + return (const UnicodeString *)fAttValues.elementAt(i); + } + } + return nullptr; +} + +int32_t +UXMLElement::countChildren() const { + return fChildren.size(); +} + +const UObject * +UXMLElement::getChild(int32_t i, UXMLNodeType &type) const { + if(0<=i && i<fChildren.size()) { + const UObject *node=(const UObject *)fChildren.elementAt(i); + if(dynamic_cast<const UXMLElement *>(node)!=nullptr) { + type=UXML_NODE_TYPE_ELEMENT; + } else { + type=UXML_NODE_TYPE_STRING; + } + return node; + } else { + return nullptr; + } +} + +const UXMLElement * +UXMLElement::nextChildElement(int32_t &i) const { + if(i<0) { + return nullptr; + } + + const UObject *node; + int32_t count=fChildren.size(); + while(i<count) { + node=(const UObject *)fChildren.elementAt(i++); + const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); + if(elem!=nullptr) { + return elem; + } + } + return nullptr; +} + +const UXMLElement * +UXMLElement::getChildElement(const UnicodeString &name) const { + // search for the element name by comparing the interned pointer, + // not the string contents + const UnicodeString *p=fParser->findName(name); + if(p==nullptr) { + return nullptr; // no such element seen by the parser at all + } + + const UObject *node; + int32_t i, count=fChildren.size(); + for(i=0; i<count; ++i) { + node=(const UObject *)fChildren.elementAt(i); + const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); + if(elem!=nullptr) { + if(p==elem->fName) { + return elem; + } + } + } + return nullptr; +} + +U_NAMESPACE_END + +#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ + diff --git a/intl/icu/source/tools/toolutil/xmlparser.h b/intl/icu/source/tools/toolutil/xmlparser.h new file mode 100644 index 0000000000..d0dcd9a48a --- /dev/null +++ b/intl/icu/source/tools/toolutil/xmlparser.h @@ -0,0 +1,247 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2004-2005, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: xmlparser.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2004jul21 +* created by: Andy Heninger +* +* Tiny XML parser using ICU and intended for use in ICU tests and in build tools. +* Not suitable for production use. Not supported. +* Not conformant. Not efficient. +* But very small. +*/ + +#ifndef __XMLPARSER_H__ +#define __XMLPARSER_H__ + +#include "unicode/uobject.h" +#include "unicode/unistr.h" +#include "unicode/regex.h" +#include "uvector.h" +#include "hash.h" + +#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION + +enum UXMLNodeType { + /** Node type string (text contents), stored as a UnicodeString. */ + UXML_NODE_TYPE_STRING, + /** Node type element, stored as a UXMLElement. */ + UXML_NODE_TYPE_ELEMENT, + UXML_NODE_TYPE_COUNT +}; + +U_NAMESPACE_BEGIN + +class UXMLParser; + +/** + * This class represents an element node in a parsed XML tree. + */ +class U_TOOLUTIL_API UXMLElement : public UObject { +public: + /** + * Destructor. + */ + virtual ~UXMLElement(); + + /** + * Get the tag name of this element. + */ + const UnicodeString &getTagName() const; + /** + * Get the text contents of the element. + * Append the contents of all text child nodes. + * @param recurse If true, also recursively appends the contents of all + * text child nodes of element children. + * @return The text contents. + */ + UnicodeString getText(UBool recurse) const; + /** + * Get the number of attributes. + */ + int32_t countAttributes() const; + /** + * Get the i-th attribute. + * @param i Index of the attribute. + * @param name Output parameter, receives the attribute name. + * @param value Output parameter, receives the attribute value. + * @return A pointer to the attribute value (may be &value or a pointer to an + * internal string object), or nullptr if i is out of bounds. + */ + const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const; + /** + * Get the value of the attribute with the given name. + * @param name Attribute name to be looked up. + * @return A pointer to the attribute value, or nullptr if this element + * does not have this attribute. + */ + const UnicodeString *getAttribute(const UnicodeString &name) const; + /** + * Get the number of child nodes. + */ + int32_t countChildren() const; + /** + * Get the i-th child node. + * @param i Index of the child node. + * @param type The child node type. + * @return A pointer to the child node object, or nullptr if i is out of bounds. + */ + const UObject *getChild(int32_t i, UXMLNodeType &type) const; + /** + * Get the next child element node, skipping non-element child nodes. + * @param i Enumeration index; initialize to 0 before getting the first child element. + * @return A pointer to the next child element, or nullptr if there is none. + */ + const UXMLElement *nextChildElement(int32_t &i) const; + /** + * Get the immediate child element with the given name. + * If there are multiple child elements with this name, then return + * the first one. + * @param name Element name to be looked up. + * @return A pointer to the element node, or nullptr if this element + * does not have this immediate child element. + */ + const UXMLElement *getChildElement(const UnicodeString &name) const; + + /** + * ICU "poor man's RTTI", returns a UClassID for the actual class. + */ + virtual UClassID getDynamicClassID() const override; + + /** + * ICU "poor man's RTTI", returns a UClassID for this class. + */ + static UClassID U_EXPORT2 getStaticClassID(); + +private: + // prevent default construction etc. + UXMLElement(); + UXMLElement(const UXMLElement &other); + UXMLElement &operator=(const UXMLElement &other); + + void appendText(UnicodeString &text, UBool recurse) const; + + friend class UXMLParser; + + UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode); + + const UXMLParser *fParser; + const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser) + UnicodeString fContent; // The text content of this node. All element content is + // concatenated even when there are intervening nested elements + // (which doesn't happen with most xml files we care about) + // Sections of content containing only white space are dropped, + // which gets rid the bogus white space content from + // elements which are primarily containers for nested elements. + UVector fAttNames; // A vector containing the names of this element's attributes + // The names are UnicodeString objects, owned by the UXMLParser. + UVector fAttValues; // A vector containing the attribute values for + // this element's attributes. The order is the same + // as that of the attribute name vector. + + UVector fChildren; // The child nodes of this element (a Vector) + + UXMLElement *fParent; // A pointer to the parent element of this element. +}; + +/** + * A simple XML parser; it is neither efficient nor conformant and only useful for + * restricted types of XML documents. + * + * The parse methods parse whole documents and return the parse trees via their + * root elements. + */ +class U_TOOLUTIL_API UXMLParser : public UObject { +public: + /** + * Create an XML parser. + */ + static UXMLParser *createParser(UErrorCode &errorCode); + /** + * Destructor. + */ + virtual ~UXMLParser(); + + /** + * Parse an XML document, create the entire document tree, and + * return a pointer to the root element of the parsed tree. + * The caller must delete the element. + */ + UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode); + /** + * Parse an XML file, create the entire document tree, and + * return a pointer to the root element of the parsed tree. + * The caller must delete the element. + */ + UXMLElement *parseFile(const char *filename, UErrorCode &errorCode); + + /** + * ICU "poor man's RTTI", returns a UClassID for the actual class. + */ + virtual UClassID getDynamicClassID() const override; + + /** + * ICU "poor man's RTTI", returns a UClassID for this class. + */ + static UClassID U_EXPORT2 getStaticClassID(); + +private: + // prevent default construction etc. + UXMLParser(); + UXMLParser(const UXMLParser &other); + UXMLParser &operator=(const UXMLParser &other); + + // constructor + UXMLParser(UErrorCode &status); + + void parseMisc(UErrorCode &status); + UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status); + void error(const char *message, UErrorCode &status); + UnicodeString scanContent(UErrorCode &status); + void replaceCharRefs(UnicodeString &s, UErrorCode &status); + + const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode); +public: + // public for UXMLElement only + const UnicodeString *findName(const UnicodeString &s) const; +private: + + // There is one ICU regex matcher for each of the major XML syntax items + // that are recognized. + RegexMatcher mXMLDecl; + RegexMatcher mXMLComment; + RegexMatcher mXMLSP; + RegexMatcher mXMLDoctype; + RegexMatcher mXMLPI; + RegexMatcher mXMLElemStart; + RegexMatcher mXMLElemEnd; + RegexMatcher mXMLElemEmpty; + RegexMatcher mXMLCharData; + RegexMatcher mAttrValue; + RegexMatcher mAttrNormalizer; + RegexMatcher mNewLineNormalizer; + RegexMatcher mAmps; + + Hashtable fNames; // interned element/attribute name strings + UStack fElementStack; // Stack holds the parent elements when nested + // elements are being parsed. All items on this + // stack are of type UXMLElement. + int32_t fPos; // String index of the current scan position in + // xml source (in fSrc). + UnicodeString fOneLF; +}; + +U_NAMESPACE_END +#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ + +#endif |