summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/tools/toolutil
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--intl/icu/source/tools/toolutil/BUILD.bazel126
-rw-r--r--intl/icu/source/tools/toolutil/Makefile.in155
-rw-r--r--intl/icu/source/tools/toolutil/collationinfo.cpp152
-rw-r--r--intl/icu/source/tools/toolutil/collationinfo.h42
-rw-r--r--intl/icu/source/tools/toolutil/dbgutil.cpp160
-rw-r--r--intl/icu/source/tools/toolutil/dbgutil.h45
-rw-r--r--intl/icu/source/tools/toolutil/denseranges.cpp160
-rw-r--r--intl/icu/source/tools/toolutil/denseranges.h41
-rw-r--r--intl/icu/source/tools/toolutil/filestrm.cpp227
-rw-r--r--intl/icu/source/tools/toolutil/filestrm.h106
-rw-r--r--intl/icu/source/tools/toolutil/filetools.cpp140
-rw-r--r--intl/icu/source/tools/toolutil/filetools.h34
-rw-r--r--intl/icu/source/tools/toolutil/flagparser.cpp180
-rw-r--r--intl/icu/source/tools/toolutil/flagparser.h32
-rw-r--r--intl/icu/source/tools/toolutil/package.cpp1311
-rw-r--r--intl/icu/source/tools/toolutil/package.h203
-rw-r--r--intl/icu/source/tools/toolutil/pkg_genc.cpp1396
-rw-r--r--intl/icu/source/tools/toolutil/pkg_genc.h107
-rw-r--r--intl/icu/source/tools/toolutil/pkg_gencmn.cpp578
-rw-r--r--intl/icu/source/tools/toolutil/pkg_gencmn.h18
-rw-r--r--intl/icu/source/tools/toolutil/pkg_icu.cpp176
-rw-r--r--intl/icu/source/tools/toolutil/pkg_icu.h25
-rw-r--r--intl/icu/source/tools/toolutil/pkg_imp.h38
-rw-r--r--intl/icu/source/tools/toolutil/pkgitems.cpp645
-rw-r--r--intl/icu/source/tools/toolutil/ppucd.cpp622
-rw-r--r--intl/icu/source/tools/toolutil/ppucd.h180
-rw-r--r--intl/icu/source/tools/toolutil/sources.txt24
-rw-r--r--intl/icu/source/tools/toolutil/swapimpl.cpp1048
-rw-r--r--intl/icu/source/tools/toolutil/swapimpl.h45
-rw-r--r--intl/icu/source/tools/toolutil/toolutil.cpp381
-rw-r--r--intl/icu/source/tools/toolutil/toolutil.h201
-rw-r--r--intl/icu/source/tools/toolutil/toolutil.vcxproj272
-rw-r--r--intl/icu/source/tools/toolutil/ucbuf.cpp788
-rw-r--r--intl/icu/source/tools/toolutil/ucbuf.h218
-rw-r--r--intl/icu/source/tools/toolutil/ucln_tu.cpp19
-rw-r--r--intl/icu/source/tools/toolutil/ucm.cpp1195
-rw-r--r--intl/icu/source/tools/toolutil/ucm.h302
-rw-r--r--intl/icu/source/tools/toolutil/ucmstate.cpp1053
-rw-r--r--intl/icu/source/tools/toolutil/udbgutil.cpp769
-rw-r--r--intl/icu/source/tools/toolutil/udbgutil.h147
-rw-r--r--intl/icu/source/tools/toolutil/unewdata.cpp286
-rw-r--r--intl/icu/source/tools/toolutil/unewdata.h113
-rw-r--r--intl/icu/source/tools/toolutil/uoptions.cpp133
-rw-r--r--intl/icu/source/tools/toolutil/uoptions.h143
-rw-r--r--intl/icu/source/tools/toolutil/uparse.cpp383
-rw-r--r--intl/icu/source/tools/toolutil/uparse.h153
-rw-r--r--intl/icu/source/tools/toolutil/writesrc.cpp515
-rw-r--r--intl/icu/source/tools/toolutil/writesrc.h198
-rw-r--r--intl/icu/source/tools/toolutil/xmlparser.cpp827
-rw-r--r--intl/icu/source/tools/toolutil/xmlparser.h247
50 files changed, 16359 insertions, 0 deletions
diff --git a/intl/icu/source/tools/toolutil/BUILD.bazel b/intl/icu/source/tools/toolutil/BUILD.bazel
new file mode 100644
index 0000000000..276c857f12
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/BUILD.bazel
@@ -0,0 +1,126 @@
+# © 2021 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+
+# This Bazel build file defines targets that are dependencies for building
+# the gennorm2 and genprops binaries.
+
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
+
+package(
+ default_visibility = ["//visibility:public"],
+)
+
+cc_library(
+ name = "toolutil",
+ includes = ["."],
+ hdrs = ["toolutil.h"],
+ srcs = ["toolutil.cpp"],
+ local_defines = [
+ "U_TOOLUTIL_IMPLEMENTATION",
+ ],
+ deps = ["//icu4c/source/common:platform"],
+)
+
+cc_library(
+ name = "unewdata",
+ includes = ["."],
+ hdrs = ["unewdata.h"],
+ srcs = ["unewdata.cpp"],
+ local_defines = [
+ "U_TOOLUTIL_IMPLEMENTATION",
+ ],
+ deps = [
+ ":filestrm",
+ "//icu4c/source/common:platform",
+ ],
+)
+
+cc_library(
+ name = "uoptions",
+ includes = ["."],
+ hdrs = ["uoptions.h"],
+ srcs = ["uoptions.cpp"],
+ local_defines = [
+ "U_TOOLUTIL_IMPLEMENTATION",
+ ],
+ deps = ["//icu4c/source/common:platform"],
+)
+
+cc_library(
+ name = "writesrc",
+ includes = ["."],
+ hdrs = ["writesrc.h"],
+ srcs = ["writesrc.cpp"],
+ local_defines = [
+ "U_TOOLUTIL_IMPLEMENTATION",
+ ],
+ deps = [
+ "//icu4c/source/common:bytestream",
+ "//icu4c/source/common:platform",
+ "//icu4c/source/common:uniset_core",
+ ],
+)
+
+cc_library(
+ name = "uparse",
+ includes = ["."],
+ hdrs = ["uparse.h"],
+ srcs = ["uparse.cpp"],
+ local_defines = [
+ "U_TOOLUTIL_IMPLEMENTATION",
+ ],
+ deps = [
+ ":filestrm",
+ "//icu4c/source/common:platform",
+ ],
+)
+
+cc_library(
+ name = "filestrm",
+ includes = ["."],
+ hdrs = ["filestrm.h"],
+ srcs = ["filestrm.cpp"],
+ local_defines = [
+ "U_TOOLUTIL_IMPLEMENTATION",
+ ],
+ deps = ["//icu4c/source/common:platform"],
+)
+
+cc_library(
+ name = "ppucd",
+ includes = ["."],
+ hdrs = ["ppucd.h"],
+ srcs = ["ppucd.cpp"],
+ local_defines = [
+ "U_TOOLUTIL_IMPLEMENTATION",
+ ],
+ deps = [
+ ":uparse",
+ "//icu4c/source/common:platform",
+ ],
+)
+
+cc_library(
+ name = "denseranges",
+ includes = ["."],
+ hdrs = ["denseranges.h"],
+ srcs = ["denseranges.cpp"],
+ local_defines = [
+ "U_TOOLUTIL_IMPLEMENTATION",
+ ],
+ deps = ["//icu4c/source/common:platform"],
+)
+
+cc_library(
+ name = "collationinfo",
+ includes = ["."],
+ hdrs = ["collationinfo.h"],
+ srcs = ["collationinfo.cpp"],
+ local_defines = [
+ "U_TOOLUTIL_IMPLEMENTATION",
+ ],
+ deps = [
+ "//icu4c/source/common:platform",
+ "//icu4c/source/i18n:headers",
+ ],
+)
diff --git a/intl/icu/source/tools/toolutil/Makefile.in b/intl/icu/source/tools/toolutil/Makefile.in
new file mode 100644
index 0000000000..c9fd89b0f0
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/Makefile.in
@@ -0,0 +1,155 @@
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+#******************************************************************************
+#
+# Copyright (C) 1999-2014, International Business Machines
+# Corporation and others. All Rights Reserved.
+#
+#******************************************************************************
+## Makefile.in for ICU - tools/toolutil
+## Steven R. Loomis
+
+## Source directory information
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+
+top_builddir = ../..
+
+## All the flags and other definitions are included here.
+include $(top_builddir)/icudefs.mk
+
+## Build directory information
+subdir = tools/toolutil
+
+## Extra files to remove for 'make clean'
+CLEANFILES = *~ $(DEPS) $(IMPORT_LIB) $(MIDDLE_IMPORT_LIB) $(FINAL_IMPORT_LIB)
+
+## Target information
+
+TARGET_STUBNAME=$(TOOLUTIL_STUBNAME)
+
+ifneq ($(ENABLE_STATIC),)
+TARGET = $(LIBDIR)/$(LIBSICU)$(TARGET_STUBNAME)$(ICULIBSUFFIX).$(A)
+endif
+
+ifneq ($(ENABLE_SHARED),)
+SO_TARGET = $(LIBDIR)/$(LIBICU)$(TARGET_STUBNAME)$(ICULIBSUFFIX).$(SO)
+ALL_SO_TARGETS = $(SO_TARGET) $(MIDDLE_SO_TARGET) $(FINAL_SO_TARGET) $(SHARED_OBJECT)
+endif
+
+ALL_TARGETS = $(TARGET) $(ALL_SO_TARGETS)
+
+DYNAMICCPPFLAGS = $(SHAREDLIBCPPFLAGS)
+DYNAMICCFLAGS = $(SHAREDLIBCFLAGS)
+DYNAMICCXXFLAGS = $(SHAREDLIBCXXFLAGS)
+CFLAGS += $(LIBCFLAGS)
+CXXFLAGS += $(LIBCXXFLAGS)
+
+CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(top_srcdir)/i18n $(LIBCPPFLAGS)
+
+# from icuinfo
+CPPFLAGS+= "-DU_BUILD=\"@build@\"" "-DU_HOST=\"@host@\"" "-DU_CC=\"@CC@\"" "-DU_CXX=\"@CXX@\""
+CPPFLAGS += -DUNISTR_FROM_CHAR_EXPLICIT=explicit -DUNISTR_FROM_STRING_EXPLICIT=explicit
+
+DEFS += -DU_TOOLUTIL_IMPLEMENTATION
+LDFLAGS += $(LDFLAGSICUTOOLUTIL)
+LIBS = $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS)
+
+SOURCES = $(shell cat $(srcdir)/sources.txt)
+OBJECTS = $(SOURCES:.cpp=.o)
+
+STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O))
+
+DEPS = $(OBJECTS:.o=.d)
+
+-include Makefile.local
+
+## List of phony targets
+.PHONY : all all-local install install-local clean clean-local \
+distclean distclean-local install-library dist \
+dist-local check check-local
+
+## Clear suffix list
+.SUFFIXES :
+
+## List of standard targets
+all: all-local
+install: install-local
+clean: clean-local
+distclean : distclean-local
+dist: dist-local
+check: all check-local
+
+all-local: $(ALL_TARGETS)
+
+install-local: install-library
+
+install-library: all-local
+ $(MKINSTALLDIRS) $(DESTDIR)$(libdir)
+ifneq ($(ENABLE_STATIC),)
+ $(INSTALL-L) $(TARGET) $(DESTDIR)$(libdir)
+endif
+ifneq ($(ENABLE_SHARED),)
+# For MinGW, do we want the DLL to go in the bin location?
+ifeq ($(MINGW_MOVEDLLSTOBINDIR),YES)
+ $(MKINSTALLDIRS) $(DESTDIR)$(bindir)
+ $(INSTALL-L) $(FINAL_SO_TARGET) $(DESTDIR)$(bindir)
+else
+ $(INSTALL-L) $(FINAL_SO_TARGET) $(DESTDIR)$(libdir)
+ifneq ($(FINAL_SO_TARGET),$(SO_TARGET))
+ cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(SO_TARGET)) && ln -s $(notdir $(FINAL_SO_TARGET)) $(notdir $(SO_TARGET))
+ifneq ($(FINAL_SO_TARGET),$(MIDDLE_SO_TARGET))
+ cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(MIDDLE_SO_TARGET)) && ln -s $(notdir $(FINAL_SO_TARGET)) $(notdir $(MIDDLE_SO_TARGET))
+endif
+endif
+endif
+ifneq ($(IMPORT_LIB_EXT),)
+ $(INSTALL-L) $(FINAL_IMPORT_LIB) $(DESTDIR)$(libdir)
+ifneq ($(IMPORT_LIB),$(FINAL_IMPORT_LIB))
+ cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(IMPORT_LIB)) && ln -s $(notdir $(FINAL_IMPORT_LIB)) $(notdir $(IMPORT_LIB))
+endif
+ifneq ($(MIDDLE_IMPORT_LIB),$(FINAL_IMPORT_LIB))
+ cd $(DESTDIR)$(libdir) && $(RM) $(notdir $(MIDDLE_IMPORT_LIB)) && ln -s $(notdir $(FINAL_IMPORT_LIB)) $(notdir $(MIDDLE_IMPORT_LIB))
+endif
+endif
+endif
+
+dist-local:
+
+clean-local:
+ test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
+ $(RMV) $(OBJECTS) $(STATIC_OBJECTS) $(ALL_TARGETS)
+
+distclean-local: clean-local
+ $(RMV) Makefile
+
+check-local: all-local
+
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+ cd $(top_builddir) \
+ && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
+
+ifneq ($(ENABLE_STATIC),)
+$(TARGET): $(STATIC_OBJECTS)
+ $(AR) $(ARFLAGS) $(AR_OUTOPT)$@ $^
+ $(RANLIB) $@
+endif
+
+ifneq ($(ENABLE_SHARED),)
+$(SHARED_OBJECT): $(OBJECTS)
+ $(SHLIB.cc) $(LD_SONAME) $(OUTOPT)$@ $^ $(LIBS)
+ifeq ($(ENABLE_RPATH),YES)
+ifneq ($(wildcard $(libdir)/$(MIDDLE_SO_TARGET)),)
+ $(warning RPATH warning: --enable-rpath means test programs may use existing $(libdir)/$(MIDDLE_SO_TARGET))
+endif
+endif
+endif
+
+ifeq (,$(MAKECMDGOALS))
+-include $(DEPS)
+else
+ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
+-include $(DEPS)
+endif
+endif
+
diff --git a/intl/icu/source/tools/toolutil/collationinfo.cpp b/intl/icu/source/tools/toolutil/collationinfo.cpp
new file mode 100644
index 0000000000..6bad90e133
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/collationinfo.cpp
@@ -0,0 +1,152 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+* Copyright (C) 2013-2015, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* collationinfo.cpp
+*
+* created on: 2013aug05
+* created by: Markus W. Scherer
+*/
+
+#include <stdio.h>
+#include <string.h>
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_COLLATION
+
+#include "collationdata.h"
+#include "collationdatareader.h"
+#include "collationinfo.h"
+#include "uassert.h"
+#include "uvectr32.h"
+
+U_NAMESPACE_BEGIN
+
+void
+CollationInfo::printSizes(int32_t sizeWithHeader, const int32_t indexes[]) {
+ int32_t totalSize = indexes[CollationDataReader::IX_TOTAL_SIZE];
+ if(sizeWithHeader > totalSize) {
+ printf(" header size: %6ld\n", (long)(sizeWithHeader - totalSize));
+ }
+
+ int32_t length = indexes[CollationDataReader::IX_INDEXES_LENGTH];
+ printf(" indexes: %6ld *4 = %6ld\n", (long)length, (long)length * 4);
+
+ length = getDataLength(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET);
+ if(length != 0) {
+ printf(" reorder codes: %6ld *4 = %6ld\n", (long)length / 4, (long)length);
+ }
+
+ length = getDataLength(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET);
+ if(length != 0) {
+ U_ASSERT(length >= 256);
+ printf(" reorder table: %6ld\n", (long)length);
+ }
+
+ length = getDataLength(indexes, CollationDataReader::IX_TRIE_OFFSET);
+ if(length != 0) {
+ printf(" trie size: %6ld\n", (long)length);
+ }
+
+ length = getDataLength(indexes, CollationDataReader::IX_RESERVED8_OFFSET);
+ if(length != 0) {
+ printf(" reserved (offset 8): %6ld\n", (long)length);
+ }
+
+ length = getDataLength(indexes, CollationDataReader::IX_CES_OFFSET);
+ if(length != 0) {
+ printf(" CEs: %6ld *8 = %6ld\n", (long)length / 8, (long)length);
+ }
+
+ length = getDataLength(indexes, CollationDataReader::IX_RESERVED10_OFFSET);
+ if(length != 0) {
+ printf(" reserved (offset 10): %6ld\n", (long)length);
+ }
+
+ length = getDataLength(indexes, CollationDataReader::IX_CE32S_OFFSET);
+ if(length != 0) {
+ printf(" CE32s: %6ld *4 = %6ld\n", (long)length / 4, (long)length);
+ }
+
+ length = getDataLength(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET);
+ if(length != 0) {
+ printf(" rootElements: %6ld *4 = %6ld\n", (long)length / 4, (long)length);
+ }
+
+ length = getDataLength(indexes, CollationDataReader::IX_CONTEXTS_OFFSET);
+ if(length != 0) {
+ printf(" contexts: %6ld *2 = %6ld\n", (long)length / 2, (long)length);
+ }
+
+ length = getDataLength(indexes, CollationDataReader::IX_UNSAFE_BWD_OFFSET);
+ if(length != 0) {
+ printf(" unsafeBwdSet: %6ld *2 = %6ld\n", (long)length / 2, (long)length);
+ }
+
+ length = getDataLength(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET);
+ if(length != 0) {
+ printf(" fastLatin table: %6ld *2 = %6ld\n", (long)length / 2, (long)length);
+ }
+
+ length = getDataLength(indexes, CollationDataReader::IX_SCRIPTS_OFFSET);
+ if(length != 0) {
+ printf(" scripts data: %6ld *2 = %6ld\n", (long)length / 2, (long)length);
+ }
+
+ length = getDataLength(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET);
+ if(length != 0) {
+ U_ASSERT(length >= 256);
+ printf(" compressibleBytes: %6ld\n", (long)length);
+ }
+
+ length = getDataLength(indexes, CollationDataReader::IX_RESERVED18_OFFSET);
+ if(length != 0) {
+ printf(" reserved (offset 18): %6ld\n", (long)length);
+ }
+
+ printf(" collator binary total size: %6ld\n", (long)sizeWithHeader);
+}
+
+int32_t
+CollationInfo::getDataLength(const int32_t indexes[], int32_t startIndex) {
+ return indexes[startIndex + 1] - indexes[startIndex];
+}
+
+void
+CollationInfo::printReorderRanges(const CollationData &data, const int32_t *codes, int32_t length) {
+ UErrorCode errorCode = U_ZERO_ERROR;
+ UVector32 ranges(errorCode);
+ data.makeReorderRanges(codes, length, ranges, errorCode);
+ if(U_FAILURE(errorCode)) {
+ printf(" error building reorder ranges: %s\n", u_errorName(errorCode));
+ return;
+ }
+
+ int32_t start = 0;
+ for(int32_t i = 0; i < ranges.size(); ++i) {
+ int32_t pair = ranges.elementAti(i);
+ int32_t limit = (pair >> 16) & 0xffff;
+ int16_t offset = (int16_t)pair;
+ if(offset == 0) {
+ // [inclusive-start, exclusive-limit[
+ printf(" [%04x, %04x[\n", start, limit);
+ } else if(offset > 0) {
+ printf(" reorder [%04x, %04x[ by offset %02x to [%04x, %04x[\n",
+ start, limit, offset,
+ start + (offset << 8), limit + (offset << 8));
+ } else /* offset < 0 */ {
+ printf(" reorder [%04x, %04x[ by offset -%02x to [%04x, %04x[\n",
+ start, limit, -offset,
+ start + (offset << 8), limit + (offset << 8));
+ }
+ start = limit;
+ }
+}
+
+U_NAMESPACE_END
+
+#endif // !UCONFIG_NO_COLLATION
diff --git a/intl/icu/source/tools/toolutil/collationinfo.h b/intl/icu/source/tools/toolutil/collationinfo.h
new file mode 100644
index 0000000000..815b89d40d
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/collationinfo.h
@@ -0,0 +1,42 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+* Copyright (C) 2013-2015, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* collationinfo.h
+*
+* created on: 2013aug05
+* created by: Markus W. Scherer
+*/
+
+#ifndef __COLLATIONINFO_H__
+#define __COLLATIONINFO_H__
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_COLLATION
+
+U_NAMESPACE_BEGIN
+
+struct CollationData;
+
+/**
+ * Collation-related code for tools & demos.
+ */
+class U_TOOLUTIL_API CollationInfo /* all static */ {
+public:
+ static void printSizes(int32_t sizeWithHeader, const int32_t indexes[]);
+ static void printReorderRanges(const CollationData &data, const int32_t *codes, int32_t length);
+
+private:
+ CollationInfo(); // no constructor
+
+ static int32_t getDataLength(const int32_t indexes[], int32_t startIndex);
+};
+
+U_NAMESPACE_END
+
+#endif // !UCONFIG_NO_COLLATION
+#endif // __COLLATIONINFO_H__
diff --git a/intl/icu/source/tools/toolutil/dbgutil.cpp b/intl/icu/source/tools/toolutil/dbgutil.cpp
new file mode 100644
index 0000000000..d42b267f73
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/dbgutil.cpp
@@ -0,0 +1,160 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/********************************************************************
+ * COPYRIGHT:
+ * Copyright (c) 2007-2012, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ ********************************************************************/
+
+#include "udbgutil.h"
+#include "dbgutil.h"
+
+#if !UCONFIG_NO_FORMATTING
+
+#include "unicode/unistr.h"
+#include "unicode/ustring.h"
+#include "util.h"
+#include "ucln.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+U_NAMESPACE_USE
+
+static UnicodeString **strs = nullptr;
+
+static const UnicodeString& _fieldString(UDebugEnumType type, int32_t field, UnicodeString& fillin) {
+ const char *str = udbg_enumName(type, field);
+ if(str == nullptr) {
+ return fillin.remove();
+ } else {
+ return fillin = UnicodeString(str, -1, US_INV);
+ }
+}
+
+U_CDECL_BEGIN
+static void udbg_cleanup() {
+ if(strs != nullptr) {
+ for(int t=0;t<=UDBG_ENUM_COUNT;t++) {
+ delete [] strs[t];
+ }
+ delete[] strs;
+ strs = nullptr;
+ }
+}
+
+static UBool tu_cleanup()
+{
+ udbg_cleanup();
+ return true;
+}
+
+static void udbg_register_cleanup() {
+ ucln_registerCleanup(UCLN_TOOLUTIL, tu_cleanup);
+}
+U_CDECL_END
+
+static void udbg_setup() {
+ if(strs == nullptr) {
+ udbg_register_cleanup();
+ //fprintf(stderr,"Initializing string cache..\n");
+ //fflush(stderr);
+ UnicodeString **newStrs = new UnicodeString*[UDBG_ENUM_COUNT+1];
+ for(int t=0;t<UDBG_ENUM_COUNT;t++) {
+ int32_t c = udbg_enumCount((UDebugEnumType)t);
+ newStrs[t] = new UnicodeString[c+1];
+ for(int f=0;f<=c;f++) {
+ _fieldString((UDebugEnumType)t, f, newStrs[t][f]);
+ }
+ }
+ newStrs[UDBG_ENUM_COUNT] = new UnicodeString[1]; // empty string
+
+ strs = newStrs;
+ }
+}
+
+
+
+U_TOOLUTIL_API const UnicodeString& U_EXPORT2 udbg_enumString(UDebugEnumType type, int32_t field) {
+ if(strs == nullptr ) {
+ udbg_setup();
+ }
+ if(type<0||type>=UDBG_ENUM_COUNT) {
+ // use UDBG_ENUM_COUNT,0 to mean an empty string
+ //fprintf(stderr, "** returning out of range on %d\n",type);
+ //fflush(stderr);
+ return strs[UDBG_ENUM_COUNT][0];
+ }
+ int32_t count = udbg_enumCount(type);
+ //fprintf(stderr, "enumString [%d,%d]: typecount %d, fieldcount %d\n", type,field,UDBG_ENUM_COUNT,count);
+ //fflush(stderr);
+ if(field<0 || field > count) {
+ return strs[type][count];
+ } else { return strs[type][field];
+ }
+}
+
+U_CAPI int32_t U_EXPORT2 udbg_enumByString(UDebugEnumType type, const UnicodeString& string) {
+ if(type<0||type>=UDBG_ENUM_COUNT) {
+ return -1;
+ }
+ // initialize array
+ udbg_enumString(type,0);
+ // search
+ /// printf("type=%d\n", type); fflush(stdout);
+ for(int i=0;i<udbg_enumCount(type);i++) {
+// printf("i=%d/%d\n", i, udbg_enumCount(type)); fflush(stdout);
+ if(string == (strs[type][i])) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+// from DataMap::utoi
+U_CAPI int32_t
+udbg_stoi(const UnicodeString &s)
+{
+ char ch[256];
+ const char16_t *u = toUCharPtr(s.getBuffer());
+ int32_t len = s.length();
+ u_UCharsToChars(u, ch, len);
+ ch[len] = 0; /* include terminating \0 */
+ return atoi(ch);
+}
+
+
+U_CAPI double
+udbg_stod(const UnicodeString &s)
+{
+ char ch[256];
+ const char16_t *u = toUCharPtr(s.getBuffer());
+ int32_t len = s.length();
+ u_UCharsToChars(u, ch, len);
+ ch[len] = 0; /* include terminating \0 */
+ return atof(ch);
+}
+
+U_CAPI UnicodeString *
+udbg_escape(const UnicodeString &src, UnicodeString *dst)
+{
+ dst->remove();
+ for (int32_t i = 0; i < src.length(); ++i) {
+ char16_t c = src[i];
+ if(ICU_Utility::isUnprintable(c)) {
+ *dst += UnicodeString("[");
+ ICU_Utility::escapeUnprintable(*dst, c);
+ *dst += UnicodeString("]");
+ }
+ else {
+ *dst += c;
+ }
+ }
+
+ return dst;
+}
+
+
+
+#endif
diff --git a/intl/icu/source/tools/toolutil/dbgutil.h b/intl/icu/source/tools/toolutil/dbgutil.h
new file mode 100644
index 0000000000..43fe2171b4
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/dbgutil.h
@@ -0,0 +1,45 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+/*
+************************************************************************
+* Copyright (c) 2007-2012, International Business Machines
+* Corporation and others. All Rights Reserved.
+************************************************************************
+*/
+
+/** C++ Utilities to aid in debugging **/
+
+#ifndef _DBGUTIL_H
+#define _DBGUTIL_H
+
+#include "unicode/utypes.h"
+#include "udbgutil.h"
+#include "unicode/unistr.h"
+
+#if !UCONFIG_NO_FORMATTING
+
+U_TOOLUTIL_API const icu::UnicodeString& U_EXPORT2
+udbg_enumString(UDebugEnumType type, int32_t field);
+
+/**
+ * @return enum offset, or UDBG_INVALID_ENUM on error
+ */
+U_CAPI int32_t U_EXPORT2
+udbg_enumByString(UDebugEnumType type, const icu::UnicodeString& string);
+
+/**
+ * Convert a UnicodeString (with ascii digits) into a number.
+ * @param s string
+ * @return numerical value, or 0 on error
+ */
+U_CAPI int32_t U_EXPORT2 udbg_stoi(const icu::UnicodeString &s);
+
+U_CAPI double U_EXPORT2 udbg_stod(const icu::UnicodeString &s);
+
+U_CAPI icu::UnicodeString * U_EXPORT2
+udbg_escape(const icu::UnicodeString &s, icu::UnicodeString *dst);
+
+#endif
+
+#endif
diff --git a/intl/icu/source/tools/toolutil/denseranges.cpp b/intl/icu/source/tools/toolutil/denseranges.cpp
new file mode 100644
index 0000000000..f5e52b1bbb
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/denseranges.cpp
@@ -0,0 +1,160 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+* Copyright (C) 2010, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* file name: denseranges.cpp
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2010sep25
+* created by: Markus W. Scherer
+*
+* Helper code for finding a small number of dense ranges.
+*/
+
+#include "unicode/utypes.h"
+#include "denseranges.h"
+
+// Definitions in the anonymous namespace are invisible outside this file.
+namespace {
+
+/**
+ * Collect up to 15 range gaps and sort them by ascending gap size.
+ */
+class LargestGaps {
+public:
+ LargestGaps(int32_t max) : maxLength(max<=kCapacity ? max : kCapacity), length(0) {}
+
+ void add(int32_t gapStart, int64_t gapLength) {
+ int32_t i=length;
+ while(i>0 && gapLength>gapLengths[i-1]) {
+ --i;
+ }
+ if(i<maxLength) {
+ // The new gap is now one of the maxLength largest.
+ // Insert the new gap, moving up smaller ones of the previous
+ // length largest.
+ int32_t j= length<maxLength ? length++ : maxLength-1;
+ while(j>i) {
+ gapStarts[j]=gapStarts[j-1];
+ gapLengths[j]=gapLengths[j-1];
+ --j;
+ }
+ gapStarts[i]=gapStart;
+ gapLengths[i]=gapLength;
+ }
+ }
+
+ void truncate(int32_t newLength) {
+ if(newLength<length) {
+ length=newLength;
+ }
+ }
+
+ int32_t count() const { return length; }
+ int32_t gapStart(int32_t i) const { return gapStarts[i]; }
+ int64_t gapLength(int32_t i) const { return gapLengths[i]; }
+
+ int32_t firstAfter(int32_t value) const {
+ if(length==0) {
+ return -1;
+ }
+ int32_t minValue=0;
+ int32_t minIndex=-1;
+ for(int32_t i=0; i<length; ++i) {
+ if(value<gapStarts[i] && (minIndex<0 || gapStarts[i]<minValue)) {
+ minValue=gapStarts[i];
+ minIndex=i;
+ }
+ }
+ return minIndex;
+ }
+
+private:
+ static const int32_t kCapacity=15;
+
+ int32_t maxLength;
+ int32_t length;
+ int32_t gapStarts[kCapacity];
+ int64_t gapLengths[kCapacity];
+};
+
+} // namespace
+
+/**
+ * Does it make sense to write 1..capacity ranges?
+ * Returns 0 if not, otherwise the number of ranges.
+ * @param values Sorted array of signed-integer values.
+ * @param length Number of values.
+ * @param density Minimum average range density, in 256th. (0x100=100%=perfectly dense.)
+ * Should be 0x80..0x100, must be 1..0x100.
+ * @param ranges Output ranges array.
+ * @param capacity Maximum number of ranges.
+ * @return Minimum number of ranges (at most capacity) that have the desired density,
+ * or 0 if that density cannot be achieved.
+ */
+U_CAPI int32_t U_EXPORT2
+uprv_makeDenseRanges(const int32_t values[], int32_t length,
+ int32_t density,
+ int32_t ranges[][2], int32_t capacity) {
+ if(length<=2) {
+ return 0;
+ }
+ int32_t minValue=values[0];
+ int32_t maxValue=values[length-1]; // Assume minValue<=maxValue.
+ // Use int64_t variables for intermediate-value precision and to avoid
+ // signed-int32_t overflow of maxValue-minValue.
+ int64_t maxLength=(int64_t)maxValue-(int64_t)minValue+1;
+ if(length>=(density*maxLength)/0x100) {
+ // Use one range.
+ ranges[0][0]=minValue;
+ ranges[0][1]=maxValue;
+ return 1;
+ }
+ if(length<=4) {
+ return 0;
+ }
+ // See if we can split [minValue, maxValue] into 2..capacity ranges,
+ // divided by the 1..(capacity-1) largest gaps.
+ LargestGaps gaps(capacity-1);
+ int32_t i;
+ int32_t expectedValue=minValue;
+ for(i=1; i<length; ++i) {
+ ++expectedValue;
+ int32_t actualValue=values[i];
+ if(expectedValue!=actualValue) {
+ gaps.add(expectedValue, (int64_t)actualValue-(int64_t)expectedValue);
+ expectedValue=actualValue;
+ }
+ }
+ // We know gaps.count()>=1 because we have fewer values (length) than
+ // the length of the [minValue..maxValue] range (maxLength).
+ // (Otherwise we would have returned with the one range above.)
+ int32_t num;
+ for(i=0, num=2;; ++i, ++num) {
+ if(i>=gaps.count()) {
+ // The values are too sparse for capacity or fewer ranges
+ // of the requested density.
+ return 0;
+ }
+ maxLength-=gaps.gapLength(i);
+ if(length>num*2 && length>=(density*maxLength)/0x100) {
+ break;
+ }
+ }
+ // Use the num ranges with the num-1 largest gaps.
+ gaps.truncate(num-1);
+ ranges[0][0]=minValue;
+ for(i=0; i<=num-2; ++i) {
+ int32_t gapIndex=gaps.firstAfter(minValue);
+ int32_t gapStart=gaps.gapStart(gapIndex);
+ ranges[i][1]=gapStart-1;
+ ranges[i+1][0]=minValue=(int32_t)(gapStart+gaps.gapLength(gapIndex));
+ }
+ ranges[num-1][1]=maxValue;
+ return num;
+}
diff --git a/intl/icu/source/tools/toolutil/denseranges.h b/intl/icu/source/tools/toolutil/denseranges.h
new file mode 100644
index 0000000000..c489ca47d8
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/denseranges.h
@@ -0,0 +1,41 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+* Copyright (C) 2010, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* file name: denseranges.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2010sep25
+* created by: Markus W. Scherer
+*
+* Helper code for finding a small number of dense ranges.
+*/
+
+#ifndef __DENSERANGES_H__
+#define __DENSERANGES_H__
+
+#include "unicode/utypes.h"
+
+/**
+ * Does it make sense to write 1..capacity ranges?
+ * Returns 0 if not, otherwise the number of ranges.
+ * @param values Sorted array of signed-integer values.
+ * @param length Number of values.
+ * @param density Minimum average range density, in 256th. (0x100=100%=perfectly dense.)
+ * Should be 0x80..0x100, must be 1..0x100.
+ * @param ranges Output ranges array.
+ * @param capacity Maximum number of ranges.
+ * @return Minimum number of ranges (at most capacity) that have the desired density,
+ * or 0 if that density cannot be achieved.
+ */
+U_CAPI int32_t U_EXPORT2
+uprv_makeDenseRanges(const int32_t values[], int32_t length,
+ int32_t density,
+ int32_t ranges[][2], int32_t capacity);
+
+#endif // __DENSERANGES_H__
diff --git a/intl/icu/source/tools/toolutil/filestrm.cpp b/intl/icu/source/tools/toolutil/filestrm.cpp
new file mode 100644
index 0000000000..9a2695197a
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/filestrm.cpp
@@ -0,0 +1,227 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+******************************************************************************
+*
+* Copyright (C) 1997-2011, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+******************************************************************************
+*
+* File FILESTRM.C
+*
+* @author Glenn Marcy
+*
+* Modification History:
+*
+* Date Name Description
+* 5/8/98 gm Created
+* 03/02/99 stephen Reordered params in ungetc to match stdio
+* Added wopen
+* 3/29/99 helena Merged Stephen and Bertrand's changes.
+*
+******************************************************************************
+*/
+
+#include "filestrm.h"
+
+#include "cmemory.h"
+
+#include <stdio.h>
+
+U_CAPI FileStream* U_EXPORT2
+T_FileStream_open(const char* filename, const char* mode)
+{
+ if(filename != nullptr && *filename != 0 && mode != nullptr && *mode != 0) {
+ FILE *file = fopen(filename, mode);
+ return (FileStream*)file;
+ } else {
+ return nullptr;
+ }
+}
+
+/*
+U_CAPI FileStream* U_EXPORT2
+T_FileStream_wopen(const wchar_t* filename, const wchar_t* mode)
+{
+ // TBD: _wfopen is believed to be MS-specific?
+#if U_PLATFORM_USES_ONLY_WIN32_API
+ FILE* result = _wfopen(filename, mode);
+ return (FileStream*)result;
+#else
+ size_t fnMbsSize, mdMbsSize;
+ char *fn, *md;
+ FILE *result;
+
+ // convert from wchar_t to char
+ fnMbsSize = wcstombs(nullptr, filename, ((size_t)-1) >> 1);
+ fn = (char*)uprv_malloc(fnMbsSize+2);
+ wcstombs(fn, filename, fnMbsSize);
+ fn[fnMbsSize] = 0;
+
+ mdMbsSize = wcstombs(nullptr, mode, ((size_t)-1) >> 1);
+ md = (char*)uprv_malloc(mdMbsSize+2);
+ wcstombs(md, mode, mdMbsSize);
+ md[mdMbsSize] = 0;
+
+ result = fopen(fn, md);
+ uprv_free(fn);
+ uprv_free(md);
+ return (FileStream*)result;
+#endif
+}
+*/
+U_CAPI void U_EXPORT2
+T_FileStream_close(FileStream* fileStream)
+{
+ if (fileStream != 0)
+ fclose((FILE*)fileStream);
+}
+
+U_CAPI UBool U_EXPORT2
+T_FileStream_file_exists(const char* filename)
+{
+ FILE* temp = fopen(filename, "r");
+ if (temp) {
+ fclose(temp);
+ return true;
+ } else
+ return false;
+}
+
+/*static const int32_t kEOF;
+const int32_t FileStream::kEOF = EOF;*/
+
+/*
+U_CAPI FileStream*
+T_FileStream_tmpfile()
+{
+ FILE* file = tmpfile();
+ return (FileStream*)file;
+}
+*/
+
+U_CAPI int32_t U_EXPORT2
+T_FileStream_read(FileStream* fileStream, void* addr, int32_t len)
+{
+ return static_cast<int32_t>(fread(addr, 1, len, (FILE*)fileStream));
+}
+
+U_CAPI int32_t U_EXPORT2
+T_FileStream_write(FileStream* fileStream, const void* addr, int32_t len)
+{
+
+ return static_cast<int32_t>(fwrite(addr, 1, len, (FILE*)fileStream));
+}
+
+U_CAPI void U_EXPORT2
+T_FileStream_rewind(FileStream* fileStream)
+{
+ rewind((FILE*)fileStream);
+}
+
+U_CAPI int32_t U_EXPORT2
+T_FileStream_putc(FileStream* fileStream, int32_t ch)
+{
+ int32_t c = fputc(ch, (FILE*)fileStream);
+ return c;
+}
+
+U_CAPI int U_EXPORT2
+T_FileStream_getc(FileStream* fileStream)
+{
+ int c = fgetc((FILE*)fileStream);
+ return c;
+}
+
+U_CAPI int32_t U_EXPORT2
+T_FileStream_ungetc(int32_t ch, FileStream* fileStream)
+{
+
+ int32_t c = ungetc(ch, (FILE*)fileStream);
+ return c;
+}
+
+U_CAPI int32_t U_EXPORT2
+T_FileStream_peek(FileStream* fileStream)
+{
+ int32_t c = fgetc((FILE*)fileStream);
+ return ungetc(c, (FILE*)fileStream);
+}
+
+U_CAPI char* U_EXPORT2
+T_FileStream_readLine(FileStream* fileStream, char* buffer, int32_t length)
+{
+ return fgets(buffer, length, (FILE*)fileStream);
+}
+
+U_CAPI int32_t U_EXPORT2
+T_FileStream_writeLine(FileStream* fileStream, const char* buffer)
+{
+ return fputs(buffer, (FILE*)fileStream);
+}
+
+U_CAPI int32_t U_EXPORT2
+T_FileStream_size(FileStream* fileStream)
+{
+ int32_t savedPos = ftell((FILE*)fileStream);
+ int32_t size = 0;
+
+ /*Changes by Bertrand A. D. doesn't affect the current position
+ goes to the end of the file before ftell*/
+ fseek((FILE*)fileStream, 0, SEEK_END);
+ size = (int32_t)ftell((FILE*)fileStream);
+ fseek((FILE*)fileStream, savedPos, SEEK_SET);
+ return size;
+}
+
+U_CAPI int U_EXPORT2
+T_FileStream_eof(FileStream* fileStream)
+{
+ return feof((FILE*)fileStream);
+}
+
+/*
+ Warning
+ This function may not work consistently on all platforms
+ (e.g. HP-UX, FreeBSD and MacOSX don't return an error when
+ putc is used on a file opened as readonly)
+*/
+U_CAPI int U_EXPORT2
+T_FileStream_error(FileStream* fileStream)
+{
+ return (fileStream == 0 || ferror((FILE*)fileStream));
+}
+
+/* This function doesn't work. */
+/* force the stream to set its error flag*/
+/*U_CAPI void U_EXPORT2
+T_FileStream_setError(FileStream* fileStream)
+{
+ fseek((FILE*)fileStream, 99999, SEEK_SET);
+}
+*/
+
+U_CAPI FileStream* U_EXPORT2
+T_FileStream_stdin()
+{
+ return (FileStream*)stdin;
+}
+
+U_CAPI FileStream* U_EXPORT2
+T_FileStream_stdout()
+{
+ return (FileStream*)stdout;
+}
+
+
+U_CAPI FileStream* U_EXPORT2
+T_FileStream_stderr()
+{
+ return (FileStream*)stderr;
+}
+
+U_CAPI UBool U_EXPORT2
+T_FileStream_remove(const char* fileName){
+ return (remove(fileName) == 0);
+}
diff --git a/intl/icu/source/tools/toolutil/filestrm.h b/intl/icu/source/tools/toolutil/filestrm.h
new file mode 100644
index 0000000000..86fac3063f
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/filestrm.h
@@ -0,0 +1,106 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+******************************************************************************
+*
+* Copyright (C) 1997-2005, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+******************************************************************************
+*
+* File FILESTRM.H
+*
+* Contains FileStream interface
+*
+* @author Glenn Marcy
+*
+* Modification History:
+*
+* Date Name Description
+* 5/8/98 gm Created.
+* 03/02/99 stephen Reordered params in ungetc to match stdio
+* Added wopen
+*
+******************************************************************************
+*/
+
+#ifndef FILESTRM_H
+#define FILESTRM_H
+
+#include "unicode/utypes.h"
+
+typedef struct _FileStream FileStream;
+
+U_CAPI FileStream* U_EXPORT2
+T_FileStream_open(const char* filename, const char* mode);
+
+/*
+U_CAPI FileStream* U_EXPORT2
+T_FileStream_wopen(const wchar_t* filename, const wchar_t* mode);
+*/
+U_CAPI void U_EXPORT2
+T_FileStream_close(FileStream* fileStream);
+
+U_CAPI UBool U_EXPORT2
+T_FileStream_file_exists(const char* filename);
+
+/*
+U_CAPI FileStream* U_EXPORT2
+T_FileStream_tmpfile(void);
+*/
+
+U_CAPI int32_t U_EXPORT2
+T_FileStream_read(FileStream* fileStream, void* addr, int32_t len);
+
+U_CAPI int32_t U_EXPORT2
+T_FileStream_write(FileStream* fileStream, const void* addr, int32_t len);
+
+U_CAPI void U_EXPORT2
+T_FileStream_rewind(FileStream* fileStream);
+
+/*Added by Bertrand A. D. */
+U_CAPI char * U_EXPORT2
+T_FileStream_readLine(FileStream* fileStream, char* buffer, int32_t length);
+
+U_CAPI int32_t U_EXPORT2
+T_FileStream_writeLine(FileStream* fileStream, const char* buffer);
+
+U_CAPI int32_t U_EXPORT2
+T_FileStream_putc(FileStream* fileStream, int32_t ch);
+
+U_CAPI int U_EXPORT2
+T_FileStream_getc(FileStream* fileStream);
+
+U_CAPI int32_t U_EXPORT2
+T_FileStream_ungetc(int32_t ch, FileStream *fileStream);
+
+U_CAPI int32_t U_EXPORT2
+T_FileStream_peek(FileStream* fileStream);
+
+U_CAPI int32_t U_EXPORT2
+T_FileStream_size(FileStream* fileStream);
+
+U_CAPI int U_EXPORT2
+T_FileStream_eof(FileStream* fileStream);
+
+U_CAPI int U_EXPORT2
+T_FileStream_error(FileStream* fileStream);
+
+/*
+U_CAPI void U_EXPORT2
+T_FileStream_setError(FileStream* fileStream);
+*/
+
+U_CAPI FileStream* U_EXPORT2
+T_FileStream_stdin(void);
+
+U_CAPI FileStream* U_EXPORT2
+T_FileStream_stdout(void);
+
+U_CAPI FileStream* U_EXPORT2
+T_FileStream_stderr(void);
+
+U_CAPI UBool U_EXPORT2
+T_FileStream_remove(const char* fileName);
+
+#endif /* _FILESTRM*/
diff --git a/intl/icu/source/tools/toolutil/filetools.cpp b/intl/icu/source/tools/toolutil/filetools.cpp
new file mode 100644
index 0000000000..994d8e31f0
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/filetools.cpp
@@ -0,0 +1,140 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/******************************************************************************
+ * Copyright (C) 2009-2013, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *******************************************************************************
+ */
+
+#include "unicode/platform.h"
+#if U_PLATFORM == U_PF_MINGW
+// *cough* - for struct stat
+#ifdef __STRICT_ANSI__
+#undef __STRICT_ANSI__
+#endif
+#endif
+
+#include "filetools.h"
+#include "filestrm.h"
+#include "charstr.h"
+#include "cstring.h"
+#include "unicode/putil.h"
+#include "putilimp.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <string.h>
+
+#if U_HAVE_DIRENT_H
+#include <dirent.h>
+typedef struct dirent DIRENT;
+
+#define SKIP1 "."
+#define SKIP2 ".."
+#endif
+
+static int32_t whichFileModTimeIsLater(const char *file1, const char *file2);
+
+/*
+ * Goes through the given directory recursive to compare each file's modification time with that of the file given.
+ * Also can be given just one file to check against. Default value for isDir is false.
+ */
+U_CAPI UBool U_EXPORT2
+isFileModTimeLater(const char *filePath, const char *checkAgainst, UBool isDir) {
+ UBool isLatest = true;
+
+ if (filePath == nullptr || checkAgainst == nullptr) {
+ return false;
+ }
+
+ if (isDir == true) {
+#if U_HAVE_DIRENT_H
+ DIR *pDir = nullptr;
+ if ((pDir= opendir(checkAgainst)) != nullptr) {
+ DIR *subDirp = nullptr;
+ DIRENT *dirEntry = nullptr;
+
+ while ((dirEntry = readdir(pDir)) != nullptr) {
+ if (uprv_strcmp(dirEntry->d_name, SKIP1) != 0 && uprv_strcmp(dirEntry->d_name, SKIP2) != 0) {
+ UErrorCode status = U_ZERO_ERROR;
+ icu::CharString newpath(checkAgainst, -1, status);
+ newpath.append(U_FILE_SEP_STRING, -1, status);
+ newpath.append(dirEntry->d_name, -1, status);
+ if (U_FAILURE(status)) {
+ fprintf(stderr, "%s:%d: %s\n", __FILE__, __LINE__, u_errorName(status));
+ return false;
+ }
+
+ if ((subDirp = opendir(newpath.data())) != nullptr) {
+ /* If this new path is a directory, make a recursive call with the newpath. */
+ closedir(subDirp);
+ isLatest = isFileModTimeLater(filePath, newpath.data(), isDir);
+ if (!isLatest) {
+ break;
+ }
+ } else {
+ int32_t latest = whichFileModTimeIsLater(filePath, newpath.data());
+ if (latest < 0 || latest == 2) {
+ isLatest = false;
+ break;
+ }
+ }
+
+ }
+ }
+ closedir(pDir);
+ } else {
+ fprintf(stderr, "Unable to open directory: %s\n", checkAgainst);
+ return false;
+ }
+#endif
+ } else {
+ if (T_FileStream_file_exists(checkAgainst)) {
+ int32_t latest = whichFileModTimeIsLater(filePath, checkAgainst);
+ if (latest < 0 || latest == 2) {
+ isLatest = false;
+ }
+ } else {
+ isLatest = false;
+ }
+ }
+
+ return isLatest;
+}
+
+/* Compares the mod time of both files returning a number indicating which one is later. -1 if error ocurs. */
+static int32_t whichFileModTimeIsLater(const char *file1, const char *file2) {
+ int32_t result = 0;
+ struct stat stbuf1, stbuf2;
+
+ if (stat(file1, &stbuf1) == 0 && stat(file2, &stbuf2) == 0) {
+ time_t modtime1, modtime2;
+ double diff;
+
+ modtime1 = stbuf1.st_mtime;
+ modtime2 = stbuf2.st_mtime;
+
+ diff = difftime(modtime1, modtime2);
+ if (diff < 0.0) {
+ result = 2;
+ } else if (diff > 0.0) {
+ result = 1;
+ }
+
+ } else {
+ fprintf(stderr, "Unable to get stats from file: %s or %s\n", file1, file2);
+ result = -1;
+ }
+
+ return result;
+}
+
+/* Swap the file separater character given with the new one in the file path. */
+U_CAPI void U_EXPORT2
+swapFileSepChar(char *filePath, const char oldFileSepChar, const char newFileSepChar) {
+ for (int32_t i = 0, length = static_cast<int32_t>(uprv_strlen(filePath)); i < length; i++) {
+ filePath[i] = (filePath[i] == oldFileSepChar ) ? newFileSepChar : filePath[i];
+ }
+}
diff --git a/intl/icu/source/tools/toolutil/filetools.h b/intl/icu/source/tools/toolutil/filetools.h
new file mode 100644
index 0000000000..40a606a7d4
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/filetools.h
@@ -0,0 +1,34 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2009, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: filetools.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2009jan09
+* created by: Michael Ow
+*
+* Contains various functions to handle files.
+* Not suitable for production use. Not supported.
+* Not conformant. Not efficient.
+*/
+
+#ifndef __FILETOOLS_H__
+#define __FILETOOLS_H__
+
+#include "unicode/utypes.h"
+
+U_CAPI UBool U_EXPORT2
+isFileModTimeLater(const char *filePath, const char *checkAgainst, UBool isDir=false);
+
+U_CAPI void U_EXPORT2
+swapFileSepChar(char *filePath, const char oldFileSepChar, const char newFileSepChar);
+
+#endif
diff --git a/intl/icu/source/tools/toolutil/flagparser.cpp b/intl/icu/source/tools/toolutil/flagparser.cpp
new file mode 100644
index 0000000000..8bbceb4f73
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/flagparser.cpp
@@ -0,0 +1,180 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/******************************************************************************
+ * Copyright (C) 2009-2015, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *******************************************************************************
+ */
+
+#include "flagparser.h"
+#include "filestrm.h"
+#include "cstring.h"
+#include "cmemory.h"
+
+#define DEFAULT_BUFFER_SIZE 512
+
+static int32_t currentBufferSize = DEFAULT_BUFFER_SIZE;
+
+static int32_t extractFlag(char* buffer, int32_t bufferSize, char* flag, int32_t flagSize, const char ** flagNames, int32_t numOfFlags, UErrorCode *status);
+static int32_t getFlagOffset(const char *buffer, int32_t bufferSize);
+
+/*
+ * Opens the given fileName and reads in the information storing the data in flagBuffer.
+ */
+U_CAPI int32_t U_EXPORT2
+parseFlagsFile(const char *fileName, char **flagBuffer, int32_t flagBufferSize, const char ** flagNames, int32_t numOfFlags, UErrorCode *status) {
+ char* buffer = nullptr;
+ char* tmpFlagBuffer = nullptr;
+ UBool allocateMoreSpace = false;
+ int32_t idx, i;
+ int32_t result = 0;
+
+ FileStream *f = T_FileStream_open(fileName, "r");
+ if (f == nullptr) {
+ *status = U_FILE_ACCESS_ERROR;
+ goto parseFlagsFile_cleanup;
+ }
+
+ buffer = (char *)uprv_malloc(sizeof(char) * currentBufferSize);
+ tmpFlagBuffer = (char *)uprv_malloc(sizeof(char) * flagBufferSize);
+
+ if (buffer == nullptr || tmpFlagBuffer == nullptr) {
+ *status = U_MEMORY_ALLOCATION_ERROR;
+ goto parseFlagsFile_cleanup;
+ }
+
+ do {
+ if (allocateMoreSpace) {
+ allocateMoreSpace = false;
+ currentBufferSize *= 2;
+ uprv_free(buffer);
+ buffer = (char *)uprv_malloc(sizeof(char) * currentBufferSize);
+ if (buffer == nullptr) {
+ *status = U_MEMORY_ALLOCATION_ERROR;
+ goto parseFlagsFile_cleanup;
+ }
+ }
+ for (i = 0; i < numOfFlags;) {
+ if (T_FileStream_readLine(f, buffer, currentBufferSize) == nullptr) {
+ /* End of file reached. */
+ break;
+ }
+ if (buffer[0] == '#') {
+ continue;
+ }
+
+ if ((int32_t)uprv_strlen(buffer) == (currentBufferSize - 1) && buffer[currentBufferSize-2] != '\n') {
+ /* Allocate more space for buffer if it did not read the entire line */
+ allocateMoreSpace = true;
+ T_FileStream_rewind(f);
+ break;
+ } else {
+ idx = extractFlag(buffer, currentBufferSize, tmpFlagBuffer, flagBufferSize, flagNames, numOfFlags, status);
+ if (U_FAILURE(*status)) {
+ if (*status == U_BUFFER_OVERFLOW_ERROR) {
+ result = currentBufferSize;
+ } else {
+ result = -1;
+ }
+ break;
+ } else {
+ if (flagNames != nullptr) {
+ if (idx >= 0) {
+ uprv_strcpy(flagBuffer[idx], tmpFlagBuffer);
+ } else {
+ /* No match found. Skip it. */
+ continue;
+ }
+ } else {
+ uprv_strcpy(flagBuffer[i++], tmpFlagBuffer);
+ }
+ }
+ }
+ }
+ } while (allocateMoreSpace && U_SUCCESS(*status));
+
+parseFlagsFile_cleanup:
+ uprv_free(tmpFlagBuffer);
+ uprv_free(buffer);
+
+ T_FileStream_close(f);
+
+ if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
+ return -1;
+ }
+
+ if (U_SUCCESS(*status) && result == 0) {
+ currentBufferSize = DEFAULT_BUFFER_SIZE;
+ }
+
+ return result;
+}
+
+
+/*
+ * Extract the setting after the '=' and store it in flag excluding the newline character.
+ */
+static int32_t extractFlag(char* buffer, int32_t bufferSize, char* flag, int32_t flagSize, const char **flagNames, int32_t numOfFlags, UErrorCode *status) {
+ int32_t i, idx = -1;
+ char *pBuffer;
+ int32_t offset=0;
+ UBool bufferWritten = false;
+
+ if (buffer[0] != 0) {
+ /* Get the offset (i.e. position after the '=') */
+ offset = getFlagOffset(buffer, bufferSize);
+ pBuffer = buffer+offset;
+ for(i = 0;;i++) {
+ if (i >= flagSize) {
+ *status = U_BUFFER_OVERFLOW_ERROR;
+ return -1;
+ }
+ if (pBuffer[i+1] == 0) {
+ /* Indicates a new line character. End here. */
+ flag[i] = 0;
+ break;
+ }
+
+ flag[i] = pBuffer[i];
+ if (i == 0) {
+ bufferWritten = true;
+ }
+ }
+ }
+
+ if (!bufferWritten) {
+ flag[0] = 0;
+ }
+
+ if (flagNames != nullptr && offset>0) {
+ offset--; /* Move offset back 1 because of '='*/
+ for (i = 0; i < numOfFlags; i++) {
+ if (uprv_strncmp(buffer, flagNames[i], offset) == 0) {
+ idx = i;
+ break;
+ }
+ }
+ }
+
+ return idx;
+}
+
+/*
+ * Get the position after the '=' character.
+ */
+static int32_t getFlagOffset(const char *buffer, int32_t bufferSize) {
+ int32_t offset = 0;
+
+ for (offset = 0; offset < bufferSize;offset++) {
+ if (buffer[offset] == '=') {
+ offset++;
+ break;
+ }
+ }
+
+ if (offset == bufferSize || (offset - 1) == bufferSize) {
+ offset = 0;
+ }
+
+ return offset;
+}
diff --git a/intl/icu/source/tools/toolutil/flagparser.h b/intl/icu/source/tools/toolutil/flagparser.h
new file mode 100644
index 0000000000..aa42547164
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/flagparser.h
@@ -0,0 +1,32 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2009-2011, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: flagparser.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2009jan08
+* created by: Michael Ow
+*
+* Tiny flag file parser using ICU and intended for use in ICU tests and in build tools.
+* Not suitable for production use. Not supported.
+* Not conformant. Not efficient.
+* But very small.
+*/
+
+#ifndef __FLAGPARSER_H__
+#define __FLAGPARSER_H__
+
+#include "unicode/utypes.h"
+
+U_CAPI int32_t U_EXPORT2
+parseFlagsFile(const char *fileName, char **flagBuffer, int32_t flagBufferSize, const char ** flagNames, int32_t numOfFlags, UErrorCode *status);
+
+#endif
diff --git a/intl/icu/source/tools/toolutil/package.cpp b/intl/icu/source/tools/toolutil/package.cpp
new file mode 100644
index 0000000000..3098f5d57d
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/package.cpp
@@ -0,0 +1,1311 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 1999-2015, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: package.cpp
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2005aug25
+* created by: Markus W. Scherer
+*
+* Read, modify, and write ICU .dat data package files.
+* This is an integral part of the icupkg tool, moved to the toolutil library
+* because parts of tool implementations tend to be later shared by
+* other tools.
+* Subsumes functionality and implementation code from
+* gencmn, decmn, and icuswap tools.
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/putil.h"
+#include "unicode/udata.h"
+#include "cstring.h"
+#include "uarrsort.h"
+#include "ucmndata.h"
+#include "udataswp.h"
+#include "swapimpl.h"
+#include "toolutil.h"
+#include "package.h"
+#include "cmemory.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+static const int32_t kItemsChunk = 256; /* How much to increase the filesarray by each time */
+
+// general definitions ----------------------------------------------------- ***
+
+/* UDataInfo cf. udata.h */
+static const UDataInfo dataInfo={
+ (uint16_t)sizeof(UDataInfo),
+ 0,
+
+ U_IS_BIG_ENDIAN,
+ U_CHARSET_FAMILY,
+ (uint8_t)sizeof(char16_t),
+ 0,
+
+ {0x43, 0x6d, 0x6e, 0x44}, /* dataFormat="CmnD" */
+ {1, 0, 0, 0}, /* formatVersion */
+ {3, 0, 0, 0} /* dataVersion */
+};
+
+U_CDECL_BEGIN
+static void U_CALLCONV
+printPackageError(void *context, const char *fmt, va_list args) {
+ vfprintf((FILE *)context, fmt, args);
+}
+U_CDECL_END
+
+static uint16_t
+readSwapUInt16(uint16_t x) {
+ return (uint16_t)((x<<8)|(x>>8));
+}
+
+// platform types ---------------------------------------------------------- ***
+
+static const char *types="lb?e";
+
+enum { TYPE_L, TYPE_B, TYPE_LE, TYPE_E, TYPE_COUNT };
+
+static inline int32_t
+makeTypeEnum(uint8_t charset, UBool isBigEndian) {
+ return 2*(int32_t)charset+isBigEndian;
+}
+
+static inline int32_t
+makeTypeEnum(char type) {
+ return
+ type == 'l' ? TYPE_L :
+ type == 'b' ? TYPE_B :
+ type == 'e' ? TYPE_E :
+ -1;
+}
+
+static inline char
+makeTypeLetter(uint8_t charset, UBool isBigEndian) {
+ return types[makeTypeEnum(charset, isBigEndian)];
+}
+
+static inline char
+makeTypeLetter(int32_t typeEnum) {
+ return types[typeEnum];
+}
+
+static void
+makeTypeProps(char type, uint8_t &charset, UBool &isBigEndian) {
+ int32_t typeEnum=makeTypeEnum(type);
+ charset=(uint8_t)(typeEnum>>1);
+ isBigEndian=(UBool)(typeEnum&1);
+}
+
+U_CFUNC const UDataInfo *
+getDataInfo(const uint8_t *data, int32_t length,
+ int32_t &infoLength, int32_t &headerLength,
+ UErrorCode *pErrorCode) {
+ const DataHeader *pHeader;
+ const UDataInfo *pInfo;
+
+ if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ if( data==nullptr ||
+ (length>=0 && length<(int32_t)sizeof(DataHeader))
+ ) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+
+ pHeader=(const DataHeader *)data;
+ pInfo=&pHeader->info;
+ if( (length>=0 && length<(int32_t)sizeof(DataHeader)) ||
+ pHeader->dataHeader.magic1!=0xda ||
+ pHeader->dataHeader.magic2!=0x27 ||
+ pInfo->sizeofUChar!=2
+ ) {
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return nullptr;
+ }
+
+ if(pInfo->isBigEndian==U_IS_BIG_ENDIAN) {
+ headerLength=pHeader->dataHeader.headerSize;
+ infoLength=pInfo->size;
+ } else {
+ headerLength=readSwapUInt16(pHeader->dataHeader.headerSize);
+ infoLength=readSwapUInt16(pInfo->size);
+ }
+
+ if( headerLength<(int32_t)sizeof(DataHeader) ||
+ infoLength<(int32_t)sizeof(UDataInfo) ||
+ headerLength<(int32_t)(sizeof(pHeader->dataHeader)+infoLength) ||
+ (length>=0 && length<headerLength)
+ ) {
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return nullptr;
+ }
+
+ return pInfo;
+}
+
+static int32_t
+getTypeEnumForInputData(const uint8_t *data, int32_t length,
+ UErrorCode *pErrorCode) {
+ const UDataInfo *pInfo;
+ int32_t infoLength, headerLength;
+
+ /* getDataInfo() checks for illegal arguments */
+ pInfo=getDataInfo(data, length, infoLength, headerLength, pErrorCode);
+ if(pInfo==nullptr) {
+ return -1;
+ }
+
+ return makeTypeEnum(pInfo->charsetFamily, (UBool)pInfo->isBigEndian);
+}
+
+// file handling ----------------------------------------------------------- ***
+
+static void
+extractPackageName(const char *filename,
+ char pkg[], int32_t capacity) {
+ const char *basename;
+ int32_t len;
+
+ basename=findBasename(filename);
+ len=(int32_t)strlen(basename)-4; /* -4: subtract the length of ".dat" */
+
+ if(len<=0 || 0!=strcmp(basename+len, ".dat")) {
+ fprintf(stderr, "icupkg: \"%s\" is not recognized as a package filename (must end with .dat)\n",
+ basename);
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+
+ if(len>=capacity) {
+ fprintf(stderr, "icupkg: the package name \"%s\" is too long (>=%ld)\n",
+ basename, (long)capacity);
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+
+ memcpy(pkg, basename, len);
+ pkg[len]=0;
+}
+
+static int32_t
+getFileLength(FILE *f) {
+ int32_t length;
+
+ fseek(f, 0, SEEK_END);
+ length=(int32_t)ftell(f);
+ fseek(f, 0, SEEK_SET);
+ return length;
+}
+
+/*
+ * Turn tree separators and alternate file separators into normal file separators.
+ */
+#if U_TREE_ENTRY_SEP_CHAR==U_FILE_SEP_CHAR && U_FILE_ALT_SEP_CHAR==U_FILE_SEP_CHAR
+#define treeToPath(s)
+#else
+static void
+treeToPath(char *s) {
+ char *t;
+
+ for(t=s; *t!=0; ++t) {
+ if(*t==U_TREE_ENTRY_SEP_CHAR || *t==U_FILE_ALT_SEP_CHAR) {
+ *t=U_FILE_SEP_CHAR;
+ }
+ }
+}
+#endif
+
+/*
+ * Turn file separators into tree separators.
+ */
+#if U_TREE_ENTRY_SEP_CHAR==U_FILE_SEP_CHAR && U_FILE_ALT_SEP_CHAR==U_FILE_SEP_CHAR
+#define pathToTree(s)
+#else
+static void
+pathToTree(char *s) {
+ char *t;
+
+ for(t=s; *t!=0; ++t) {
+ if(*t==U_FILE_SEP_CHAR || *t==U_FILE_ALT_SEP_CHAR) {
+ *t=U_TREE_ENTRY_SEP_CHAR;
+ }
+ }
+}
+#endif
+
+/*
+ * Prepend the path (if any) to the name and run the name through treeToName().
+ */
+static void
+makeFullFilename(const char *path, const char *name,
+ char *filename, int32_t capacity) {
+ char *s;
+
+ // prepend the path unless nullptr or empty
+ if(path!=nullptr && path[0]!=0) {
+ if((int32_t)(strlen(path)+1)>=capacity) {
+ fprintf(stderr, "pathname too long: \"%s\"\n", path);
+ exit(U_BUFFER_OVERFLOW_ERROR);
+ }
+ strcpy(filename, path);
+
+ // make sure the path ends with a file separator
+ s=strchr(filename, 0);
+ if(*(s-1)!=U_FILE_SEP_CHAR && *(s-1)!=U_FILE_ALT_SEP_CHAR) {
+ *s++=U_FILE_SEP_CHAR;
+ }
+ } else {
+ s=filename;
+ }
+
+ // turn the name into a filename, turn tree separators into file separators
+ if((int32_t)((s-filename)+strlen(name))>=capacity) {
+ fprintf(stderr, "path/filename too long: \"%s%s\"\n", filename, name);
+ exit(U_BUFFER_OVERFLOW_ERROR);
+ }
+ strcpy(s, name);
+ treeToPath(s);
+}
+
+static void
+makeFullFilenameAndDirs(const char *path, const char *name,
+ char *filename, int32_t capacity) {
+ char *sep;
+ UErrorCode errorCode;
+
+ makeFullFilename(path, name, filename, capacity);
+
+ // make tree directories
+ errorCode=U_ZERO_ERROR;
+ sep=strchr(filename, 0)-strlen(name);
+ while((sep=strchr(sep, U_FILE_SEP_CHAR))!=nullptr) {
+ if(sep!=filename) {
+ *sep=0; // truncate temporarily
+ uprv_mkdir(filename, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: unable to create tree directory \"%s\"\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+ }
+ *sep++=U_FILE_SEP_CHAR; // restore file separator character
+ }
+}
+
+static uint8_t *
+readFile(const char *path, const char *name, int32_t &length, char &type) {
+ char filename[1024];
+ FILE *file;
+ UErrorCode errorCode;
+ int32_t fileLength, typeEnum;
+
+ makeFullFilename(path, name, filename, (int32_t)sizeof(filename));
+
+ /* open the input file, get its length, allocate memory for it, read the file */
+ file=fopen(filename, "rb");
+ if(file==nullptr) {
+ fprintf(stderr, "icupkg: unable to open input file \"%s\"\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ /* get the file length */
+ fileLength=getFileLength(file);
+ if(ferror(file) || fileLength<=0) {
+ fprintf(stderr, "icupkg: empty input file \"%s\"\n", filename);
+ fclose(file);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ /* allocate the buffer, pad to multiple of 16 */
+ length=(fileLength+0xf)&~0xf;
+ icu::LocalMemory<uint8_t> data((uint8_t *)uprv_malloc(length));
+ if(data.isNull()) {
+ fclose(file);
+ fprintf(stderr, "icupkg: malloc error allocating %d bytes.\n", (int)length);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+
+ /* read the file */
+ if(fileLength!=(int32_t)fread(data.getAlias(), 1, fileLength, file)) {
+ fprintf(stderr, "icupkg: error reading \"%s\"\n", filename);
+ fclose(file);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ /* pad the file to a multiple of 16 using the usual padding byte */
+ if(fileLength<length) {
+ memset(data.getAlias()+fileLength, 0xaa, length-fileLength);
+ }
+
+ fclose(file);
+
+ // minimum check for ICU-format data
+ errorCode=U_ZERO_ERROR;
+ typeEnum=getTypeEnumForInputData(data.getAlias(), length, &errorCode);
+ if(typeEnum<0 || U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: not an ICU data file: \"%s\"\n", filename);
+#if !UCONFIG_NO_LEGACY_CONVERSION
+ exit(U_INVALID_FORMAT_ERROR);
+#else
+ fprintf(stderr, "U_INVALID_FORMAT_ERROR occurred but UCONFIG_NO_LEGACY_CONVERSION is on so this is expected.\n");
+ exit(0);
+#endif
+ }
+ type=makeTypeLetter(typeEnum);
+
+ return data.orphan();
+}
+
+// .dat package file representation ---------------------------------------- ***
+
+U_CDECL_BEGIN
+
+static int32_t U_CALLCONV
+compareItems(const void * /*context*/, const void *left, const void *right) {
+ U_NAMESPACE_USE
+
+ return (int32_t)strcmp(((Item *)left)->name, ((Item *)right)->name);
+}
+
+U_CDECL_END
+
+U_NAMESPACE_BEGIN
+
+Package::Package()
+ : doAutoPrefix(false), prefixEndsWithType(false) {
+ inPkgName[0]=0;
+ pkgPrefix[0]=0;
+ inData=nullptr;
+ inLength=0;
+ inCharset=U_CHARSET_FAMILY;
+ inIsBigEndian=U_IS_BIG_ENDIAN;
+
+ itemCount=0;
+ itemMax=0;
+ items=nullptr;
+
+ inStringTop=outStringTop=0;
+
+ matchMode=0;
+ findPrefix=findSuffix=nullptr;
+ findPrefixLength=findSuffixLength=0;
+ findNextIndex=-1;
+
+ // create a header for an empty package
+ DataHeader *pHeader;
+ pHeader=(DataHeader *)header;
+ pHeader->dataHeader.magic1=0xda;
+ pHeader->dataHeader.magic2=0x27;
+ memcpy(&pHeader->info, &dataInfo, sizeof(dataInfo));
+ headerLength=(int32_t)(4+sizeof(dataInfo));
+ if(headerLength&0xf) {
+ /* NUL-pad the header to a multiple of 16 */
+ int32_t length=(headerLength+0xf)&~0xf;
+ memset(header+headerLength, 0, length-headerLength);
+ headerLength=length;
+ }
+ pHeader->dataHeader.headerSize=(uint16_t)headerLength;
+}
+
+Package::~Package() {
+ int32_t idx;
+
+ uprv_free(inData);
+
+ for(idx=0; idx<itemCount; ++idx) {
+ if(items[idx].isDataOwned) {
+ uprv_free(items[idx].data);
+ }
+ }
+
+ uprv_free((void*)items);
+}
+
+void
+Package::setPrefix(const char *p) {
+ if(strlen(p)>=sizeof(pkgPrefix)) {
+ fprintf(stderr, "icupkg: --toc_prefix %s too long\n", p);
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ strcpy(pkgPrefix, p);
+}
+
+void
+Package::readPackage(const char *filename) {
+ UDataSwapper *ds;
+ const UDataInfo *pInfo;
+ UErrorCode errorCode;
+
+ const uint8_t *inBytes;
+
+ int32_t length, offset, i;
+ int32_t itemLength, typeEnum;
+ char type;
+
+ const UDataOffsetTOCEntry *inEntries;
+
+ extractPackageName(filename, inPkgName, (int32_t)sizeof(inPkgName));
+
+ /* read the file */
+ inData=readFile(nullptr, filename, inLength, type);
+ length=inLength;
+
+ /*
+ * swap the header - even if the swapping itself is a no-op
+ * because it tells us the header length
+ */
+ errorCode=U_ZERO_ERROR;
+ makeTypeProps(type, inCharset, inIsBigEndian);
+ ds=udata_openSwapper(inIsBigEndian, inCharset, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: udata_openSwapper(\"%s\") failed - %s\n",
+ filename, u_errorName(errorCode));
+ exit(errorCode);
+ }
+
+ ds->printError=printPackageError;
+ ds->printErrorContext=stderr;
+
+ headerLength=sizeof(header);
+ if(length<headerLength) {
+ headerLength=length;
+ }
+ headerLength=udata_swapDataHeader(ds, inData, headerLength, header, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ exit(errorCode);
+ }
+
+ /* check data format and format version */
+ pInfo=(const UDataInfo *)((const char *)inData+4);
+ if(!(
+ pInfo->dataFormat[0]==0x43 && /* dataFormat="CmnD" */
+ pInfo->dataFormat[1]==0x6d &&
+ pInfo->dataFormat[2]==0x6e &&
+ pInfo->dataFormat[3]==0x44 &&
+ pInfo->formatVersion[0]==1
+ )) {
+ fprintf(stderr, "icupkg: data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as an ICU .dat package\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ pInfo->formatVersion[0]);
+ exit(U_UNSUPPORTED_ERROR);
+ }
+ inIsBigEndian=(UBool)pInfo->isBigEndian;
+ inCharset=pInfo->charsetFamily;
+
+ inBytes=(const uint8_t *)inData+headerLength;
+ inEntries=(const UDataOffsetTOCEntry *)(inBytes+4);
+
+ /* check that the itemCount fits, then the ToC table, then at least the header of the last item */
+ length-=headerLength;
+ if(length<4) {
+ /* itemCount does not fit */
+ offset=0x7fffffff;
+ } else {
+ itemCount=udata_readInt32(ds, *(const int32_t *)inBytes);
+ setItemCapacity(itemCount); /* resize so there's space */
+ if(itemCount==0) {
+ offset=4;
+ } else if(length<(4+8*itemCount)) {
+ /* ToC table does not fit */
+ offset=0x7fffffff;
+ } else {
+ /* offset of the last item plus at least 20 bytes for its header */
+ offset=20+(int32_t)ds->readUInt32(inEntries[itemCount-1].dataOffset);
+ }
+ }
+ if(length<offset) {
+ fprintf(stderr, "icupkg: too few bytes (%ld after header) for a .dat package\n",
+ (long)length);
+ exit(U_INDEX_OUTOFBOUNDS_ERROR);
+ }
+ /* do not modify the package length variable until the last item's length is set */
+
+ if(itemCount<=0) {
+ if(doAutoPrefix) {
+ fprintf(stderr, "icupkg: --auto_toc_prefix[_with_type] but the input package is empty\n");
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ } else {
+ char prefix[MAX_PKG_NAME_LENGTH+4];
+ char *s, *inItemStrings;
+
+ if(itemCount>itemMax) {
+ fprintf(stderr, "icupkg: too many items, maximum is %d\n", itemMax);
+ exit(U_BUFFER_OVERFLOW_ERROR);
+ }
+
+ /* swap the item name strings */
+ int32_t stringsOffset=4+8*itemCount;
+ itemLength=(int32_t)(ds->readUInt32(inEntries[0].dataOffset))-stringsOffset;
+
+ // don't include padding bytes at the end of the item names
+ while(itemLength>0 && inBytes[stringsOffset+itemLength-1]!=0) {
+ --itemLength;
+ }
+
+ if((inStringTop+itemLength)>STRING_STORE_SIZE) {
+ fprintf(stderr, "icupkg: total length of item name strings too long\n");
+ exit(U_BUFFER_OVERFLOW_ERROR);
+ }
+
+ inItemStrings=inStrings+inStringTop;
+ ds->swapInvChars(ds, inBytes+stringsOffset, itemLength, inItemStrings, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg failed to swap the input .dat package item name strings\n");
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ inStringTop+=itemLength;
+
+ // reset the Item entries
+ memset(items, 0, itemCount*sizeof(Item));
+
+ /*
+ * Get the common prefix of the items.
+ * New-style ICU .dat packages use tree separators ('/') between package names,
+ * tree names, and item names,
+ * while old-style ICU .dat packages (before multi-tree support)
+ * use an underscore ('_') between package and item names.
+ */
+ offset=(int32_t)ds->readUInt32(inEntries[0].nameOffset)-stringsOffset;
+ s=inItemStrings+offset; // name of the first entry
+ int32_t prefixLength;
+ if(doAutoPrefix) {
+ // Use the first entry's prefix. Must be a new-style package.
+ const char *prefixLimit=strchr(s, U_TREE_ENTRY_SEP_CHAR);
+ if(prefixLimit==nullptr) {
+ fprintf(stderr,
+ "icupkg: --auto_toc_prefix[_with_type] but "
+ "the first entry \"%s\" does not contain a '%c'\n",
+ s, U_TREE_ENTRY_SEP_CHAR);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ prefixLength=(int32_t)(prefixLimit-s);
+ if(prefixLength==0 || prefixLength>=UPRV_LENGTHOF(pkgPrefix)) {
+ fprintf(stderr,
+ "icupkg: --auto_toc_prefix[_with_type] but "
+ "the prefix of the first entry \"%s\" is empty or too long\n",
+ s);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ if(prefixEndsWithType && s[prefixLength-1]!=type) {
+ fprintf(stderr,
+ "icupkg: --auto_toc_prefix_with_type but "
+ "the prefix of the first entry \"%s\" does not end with '%c'\n",
+ s, type);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ memcpy(pkgPrefix, s, prefixLength);
+ pkgPrefix[prefixLength]=0;
+ memcpy(prefix, s, ++prefixLength); // include the /
+ } else {
+ // Use the package basename as prefix.
+ int32_t inPkgNameLength= static_cast<int32_t>(strlen(inPkgName));
+ memcpy(prefix, inPkgName, inPkgNameLength);
+ prefixLength=inPkgNameLength;
+
+ if( (int32_t)strlen(s)>=(inPkgNameLength+2) &&
+ 0==memcmp(s, inPkgName, inPkgNameLength) &&
+ s[inPkgNameLength]=='_'
+ ) {
+ // old-style .dat package
+ prefix[prefixLength++]='_';
+ } else {
+ // new-style .dat package
+ prefix[prefixLength++]=U_TREE_ENTRY_SEP_CHAR;
+ // if it turns out to not contain U_TREE_ENTRY_SEP_CHAR
+ // then the test in the loop below will fail
+ }
+ }
+ prefix[prefixLength]=0;
+
+ /* read the ToC table */
+ for(i=0; i<itemCount; ++i) {
+ // skip the package part of the item name, error if it does not match the actual package name
+ // or if nothing follows the package name
+ offset=(int32_t)ds->readUInt32(inEntries[i].nameOffset)-stringsOffset;
+ s=inItemStrings+offset;
+ if(0!=strncmp(s, prefix, prefixLength) || s[prefixLength]==0) {
+ fprintf(stderr, "icupkg: input .dat item name \"%s\" does not start with \"%s\"\n",
+ s, prefix);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ items[i].name=s+prefixLength;
+
+ // set the item's data
+ items[i].data=(uint8_t *)inBytes+ds->readUInt32(inEntries[i].dataOffset);
+ if(i>0) {
+ items[i-1].length=(int32_t)(items[i].data-items[i-1].data);
+
+ // set the previous item's platform type
+ typeEnum=getTypeEnumForInputData(items[i-1].data, items[i-1].length, &errorCode);
+ if(typeEnum<0 || U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: not an ICU data file: item \"%s\" in \"%s\"\n", items[i-1].name, filename);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ items[i-1].type=makeTypeLetter(typeEnum);
+ }
+ items[i].isDataOwned=false;
+ }
+ // set the last item's length
+ items[itemCount-1].length=length-ds->readUInt32(inEntries[itemCount-1].dataOffset);
+
+ // set the last item's platform type
+ typeEnum=getTypeEnumForInputData(items[itemCount-1].data, items[itemCount-1].length, &errorCode);
+ if(typeEnum<0 || U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: not an ICU data file: item \"%s\" in \"%s\"\n", items[itemCount-1].name, filename);
+ exit(U_INVALID_FORMAT_ERROR);
+ }
+ items[itemCount-1].type=makeTypeLetter(typeEnum);
+
+ if(type!=U_ICUDATA_TYPE_LETTER[0]) {
+ // sort the item names for the local charset
+ sortItems();
+ }
+ }
+
+ udata_closeSwapper(ds);
+}
+
+char
+Package::getInType() {
+ return makeTypeLetter(inCharset, inIsBigEndian);
+}
+
+void
+Package::writePackage(const char *filename, char outType, const char *comment) {
+ char prefix[MAX_PKG_NAME_LENGTH+4];
+ UDataOffsetTOCEntry entry;
+ UDataSwapper *dsLocalToOut, *ds[TYPE_COUNT];
+ FILE *file;
+ Item *pItem;
+ char *name;
+ UErrorCode errorCode;
+ int32_t i, length, prefixLength, maxItemLength, basenameOffset, offset, outInt32;
+ uint8_t outCharset;
+ UBool outIsBigEndian;
+
+ extractPackageName(filename, prefix, MAX_PKG_NAME_LENGTH);
+
+ // if there is an explicit comment, then use it, else use what's in the current header
+ if(comment!=nullptr) {
+ /* get the header size minus the current comment */
+ DataHeader *pHeader;
+ int32_t length;
+
+ pHeader=(DataHeader *)header;
+ headerLength=4+pHeader->info.size;
+ length=(int32_t)strlen(comment);
+ if((int32_t)(headerLength+length)>=(int32_t)sizeof(header)) {
+ fprintf(stderr, "icupkg: comment too long\n");
+ exit(U_BUFFER_OVERFLOW_ERROR);
+ }
+ memcpy(header+headerLength, comment, length+1);
+ headerLength+=length;
+ if(headerLength&0xf) {
+ /* NUL-pad the header to a multiple of 16 */
+ length=(headerLength+0xf)&~0xf;
+ memset(header+headerLength, 0, length-headerLength);
+ headerLength=length;
+ }
+ pHeader->dataHeader.headerSize=(uint16_t)headerLength;
+ }
+
+ makeTypeProps(outType, outCharset, outIsBigEndian);
+
+ // open (TYPE_COUNT-2) swappers
+ // one is a no-op for local type==outType
+ // one type (TYPE_LE) is bogus
+ errorCode=U_ZERO_ERROR;
+ i=makeTypeEnum(outType);
+ ds[TYPE_B]= i==TYPE_B ? nullptr : udata_openSwapper(true, U_ASCII_FAMILY, outIsBigEndian, outCharset, &errorCode);
+ ds[TYPE_L]= i==TYPE_L ? nullptr : udata_openSwapper(false, U_ASCII_FAMILY, outIsBigEndian, outCharset, &errorCode);
+ ds[TYPE_LE]=nullptr;
+ ds[TYPE_E]= i==TYPE_E ? nullptr : udata_openSwapper(true, U_EBCDIC_FAMILY, outIsBigEndian, outCharset, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: udata_openSwapper() failed - %s\n", u_errorName(errorCode));
+ exit(errorCode);
+ }
+ for(i=0; i<TYPE_COUNT; ++i) {
+ if(ds[i]!=nullptr) {
+ ds[i]->printError=printPackageError;
+ ds[i]->printErrorContext=stderr;
+ }
+ }
+
+ dsLocalToOut=ds[makeTypeEnum(U_CHARSET_FAMILY, U_IS_BIG_ENDIAN)];
+
+ // create the file and write its contents
+ file=fopen(filename, "wb");
+ if(file==nullptr) {
+ fprintf(stderr, "icupkg: unable to create file \"%s\"\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ // swap and write the header
+ if(dsLocalToOut!=nullptr) {
+ udata_swapDataHeader(dsLocalToOut, header, headerLength, header, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: udata_swapDataHeader(local to out) failed - %s\n", u_errorName(errorCode));
+ exit(errorCode);
+ }
+ }
+ length=(int32_t)fwrite(header, 1, headerLength, file);
+ if(length!=headerLength) {
+ fprintf(stderr, "icupkg: unable to write complete header to file \"%s\"\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ // prepare and swap the package name with a tree separator
+ // for prepending to item names
+ if(pkgPrefix[0]==0) {
+ prefixLength=(int32_t)strlen(prefix);
+ } else {
+ prefixLength=(int32_t)strlen(pkgPrefix);
+ memcpy(prefix, pkgPrefix, prefixLength);
+ if(prefixEndsWithType) {
+ prefix[prefixLength-1]=outType;
+ }
+ }
+ prefix[prefixLength++]=U_TREE_ENTRY_SEP_CHAR;
+ prefix[prefixLength]=0;
+ if(dsLocalToOut!=nullptr) {
+ dsLocalToOut->swapInvChars(dsLocalToOut, prefix, prefixLength, prefix, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: swapInvChars(output package name) failed - %s\n", u_errorName(errorCode));
+ exit(errorCode);
+ }
+
+ // swap and sort the item names (sorting needs to be done in the output charset)
+ dsLocalToOut->swapInvChars(dsLocalToOut, inStrings, inStringTop, inStrings, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: swapInvChars(item names) failed - %s\n", u_errorName(errorCode));
+ exit(errorCode);
+ }
+ sortItems();
+ }
+
+ // create the output item names in sorted order, with the package name prepended to each
+ for(i=0; i<itemCount; ++i) {
+ length=(int32_t)strlen(items[i].name);
+ name=allocString(false, length+prefixLength);
+ memcpy(name, prefix, prefixLength);
+ memcpy(name+prefixLength, items[i].name, length+1);
+ items[i].name=name;
+ }
+
+ // calculate offsets for item names and items, pad to 16-align items
+ // align only the first item; each item's length is a multiple of 16
+ basenameOffset=4+8*itemCount;
+ offset=basenameOffset+outStringTop;
+ if((length=(offset&15))!=0) {
+ length=16-length;
+ memset(allocString(false, length-1), 0xaa, length);
+ offset+=length;
+ }
+
+ // write the table of contents
+ // first the itemCount
+ outInt32=itemCount;
+ if(dsLocalToOut!=nullptr) {
+ dsLocalToOut->swapArray32(dsLocalToOut, &outInt32, 4, &outInt32, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: swapArray32(item count) failed - %s\n", u_errorName(errorCode));
+ exit(errorCode);
+ }
+ }
+ length=(int32_t)fwrite(&outInt32, 1, 4, file);
+ if(length!=4) {
+ fprintf(stderr, "icupkg: unable to write complete item count to file \"%s\"\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ // then write the item entries (and collect the maxItemLength)
+ maxItemLength=0;
+ for(i=0; i<itemCount; ++i) {
+ entry.nameOffset=(uint32_t)(basenameOffset+(items[i].name-outStrings));
+ entry.dataOffset=(uint32_t)offset;
+ if(dsLocalToOut!=nullptr) {
+ dsLocalToOut->swapArray32(dsLocalToOut, &entry, 8, &entry, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: swapArray32(item entry %ld) failed - %s\n", (long)i, u_errorName(errorCode));
+ exit(errorCode);
+ }
+ }
+ length=(int32_t)fwrite(&entry, 1, 8, file);
+ if(length!=8) {
+ fprintf(stderr, "icupkg: unable to write complete item entry %ld to file \"%s\"\n", (long)i, filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ length=items[i].length;
+ if(length>maxItemLength) {
+ maxItemLength=length;
+ }
+ offset+=length;
+ }
+
+ // write the item names
+ length=(int32_t)fwrite(outStrings, 1, outStringTop, file);
+ if(length!=outStringTop) {
+ fprintf(stderr, "icupkg: unable to write complete item names to file \"%s\"\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ // write the items
+ for(pItem=items, i=0; i<itemCount; ++pItem, ++i) {
+ int32_t type=makeTypeEnum(pItem->type);
+ if(ds[type]!=nullptr) {
+ // swap each item from its platform properties to the desired ones
+ udata_swap(
+ ds[type],
+ pItem->data, pItem->length, pItem->data,
+ &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: udata_swap(item %ld) failed - %s\n", (long)i, u_errorName(errorCode));
+ exit(errorCode);
+ }
+ }
+ length=(int32_t)fwrite(pItem->data, 1, pItem->length, file);
+ if(length!=pItem->length) {
+ fprintf(stderr, "icupkg: unable to write complete item %ld to file \"%s\"\n", (long)i, filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+ }
+
+ if(ferror(file)) {
+ fprintf(stderr, "icupkg: unable to write complete file \"%s\"\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ fclose(file);
+ for(i=0; i<TYPE_COUNT; ++i) {
+ udata_closeSwapper(ds[i]);
+ }
+}
+
+int32_t
+Package::findItem(const char *name, int32_t length) const {
+ int32_t i, start, limit;
+ int result;
+
+ /* do a binary search for the string */
+ start=0;
+ limit=itemCount;
+ while(start<limit) {
+ i=(start+limit)/2;
+ if(length>=0) {
+ result=strncmp(name, items[i].name, length);
+ } else {
+ result=strcmp(name, items[i].name);
+ }
+
+ if(result==0) {
+ /* found */
+ if(length>=0) {
+ /*
+ * if we compared just prefixes, then we may need to back up
+ * to the first item with this prefix
+ */
+ while(i>0 && 0==strncmp(name, items[i-1].name, length)) {
+ --i;
+ }
+ }
+ return i;
+ } else if(result<0) {
+ limit=i;
+ } else /* result>0 */ {
+ start=i+1;
+ }
+ }
+
+ return ~start; /* not found, return binary-not of the insertion point */
+}
+
+void
+Package::findItems(const char *pattern) {
+ const char *wild;
+
+ if(pattern==nullptr || *pattern==0) {
+ findNextIndex=-1;
+ return;
+ }
+
+ findPrefix=pattern;
+ findSuffix=nullptr;
+ findSuffixLength=0;
+
+ wild=strchr(pattern, '*');
+ if(wild==nullptr) {
+ // no wildcard
+ findPrefixLength=(int32_t)strlen(pattern);
+ } else {
+ // one wildcard
+ findPrefixLength=(int32_t)(wild-pattern);
+ findSuffix=wild+1;
+ findSuffixLength=(int32_t)strlen(findSuffix);
+ if(nullptr!=strchr(findSuffix, '*')) {
+ // two or more wildcards
+ fprintf(stderr, "icupkg: syntax error (more than one '*') in item pattern \"%s\"\n", pattern);
+ exit(U_PARSE_ERROR);
+ }
+ }
+
+ if(findPrefixLength==0) {
+ findNextIndex=0;
+ } else {
+ findNextIndex=findItem(findPrefix, findPrefixLength);
+ }
+}
+
+int32_t
+Package::findNextItem() {
+ const char *name, *middle, *treeSep;
+ int32_t idx, nameLength, middleLength;
+
+ if(findNextIndex<0) {
+ return -1;
+ }
+
+ while(findNextIndex<itemCount) {
+ idx=findNextIndex++;
+ name=items[idx].name;
+ nameLength=(int32_t)strlen(name);
+ if(nameLength<(findPrefixLength+findSuffixLength)) {
+ // item name too short for prefix & suffix
+ continue;
+ }
+ if(findPrefixLength>0 && 0!=memcmp(findPrefix, name, findPrefixLength)) {
+ // left the range of names with this prefix
+ break;
+ }
+ middle=name+findPrefixLength;
+ middleLength=nameLength-findPrefixLength-findSuffixLength;
+ if(findSuffixLength>0 && 0!=memcmp(findSuffix, name+(nameLength-findSuffixLength), findSuffixLength)) {
+ // suffix does not match
+ continue;
+ }
+ // prefix & suffix match
+
+ if(matchMode&MATCH_NOSLASH) {
+ treeSep=strchr(middle, U_TREE_ENTRY_SEP_CHAR);
+ if(treeSep!=nullptr && (treeSep-middle)<middleLength) {
+ // the middle (matching the * wildcard) contains a tree separator /
+ continue;
+ }
+ }
+
+ // found a matching item
+ return idx;
+ }
+
+ // no more items
+ findNextIndex=-1;
+ return -1;
+}
+
+void
+Package::setMatchMode(uint32_t mode) {
+ matchMode=mode;
+}
+
+void
+Package::addItem(const char *name) {
+ addItem(name, nullptr, 0, false, U_ICUDATA_TYPE_LETTER[0]);
+}
+
+void
+Package::addItem(const char *name, uint8_t *data, int32_t length, UBool isDataOwned, char type) {
+ int32_t idx;
+
+ idx=findItem(name);
+ if(idx<0) {
+ // new item, make space at the insertion point
+ ensureItemCapacity();
+ // move the following items down
+ idx=~idx;
+ if(idx<itemCount) {
+ memmove(items+idx+1, items+idx, (itemCount-idx)*sizeof(Item));
+ }
+ ++itemCount;
+
+ // reset this Item entry
+ memset(items+idx, 0, sizeof(Item));
+
+ // copy the item's name
+ items[idx].name=allocString(true, static_cast<int32_t>(strlen(name)));
+ strcpy(items[idx].name, name);
+ pathToTree(items[idx].name);
+ } else {
+ // same-name item found, replace it
+ if(items[idx].isDataOwned) {
+ uprv_free(items[idx].data);
+ }
+
+ // keep the item's name since it is the same
+ }
+
+ // set the item's data
+ items[idx].data=data;
+ items[idx].length=length;
+ items[idx].isDataOwned=isDataOwned;
+ items[idx].type=type;
+}
+
+void
+Package::addFile(const char *filesPath, const char *name) {
+ uint8_t *data;
+ int32_t length;
+ char type;
+
+ data=readFile(filesPath, name, length, type);
+ // readFile() exits the tool if it fails
+ addItem(name, data, length, true, type);
+}
+
+void
+Package::addItems(const Package &listPkg) {
+ const Item *pItem;
+ int32_t i;
+
+ for(pItem=listPkg.items, i=0; i<listPkg.itemCount; ++pItem, ++i) {
+ addItem(pItem->name, pItem->data, pItem->length, false, pItem->type);
+ }
+}
+
+void
+Package::removeItem(int32_t idx) {
+ if(idx>=0) {
+ // remove the item
+ if(items[idx].isDataOwned) {
+ uprv_free(items[idx].data);
+ }
+
+ // move the following items up
+ if((idx+1)<itemCount) {
+ memmove(items+idx, items+idx+1, (itemCount-(idx+1))*sizeof(Item));
+ }
+ --itemCount;
+
+ if(idx<=findNextIndex) {
+ --findNextIndex;
+ }
+ }
+}
+
+void
+Package::removeItems(const char *pattern) {
+ int32_t idx;
+
+ findItems(pattern);
+ while((idx=findNextItem())>=0) {
+ removeItem(idx);
+ }
+}
+
+void
+Package::removeItems(const Package &listPkg) {
+ const Item *pItem;
+ int32_t i;
+
+ for(pItem=listPkg.items, i=0; i<listPkg.itemCount; ++pItem, ++i) {
+ removeItems(pItem->name);
+ }
+}
+
+void
+Package::extractItem(const char *filesPath, const char *outName, int32_t idx, char outType) {
+ char filename[1024];
+ UDataSwapper *ds;
+ FILE *file;
+ Item *pItem;
+ int32_t fileLength;
+ uint8_t itemCharset, outCharset;
+ UBool itemIsBigEndian, outIsBigEndian;
+
+ if(idx<0 || itemCount<=idx) {
+ return;
+ }
+ pItem=items+idx;
+
+ // swap the data to the outType
+ // outType==0: don't swap
+ if(outType!=0 && pItem->type!=outType) {
+ // open the swapper
+ UErrorCode errorCode=U_ZERO_ERROR;
+ makeTypeProps(pItem->type, itemCharset, itemIsBigEndian);
+ makeTypeProps(outType, outCharset, outIsBigEndian);
+ ds=udata_openSwapper(itemIsBigEndian, itemCharset, outIsBigEndian, outCharset, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: udata_openSwapper(item %ld) failed - %s\n",
+ (long)idx, u_errorName(errorCode));
+ exit(errorCode);
+ }
+
+ ds->printError=printPackageError;
+ ds->printErrorContext=stderr;
+
+ // swap the item from its platform properties to the desired ones
+ udata_swap(ds, pItem->data, pItem->length, pItem->data, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: udata_swap(item %ld) failed - %s\n", (long)idx, u_errorName(errorCode));
+ exit(errorCode);
+ }
+ udata_closeSwapper(ds);
+ pItem->type=outType;
+ }
+
+ // create the file and write its contents
+ makeFullFilenameAndDirs(filesPath, outName, filename, (int32_t)sizeof(filename));
+ file=fopen(filename, "wb");
+ if(file==nullptr) {
+ fprintf(stderr, "icupkg: unable to create file \"%s\"\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+ fileLength=(int32_t)fwrite(pItem->data, 1, pItem->length, file);
+
+ if(ferror(file) || fileLength!=pItem->length) {
+ fprintf(stderr, "icupkg: unable to write complete file \"%s\"\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+ fclose(file);
+}
+
+void
+Package::extractItem(const char *filesPath, int32_t idx, char outType) {
+ extractItem(filesPath, items[idx].name, idx, outType);
+}
+
+void
+Package::extractItems(const char *filesPath, const char *pattern, char outType) {
+ int32_t idx;
+
+ findItems(pattern);
+ while((idx=findNextItem())>=0) {
+ extractItem(filesPath, idx, outType);
+ }
+}
+
+void
+Package::extractItems(const char *filesPath, const Package &listPkg, char outType) {
+ const Item *pItem;
+ int32_t i;
+
+ for(pItem=listPkg.items, i=0; i<listPkg.itemCount; ++pItem, ++i) {
+ extractItems(filesPath, pItem->name, outType);
+ }
+}
+
+int32_t
+Package::getItemCount() const {
+ return itemCount;
+}
+
+const Item *
+Package::getItem(int32_t idx) const {
+ if (0 <= idx && idx < itemCount) {
+ return &items[idx];
+ }
+ return nullptr;
+}
+
+void
+Package::checkDependency(void *context, const char *itemName, const char *targetName) {
+ // check dependency: make sure the target item is in the package
+ Package *me=(Package *)context;
+ if(me->findItem(targetName)<0) {
+ me->isMissingItems=true;
+ fprintf(stderr, "Item %s depends on missing item %s\n", itemName, targetName);
+ }
+}
+
+UBool
+Package::checkDependencies() {
+ isMissingItems=false;
+ enumDependencies(this, checkDependency);
+ return (UBool)!isMissingItems;
+}
+
+void
+Package::enumDependencies(void *context, CheckDependency check) {
+ int32_t i;
+
+ for(i=0; i<itemCount; ++i) {
+ enumDependencies(items+i, context, check);
+ }
+}
+
+char *
+Package::allocString(UBool in, int32_t length) {
+ char *p;
+ int32_t top;
+
+ if(in) {
+ top=inStringTop;
+ p=inStrings+top;
+ } else {
+ top=outStringTop;
+ p=outStrings+top;
+ }
+ top+=length+1;
+
+ if(top>STRING_STORE_SIZE) {
+ fprintf(stderr, "icupkg: string storage overflow\n");
+ exit(U_BUFFER_OVERFLOW_ERROR);
+ }
+ if(in) {
+ inStringTop=top;
+ } else {
+ outStringTop=top;
+ }
+ return p;
+}
+
+void
+Package::sortItems() {
+ UErrorCode errorCode=U_ZERO_ERROR;
+ uprv_sortArray(items, itemCount, (int32_t)sizeof(Item), compareItems, nullptr, false, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: sorting item names failed - %s\n", u_errorName(errorCode));
+ exit(errorCode);
+ }
+}
+
+void Package::setItemCapacity(int32_t max)
+{
+ if(max<=itemMax) {
+ return;
+ }
+ Item *newItems = (Item*)uprv_malloc(max * sizeof(items[0]));
+ Item *oldItems = items;
+ if(newItems == nullptr) {
+ fprintf(stderr, "icupkg: Out of memory trying to allocate %lu bytes for %d items\n",
+ (unsigned long)(max*sizeof(items[0])), max);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ if(items && itemCount>0) {
+ uprv_memcpy(newItems, items, (size_t)itemCount*sizeof(items[0]));
+ }
+ itemMax = max;
+ items = newItems;
+ uprv_free(oldItems);
+}
+
+void Package::ensureItemCapacity()
+{
+ if((itemCount+1)>itemMax) {
+ setItemCapacity(itemCount+kItemsChunk);
+ }
+}
+
+U_NAMESPACE_END
diff --git a/intl/icu/source/tools/toolutil/package.h b/intl/icu/source/tools/toolutil/package.h
new file mode 100644
index 0000000000..ea60c13a74
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/package.h
@@ -0,0 +1,203 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2005-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: package.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2005aug25
+* created by: Markus W. Scherer
+*
+* Read, modify, and write ICU .dat data package files.
+*/
+
+#ifndef __PACKAGE_H__
+#define __PACKAGE_H__
+
+#include "unicode/utypes.h"
+
+#include <stdio.h>
+
+// .dat package file representation ---------------------------------------- ***
+
+#define STRING_STORE_SIZE 100000
+#define MAX_PKG_NAME_LENGTH 64
+
+typedef void CheckDependency(void *context, const char *itemName, const char *targetName);
+
+U_NAMESPACE_BEGIN
+
+struct Item {
+ char *name;
+ uint8_t *data;
+ int32_t length;
+ UBool isDataOwned;
+ char type;
+};
+
+class U_TOOLUTIL_API Package {
+public:
+ /*
+ * Constructor.
+ * Prepare this object for a new, empty package.
+ */
+ Package();
+
+ /* Destructor. */
+ ~Package();
+
+ /**
+ * Uses the prefix of the first entry of the package in readPackage(),
+ * rather than the package basename.
+ */
+ void setAutoPrefix() { doAutoPrefix=true; }
+ /**
+ * Same as setAutoPrefix(), plus the prefix must end with the platform type letter.
+ */
+ void setAutoPrefixWithType() {
+ doAutoPrefix=true;
+ prefixEndsWithType=true;
+ }
+ void setPrefix(const char *p);
+
+ /*
+ * Read an existing .dat package file.
+ * The header and item name strings are swapped into this object,
+ * but the items are left unswapped.
+ */
+ void readPackage(const char *filename);
+ /*
+ * Write a .dat package file with the items in this object.
+ * Swap all pieces to the desired output platform properties.
+ * The package becomes unusable:
+ * The item names are swapped and sorted in the outCharset rather than the local one.
+ * Also, the items themselves are swapped in-place
+ */
+ void writePackage(const char *filename, char outType, const char *comment);
+
+ /*
+ * Return the input data type letter (l, b, or e).
+ */
+ char getInType();
+
+ // find the item in items[], return the non-negative index if found, else the binary-not of the insertion point
+ int32_t findItem(const char *name, int32_t length=-1) const;
+
+ /*
+ * Set internal state for following calls to findNextItem() which will return
+ * indexes for items whose names match the pattern.
+ */
+ void findItems(const char *pattern);
+ int32_t findNextItem();
+ /*
+ * Set the match mode for findItems() & findNextItem().
+ * @param mode 0=default
+ * MATCH_NOSLASH * does not match a '/'
+ */
+ void setMatchMode(uint32_t mode);
+
+ enum {
+ MATCH_NOSLASH=1
+ };
+
+ void addItem(const char *name);
+ void addItem(const char *name, uint8_t *data, int32_t length, UBool isDataOwned, char type);
+ void addFile(const char *filesPath, const char *name);
+ void addItems(const Package &listPkg);
+
+ void removeItem(int32_t itemIndex);
+ void removeItems(const char *pattern);
+ void removeItems(const Package &listPkg);
+
+ /* The extractItem() functions accept outputType=0 to mean "don't swap the item". */
+ void extractItem(const char *filesPath, int32_t itemIndex, char outType);
+ void extractItems(const char *filesPath, const char *pattern, char outType);
+ void extractItems(const char *filesPath, const Package &listPkg, char outType);
+
+ /* This variant extracts an item to a specific filename. */
+ void extractItem(const char *filesPath, const char *outName, int32_t itemIndex, char outType);
+
+ int32_t getItemCount() const;
+ const Item *getItem(int32_t idx) const;
+
+ /*
+ * Check dependencies and return true if all dependencies are fulfilled.
+ */
+ UBool checkDependencies();
+
+ /*
+ * Enumerate all the dependencies and give the results to context and call CheckDependency callback
+ * @param context user context (will be passed to check function)
+ * @param check will be called with context and any missing items
+ */
+ void enumDependencies(void *context, CheckDependency check);
+
+private:
+ void enumDependencies(Item *pItem, void *context, CheckDependency check);
+
+ /**
+ * Default CheckDependency function used by checkDependencies()
+ */
+ static void checkDependency(void *context, const char *itemName, const char *targetName);
+
+ /*
+ * Allocate a string in inStrings or outStrings.
+ * The length does not include the terminating NUL.
+ */
+ char *allocString(UBool in, int32_t length);
+
+ void sortItems();
+
+ // data fields
+ char inPkgName[MAX_PKG_NAME_LENGTH];
+ char pkgPrefix[MAX_PKG_NAME_LENGTH];
+
+ uint8_t *inData;
+ uint8_t header[1024];
+ int32_t inLength, headerLength;
+ uint8_t inCharset;
+ UBool inIsBigEndian;
+ UBool doAutoPrefix;
+ UBool prefixEndsWithType;
+
+ int32_t itemCount;
+ int32_t itemMax;
+ Item *items;
+
+ int32_t inStringTop, outStringTop;
+ char inStrings[STRING_STORE_SIZE], outStrings[STRING_STORE_SIZE];
+
+ // match mode for findItems(pattern) and findNextItem()
+ uint32_t matchMode;
+
+ // state for findItems(pattern) and findNextItem()
+ const char *findPrefix, *findSuffix;
+ int32_t findPrefixLength, findSuffixLength;
+ int32_t findNextIndex;
+
+ // state for checkDependencies()
+ UBool isMissingItems;
+
+ /**
+ * Grow itemMax to new value
+ */
+ void setItemCapacity(int32_t max);
+
+ /**
+ * Grow itemMax to at least itemCount+1
+ */
+ void ensureItemCapacity();
+};
+
+U_NAMESPACE_END
+
+#endif
+
+
diff --git a/intl/icu/source/tools/toolutil/pkg_genc.cpp b/intl/icu/source/tools/toolutil/pkg_genc.cpp
new file mode 100644
index 0000000000..741a8a5228
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/pkg_genc.cpp
@@ -0,0 +1,1396 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/******************************************************************************
+ * Copyright (C) 2009-2016, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *******************************************************************************
+ */
+#include "unicode/utypes.h"
+
+#if U_PLATFORM_HAS_WIN32_API
+# define VC_EXTRALEAN
+# define WIN32_LEAN_AND_MEAN
+# define NOUSER
+# define NOSERVICE
+# define NOIME
+# define NOMCX
+#include <windows.h>
+#include <time.h>
+# ifdef __GNUC__
+# define WINDOWS_WITH_GNUC
+# endif
+#endif
+
+#if U_PLATFORM_IS_LINUX_BASED && U_HAVE_ELF_H
+# define U_ELF
+#endif
+
+#ifdef U_ELF
+# include <elf.h>
+# if defined(ELFCLASS64)
+# define U_ELF64
+# endif
+ /* Old elf.h headers may not have EM_X86_64, or have EM_X8664 instead. */
+# ifndef EM_X86_64
+# define EM_X86_64 62
+# endif
+# define ICU_ENTRY_OFFSET 0
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "unicode/putil.h"
+#include "cmemory.h"
+#include "cstring.h"
+#include "filestrm.h"
+#include "toolutil.h"
+#include "unicode/uclean.h"
+#include "uoptions.h"
+#include "pkg_genc.h"
+#include "filetools.h"
+#include "charstr.h"
+#include "unicode/errorcode.h"
+
+#define MAX_COLUMN ((uint32_t)(0xFFFFFFFFU))
+
+#define HEX_0X 0 /* 0x1234 */
+#define HEX_0H 1 /* 01234h */
+
+/* prototypes --------------------------------------------------------------- */
+static void
+getOutFilename(
+ const char *inFilename,
+ const char *destdir,
+ char *outFilename,
+ int32_t outFilenameCapacity,
+ char *entryName,
+ int32_t entryNameCapacity,
+ const char *newSuffix,
+ const char *optFilename);
+
+static uint32_t
+write8(FileStream *out, uint8_t byte, uint32_t column);
+
+static uint32_t
+write32(FileStream *out, uint32_t byte, uint32_t column);
+
+#if U_PLATFORM == U_PF_OS400
+static uint32_t
+write8str(FileStream *out, uint8_t byte, uint32_t column);
+#endif
+/* -------------------------------------------------------------------------- */
+
+/*
+Creating Template Files for New Platforms
+
+Let the cc compiler help you get started.
+Compile this program
+ const unsigned int x[5] = {1, 2, 0xdeadbeef, 0xffffffff, 16};
+with the -S option to produce assembly output.
+
+For example, this will generate array.s:
+gcc -S array.c
+
+This will produce a .s file that may look like this:
+
+ .file "array.c"
+ .version "01.01"
+gcc2_compiled.:
+ .globl x
+ .section .rodata
+ .align 4
+ .type x,@object
+ .size x,20
+x:
+ .long 1
+ .long 2
+ .long -559038737
+ .long -1
+ .long 16
+ .ident "GCC: (GNU) 2.96 20000731 (Red Hat Linux 7.1 2.96-85)"
+
+which gives a starting point that will compile, and can be transformed
+to become the template, generally with some consulting of as docs and
+some experimentation.
+
+If you want ICU to automatically use this assembly, you should
+specify "GENCCODE_ASSEMBLY=-a name" in the specific config/mh-* file,
+where the name is the compiler or platform that you used in this
+assemblyHeader data structure.
+*/
+static const struct AssemblyType {
+ const char *name;
+ const char *header;
+ const char *beginLine;
+ const char *footer;
+ int8_t hexType; /* HEX_0X or HEX_0h */
+} assemblyHeader[] = {
+ /* For gcc assemblers, the meaning of .align changes depending on the */
+ /* hardware, so we use .balign 16 which always means 16 bytes. */
+ /* https://sourceware.org/binutils/docs/as/Pseudo-Ops.html */
+ {"gcc",
+ ".globl %s\n"
+ "\t.section .note.GNU-stack,\"\",%%progbits\n"
+ "#ifdef __CET__\n"
+ "# include <cet.h>\n"
+ "#endif\n"
+ "\t.section .rodata\n"
+ "\t.balign 16\n"
+ "#ifdef U_HIDE_DATA_SYMBOL\n"
+ "\t.hidden %s\n"
+ "#endif\n"
+ "\t.type %s,%%object\n"
+ "%s:\n\n",
+
+ ".long ",".size %s, .-%s\n",HEX_0X
+ },
+ {"gcc-darwin",
+ /*"\t.section __TEXT,__text,regular,pure_instructions\n"
+ "\t.section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32\n"*/
+ ".globl _%s\n"
+ "#ifdef U_HIDE_DATA_SYMBOL\n"
+ "\t.private_extern _%s\n"
+ "#endif\n"
+ "\t.data\n"
+ "\t.const\n"
+ "\t.balign 16\n"
+ "_%s:\n\n",
+
+ ".long ","",HEX_0X
+ },
+ /* macOS PPC should use `.p2align 4` instead `.balign 16` because is
+ * unknown pseudo ops for such legacy system*/
+ {"gcc-darwin-ppc",
+ /*"\t.section __TEXT,__text,regular,pure_instructions\n"
+ "\t.section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32\n"*/
+ ".globl _%s\n"
+ "#ifdef U_HIDE_DATA_SYMBOL\n"
+ "\t.private_extern _%s\n"
+ "#endif\n"
+ "\t.data\n"
+ "\t.const\n"
+ "\t.p2align 4\n"
+ "_%s:\n\n",
+
+ ".long ","",HEX_0X
+ },
+ {"gcc-cygwin",
+ ".globl _%s\n"
+ "\t.section .rodata\n"
+ "\t.balign 16\n"
+ "_%s:\n\n",
+
+ ".long ","",HEX_0X
+ },
+ {"gcc-mingw64",
+ ".globl %s\n"
+ "\t.section .rodata\n"
+ "\t.balign 16\n"
+ "%s:\n\n",
+
+ ".long ","",HEX_0X
+ },
+/* 16 bytes alignment. */
+/* http://docs.oracle.com/cd/E19641-01/802-1947/802-1947.pdf */
+ {"sun",
+ "\t.section \".rodata\"\n"
+ "\t.align 16\n"
+ ".globl %s\n"
+ "%s:\n",
+
+ ".word ","",HEX_0X
+ },
+/* 16 bytes alignment for sun-x86. */
+/* http://docs.oracle.com/cd/E19963-01/html/821-1608/eoiyg.html */
+ {"sun-x86",
+ "Drodata.rodata:\n"
+ "\t.type Drodata.rodata,@object\n"
+ "\t.size Drodata.rodata,0\n"
+ "\t.globl %s\n"
+ "\t.align 16\n"
+ "%s:\n",
+
+ ".4byte ","",HEX_0X
+ },
+/* 1<<4 bit alignment for aix. */
+/* http://pic.dhe.ibm.com/infocenter/aix/v6r1/index.jsp?topic=%2Fcom.ibm.aix.aixassem%2Fdoc%2Falangref%2Fidalangref_csect_pseudoop.htm */
+ {"xlc",
+ ".globl %s{RO}\n"
+ "\t.toc\n"
+ "%s:\n"
+ "\t.csect %s{RO}, 4\n",
+
+ ".long ","",HEX_0X
+ },
+ {"aCC-ia64",
+ "\t.file \"%s.s\"\n"
+ "\t.type %s,@object\n"
+ "\t.global %s\n"
+ "\t.secalias .abe$0.rodata, \".rodata\"\n"
+ "\t.section .abe$0.rodata = \"a\", \"progbits\"\n"
+ "\t.align 16\n"
+ "%s::\t",
+
+ "data4 ","",HEX_0X
+ },
+ {"aCC-parisc",
+ "\t.SPACE $TEXT$\n"
+ "\t.SUBSPA $LIT$\n"
+ "%s\n"
+ "\t.EXPORT %s\n"
+ "\t.ALIGN 16\n",
+
+ ".WORD ","",HEX_0X
+ },
+/* align 16 bytes */
+/* http://msdn.microsoft.com/en-us/library/dwa9fwef.aspx */
+ {"nasm",
+ "global %s\n"
+#if defined(_WIN32)
+ "section .rdata align=16\n"
+#else
+ "section .rodata align=16\n"
+#endif
+ "%s:\n",
+ " dd ","",HEX_0X
+ },
+ { "masm",
+ "\tTITLE %s\n"
+ "; generated by genccode\n"
+ ".386\n"
+ ".model flat\n"
+ "\tPUBLIC _%s\n"
+ "ICUDATA_%s\tSEGMENT READONLY PARA PUBLIC FLAT 'DATA'\n"
+ "\tALIGN 16\n"
+ "_%s\tLABEL DWORD\n",
+ "\tDWORD ","\nICUDATA_%s\tENDS\n\tEND\n",HEX_0H
+ },
+ { "masm64",
+ "\tTITLE %s\n"
+ "; generated by genccode\n"
+ "\tPUBLIC _%s\n"
+ "ICUDATA_%s\tSEGMENT READONLY 'DATA'\n"
+ "\tALIGN 16\n"
+ "_%s\tLABEL DWORD\n",
+ "\tDWORD ","\nICUDATA_%s\tENDS\n\tEND\n",HEX_0H
+ }
+};
+
+static int32_t assemblyHeaderIndex = -1;
+static int32_t hexType = HEX_0X;
+
+U_CAPI UBool U_EXPORT2
+checkAssemblyHeaderName(const char* optAssembly) {
+ int32_t idx;
+ assemblyHeaderIndex = -1;
+ for (idx = 0; idx < UPRV_LENGTHOF(assemblyHeader); idx++) {
+ if (uprv_strcmp(optAssembly, assemblyHeader[idx].name) == 0) {
+ assemblyHeaderIndex = idx;
+ hexType = assemblyHeader[idx].hexType; /* set the hex type */
+ return true;
+ }
+ }
+
+ return false;
+}
+
+
+U_CAPI void U_EXPORT2
+printAssemblyHeadersToStdErr() {
+ int32_t idx;
+ fprintf(stderr, "%s", assemblyHeader[0].name);
+ for (idx = 1; idx < UPRV_LENGTHOF(assemblyHeader); idx++) {
+ fprintf(stderr, ", %s", assemblyHeader[idx].name);
+ }
+ fprintf(stderr,
+ ")\n");
+}
+
+U_CAPI void U_EXPORT2
+writeAssemblyCode(
+ const char *filename,
+ const char *destdir,
+ const char *optEntryPoint,
+ const char *optFilename,
+ char *outFilePath,
+ size_t outFilePathCapacity) {
+ uint32_t column = MAX_COLUMN;
+ char entry[96];
+ union {
+ uint32_t uint32s[1024];
+ char chars[4096];
+ } buffer;
+ FileStream *in, *out;
+ size_t i, length, count;
+
+ in=T_FileStream_open(filename, "rb");
+ if(in==nullptr) {
+ fprintf(stderr, "genccode: unable to open input file %s\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ const char* newSuffix = nullptr;
+
+ if (uprv_strcmp(assemblyHeader[assemblyHeaderIndex].name, "masm") == 0) {
+ newSuffix = ".masm";
+ }
+ else if (uprv_strcmp(assemblyHeader[assemblyHeaderIndex].name, "nasm") == 0) {
+ newSuffix = ".asm";
+ } else {
+ newSuffix = ".S";
+ }
+
+ getOutFilename(
+ filename,
+ destdir,
+ buffer.chars,
+ sizeof(buffer.chars),
+ entry,
+ sizeof(entry),
+ newSuffix,
+ optFilename);
+ out=T_FileStream_open(buffer.chars, "w");
+ if(out==nullptr) {
+ fprintf(stderr, "genccode: unable to open output file %s\n", buffer.chars);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ if (outFilePath != nullptr) {
+ if (uprv_strlen(buffer.chars) >= outFilePathCapacity) {
+ fprintf(stderr, "genccode: filename too long\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ uprv_strcpy(outFilePath, buffer.chars);
+#if defined (WINDOWS_WITH_GNUC) && U_PLATFORM != U_PF_CYGWIN
+ /* Need to fix the file separator character when using MinGW. */
+ swapFileSepChar(outFilePath, U_FILE_SEP_CHAR, '/');
+#endif
+ }
+
+ if(optEntryPoint != nullptr) {
+ uprv_strcpy(entry, optEntryPoint);
+ uprv_strcat(entry, "_dat");
+ }
+
+ /* turn dashes or dots in the entry name into underscores */
+ length=uprv_strlen(entry);
+ for(i=0; i<length; ++i) {
+ if(entry[i]=='-' || entry[i]=='.') {
+ entry[i]='_';
+ }
+ }
+
+ count = snprintf(
+ buffer.chars, sizeof(buffer.chars),
+ assemblyHeader[assemblyHeaderIndex].header,
+ entry, entry, entry, entry,
+ entry, entry, entry, entry);
+ if (count >= sizeof(buffer.chars)) {
+ fprintf(stderr, "genccode: entry name too long (long filename?)\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ T_FileStream_writeLine(out, buffer.chars);
+ T_FileStream_writeLine(out, assemblyHeader[assemblyHeaderIndex].beginLine);
+
+ for(;;) {
+ memset(buffer.uint32s, 0, sizeof(buffer.uint32s));
+ length=T_FileStream_read(in, buffer.uint32s, sizeof(buffer.uint32s));
+ if(length==0) {
+ break;
+ }
+ for(i=0; i<(length/sizeof(buffer.uint32s[0])); i++) {
+ // TODO: What if the last read sees length not as a multiple of 4?
+ column = write32(out, buffer.uint32s[i], column);
+ }
+ }
+
+ T_FileStream_writeLine(out, "\n");
+
+ count = snprintf(
+ buffer.chars, sizeof(buffer.chars),
+ assemblyHeader[assemblyHeaderIndex].footer,
+ entry, entry, entry, entry,
+ entry, entry, entry, entry);
+ if (count >= sizeof(buffer.chars)) {
+ fprintf(stderr, "genccode: entry name too long (long filename?)\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ T_FileStream_writeLine(out, buffer.chars);
+
+ if(T_FileStream_error(in)) {
+ fprintf(stderr, "genccode: file read error while generating from file %s\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ if(T_FileStream_error(out)) {
+ fprintf(stderr, "genccode: file write error while generating from file %s\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ T_FileStream_close(out);
+ T_FileStream_close(in);
+}
+
+U_CAPI void U_EXPORT2
+writeCCode(
+ const char *filename,
+ const char *destdir,
+ const char *optEntryPoint,
+ const char *optName,
+ const char *optFilename,
+ char *outFilePath,
+ size_t outFilePathCapacity) {
+ uint32_t column = MAX_COLUMN;
+ char buffer[4096], entry[96];
+ FileStream *in, *out;
+ size_t i, length, count;
+
+ in=T_FileStream_open(filename, "rb");
+ if(in==nullptr) {
+ fprintf(stderr, "genccode: unable to open input file %s\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ if(optName != nullptr) { /* prepend 'icudt28_' */
+ // +2 includes the _ and the NUL
+ if (uprv_strlen(optName) + 2 > sizeof(entry)) {
+ fprintf(stderr, "genccode: entry name too long (long filename?)\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ strcpy(entry, optName);
+ strcat(entry, "_");
+ } else {
+ entry[0] = 0;
+ }
+
+ getOutFilename(
+ filename,
+ destdir,
+ buffer,
+ static_cast<int32_t>(sizeof(buffer)),
+ entry + uprv_strlen(entry),
+ static_cast<int32_t>(sizeof(entry) - uprv_strlen(entry)),
+ ".c",
+ optFilename);
+
+ if (outFilePath != nullptr) {
+ if (uprv_strlen(buffer) >= outFilePathCapacity) {
+ fprintf(stderr, "genccode: filename too long\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ uprv_strcpy(outFilePath, buffer);
+#if defined (WINDOWS_WITH_GNUC) && U_PLATFORM != U_PF_CYGWIN
+ /* Need to fix the file separator character when using MinGW. */
+ swapFileSepChar(outFilePath, U_FILE_SEP_CHAR, '/');
+#endif
+ }
+
+ out=T_FileStream_open(buffer, "w");
+ if(out==nullptr) {
+ fprintf(stderr, "genccode: unable to open output file %s\n", buffer);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ if(optEntryPoint != nullptr) {
+ uprv_strcpy(entry, optEntryPoint);
+ uprv_strcat(entry, "_dat");
+ }
+
+ /* turn dashes or dots in the entry name into underscores */
+ length=uprv_strlen(entry);
+ for(i=0; i<length; ++i) {
+ if(entry[i]=='-' || entry[i]=='.') {
+ entry[i]='_';
+ }
+ }
+
+#if U_PLATFORM == U_PF_OS400
+ /*
+ TODO: Fix this once the compiler implements this feature. Keep in sync with udatamem.c
+
+ This is here because this platform can't currently put
+ const data into the read-only pages of an object or
+ shared library (service program). Only strings are allowed in read-only
+ pages, so we use char * strings to store the data.
+
+ In order to prevent the beginning of the data from ever matching the
+ magic numbers we must still use the initial double.
+ [grhoten 4/24/2003]
+ */
+ count = snprintf(buffer, sizeof(buffer),
+ "#ifndef IN_GENERATED_CCODE\n"
+ "#define IN_GENERATED_CCODE\n"
+ "#define U_DISABLE_RENAMING 1\n"
+ "#include \"unicode/umachine.h\"\n"
+ "#endif\n"
+ "U_CDECL_BEGIN\n"
+ "const struct {\n"
+ " double bogus;\n"
+ " const char *bytes; \n"
+ "} %s={ 0.0, \n",
+ entry);
+ if (count >= sizeof(buffer)) {
+ fprintf(stderr, "genccode: entry name too long (long filename?)\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ T_FileStream_writeLine(out, buffer);
+
+ for(;;) {
+ length=T_FileStream_read(in, buffer, sizeof(buffer));
+ if(length==0) {
+ break;
+ }
+ for(i=0; i<length; ++i) {
+ column = write8str(out, (uint8_t)buffer[i], column);
+ }
+ }
+
+ T_FileStream_writeLine(out, "\"\n};\nU_CDECL_END\n");
+#else
+ /* Function renaming shouldn't be done in data */
+ count = snprintf(buffer, sizeof(buffer),
+ "#ifndef IN_GENERATED_CCODE\n"
+ "#define IN_GENERATED_CCODE\n"
+ "#define U_DISABLE_RENAMING 1\n"
+ "#include \"unicode/umachine.h\"\n"
+ "#endif\n"
+ "U_CDECL_BEGIN\n"
+ "const struct {\n"
+ " double bogus;\n"
+ " uint8_t bytes[%ld]; \n"
+ "} %s={ 0.0, {\n",
+ (long)T_FileStream_size(in), entry);
+ if (count >= sizeof(buffer)) {
+ fprintf(stderr, "genccode: entry name too long (long filename?)\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ T_FileStream_writeLine(out, buffer);
+
+ for(;;) {
+ length=T_FileStream_read(in, buffer, sizeof(buffer));
+ if(length==0) {
+ break;
+ }
+ for(i=0; i<length; ++i) {
+ column = write8(out, (uint8_t)buffer[i], column);
+ }
+ }
+
+ T_FileStream_writeLine(out, "\n}\n};\nU_CDECL_END\n");
+#endif
+
+ if(T_FileStream_error(in)) {
+ fprintf(stderr, "genccode: file read error while generating from file %s\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ if(T_FileStream_error(out)) {
+ fprintf(stderr, "genccode: file write error while generating from file %s\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ T_FileStream_close(out);
+ T_FileStream_close(in);
+}
+
+static uint32_t
+write32(FileStream *out, uint32_t bitField, uint32_t column) {
+ int32_t i;
+ char bitFieldStr[64]; /* This is more bits than needed for a 32-bit number */
+ char *s = bitFieldStr;
+ uint8_t *ptrIdx = (uint8_t *)&bitField;
+ static const char hexToStr[16] = {
+ '0','1','2','3',
+ '4','5','6','7',
+ '8','9','A','B',
+ 'C','D','E','F'
+ };
+
+ /* write the value, possibly with comma and newline */
+ if(column==MAX_COLUMN) {
+ /* first byte */
+ column=1;
+ } else if(column<32) {
+ *(s++)=',';
+ ++column;
+ } else {
+ *(s++)='\n';
+ uprv_strcpy(s, assemblyHeader[assemblyHeaderIndex].beginLine);
+ s+=uprv_strlen(s);
+ column=1;
+ }
+
+ if (bitField < 10) {
+ /* It's a small number. Don't waste the space for 0x */
+ *(s++)=hexToStr[bitField];
+ }
+ else {
+ int seenNonZero = 0; /* This is used to remove leading zeros */
+
+ if(hexType==HEX_0X) {
+ *(s++)='0';
+ *(s++)='x';
+ } else if(hexType==HEX_0H) {
+ *(s++)='0';
+ }
+
+ /* This creates a 32-bit field */
+#if U_IS_BIG_ENDIAN
+ for (i = 0; i < sizeof(uint32_t); i++)
+#else
+ for (i = sizeof(uint32_t)-1; i >= 0 ; i--)
+#endif
+ {
+ uint8_t value = ptrIdx[i];
+ if (value || seenNonZero) {
+ *(s++)=hexToStr[value>>4];
+ *(s++)=hexToStr[value&0xF];
+ seenNonZero = 1;
+ }
+ }
+ if(hexType==HEX_0H) {
+ *(s++)='h';
+ }
+ }
+
+ *(s++)=0;
+ T_FileStream_writeLine(out, bitFieldStr);
+ return column;
+}
+
+static uint32_t
+write8(FileStream *out, uint8_t byte, uint32_t column) {
+ char s[4];
+ int i=0;
+
+ /* convert the byte value to a string */
+ if(byte>=100) {
+ s[i++]=(char)('0'+byte/100);
+ byte%=100;
+ }
+ if(i>0 || byte>=10) {
+ s[i++]=(char)('0'+byte/10);
+ byte%=10;
+ }
+ s[i++]=(char)('0'+byte);
+ s[i]=0;
+
+ /* write the value, possibly with comma and newline */
+ if(column==MAX_COLUMN) {
+ /* first byte */
+ column=1;
+ } else if(column<16) {
+ T_FileStream_writeLine(out, ",");
+ ++column;
+ } else {
+ T_FileStream_writeLine(out, ",\n");
+ column=1;
+ }
+ T_FileStream_writeLine(out, s);
+ return column;
+}
+
+#if U_PLATFORM == U_PF_OS400
+static uint32_t
+write8str(FileStream *out, uint8_t byte, uint32_t column) {
+ char s[8];
+
+ if (byte > 7)
+ snprintf(s, sizeof(s), "\\x%X", byte);
+ else
+ snprintf(s, sizeof(s), "\\%X", byte);
+
+ /* write the value, possibly with comma and newline */
+ if(column==MAX_COLUMN) {
+ /* first byte */
+ column=1;
+ T_FileStream_writeLine(out, "\"");
+ } else if(column<24) {
+ ++column;
+ } else {
+ T_FileStream_writeLine(out, "\"\n\"");
+ column=1;
+ }
+ T_FileStream_writeLine(out, s);
+ return column;
+}
+#endif
+
+static void
+getOutFilename(
+ const char *inFilename,
+ const char *destdir,
+ char *outFilename,
+ int32_t outFilenameCapacity,
+ char *entryName,
+ int32_t entryNameCapacity,
+ const char *newSuffix,
+ const char *optFilename) {
+ const char *basename=findBasename(inFilename), *suffix=uprv_strrchr(basename, '.');
+
+ icu::CharString outFilenameBuilder;
+ icu::CharString entryNameBuilder;
+ icu::ErrorCode status;
+
+ /* copy path */
+ if(destdir!=nullptr && *destdir!=0) {
+ outFilenameBuilder.append(destdir, status);
+ outFilenameBuilder.ensureEndsWithFileSeparator(status);
+ } else {
+ outFilenameBuilder.append(inFilename, static_cast<int32_t>(basename - inFilename), status);
+ }
+ inFilename=basename;
+
+ if(suffix==nullptr) {
+ /* the filename does not have a suffix */
+ entryNameBuilder.append(inFilename, status);
+ if(optFilename != nullptr) {
+ outFilenameBuilder.append(optFilename, status);
+ } else {
+ outFilenameBuilder.append(inFilename, status);
+ }
+ outFilenameBuilder.append(newSuffix, status);
+ } else {
+ int32_t saveOutFilenameLength = outFilenameBuilder.length();
+ /* copy basename */
+ while(inFilename<suffix) {
+ // iSeries cannot have '-' in the .o objects.
+ char c = (*inFilename=='-') ? '_' : *inFilename;
+ outFilenameBuilder.append(c, status);
+ entryNameBuilder.append(c, status);
+ inFilename++;
+ }
+
+ /* replace '.' by '_' */
+ outFilenameBuilder.append('_', status);
+ entryNameBuilder.append('_', status);
+ ++inFilename;
+
+ /* copy suffix */
+ outFilenameBuilder.append(inFilename, status);
+ entryNameBuilder.append(inFilename, status);
+
+ if(optFilename != nullptr) {
+ outFilenameBuilder.truncate(saveOutFilenameLength);
+ outFilenameBuilder.append(optFilename, status);
+ }
+ // add ".c"
+ outFilenameBuilder.append(newSuffix, status);
+ }
+
+ if (status.isFailure()) {
+ fprintf(stderr, "genccode: error building filename or entrypoint\n");
+ exit(status.get());
+ }
+
+ if (outFilenameBuilder.length() >= outFilenameCapacity) {
+ fprintf(stderr, "genccode: output filename too long\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+
+ if (entryNameBuilder.length() >= entryNameCapacity) {
+ fprintf(stderr, "genccode: entry name too long (long filename?)\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+
+ outFilenameBuilder.extract(outFilename, outFilenameCapacity, status);
+ entryNameBuilder.extract(entryName, entryNameCapacity, status);
+}
+
+#ifdef CAN_GENERATE_OBJECTS
+static void
+getArchitecture(uint16_t *pCPU, uint16_t *pBits, UBool *pIsBigEndian, const char *optMatchArch) {
+ union {
+ char bytes[2048];
+#ifdef U_ELF
+ Elf32_Ehdr header32;
+ /* Elf32_Ehdr and ELF64_Ehdr are identical for the necessary fields. */
+#elif U_PLATFORM_HAS_WIN32_API
+ IMAGE_FILE_HEADER header;
+#endif
+ } buffer;
+
+ const char *filename;
+ FileStream *in;
+ int32_t length;
+
+#ifdef U_ELF
+
+#elif U_PLATFORM_HAS_WIN32_API
+ const IMAGE_FILE_HEADER *pHeader;
+#else
+# error "Unknown platform for CAN_GENERATE_OBJECTS."
+#endif
+
+ if(optMatchArch != nullptr) {
+ filename=optMatchArch;
+ } else {
+ /* set defaults */
+#ifdef U_ELF
+ /* set EM_386 because elf.h does not provide better defaults */
+ *pCPU=EM_386;
+ *pBits=32;
+ *pIsBigEndian=(UBool)(U_IS_BIG_ENDIAN ? ELFDATA2MSB : ELFDATA2LSB);
+#elif U_PLATFORM_HAS_WIN32_API
+ // Windows always runs in little-endian mode.
+ *pIsBigEndian = false;
+
+ // Note: The various _M_<arch> macros are predefined by the MSVC compiler based
+ // on the target compilation architecture.
+ // https://docs.microsoft.com/cpp/preprocessor/predefined-macros
+
+ // link.exe will link an IMAGE_FILE_MACHINE_UNKNOWN data-only .obj file
+ // no matter what architecture it is targeting (though other values are
+ // required to match). Unfortunately, the variable name decoration/mangling
+ // is slightly different on x86, which means we can't use the UNKNOWN type
+ // for all architectures though.
+# if defined(_M_IX86)
+ *pCPU = IMAGE_FILE_MACHINE_I386;
+# else
+ *pCPU = IMAGE_FILE_MACHINE_UNKNOWN;
+# endif
+# if defined(_M_IA64) || defined(_M_AMD64) || defined (_M_ARM64)
+ *pBits = 64; // Doesn't seem to be used for anything interesting though?
+# elif defined(_M_IX86) || defined(_M_ARM)
+ *pBits = 32;
+# else
+# error "Unknown platform for CAN_GENERATE_OBJECTS."
+# endif
+#else
+# error "Unknown platform for CAN_GENERATE_OBJECTS."
+#endif
+ return;
+ }
+
+ in=T_FileStream_open(filename, "rb");
+ if(in==nullptr) {
+ fprintf(stderr, "genccode: unable to open match-arch file %s\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+ length=T_FileStream_read(in, buffer.bytes, sizeof(buffer.bytes));
+
+#ifdef U_ELF
+ if(length<(int32_t)sizeof(Elf32_Ehdr)) {
+ fprintf(stderr, "genccode: match-arch file %s is too short\n", filename);
+ exit(U_UNSUPPORTED_ERROR);
+ }
+ if(
+ buffer.header32.e_ident[0]!=ELFMAG0 ||
+ buffer.header32.e_ident[1]!=ELFMAG1 ||
+ buffer.header32.e_ident[2]!=ELFMAG2 ||
+ buffer.header32.e_ident[3]!=ELFMAG3 ||
+ buffer.header32.e_ident[EI_CLASS]<ELFCLASS32 || buffer.header32.e_ident[EI_CLASS]>ELFCLASS64
+ ) {
+ fprintf(stderr, "genccode: match-arch file %s is not an ELF object file, or not supported\n", filename);
+ exit(U_UNSUPPORTED_ERROR);
+ }
+
+ *pBits= buffer.header32.e_ident[EI_CLASS]==ELFCLASS32 ? 32 : 64; /* only 32 or 64: see check above */
+#ifdef U_ELF64
+ if(*pBits!=32 && *pBits!=64) {
+ fprintf(stderr, "genccode: currently only supports 32-bit and 64-bit ELF format\n");
+ exit(U_UNSUPPORTED_ERROR);
+ }
+#else
+ if(*pBits!=32) {
+ fprintf(stderr, "genccode: built with elf.h missing 64-bit definitions\n");
+ exit(U_UNSUPPORTED_ERROR);
+ }
+#endif
+
+ *pIsBigEndian=(UBool)(buffer.header32.e_ident[EI_DATA]==ELFDATA2MSB);
+ if(*pIsBigEndian!=U_IS_BIG_ENDIAN) {
+ fprintf(stderr, "genccode: currently only same-endianness ELF formats are supported\n");
+ exit(U_UNSUPPORTED_ERROR);
+ }
+ /* TODO: Support byte swapping */
+
+ *pCPU=buffer.header32.e_machine;
+#elif U_PLATFORM_HAS_WIN32_API
+ if(length<sizeof(IMAGE_FILE_HEADER)) {
+ fprintf(stderr, "genccode: match-arch file %s is too short\n", filename);
+ exit(U_UNSUPPORTED_ERROR);
+ }
+ /* TODO: Use buffer.header. Keep aliasing legal. */
+ pHeader=(const IMAGE_FILE_HEADER *)buffer.bytes;
+ *pCPU=pHeader->Machine;
+ /*
+ * The number of bits is implicit with the Machine value.
+ * *pBits is ignored in the calling code, so this need not be precise.
+ */
+ *pBits= *pCPU==IMAGE_FILE_MACHINE_I386 ? 32 : 64;
+ /* Windows always runs on little-endian CPUs. */
+ *pIsBigEndian=false;
+#else
+# error "Unknown platform for CAN_GENERATE_OBJECTS."
+#endif
+
+ T_FileStream_close(in);
+}
+
+U_CAPI void U_EXPORT2
+writeObjectCode(
+ const char *filename,
+ const char *destdir,
+ const char *optEntryPoint,
+ const char *optMatchArch,
+ const char *optFilename,
+ char *outFilePath,
+ size_t outFilePathCapacity,
+ UBool optWinDllExport) {
+ /* common variables */
+ char buffer[4096], entry[96]={ 0 };
+ FileStream *in, *out;
+ const char *newSuffix;
+ int32_t i, entryLength, length, size, entryOffset=0, entryLengthOffset=0;
+
+ uint16_t cpu, bits;
+ UBool makeBigEndian;
+
+ (void)optWinDllExport; /* unused except Windows */
+
+ /* platform-specific variables and initialization code */
+#ifdef U_ELF
+ /* 32-bit Elf file header */
+ static Elf32_Ehdr header32={
+ {
+ /* e_ident[] */
+ ELFMAG0, ELFMAG1, ELFMAG2, ELFMAG3,
+ ELFCLASS32,
+ U_IS_BIG_ENDIAN ? ELFDATA2MSB : ELFDATA2LSB,
+ EV_CURRENT /* EI_VERSION */
+ },
+ ET_REL,
+ EM_386,
+ EV_CURRENT, /* e_version */
+ 0, /* e_entry */
+ 0, /* e_phoff */
+ (Elf32_Off)sizeof(Elf32_Ehdr), /* e_shoff */
+ 0, /* e_flags */
+ (Elf32_Half)sizeof(Elf32_Ehdr), /* eh_size */
+ 0, /* e_phentsize */
+ 0, /* e_phnum */
+ (Elf32_Half)sizeof(Elf32_Shdr), /* e_shentsize */
+ 5, /* e_shnum */
+ 2 /* e_shstrndx */
+ };
+
+ /* 32-bit Elf section header table */
+ static Elf32_Shdr sectionHeaders32[5]={
+ { /* SHN_UNDEF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ },
+ { /* .symtab */
+ 1, /* sh_name */
+ SHT_SYMTAB,
+ 0, /* sh_flags */
+ 0, /* sh_addr */
+ (Elf32_Off)(sizeof(header32)+sizeof(sectionHeaders32)), /* sh_offset */
+ (Elf32_Word)(2*sizeof(Elf32_Sym)), /* sh_size */
+ 3, /* sh_link=sect hdr index of .strtab */
+ 1, /* sh_info=One greater than the symbol table index of the last
+ * local symbol (with STB_LOCAL). */
+ 4, /* sh_addralign */
+ (Elf32_Word)(sizeof(Elf32_Sym)) /* sh_entsize */
+ },
+ { /* .shstrtab */
+ 9, /* sh_name */
+ SHT_STRTAB,
+ 0, /* sh_flags */
+ 0, /* sh_addr */
+ (Elf32_Off)(sizeof(header32)+sizeof(sectionHeaders32)+2*sizeof(Elf32_Sym)), /* sh_offset */
+ 40, /* sh_size */
+ 0, /* sh_link */
+ 0, /* sh_info */
+ 1, /* sh_addralign */
+ 0 /* sh_entsize */
+ },
+ { /* .strtab */
+ 19, /* sh_name */
+ SHT_STRTAB,
+ 0, /* sh_flags */
+ 0, /* sh_addr */
+ (Elf32_Off)(sizeof(header32)+sizeof(sectionHeaders32)+2*sizeof(Elf32_Sym)+40), /* sh_offset */
+ (Elf32_Word)sizeof(entry), /* sh_size */
+ 0, /* sh_link */
+ 0, /* sh_info */
+ 1, /* sh_addralign */
+ 0 /* sh_entsize */
+ },
+ { /* .rodata */
+ 27, /* sh_name */
+ SHT_PROGBITS,
+ SHF_ALLOC, /* sh_flags */
+ 0, /* sh_addr */
+ (Elf32_Off)(sizeof(header32)+sizeof(sectionHeaders32)+2*sizeof(Elf32_Sym)+40+sizeof(entry)), /* sh_offset */
+ 0, /* sh_size */
+ 0, /* sh_link */
+ 0, /* sh_info */
+ 16, /* sh_addralign */
+ 0 /* sh_entsize */
+ }
+ };
+
+ /* symbol table */
+ static Elf32_Sym symbols32[2]={
+ { /* STN_UNDEF */
+ 0, 0, 0, 0, 0, 0
+ },
+ { /* data entry point */
+ 1, /* st_name */
+ 0, /* st_value */
+ 0, /* st_size */
+ ELF64_ST_INFO(STB_GLOBAL, STT_OBJECT),
+ 0, /* st_other */
+ 4 /* st_shndx=index of related section table entry */
+ }
+ };
+
+ /* section header string table, with decimal string offsets */
+ static const char sectionStrings[40]=
+ /* 0 */ "\0"
+ /* 1 */ ".symtab\0"
+ /* 9 */ ".shstrtab\0"
+ /* 19 */ ".strtab\0"
+ /* 27 */ ".rodata\0"
+ /* 35 */ "\0\0\0\0"; /* contains terminating NUL */
+ /* 40: padded to multiple of 8 bytes */
+
+ /*
+ * Use entry[] for the string table which will contain only the
+ * entry point name.
+ * entry[0] must be 0 (NUL)
+ * The entry point name can be up to 38 characters long (sizeof(entry)-2).
+ */
+
+ /* 16-align .rodata in the .o file, just in case */
+ static const char padding[16]={ 0 };
+ int32_t paddingSize;
+
+#ifdef U_ELF64
+ /* 64-bit Elf file header */
+ static Elf64_Ehdr header64={
+ {
+ /* e_ident[] */
+ ELFMAG0, ELFMAG1, ELFMAG2, ELFMAG3,
+ ELFCLASS64,
+ U_IS_BIG_ENDIAN ? ELFDATA2MSB : ELFDATA2LSB,
+ EV_CURRENT /* EI_VERSION */
+ },
+ ET_REL,
+ EM_X86_64,
+ EV_CURRENT, /* e_version */
+ 0, /* e_entry */
+ 0, /* e_phoff */
+ (Elf64_Off)sizeof(Elf64_Ehdr), /* e_shoff */
+ 0, /* e_flags */
+ (Elf64_Half)sizeof(Elf64_Ehdr), /* eh_size */
+ 0, /* e_phentsize */
+ 0, /* e_phnum */
+ (Elf64_Half)sizeof(Elf64_Shdr), /* e_shentsize */
+ 5, /* e_shnum */
+ 2 /* e_shstrndx */
+ };
+
+ /* 64-bit Elf section header table */
+ static Elf64_Shdr sectionHeaders64[5]={
+ { /* SHN_UNDEF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ },
+ { /* .symtab */
+ 1, /* sh_name */
+ SHT_SYMTAB,
+ 0, /* sh_flags */
+ 0, /* sh_addr */
+ (Elf64_Off)(sizeof(header64)+sizeof(sectionHeaders64)), /* sh_offset */
+ (Elf64_Xword)(2*sizeof(Elf64_Sym)), /* sh_size */
+ 3, /* sh_link=sect hdr index of .strtab */
+ 1, /* sh_info=One greater than the symbol table index of the last
+ * local symbol (with STB_LOCAL). */
+ 4, /* sh_addralign */
+ (Elf64_Xword)(sizeof(Elf64_Sym)) /* sh_entsize */
+ },
+ { /* .shstrtab */
+ 9, /* sh_name */
+ SHT_STRTAB,
+ 0, /* sh_flags */
+ 0, /* sh_addr */
+ (Elf64_Off)(sizeof(header64)+sizeof(sectionHeaders64)+2*sizeof(Elf64_Sym)), /* sh_offset */
+ 40, /* sh_size */
+ 0, /* sh_link */
+ 0, /* sh_info */
+ 1, /* sh_addralign */
+ 0 /* sh_entsize */
+ },
+ { /* .strtab */
+ 19, /* sh_name */
+ SHT_STRTAB,
+ 0, /* sh_flags */
+ 0, /* sh_addr */
+ (Elf64_Off)(sizeof(header64)+sizeof(sectionHeaders64)+2*sizeof(Elf64_Sym)+40), /* sh_offset */
+ (Elf64_Xword)sizeof(entry), /* sh_size */
+ 0, /* sh_link */
+ 0, /* sh_info */
+ 1, /* sh_addralign */
+ 0 /* sh_entsize */
+ },
+ { /* .rodata */
+ 27, /* sh_name */
+ SHT_PROGBITS,
+ SHF_ALLOC, /* sh_flags */
+ 0, /* sh_addr */
+ (Elf64_Off)(sizeof(header64)+sizeof(sectionHeaders64)+2*sizeof(Elf64_Sym)+40+sizeof(entry)), /* sh_offset */
+ 0, /* sh_size */
+ 0, /* sh_link */
+ 0, /* sh_info */
+ 16, /* sh_addralign */
+ 0 /* sh_entsize */
+ }
+ };
+
+ /*
+ * 64-bit symbol table
+ * careful: different order of items compared with Elf32_sym!
+ */
+ static Elf64_Sym symbols64[2]={
+ { /* STN_UNDEF */
+ 0, 0, 0, 0, 0, 0
+ },
+ { /* data entry point */
+ 1, /* st_name */
+ ELF64_ST_INFO(STB_GLOBAL, STT_OBJECT),
+ 0, /* st_other */
+ 4, /* st_shndx=index of related section table entry */
+ 0, /* st_value */
+ 0 /* st_size */
+ }
+ };
+
+#endif /* U_ELF64 */
+
+ /* entry[] have a leading NUL */
+ entryOffset=1;
+
+ /* in the common code, count entryLength from after the NUL */
+ entryLengthOffset=1;
+
+ newSuffix=".o";
+
+#elif U_PLATFORM_HAS_WIN32_API
+ struct {
+ IMAGE_FILE_HEADER fileHeader;
+ IMAGE_SECTION_HEADER sections[2];
+ char linkerOptions[100];
+ } objHeader;
+ IMAGE_SYMBOL symbols[1];
+ struct {
+ DWORD sizeofLongNames;
+ char longNames[100];
+ } symbolNames;
+
+ /*
+ * entry sometimes have a leading '_'
+ * overwritten if entryOffset==0 depending on the target platform
+ * see check for cpu below
+ */
+ entry[0]='_';
+
+ newSuffix=".obj";
+#else
+# error "Unknown platform for CAN_GENERATE_OBJECTS."
+#endif
+
+ /* deal with options, files and the entry point name */
+ getArchitecture(&cpu, &bits, &makeBigEndian, optMatchArch);
+ if (optMatchArch)
+ {
+ printf("genccode: --match-arch cpu=%hu bits=%hu big-endian=%d\n", cpu, bits, makeBigEndian);
+ }
+ else
+ {
+ printf("genccode: using architecture cpu=%hu bits=%hu big-endian=%d\n", cpu, bits, makeBigEndian);
+ }
+#if U_PLATFORM_HAS_WIN32_API
+ if(cpu==IMAGE_FILE_MACHINE_I386) {
+ entryOffset=1;
+ }
+#endif
+
+ in=T_FileStream_open(filename, "rb");
+ if(in==nullptr) {
+ fprintf(stderr, "genccode: unable to open input file %s\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+ size=T_FileStream_size(in);
+
+ getOutFilename(
+ filename,
+ destdir,
+ buffer,
+ sizeof(buffer),
+ entry + entryOffset,
+ sizeof(entry) - entryOffset,
+ newSuffix,
+ optFilename);
+
+ if (outFilePath != nullptr) {
+ if (uprv_strlen(buffer) >= outFilePathCapacity) {
+ fprintf(stderr, "genccode: filename too long\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ uprv_strcpy(outFilePath, buffer);
+ }
+
+ if(optEntryPoint != nullptr) {
+ uprv_strcpy(entry+entryOffset, optEntryPoint);
+ uprv_strcat(entry+entryOffset, "_dat");
+ }
+ /* turn dashes in the entry name into underscores */
+ entryLength=(int32_t)uprv_strlen(entry+entryLengthOffset);
+ for(i=0; i<entryLength; ++i) {
+ if(entry[entryLengthOffset+i]=='-') {
+ entry[entryLengthOffset+i]='_';
+ }
+ }
+
+ /* open the output file */
+ out=T_FileStream_open(buffer, "wb");
+ if(out==nullptr) {
+ fprintf(stderr, "genccode: unable to open output file %s\n", buffer);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+#ifdef U_ELF
+ if(bits==32) {
+ header32.e_ident[EI_DATA]= makeBigEndian ? ELFDATA2MSB : ELFDATA2LSB;
+ header32.e_machine=cpu;
+
+ /* 16-align .rodata in the .o file, just in case */
+ paddingSize=sectionHeaders32[4].sh_offset & 0xf;
+ if(paddingSize!=0) {
+ paddingSize=0x10-paddingSize;
+ sectionHeaders32[4].sh_offset+=paddingSize;
+ }
+
+ sectionHeaders32[4].sh_size=(Elf32_Word)size;
+
+ symbols32[1].st_size=(Elf32_Word)size;
+
+ /* write .o headers */
+ T_FileStream_write(out, &header32, (int32_t)sizeof(header32));
+ T_FileStream_write(out, sectionHeaders32, (int32_t)sizeof(sectionHeaders32));
+ T_FileStream_write(out, symbols32, (int32_t)sizeof(symbols32));
+ } else /* bits==64 */ {
+#ifdef U_ELF64
+ header64.e_ident[EI_DATA]= makeBigEndian ? ELFDATA2MSB : ELFDATA2LSB;
+ header64.e_machine=cpu;
+
+ /* 16-align .rodata in the .o file, just in case */
+ paddingSize=sectionHeaders64[4].sh_offset & 0xf;
+ if(paddingSize!=0) {
+ paddingSize=0x10-paddingSize;
+ sectionHeaders64[4].sh_offset+=paddingSize;
+ }
+
+ sectionHeaders64[4].sh_size=(Elf64_Xword)size;
+
+ symbols64[1].st_size=(Elf64_Xword)size;
+
+ /* write .o headers */
+ T_FileStream_write(out, &header64, (int32_t)sizeof(header64));
+ T_FileStream_write(out, sectionHeaders64, (int32_t)sizeof(sectionHeaders64));
+ T_FileStream_write(out, symbols64, (int32_t)sizeof(symbols64));
+#endif
+ }
+
+ T_FileStream_write(out, sectionStrings, (int32_t)sizeof(sectionStrings));
+ T_FileStream_write(out, entry, (int32_t)sizeof(entry));
+ if(paddingSize!=0) {
+ T_FileStream_write(out, padding, paddingSize);
+ }
+#elif U_PLATFORM_HAS_WIN32_API
+ /* populate the .obj headers */
+ uprv_memset(&objHeader, 0, sizeof(objHeader));
+ uprv_memset(&symbols, 0, sizeof(symbols));
+ uprv_memset(&symbolNames, 0, sizeof(symbolNames));
+
+ /* write the linker export directive */
+ if (optWinDllExport) {
+ uprv_strcpy(objHeader.linkerOptions, "-export:");
+ length=8;
+ uprv_strcpy(objHeader.linkerOptions+length, entry);
+ length+=entryLength;
+ uprv_strcpy(objHeader.linkerOptions+length, ",data ");
+ length+=6;
+ }
+ else {
+ length=0;
+ }
+
+ /* set the file header */
+ objHeader.fileHeader.Machine=cpu;
+ objHeader.fileHeader.NumberOfSections=2;
+ objHeader.fileHeader.TimeDateStamp=(DWORD)time(nullptr);
+ objHeader.fileHeader.PointerToSymbolTable=IMAGE_SIZEOF_FILE_HEADER+2*IMAGE_SIZEOF_SECTION_HEADER+length+size; /* start of symbol table */
+ objHeader.fileHeader.NumberOfSymbols=1;
+
+ /* set the section for the linker options */
+ uprv_strncpy((char *)objHeader.sections[0].Name, ".drectve", 8);
+ objHeader.sections[0].SizeOfRawData=length;
+ objHeader.sections[0].PointerToRawData=IMAGE_SIZEOF_FILE_HEADER+2*IMAGE_SIZEOF_SECTION_HEADER;
+ objHeader.sections[0].Characteristics=IMAGE_SCN_LNK_INFO|IMAGE_SCN_LNK_REMOVE|IMAGE_SCN_ALIGN_1BYTES;
+
+ /* set the data section */
+ uprv_strncpy((char *)objHeader.sections[1].Name, ".rdata", 6);
+ objHeader.sections[1].SizeOfRawData=size;
+ objHeader.sections[1].PointerToRawData=IMAGE_SIZEOF_FILE_HEADER+2*IMAGE_SIZEOF_SECTION_HEADER+length;
+ objHeader.sections[1].Characteristics=IMAGE_SCN_CNT_INITIALIZED_DATA|IMAGE_SCN_ALIGN_16BYTES|IMAGE_SCN_MEM_READ;
+
+ /* set the symbol table */
+ if(entryLength<=8) {
+ uprv_strncpy((char *)symbols[0].N.ShortName, entry, entryLength);
+ symbolNames.sizeofLongNames=4;
+ } else {
+ symbols[0].N.Name.Short=0;
+ symbols[0].N.Name.Long=4;
+ symbolNames.sizeofLongNames=4+entryLength+1;
+ uprv_strcpy(symbolNames.longNames, entry);
+ }
+ symbols[0].SectionNumber=2;
+ symbols[0].StorageClass=IMAGE_SYM_CLASS_EXTERNAL;
+
+ /* write the file header and the linker options section */
+ T_FileStream_write(out, &objHeader, objHeader.sections[1].PointerToRawData);
+#else
+# error "Unknown platform for CAN_GENERATE_OBJECTS."
+#endif
+
+ /* copy the data file into section 2 */
+ for(;;) {
+ length=T_FileStream_read(in, buffer, sizeof(buffer));
+ if(length==0) {
+ break;
+ }
+ T_FileStream_write(out, buffer, (int32_t)length);
+ }
+
+#if U_PLATFORM_HAS_WIN32_API
+ /* write the symbol table */
+ T_FileStream_write(out, symbols, IMAGE_SIZEOF_SYMBOL);
+ T_FileStream_write(out, &symbolNames, symbolNames.sizeofLongNames);
+#endif
+
+ if(T_FileStream_error(in)) {
+ fprintf(stderr, "genccode: file read error while generating from file %s\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ if(T_FileStream_error(out)) {
+ fprintf(stderr, "genccode: file write error while generating from file %s\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ T_FileStream_close(out);
+ T_FileStream_close(in);
+}
+#endif
diff --git a/intl/icu/source/tools/toolutil/pkg_genc.h b/intl/icu/source/tools/toolutil/pkg_genc.h
new file mode 100644
index 0000000000..2dd1b45cde
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/pkg_genc.h
@@ -0,0 +1,107 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/******************************************************************************
+ * Copyright (C) 2008-2011, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *******************************************************************************
+ */
+
+#ifndef __PKG_GENC_H__
+#define __PKG_GENC_H__
+
+#include "unicode/utypes.h"
+#include "toolutil.h"
+
+#include "unicode/putil.h"
+#include "putilimp.h"
+
+/*** Platform #defines move here ***/
+#if U_PLATFORM_HAS_WIN32_API
+#ifdef __GNUC__
+#define WINDOWS_WITH_GNUC
+#else
+#define WINDOWS_WITH_MSVC
+#endif
+#endif
+
+
+#if !defined(WINDOWS_WITH_MSVC)
+#define BUILD_DATA_WITHOUT_ASSEMBLY
+#endif
+
+#ifndef U_DISABLE_OBJ_CODE /* testing */
+#if defined(WINDOWS_WITH_MSVC) || U_PLATFORM_IS_LINUX_BASED
+#define CAN_WRITE_OBJ_CODE
+#endif
+#if U_PLATFORM_HAS_WIN32_API || defined(U_ELF)
+#define CAN_GENERATE_OBJECTS
+#endif
+#endif
+
+#if U_PLATFORM == U_PF_CYGWIN || defined(CYGWINMSVC)
+#define USING_CYGWIN
+#endif
+
+/*
+ * When building the data library without assembly,
+ * some platforms use a single c code file for all of
+ * the data to generate the final data library. This can
+ * increase the performance of the pkdata tool.
+ */
+#if U_PLATFORM == U_PF_OS400
+#define USE_SINGLE_CCODE_FILE
+#endif
+
+/* Need to fix the file seperator character when using MinGW. */
+#if defined(WINDOWS_WITH_GNUC) || defined(USING_CYGWIN)
+#define PKGDATA_FILE_SEP_STRING "/"
+#else
+#define PKGDATA_FILE_SEP_STRING U_FILE_SEP_STRING
+#endif
+
+#define LARGE_BUFFER_MAX_SIZE 2048
+#define SMALL_BUFFER_MAX_SIZE 512
+#define SMALL_BUFFER_FLAG_NAMES 32
+#define BUFFER_PADDING_SIZE 20
+
+/** End platform defines **/
+
+
+
+U_CAPI void U_EXPORT2
+printAssemblyHeadersToStdErr(void);
+
+U_CAPI UBool U_EXPORT2
+checkAssemblyHeaderName(const char* optAssembly);
+
+U_CAPI void U_EXPORT2
+writeCCode(
+ const char *filename,
+ const char *destdir,
+ const char *optEntryPoint,
+ const char *optName,
+ const char *optFilename,
+ char *outFilePath,
+ size_t outFilePathCapacity);
+
+U_CAPI void U_EXPORT2
+writeAssemblyCode(
+ const char *filename,
+ const char *destdir,
+ const char *optEntryPoint,
+ const char *optFilename,
+ char *outFilePath,
+ size_t outFilePathCapacity);
+
+U_CAPI void U_EXPORT2
+writeObjectCode(
+ const char *filename,
+ const char *destdir,
+ const char *optEntryPoint,
+ const char *optMatchArch,
+ const char *optFilename,
+ char *outFilePath,
+ size_t outFilePathCapacity,
+ UBool optWinDllExport);
+
+#endif
diff --git a/intl/icu/source/tools/toolutil/pkg_gencmn.cpp b/intl/icu/source/tools/toolutil/pkg_gencmn.cpp
new file mode 100644
index 0000000000..a301c322eb
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/pkg_gencmn.cpp
@@ -0,0 +1,578 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/******************************************************************************
+ * Copyright (C) 2008-2012, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *******************************************************************************
+ */
+#include "unicode/utypes.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "unicode/utypes.h"
+#include "unicode/putil.h"
+#include "cmemory.h"
+#include "cstring.h"
+#include "filestrm.h"
+#include "toolutil.h"
+#include "unicode/uclean.h"
+#include "unewdata.h"
+#include "putilimp.h"
+#include "pkg_gencmn.h"
+
+#define STRING_STORE_SIZE 200000
+
+#define COMMON_DATA_NAME U_ICUDATA_NAME
+#define DATA_TYPE "dat"
+
+/* ICU package data file format (.dat files) ------------------------------- ***
+
+Description of the data format after the usual ICU data file header
+(UDataInfo etc.).
+
+Format version 1
+
+A .dat package file contains a simple Table of Contents of item names,
+followed by the items themselves:
+
+1. ToC table
+
+uint32_t count; - number of items
+UDataOffsetTOCEntry entry[count]; - pair of uint32_t values per item:
+ uint32_t nameOffset; - offset of the item name
+ uint32_t dataOffset; - offset of the item data
+both are byte offsets from the beginning of the data
+
+2. item name strings
+
+All item names are stored as char * strings in one block between the ToC table
+and the data items.
+
+3. data items
+
+The data items are stored following the item names block.
+Each data item is 16-aligned.
+The data items are stored in the sorted order of their names.
+
+Therefore, the top of the name strings block is the offset of the first item,
+the length of the last item is the difference between its offset and
+the .dat file length, and the length of all previous items is the difference
+between its offset and the next one.
+
+----------------------------------------------------------------------------- */
+
+/* UDataInfo cf. udata.h */
+static const UDataInfo dataInfo={
+ sizeof(UDataInfo),
+ 0,
+
+ U_IS_BIG_ENDIAN,
+ U_CHARSET_FAMILY,
+ sizeof(char16_t),
+ 0,
+
+ {0x43, 0x6d, 0x6e, 0x44}, /* dataFormat="CmnD" */
+ {1, 0, 0, 0}, /* formatVersion */
+ {3, 0, 0, 0} /* dataVersion */
+};
+
+static uint32_t maxSize;
+
+static char stringStore[STRING_STORE_SIZE];
+static uint32_t stringTop=0, basenameTotal=0;
+
+typedef struct {
+ char *pathname, *basename;
+ uint32_t basenameLength, basenameOffset, fileSize, fileOffset;
+} File;
+
+#define CHUNK_FILE_COUNT 256
+static File *files = nullptr;
+static uint32_t fileCount=0;
+static uint32_t fileMax = 0;
+
+
+static char *symPrefix = nullptr;
+
+#define LINE_BUFFER_SIZE 512
+/* prototypes --------------------------------------------------------------- */
+
+static void
+addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose);
+
+static char *
+allocString(uint32_t length);
+
+U_CDECL_BEGIN
+static int
+compareFiles(const void *file1, const void *file2);
+U_CDECL_END
+
+static char *
+pathToFullPath(const char *path, const char *source);
+
+/* map non-tree separator (such as '\') to tree separator ('/') inplace. */
+static void
+fixDirToTreePath(char *s);
+/* -------------------------------------------------------------------------- */
+
+U_CAPI void U_EXPORT2
+createCommonDataFile(const char *destDir, const char *name, const char *entrypointName, const char *type, const char *source, const char *copyRight,
+ const char *dataFile, uint32_t max_size, UBool sourceTOC, UBool verbose, char *gencmnFileName) {
+ static char buffer[4096];
+ char *line;
+ char *linePtr;
+ char *s = nullptr;
+ UErrorCode errorCode=U_ZERO_ERROR;
+ uint32_t i, fileOffset, basenameOffset, length, nread;
+ FileStream *in, *file;
+
+ line = (char *)uprv_malloc(sizeof(char) * LINE_BUFFER_SIZE);
+ if (line == nullptr) {
+ fprintf(stderr, "gencmn: unable to allocate memory for line buffer of size %d\n", LINE_BUFFER_SIZE);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+
+ linePtr = line;
+
+ maxSize = max_size;
+
+ if (destDir == nullptr) {
+ destDir = u_getDataDirectory();
+ }
+ if (name == nullptr) {
+ name = COMMON_DATA_NAME;
+ }
+ if (type == nullptr) {
+ type = DATA_TYPE;
+ }
+ if (source == nullptr) {
+ source = ".";
+ }
+
+ if (dataFile == nullptr) {
+ in = T_FileStream_stdin();
+ } else {
+ in = T_FileStream_open(dataFile, "r");
+ if(in == nullptr) {
+ fprintf(stderr, "gencmn: unable to open input file %s\n", dataFile);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+ }
+
+ if (verbose) {
+ if(sourceTOC) {
+ printf("generating %s_%s.c (table of contents source file)\n", name, type);
+ } else {
+ printf("generating %s.%s (common data file with table of contents)\n", name, type);
+ }
+ }
+
+ /* read the list of files and get their lengths */
+ while((s != nullptr && *s != 0) || (s=T_FileStream_readLine(in, (line=linePtr),
+ LINE_BUFFER_SIZE))!=nullptr) {
+ /* remove trailing newline characters and parse space separated items */
+ if (s != nullptr && *s != 0) {
+ line=s;
+ } else {
+ s=line;
+ }
+ while(*s!=0) {
+ if(*s==' ') {
+ *s=0;
+ ++s;
+ break;
+ } else if(*s=='\r' || *s=='\n') {
+ *s=0;
+ break;
+ }
+ ++s;
+ }
+
+ /* check for comment */
+
+ if (*line == '#') {
+ continue;
+ }
+
+ /* add the file */
+#if (U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR)
+ {
+ char *t;
+ while((t = uprv_strchr(line,U_FILE_ALT_SEP_CHAR))) {
+ *t = U_FILE_SEP_CHAR;
+ }
+ }
+#endif
+ addFile(getLongPathname(line), name, source, sourceTOC, verbose);
+ }
+
+ uprv_free(linePtr);
+
+ if(in!=T_FileStream_stdin()) {
+ T_FileStream_close(in);
+ }
+
+ if(fileCount==0) {
+ fprintf(stderr, "gencmn: no files listed in %s\n", dataFile == nullptr ? "<stdin>" : dataFile);
+ return;
+ }
+
+ /* sort the files by basename */
+ qsort(files, fileCount, sizeof(File), compareFiles);
+
+ if(!sourceTOC) {
+ UNewDataMemory *out;
+
+ /* determine the offsets of all basenames and files in this common one */
+ basenameOffset=4+8*fileCount;
+ fileOffset=(basenameOffset+(basenameTotal+15))&~0xf;
+ for(i=0; i<fileCount; ++i) {
+ files[i].fileOffset=fileOffset;
+ fileOffset+=(files[i].fileSize+15)&~0xf;
+ files[i].basenameOffset=basenameOffset;
+ basenameOffset+=files[i].basenameLength;
+ }
+
+ /* create the output file */
+ out=udata_create(destDir, type, name,
+ &dataInfo,
+ copyRight == nullptr ? U_COPYRIGHT_STRING : copyRight,
+ &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "gencmn: udata_create(-d %s -n %s -t %s) failed - %s\n",
+ destDir, name, type,
+ u_errorName(errorCode));
+ exit(errorCode);
+ }
+
+ /* write the table of contents */
+ udata_write32(out, fileCount);
+ for(i=0; i<fileCount; ++i) {
+ udata_write32(out, files[i].basenameOffset);
+ udata_write32(out, files[i].fileOffset);
+ }
+
+ /* write the basenames */
+ for(i=0; i<fileCount; ++i) {
+ udata_writeString(out, files[i].basename, files[i].basenameLength);
+ }
+ length=4+8*fileCount+basenameTotal;
+
+ /* copy the files */
+ for(i=0; i<fileCount; ++i) {
+ /* pad to 16-align the next file */
+ length&=0xf;
+ if(length!=0) {
+ udata_writePadding(out, 16-length);
+ }
+
+ if (verbose) {
+ printf("adding %s (%ld byte%s)\n", files[i].pathname, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s");
+ }
+
+ /* copy the next file */
+ file=T_FileStream_open(files[i].pathname, "rb");
+ if(file==nullptr) {
+ fprintf(stderr, "gencmn: unable to open listed file %s\n", files[i].pathname);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+ for(nread = 0;;) {
+ length=T_FileStream_read(file, buffer, sizeof(buffer));
+ if(length <= 0) {
+ break;
+ }
+ nread += length;
+ udata_writeBlock(out, buffer, length);
+ }
+ T_FileStream_close(file);
+ length=files[i].fileSize;
+
+ if (nread != files[i].fileSize) {
+ fprintf(stderr, "gencmn: unable to read %s properly (got %ld/%ld byte%s)\n", files[i].pathname, (long)nread, (long)files[i].fileSize, files[i].fileSize == 1 ? "" : "s");
+ exit(U_FILE_ACCESS_ERROR);
+ }
+ }
+
+ /* pad to 16-align the last file (cleaner, avoids growing .dat files in icuswap) */
+ length&=0xf;
+ if(length!=0) {
+ udata_writePadding(out, 16-length);
+ }
+
+ /* finish */
+ udata_finish(out, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "gencmn: udata_finish() failed - %s\n", u_errorName(errorCode));
+ exit(errorCode);
+ }
+ } else {
+ /* write a .c source file with the table of contents */
+ char *filename;
+ FileStream *out;
+
+ /* create the output filename */
+ filename=s=buffer;
+ uprv_strcpy(filename, destDir);
+ s=filename+uprv_strlen(filename);
+ if(s>filename && *(s-1)!=U_FILE_SEP_CHAR) {
+ *s++=U_FILE_SEP_CHAR;
+ }
+ uprv_strcpy(s, name);
+ if(*(type)!=0) {
+ s+=uprv_strlen(s);
+ *s++='_';
+ uprv_strcpy(s, type);
+ }
+ s+=uprv_strlen(s);
+ uprv_strcpy(s, ".c");
+
+ /* open the output file */
+ out=T_FileStream_open(filename, "w");
+ if (gencmnFileName != nullptr) {
+ uprv_strcpy(gencmnFileName, filename);
+ }
+ if(out==nullptr) {
+ fprintf(stderr, "gencmn: unable to open .c output file %s\n", filename);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ /* write the source file */
+ snprintf(buffer, sizeof(buffer),
+ "/*\n"
+ " * ICU common data table of contents for %s.%s\n"
+ " * Automatically generated by icu/source/tools/gencmn/gencmn .\n"
+ " */\n\n"
+ "#include \"unicode/utypes.h\"\n"
+ "#include \"unicode/udata.h\"\n"
+ "\n"
+ "/* external symbol declarations for data (%d files) */\n",
+ name, type, fileCount);
+ T_FileStream_writeLine(out, buffer);
+
+ snprintf(buffer, sizeof(buffer), "extern const char\n %s%s[]", symPrefix?symPrefix:"", files[0].pathname);
+ T_FileStream_writeLine(out, buffer);
+ for(i=1; i<fileCount; ++i) {
+ snprintf(buffer, sizeof(buffer), ",\n %s%s[]", symPrefix?symPrefix:"", files[i].pathname);
+ T_FileStream_writeLine(out, buffer);
+ }
+ T_FileStream_writeLine(out, ";\n\n");
+
+ snprintf(
+ buffer, sizeof(buffer),
+ "U_EXPORT struct {\n"
+ " uint16_t headerSize;\n"
+ " uint8_t magic1, magic2;\n"
+ " UDataInfo info;\n"
+ " char padding[%lu];\n"
+ " uint32_t count, reserved;\n"
+ " struct {\n"
+ " const char *name;\n"
+ " const void *data;\n"
+ " } toc[%lu];\n"
+ "} U_EXPORT2 %s_dat = {\n"
+ " 32, 0xda, 0x27, {\n"
+ " %lu, 0,\n"
+ " %u, %u, %u, 0,\n"
+ " {0x54, 0x6f, 0x43, 0x50},\n"
+ " {1, 0, 0, 0},\n"
+ " {0, 0, 0, 0}\n"
+ " },\n"
+ " \"\", %lu, 0, {\n",
+ static_cast<unsigned long>(32-4-sizeof(UDataInfo)),
+ static_cast<unsigned long>(fileCount),
+ entrypointName,
+ static_cast<unsigned long>(sizeof(UDataInfo)),
+ U_IS_BIG_ENDIAN,
+ U_CHARSET_FAMILY,
+ U_SIZEOF_UCHAR,
+ static_cast<unsigned long>(fileCount)
+ );
+ T_FileStream_writeLine(out, buffer);
+
+ snprintf(buffer, sizeof(buffer), " { \"%s\", %s%s }", files[0].basename, symPrefix?symPrefix:"", files[0].pathname);
+ T_FileStream_writeLine(out, buffer);
+ for(i=1; i<fileCount; ++i) {
+ snprintf(buffer, sizeof(buffer), ",\n { \"%s\", %s%s }", files[i].basename, symPrefix?symPrefix:"", files[i].pathname);
+ T_FileStream_writeLine(out, buffer);
+ }
+
+ T_FileStream_writeLine(out, "\n }\n};\n");
+ T_FileStream_close(out);
+
+ uprv_free(symPrefix);
+ }
+}
+
+static void
+addFile(const char *filename, const char *name, const char *source, UBool sourceTOC, UBool verbose) {
+ char *s;
+ uint32_t length;
+ char *fullPath = nullptr;
+
+ if(fileCount==fileMax) {
+ fileMax += CHUNK_FILE_COUNT;
+ files = (File *)uprv_realloc(files, fileMax*sizeof(files[0])); /* note: never freed. */
+ if(files==nullptr) {
+ fprintf(stderr, "pkgdata/gencmn: Could not allocate %u bytes for %d files\n", (unsigned int)(fileMax*sizeof(files[0])), fileCount);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ }
+
+ if(!sourceTOC) {
+ FileStream *file;
+
+ if(uprv_pathIsAbsolute(filename)) {
+ fprintf(stderr, "gencmn: Error: absolute path encountered. Old style paths are not supported. Use relative paths such as 'fur.res' or 'translit%cfur.res'.\n\tBad path: '%s'\n", U_FILE_SEP_CHAR, filename);
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ fullPath = pathToFullPath(filename, source);
+ /* store the pathname */
+ length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1);
+ s=allocString(length);
+ uprv_strcpy(s, name);
+ uprv_strcat(s, U_TREE_ENTRY_SEP_STRING);
+ uprv_strcat(s, filename);
+
+ /* get the basename */
+ fixDirToTreePath(s);
+ files[fileCount].basename=s;
+ files[fileCount].basenameLength=length;
+
+ files[fileCount].pathname=fullPath;
+
+ basenameTotal+=length;
+
+ /* try to open the file */
+ file=T_FileStream_open(fullPath, "rb");
+ if(file==nullptr) {
+ fprintf(stderr, "gencmn: unable to open listed file %s\n", fullPath);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ /* get the file length */
+ length=T_FileStream_size(file);
+ if(T_FileStream_error(file) || length<=20) {
+ fprintf(stderr, "gencmn: unable to get length of listed file %s\n", fullPath);
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ T_FileStream_close(file);
+
+ /* do not add files that are longer than maxSize */
+ if(maxSize && length>maxSize) {
+ if (verbose) {
+ printf("%s ignored (size %ld > %ld)\n", fullPath, (long)length, (long)maxSize);
+ }
+ return;
+ }
+ files[fileCount].fileSize=length;
+ } else {
+ char *t;
+ /* get and store the basename */
+ /* need to include the package name */
+ length = (uint32_t)(uprv_strlen(filename) + 1 + uprv_strlen(name) + 1);
+ s=allocString(length);
+ uprv_strcpy(s, name);
+ uprv_strcat(s, U_TREE_ENTRY_SEP_STRING);
+ uprv_strcat(s, filename);
+ fixDirToTreePath(s);
+ files[fileCount].basename=s;
+ /* turn the basename into an entry point name and store in the pathname field */
+ t=files[fileCount].pathname=allocString(length);
+ while(--length>0) {
+ if(*s=='.' || *s=='-' || *s=='/') {
+ *t='_';
+ } else {
+ *t=*s;
+ }
+ ++s;
+ ++t;
+ }
+ *t=0;
+ }
+ ++fileCount;
+}
+
+static char *
+allocString(uint32_t length) {
+ uint32_t top=stringTop+length;
+ char *p;
+
+ if(top>STRING_STORE_SIZE) {
+ fprintf(stderr, "gencmn: out of memory\n");
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ p=stringStore+stringTop;
+ stringTop=top;
+ return p;
+}
+
+static char *
+pathToFullPath(const char *path, const char *source) {
+ int32_t length;
+ int32_t newLength;
+ char *fullPath;
+ int32_t n;
+
+ length = (uint32_t)(uprv_strlen(path) + 1);
+ newLength = (length + 1 + (int32_t)uprv_strlen(source));
+ fullPath = (char *)uprv_malloc(newLength);
+ if(source != nullptr) {
+ uprv_strcpy(fullPath, source);
+ uprv_strcat(fullPath, U_FILE_SEP_STRING);
+ } else {
+ fullPath[0] = 0;
+ }
+ n = (int32_t)uprv_strlen(fullPath);
+ fullPath[n] = 0; /* Suppress compiler warning for unused variable n */
+ /* when conditional code below is not compiled. */
+ uprv_strcat(fullPath, path);
+
+#if (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
+#if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR)
+ /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */
+ for(;fullPath[n];n++) {
+ if(fullPath[n] == U_FILE_ALT_SEP_CHAR) {
+ fullPath[n] = U_FILE_SEP_CHAR;
+ }
+ }
+#endif
+#endif
+#if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
+ /* replace tree separator (such as '/') with file sep char (such as ':' or '\\') */
+ for(;fullPath[n];n++) {
+ if(fullPath[n] == U_TREE_ENTRY_SEP_CHAR) {
+ fullPath[n] = U_FILE_SEP_CHAR;
+ }
+ }
+#endif
+ return fullPath;
+}
+
+U_CDECL_BEGIN
+static int
+compareFiles(const void *file1, const void *file2) {
+ /* sort by basename */
+ return uprv_strcmp(((File *)file1)->basename, ((File *)file2)->basename);
+}
+U_CDECL_END
+
+static void
+fixDirToTreePath(char *s)
+{
+ (void)s;
+#if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR) || ((U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR))
+ char *t;
+#endif
+#if (U_FILE_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
+ for(t=s;t=uprv_strchr(t,U_FILE_SEP_CHAR);) {
+ *t = U_TREE_ENTRY_SEP_CHAR;
+ }
+#endif
+#if (U_FILE_ALT_SEP_CHAR != U_FILE_SEP_CHAR) && (U_FILE_ALT_SEP_CHAR != U_TREE_ENTRY_SEP_CHAR)
+ for(t=s;t=uprv_strchr(t,U_FILE_ALT_SEP_CHAR);) {
+ *t = U_TREE_ENTRY_SEP_CHAR;
+ }
+#endif
+}
diff --git a/intl/icu/source/tools/toolutil/pkg_gencmn.h b/intl/icu/source/tools/toolutil/pkg_gencmn.h
new file mode 100644
index 0000000000..238239960a
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/pkg_gencmn.h
@@ -0,0 +1,18 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/******************************************************************************
+ * Copyright (C) 2008, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *******************************************************************************
+ */
+
+#ifndef __PKG_GENCMN_H__
+#define __PKG_GENCMN_H__
+
+#include "unicode/utypes.h"
+
+U_CAPI void U_EXPORT2
+createCommonDataFile(const char *destDir, const char *name, const char *entrypointName, const char *type, const char *source, const char *copyRight,
+ const char *dataFile, uint32_t max_size, UBool sourceTOC, UBool verbose, char *gencmnFileName);
+
+#endif
diff --git a/intl/icu/source/tools/toolutil/pkg_icu.cpp b/intl/icu/source/tools/toolutil/pkg_icu.cpp
new file mode 100644
index 0000000000..d9c6717ecd
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/pkg_icu.cpp
@@ -0,0 +1,176 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/******************************************************************************
+ * Copyright (C) 2008-2015, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *******************************************************************************
+ */
+#include "unicode/utypes.h"
+#include "unicode/localpointer.h"
+#include "unicode/putil.h"
+#include "cstring.h"
+#include "toolutil.h"
+#include "uoptions.h"
+#include "uparse.h"
+#include "package.h"
+#include "pkg_icu.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// read a file list -------------------------------------------------------- ***
+
+U_NAMESPACE_USE
+
+static const struct {
+ const char *suffix;
+ int32_t length;
+} listFileSuffixes[]={
+ { ".txt", 4 },
+ { ".lst", 4 },
+ { ".tmp", 4 }
+};
+
+/* check for multiple text file suffixes to see if this list name is a text file name */
+static UBool
+isListTextFile(const char *listname) {
+ const char *listNameEnd=strchr(listname, 0);
+ const char *suffix;
+ int32_t i, length;
+ for(i=0; i<UPRV_LENGTHOF(listFileSuffixes); ++i) {
+ suffix=listFileSuffixes[i].suffix;
+ length=listFileSuffixes[i].length;
+ if((listNameEnd-listname)>length && 0==memcmp(listNameEnd-length, suffix, length)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/*
+ * Read a file list.
+ * If the listname ends with ".txt", then read the list file
+ * (in the system/ invariant charset).
+ * If the listname ends with ".dat", then read the ICU .dat package file.
+ * Otherwise, read the file itself as a single-item list.
+ */
+U_CAPI Package * U_EXPORT2
+readList(const char *filesPath, const char *listname, UBool readContents, Package *listPkgIn) {
+ Package *listPkg = listPkgIn;
+ FILE *file;
+ const char *listNameEnd;
+
+ if(listname==nullptr || listname[0]==0) {
+ fprintf(stderr, "missing list file\n");
+ return nullptr;
+ }
+
+ if (listPkg == nullptr) {
+ listPkg=new Package();
+ if(listPkg==nullptr) {
+ fprintf(stderr, "icupkg: not enough memory\n");
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ }
+
+ listNameEnd=strchr(listname, 0);
+ if(isListTextFile(listname)) {
+ // read the list file
+ char line[1024];
+ char *end;
+ const char *start;
+
+ file=fopen(listname, "r");
+ if(file==nullptr) {
+ fprintf(stderr, "icupkg: unable to open list file \"%s\"\n", listname);
+ delete listPkg;
+ exit(U_FILE_ACCESS_ERROR);
+ }
+
+ while(fgets(line, sizeof(line), file)) {
+ // remove comments
+ end=strchr(line, '#');
+ if(end!=nullptr) {
+ *end=0;
+ } else {
+ // remove trailing CR LF
+ end=strchr(line, 0);
+ while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
+ *--end=0;
+ }
+ }
+
+ // check first non-whitespace character and
+ // skip empty lines and
+ // skip lines starting with reserved characters
+ start=u_skipWhitespace(line);
+ if(*start==0 || nullptr!=strchr(U_PKG_RESERVED_CHARS, *start)) {
+ continue;
+ }
+
+ // take whitespace-separated items from the line
+ for(;;) {
+ // find whitespace after the item or the end of the line
+ for(end=(char *)start; *end!=0 && *end!=' ' && *end!='\t'; ++end) {}
+ if(*end==0) {
+ // this item is the last one on the line
+ end=nullptr;
+ } else {
+ // the item is terminated by whitespace, terminate it with NUL
+ *end=0;
+ }
+ if(readContents) {
+ listPkg->addFile(filesPath, start);
+ } else {
+ listPkg->addItem(start);
+ }
+
+ // find the start of the next item or exit the loop
+ if(end==nullptr || *(start=u_skipWhitespace(end+1))==0) {
+ break;
+ }
+ }
+ }
+ fclose(file);
+ } else if((listNameEnd-listname)>4 && 0==memcmp(listNameEnd-4, ".dat", 4)) {
+ // read the ICU .dat package
+ // Accept a .dat file whose name differs from the ToC prefixes.
+ listPkg->setAutoPrefix();
+ listPkg->readPackage(listname);
+ } else {
+ // list the single file itself
+ if(readContents) {
+ listPkg->addFile(filesPath, listname);
+ } else {
+ listPkg->addItem(listname);
+ }
+ }
+
+ return listPkg;
+}
+
+U_CAPI int U_EXPORT2
+writePackageDatFile(const char *outFilename, const char *outComment, const char *sourcePath, const char *addList, Package *pkg, char outType) {
+ LocalPointer<Package> ownedPkg;
+ LocalPointer<Package> addListPkg;
+
+ if (pkg == nullptr) {
+ ownedPkg.adoptInstead(new Package);
+ if(ownedPkg.isNull()) {
+ fprintf(stderr, "icupkg: not enough memory\n");
+ return U_MEMORY_ALLOCATION_ERROR;
+ }
+ pkg = ownedPkg.getAlias();
+
+ addListPkg.adoptInstead(readList(sourcePath, addList, true, nullptr));
+ if(addListPkg.isValid()) {
+ pkg->addItems(*addListPkg);
+ } else {
+ return U_ILLEGAL_ARGUMENT_ERROR;
+ }
+ }
+
+ pkg->writePackage(outFilename, outType, outComment);
+ return 0;
+}
diff --git a/intl/icu/source/tools/toolutil/pkg_icu.h b/intl/icu/source/tools/toolutil/pkg_icu.h
new file mode 100644
index 0000000000..638056e60b
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/pkg_icu.h
@@ -0,0 +1,25 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/******************************************************************************
+ * Copyright (C) 2008-2016, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *******************************************************************************
+ */
+
+#ifndef __PKG_ICU_H__
+#define __PKG_ICU_H__
+
+#include "unicode/utypes.h"
+#include "package.h"
+
+#define U_PKG_RESERVED_CHARS "\"%&'()*+,-./:;<=>?_"
+
+U_CAPI int U_EXPORT2
+writePackageDatFile(const char *outFilename, const char *outComment,
+ const char *sourcePath, const char *addList, icu::Package *pkg,
+ char outType);
+
+U_CAPI icu::Package * U_EXPORT2
+readList(const char *filesPath, const char *listname, UBool readContents, icu::Package *listPkgIn);
+
+#endif
diff --git a/intl/icu/source/tools/toolutil/pkg_imp.h b/intl/icu/source/tools/toolutil/pkg_imp.h
new file mode 100644
index 0000000000..29abd8d83c
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/pkg_imp.h
@@ -0,0 +1,38 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2005-2016, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: pkg_imp.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2005sep18
+* created by: Markus W. Scherer
+*
+* Implementation definitions for data package functions in toolutil.
+*/
+
+#ifndef __PKG_IMP_H__
+#define __PKG_IMP_H__
+
+#include "unicode/utypes.h"
+#include "unicode/udata.h"
+
+/*
+ * Read an ICU data item with any platform type,
+ * return the pointer to the UDataInfo in its header,
+ * and set the lengths of the UDataInfo and of the whole header.
+ * All data remains in its platform type.
+ */
+U_CFUNC const UDataInfo *
+getDataInfo(const uint8_t *data, int32_t length,
+ int32_t &infoLength, int32_t &headerLength,
+ UErrorCode *pErrorCode);
+
+#endif
diff --git a/intl/icu/source/tools/toolutil/pkgitems.cpp b/intl/icu/source/tools/toolutil/pkgitems.cpp
new file mode 100644
index 0000000000..e49775d56d
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/pkgitems.cpp
@@ -0,0 +1,645 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2003-2015, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: pkgitems.cpp
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2005sep18
+* created by: Markus W. Scherer
+*
+* Companion file to package.cpp. Deals with details of ICU data item formats.
+* Used for item dependencies.
+* Contains adapted code from ucnv_bld.c (swapper code from 2003).
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/ures.h"
+#include "unicode/putil.h"
+#include "unicode/udata.h"
+#include "cstring.h"
+#include "uinvchar.h"
+#include "ucmndata.h"
+#include "udataswp.h"
+#include "swapimpl.h"
+#include "toolutil.h"
+#include "package.h"
+#include "pkg_imp.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* item formats in common */
+
+#include "uresdata.h"
+#include "ucnv_bld.h"
+#include "ucnv_io.h"
+
+// general definitions ----------------------------------------------------- ***
+
+U_CDECL_BEGIN
+
+static void U_CALLCONV
+printError(void *context, const char *fmt, va_list args) {
+ vfprintf((FILE *)context, fmt, args);
+}
+
+U_CDECL_END
+
+// a data item in native-platform form ------------------------------------- ***
+
+U_NAMESPACE_BEGIN
+
+class NativeItem {
+public:
+ NativeItem() : pItem(nullptr), pInfo(nullptr), bytes(nullptr), swapped(nullptr), length(0) {}
+ NativeItem(const Item *item, UDataSwapFn *swap) : swapped(nullptr) {
+ setItem(item, swap);
+ }
+ ~NativeItem() {
+ delete [] swapped;
+ }
+ const UDataInfo *getDataInfo() const {
+ return pInfo;
+ }
+ const uint8_t *getBytes() const {
+ return bytes;
+ }
+ int32_t getLength() const {
+ return length;
+ }
+
+ void setItem(const Item *item, UDataSwapFn *swap) {
+ pItem=item;
+ int32_t infoLength, itemHeaderLength;
+ UErrorCode errorCode=U_ZERO_ERROR;
+ pInfo=::getDataInfo(pItem->data, pItem->length, infoLength, itemHeaderLength, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ exit(errorCode); // should succeed because readFile() checks headers
+ }
+ length=pItem->length-itemHeaderLength;
+
+ if(pInfo->isBigEndian==U_IS_BIG_ENDIAN && pInfo->charsetFamily==U_CHARSET_FAMILY) {
+ bytes=pItem->data+itemHeaderLength;
+ } else {
+ UDataSwapper *ds=udata_openSwapper((UBool)pInfo->isBigEndian, pInfo->charsetFamily, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: udata_openSwapper(\"%s\") failed - %s\n",
+ pItem->name, u_errorName(errorCode));
+ exit(errorCode);
+ }
+
+ ds->printError=printError;
+ ds->printErrorContext=stderr;
+
+ swapped=new uint8_t[pItem->length];
+ if(swapped==nullptr) {
+ fprintf(stderr, "icupkg: unable to allocate memory for swapping \"%s\"\n", pItem->name);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ swap(ds, pItem->data, pItem->length, swapped, &errorCode);
+ pInfo=::getDataInfo(swapped, pItem->length, infoLength, itemHeaderLength, &errorCode);
+ bytes=swapped+itemHeaderLength;
+ udata_closeSwapper(ds);
+ }
+ }
+
+private:
+ const Item *pItem;
+ const UDataInfo *pInfo;
+ const uint8_t *bytes;
+ uint8_t *swapped;
+ int32_t length;
+};
+
+// check a dependency ------------------------------------------------------ ***
+
+/*
+ * assemble the target item name from the source item name, an ID
+ * and a suffix
+ */
+static void
+makeTargetName(const char *itemName, const char *id, int32_t idLength, const char *suffix,
+ char *target, int32_t capacity,
+ UErrorCode *pErrorCode) {
+ const char *itemID;
+ int32_t treeLength, suffixLength, targetLength;
+
+ // get the item basename
+ itemID=strrchr(itemName, '/');
+ if(itemID!=nullptr) {
+ ++itemID;
+ } else {
+ itemID=itemName;
+ }
+
+ // build the target string
+ treeLength=(int32_t)(itemID-itemName);
+ if(idLength<0) {
+ idLength=(int32_t)strlen(id);
+ }
+ suffixLength=(int32_t)strlen(suffix);
+ targetLength=treeLength+idLength+suffixLength;
+ if(targetLength>=capacity) {
+ fprintf(stderr, "icupkg/makeTargetName(%s) target item name length %ld too long\n",
+ itemName, (long)targetLength);
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ return;
+ }
+
+ memcpy(target, itemName, treeLength);
+ memcpy(target+treeLength, id, idLength);
+ memcpy(target+treeLength+idLength, suffix, suffixLength+1); // +1 includes the terminating NUL
+}
+
+static void
+checkIDSuffix(const char *itemName, const char *id, int32_t idLength, const char *suffix,
+ CheckDependency check, void *context,
+ UErrorCode *pErrorCode) {
+ char target[200];
+ makeTargetName(itemName, id, idLength, suffix, target, (int32_t)sizeof(target), pErrorCode);
+ if(U_SUCCESS(*pErrorCode)) {
+ check(context, itemName, target);
+ }
+}
+
+/* assemble the target item name from the item's parent item name */
+static void
+checkParent(const char *itemName, CheckDependency check, void *context,
+ UErrorCode *pErrorCode) {
+ const char *itemID, *parent, *parentLimit, *suffix;
+ int32_t parentLength;
+
+ // get the item basename
+ itemID=strrchr(itemName, '/');
+ if(itemID!=nullptr) {
+ ++itemID;
+ } else {
+ itemID=itemName;
+ }
+
+ // get the item suffix
+ suffix=strrchr(itemID, '.');
+ if(suffix==nullptr) {
+ // empty suffix, point to the end of the string
+ suffix=strrchr(itemID, 0);
+ }
+
+ // get the position of the last '_'
+ for(parentLimit=suffix; parentLimit>itemID && *--parentLimit!='_';) {}
+
+ if(parentLimit!=itemID) {
+ // get the parent item name by truncating the last part of this item's name */
+ parent=itemID;
+ parentLength=(int32_t)(parentLimit-itemID);
+ } else {
+ // no '_' in the item name: the parent is the root bundle
+ parent="root";
+ parentLength=4;
+ if((suffix-itemID)==parentLength && 0==memcmp(itemID, parent, parentLength)) {
+ // the item itself is "root", which does not depend on a parent
+ return;
+ }
+ }
+ checkIDSuffix(itemName, parent, parentLength, suffix, check, context, pErrorCode);
+}
+
+// get dependencies from resource bundles ---------------------------------- ***
+
+static const char16_t SLASH=0x2f;
+
+/*
+ * Check for the alias from the string or alias resource res.
+ */
+static void
+checkAlias(const char *itemName,
+ Resource res, const char16_t *alias, int32_t length, UBool useResSuffix,
+ CheckDependency check, void *context, UErrorCode *pErrorCode) {
+ int32_t i;
+
+ if(!uprv_isInvariantUString(alias, length)) {
+ fprintf(stderr, "icupkg/ures_enumDependencies(%s res=%08x) alias string contains non-invariant characters\n",
+ itemName, res);
+ *pErrorCode=U_INVALID_CHAR_FOUND;
+ return;
+ }
+
+ // extract the locale ID from alias strings like
+ // locale_ID/key1/key2/key3
+ // locale_ID
+
+ // search for the first slash
+ for(i=0; i<length && alias[i]!=SLASH; ++i) {}
+
+ if(res_getPublicType(res)==URES_ALIAS) {
+ // ignore aliases with an initial slash:
+ // /ICUDATA/... and /pkgname/... go to a different package
+ // /LOCALE/... are for dynamic sideways fallbacks and don't go to a fixed bundle
+ if(i==0) {
+ return; // initial slash ('/')
+ }
+
+ // ignore the intra-bundle path starting from the first slash ('/')
+ length=i;
+ } else /* URES_STRING */ {
+ // the whole string should only consist of a locale ID
+ if(i!=length) {
+ fprintf(stderr, "icupkg/ures_enumDependencies(%s res=%08x) %%ALIAS contains a '/'\n",
+ itemName, res);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return;
+ }
+ }
+
+ // convert the Unicode string to char *
+ char localeID[48];
+ if(length>=(int32_t)sizeof(localeID)) {
+ fprintf(stderr, "icupkg/ures_enumDependencies(%s res=%08x) alias locale ID length %ld too long\n",
+ itemName, res, (long)length);
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ return;
+ }
+ u_UCharsToChars(alias, localeID, length);
+ localeID[length]=0;
+
+ checkIDSuffix(itemName, localeID, -1, (useResSuffix ? ".res" : ""), check, context, pErrorCode);
+}
+
+/*
+ * Enumerate one resource item and its children and extract dependencies from
+ * aliases.
+ */
+static UBool
+ures_enumDependencies(const char *itemName,
+ const ResourceData *pResData,
+ Resource res, const char *inKey, const char *parentKey, int32_t depth,
+ CheckDependency check, void *context,
+ Package *pkg,
+ UErrorCode *pErrorCode) {
+ UBool doCheckParent = true; // always remains true if depth>1
+ switch(res_getPublicType(res)) {
+ case URES_STRING:
+ if(depth==1 && inKey!=nullptr &&
+ (0==strcmp(inKey, "%%ALIAS") || 0==strcmp(inKey, "%%Parent"))) {
+ // Top-level %%ALIAS string:
+ // The alias resource bundle will be used instead of this one.
+ // Top-level %%Parent string:
+ // We use this bundle as well as the explicit parent bundle.
+ // Either way, the truncation parent is ignored.
+ doCheckParent = false;
+ // No tracing: build tool
+ int32_t length;
+ const char16_t *alias=res_getStringNoTrace(pResData, res, &length);
+ checkAlias(itemName, res, alias, length, /*useResSuffix=*/ true,
+ check, context, pErrorCode);
+ // If there is a %%ALIAS, then there should be nothing else in this resource bundle.
+ } else if(depth==2 && parentKey!=nullptr && 0==strcmp(parentKey, "%%DEPENDENCY")) {
+ // Second-level %%DEPENDENCY string:
+ // Explicit declaration of a dependency of this item on that one.
+ // No tracing: build tool
+ int32_t length;
+ const char16_t *alias=res_getStringNoTrace(pResData, res, &length);
+ checkAlias(itemName, res, alias, length, /*useResSuffix=*/ false,
+ check, context, pErrorCode);
+ }
+ // we ignore all other strings
+ break;
+ case URES_ALIAS:
+ {
+ int32_t length;
+ const char16_t *alias=res_getAlias(pResData, res, &length);
+ checkAlias(itemName, res, alias, length, true, check, context, pErrorCode);
+ }
+ break;
+ case URES_TABLE:
+ {
+ /* recurse */
+ int32_t count=res_countArrayItems(pResData, res);
+ for(int32_t i=0; i<count; ++i) {
+ const char *itemKey;
+ Resource item=res_getTableItemByIndex(pResData, res, i, &itemKey);
+ // This doCheckParent return value is needed to
+ // propagate the possible false value from depth=1 to depth=0.
+ doCheckParent &= ures_enumDependencies(
+ itemName, pResData,
+ item, itemKey,
+ inKey, depth+1,
+ check, context,
+ pkg,
+ pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ fprintf(stderr, "icupkg/ures_enumDependencies(%s table res=%08x)[%d].recurse(%s: %08x) failed\n",
+ itemName, res, i, itemKey, item);
+ break;
+ }
+ }
+ }
+ break;
+ case URES_ARRAY:
+ {
+ /* recurse */
+ int32_t count=res_countArrayItems(pResData, res);
+ for(int32_t i=0; i<count; ++i) {
+ Resource item=res_getArrayItem(pResData, res, i);
+ ures_enumDependencies(
+ itemName, pResData,
+ item, nullptr,
+ inKey, depth+1,
+ check, context,
+ pkg,
+ pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ fprintf(stderr, "icupkg/ures_enumDependencies(%s array res=%08x)[%d].recurse(%08x) failed\n",
+ itemName, res, i, item);
+ break;
+ }
+ }
+ }
+ break;
+ default:
+ break;
+ }
+ return doCheckParent;
+}
+
+static void
+ures_enumDependencies(const char *itemName, const UDataInfo *pInfo,
+ const uint8_t *inBytes, int32_t length,
+ CheckDependency check, void *context,
+ Package *pkg,
+ UErrorCode *pErrorCode) {
+ ResourceData resData;
+
+ res_read(&resData, pInfo, inBytes, length, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ fprintf(stderr, "icupkg: .res format version %02x.%02x not supported, or bundle malformed\n",
+ pInfo->formatVersion[0], pInfo->formatVersion[1]);
+ exit(U_UNSUPPORTED_ERROR);
+ }
+
+ icu::NativeItem nativePool;
+
+ if(resData.usesPoolBundle) {
+ char poolName[200];
+ makeTargetName(itemName, "pool", 4, ".res", poolName, (int32_t)sizeof(poolName), pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ }
+ check(context, itemName, poolName);
+ int32_t index=pkg->findItem(poolName);
+ if(index<0) {
+ // We cannot work with a bundle if its pool resource is missing.
+ // check() already printed a complaint.
+ return;
+ }
+ // TODO: Cache the native version in the Item itself.
+ nativePool.setItem(pkg->getItem(index), ures_swap);
+ const UDataInfo *poolInfo=nativePool.getDataInfo();
+ if(poolInfo->formatVersion[0]<=1) {
+ fprintf(stderr, "icupkg: %s is not a pool bundle\n", poolName);
+ return;
+ }
+ const int32_t *poolRoot=(const int32_t *)nativePool.getBytes();
+ const int32_t *poolIndexes=poolRoot+1;
+ int32_t poolIndexLength=poolIndexes[URES_INDEX_LENGTH]&0xff;
+ if(!(poolIndexLength>URES_INDEX_POOL_CHECKSUM &&
+ (poolIndexes[URES_INDEX_ATTRIBUTES]&URES_ATT_IS_POOL_BUNDLE))
+ ) {
+ fprintf(stderr, "icupkg: %s is not a pool bundle\n", poolName);
+ return;
+ }
+ if(resData.pRoot[1+URES_INDEX_POOL_CHECKSUM]==poolIndexes[URES_INDEX_POOL_CHECKSUM]) {
+ resData.poolBundleKeys=(const char *)(poolIndexes+poolIndexLength);
+ resData.poolBundleStrings=(const uint16_t *)(poolRoot+poolIndexes[URES_INDEX_KEYS_TOP]);
+ } else {
+ fprintf(stderr, "icupkg: %s has mismatched checksum for %s\n", poolName, itemName);
+ return;
+ }
+ }
+
+ UBool doCheckParent = ures_enumDependencies(
+ itemName, &resData,
+ resData.rootRes, nullptr, nullptr, 0,
+ check, context,
+ pkg,
+ pErrorCode);
+ if(!doCheckParent) {
+ return;
+ }
+
+ /*
+ * if the bundle attributes are present and the nofallback flag is not set,
+ * then add the parent bundle as a dependency
+ */
+ if(pInfo->formatVersion[0]>1 || (pInfo->formatVersion[0]==1 && pInfo->formatVersion[1]>=1)) {
+ if(!resData.noFallback) {
+ /* this bundle participates in locale fallback */
+ checkParent(itemName, check, context, pErrorCode);
+ }
+ }
+}
+
+// get dependencies from conversion tables --------------------------------- ***
+
+#if !UCONFIG_NO_CONVERSION
+/* code adapted from ucnv_swap() */
+static void
+ucnv_enumDependencies(const UDataSwapper *ds,
+ const char *itemName, const UDataInfo *pInfo,
+ const uint8_t *inBytes, int32_t length,
+ CheckDependency check, void *context,
+ UErrorCode *pErrorCode) {
+ uint32_t staticDataSize;
+
+ const UConverterStaticData *inStaticData;
+
+ const _MBCSHeader *inMBCSHeader;
+ uint8_t outputType;
+
+ /* check format version */
+ if(!(
+ pInfo->formatVersion[0]==6 &&
+ pInfo->formatVersion[1]>=2
+ )) {
+ fprintf(stderr, "icupkg/ucnv_enumDependencies(): .cnv format version %02x.%02x not supported\n",
+ pInfo->formatVersion[0], pInfo->formatVersion[1]);
+ exit(U_UNSUPPORTED_ERROR);
+ }
+
+ /* read the initial UConverterStaticData structure after the UDataInfo header */
+ inStaticData=(const UConverterStaticData *)inBytes;
+
+ if( length<(int32_t)sizeof(UConverterStaticData) ||
+ (uint32_t)length<(staticDataSize=ds->readUInt32(inStaticData->structSize))
+ ) {
+ udata_printError(ds, "icupkg/ucnv_enumDependencies(): too few bytes (%d after header) for an ICU .cnv conversion table\n",
+ length);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return;
+ }
+
+ inBytes+=staticDataSize;
+ length-=(int32_t)staticDataSize;
+
+ /* check for supported conversionType values */
+ if(inStaticData->conversionType==UCNV_MBCS) {
+ /* MBCS data */
+ uint32_t mbcsHeaderLength, mbcsHeaderFlags, mbcsHeaderOptions;
+ int32_t extOffset;
+
+ inMBCSHeader=(const _MBCSHeader *)inBytes;
+
+ if(length<(int32_t)sizeof(_MBCSHeader)) {
+ udata_printError(ds, "icupkg/ucnv_enumDependencies(): too few bytes (%d after headers) for an ICU MBCS .cnv conversion table\n",
+ length);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return;
+ }
+ if(inMBCSHeader->version[0]==4 && inMBCSHeader->version[1]>=1) {
+ mbcsHeaderLength=MBCS_HEADER_V4_LENGTH;
+ } else if(inMBCSHeader->version[0]==5 && inMBCSHeader->version[1]>=3 &&
+ ((mbcsHeaderOptions=ds->readUInt32(inMBCSHeader->options))&
+ MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0
+ ) {
+ mbcsHeaderLength=mbcsHeaderOptions&MBCS_OPT_LENGTH_MASK;
+ } else {
+ udata_printError(ds, "icupkg/ucnv_enumDependencies(): unsupported _MBCSHeader.version %d.%d\n",
+ inMBCSHeader->version[0], inMBCSHeader->version[1]);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return;
+ }
+
+ mbcsHeaderFlags=ds->readUInt32(inMBCSHeader->flags);
+ extOffset=(int32_t)(mbcsHeaderFlags>>8);
+ outputType=(uint8_t)mbcsHeaderFlags;
+
+ if(outputType==MBCS_OUTPUT_EXT_ONLY) {
+ /*
+ * extension-only file,
+ * contains a base name instead of normal base table data
+ */
+ char baseName[32];
+ int32_t baseNameLength;
+
+ /* there is extension data after the base data, see ucnv_ext.h */
+ if(length<(extOffset+UCNV_EXT_INDEXES_MIN_LENGTH*4)) {
+ udata_printError(ds, "icupkg/ucnv_enumDependencies(): too few bytes (%d after headers) for an ICU MBCS .cnv conversion table with extension data\n",
+ length);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return;
+ }
+
+ /* swap the base name, between the header and the extension data */
+ const char *inBaseName=(const char *)inBytes+mbcsHeaderLength*4;
+ baseNameLength=(int32_t)strlen(inBaseName);
+ if(baseNameLength>=(int32_t)sizeof(baseName)) {
+ udata_printError(ds, "icupkg/ucnv_enumDependencies(%s): base name length %ld too long\n",
+ itemName, baseNameLength);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return;
+ }
+ ds->swapInvChars(ds, inBaseName, baseNameLength+1, baseName, pErrorCode);
+
+ checkIDSuffix(itemName, baseName, -1, ".cnv", check, context, pErrorCode);
+ }
+ }
+}
+
+// ICU data formats -------------------------------------------------------- ***
+
+static const struct {
+ uint8_t dataFormat[4];
+} dataFormats[]={
+ { { 0x52, 0x65, 0x73, 0x42 } }, /* dataFormat="ResB" */
+ { { 0x63, 0x6e, 0x76, 0x74 } }, /* dataFormat="cnvt" */
+ { { 0x43, 0x76, 0x41, 0x6c } } /* dataFormat="CvAl" */
+};
+
+enum {
+ FMT_RES,
+ FMT_CNV,
+ FMT_ALIAS,
+ FMT_COUNT
+};
+
+static int32_t
+getDataFormat(const uint8_t dataFormat[4]) {
+ int32_t i;
+
+ for(i=0; i<FMT_COUNT; ++i) {
+ if(0==memcmp(dataFormats[i].dataFormat, dataFormat, 4)) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+// enumerate dependencies of a package item -------------------------------- ***
+
+void
+Package::enumDependencies(Item *pItem, void *context, CheckDependency check) {
+ int32_t infoLength, itemHeaderLength;
+ UErrorCode errorCode=U_ZERO_ERROR;
+ const UDataInfo *pInfo=getDataInfo(pItem->data, pItem->length, infoLength, itemHeaderLength, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ return; // should not occur because readFile() checks headers
+ }
+
+ // find the data format and call the corresponding function, if any
+ int32_t format=getDataFormat(pInfo->dataFormat);
+ if(format>=0) {
+ switch(format) {
+ case FMT_RES:
+ {
+ /*
+ * Swap the resource bundle (if necessary) so that we can use
+ * the normal runtime uresdata.c code to read it.
+ * We do not want to duplicate that code, especially not together with on-the-fly swapping.
+ */
+ NativeItem nrb(pItem, ures_swap);
+ ures_enumDependencies(pItem->name, nrb.getDataInfo(), nrb.getBytes(), nrb.getLength(), check, context, this, &errorCode);
+ break;
+ }
+ case FMT_CNV:
+ {
+ // TODO: share/cache swappers
+ UDataSwapper *ds=udata_openSwapper(
+ (UBool)pInfo->isBigEndian, pInfo->charsetFamily,
+ U_IS_BIG_ENDIAN, U_CHARSET_FAMILY,
+ &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "icupkg: udata_openSwapper(\"%s\") failed - %s\n",
+ pItem->name, u_errorName(errorCode));
+ exit(errorCode);
+ }
+
+ ds->printError=printError;
+ ds->printErrorContext=stderr;
+
+ const uint8_t *inBytes=pItem->data+itemHeaderLength;
+ int32_t length=pItem->length-itemHeaderLength;
+
+ ucnv_enumDependencies(ds, pItem->name, pInfo, inBytes, length, check, context, &errorCode);
+ udata_closeSwapper(ds);
+ break;
+ }
+ default:
+ break;
+ }
+
+ if(U_FAILURE(errorCode)) {
+ exit(errorCode);
+ }
+ }
+}
+#endif /* UCONFIG_NO_CONVERSION */
+
+U_NAMESPACE_END
diff --git a/intl/icu/source/tools/toolutil/ppucd.cpp b/intl/icu/source/tools/toolutil/ppucd.cpp
new file mode 100644
index 0000000000..0d59b28ce4
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/ppucd.cpp
@@ -0,0 +1,622 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+* Copyright (C) 2011-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* file name: ppucd.cpp
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2011dec11
+* created by: Markus W. Scherer
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/uchar.h"
+#include "charstr.h"
+#include "cstring.h"
+#include "ppucd.h"
+#include "uassert.h"
+#include "uparse.h"
+
+#include <stdio.h>
+#include <string.h>
+
+U_NAMESPACE_BEGIN
+
+PropertyNames::~PropertyNames() {}
+
+// TODO: Create a concrete subclass for the default PropertyNames implementation
+// using the ICU library built-in property names API & data.
+// Currently only the genprops tool uses PreparsedUCD, and provides its own
+// PropertyNames implementation using its just-build property names data and its own code.
+// At some point, we should use PreparsedUCD in tests, and then we will need the
+// default implementation somewhere.
+#if 0
+int32_t
+PropertyNames::getPropertyEnum(const char *name) const {
+ return u_getPropertyEnum(name);
+}
+
+int32_t
+PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
+ return u_getPropertyValueEnum((UProperty)property, name);
+}
+#endif
+
+UniProps::UniProps()
+ : start(U_SENTINEL), end(U_SENTINEL),
+ bmg(U_SENTINEL), bpb(U_SENTINEL),
+ scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
+ digitValue(-1), numericValue(nullptr),
+ name(nullptr), nameAlias(nullptr) {
+ memset(binProps, 0, sizeof(binProps));
+ memset(intProps, 0, sizeof(intProps));
+ memset(age, 0, 4);
+}
+
+UniProps::~UniProps() {}
+
+const int32_t PreparsedUCD::kNumLineBuffers;
+
+PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
+ : pnames(nullptr),
+ file(nullptr),
+ defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
+ lineNumber(0),
+ lineType(NO_LINE),
+ fieldLimit(nullptr), lineLimit(nullptr) {
+ if(U_FAILURE(errorCode)) { return; }
+
+ if(filename==nullptr || *filename==0 || (*filename=='-' && filename[1]==0)) {
+ filename=nullptr;
+ file=stdin;
+ } else {
+ file=fopen(filename, "r");
+ }
+ if(file==nullptr) {
+ perror("error opening preparsed UCD");
+ fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
+ errorCode=U_FILE_ACCESS_ERROR;
+ return;
+ }
+
+ memset(ucdVersion, 0, 4);
+ lines[0][0]=0;
+}
+
+PreparsedUCD::~PreparsedUCD() {
+ if(file!=stdin) {
+ fclose(file);
+ }
+}
+
+// Same order as the LineType values.
+static const char *lineTypeStrings[]={
+ nullptr,
+ nullptr,
+ "ucd",
+ "property",
+ "binary",
+ "value",
+ "defaults",
+ "block",
+ "cp",
+ "unassigned",
+ "algnamesrange"
+};
+
+PreparsedUCD::LineType
+PreparsedUCD::readLine(UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return NO_LINE; }
+ // Select the next available line buffer.
+ while(!isLineBufferAvailable(lineIndex)) {
+ ++lineIndex;
+ if (lineIndex == kNumLineBuffers) {
+ lineIndex = 0;
+ }
+ }
+ char *line=lines[lineIndex];
+ *line=0;
+ lineLimit=fieldLimit=line;
+ lineType=NO_LINE;
+ char *result=fgets(line, sizeof(lines[0]), file);
+ if(result==nullptr) {
+ if(ferror(file)) {
+ perror("error reading preparsed UCD");
+ fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
+ errorCode=U_FILE_ACCESS_ERROR;
+ }
+ return NO_LINE;
+ }
+ ++lineNumber;
+ if(*line=='#') {
+ fieldLimit=strchr(line, 0);
+ return lineType=EMPTY_LINE;
+ }
+ // Remove trailing /r/n.
+ char c;
+ char *limit=strchr(line, 0);
+ while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
+ // Remove trailing white space.
+ while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
+ *limit=0;
+ lineLimit=limit;
+ if(line==limit) {
+ fieldLimit=limit;
+ return lineType=EMPTY_LINE;
+ }
+ // Split by ';'.
+ char *semi=line;
+ while((semi=strchr(semi, ';'))!=nullptr) { *semi++=0; }
+ fieldLimit=strchr(line, 0);
+ // Determine the line type.
+ int32_t type;
+ for(type=EMPTY_LINE+1;; ++type) {
+ if(type==LINE_TYPE_COUNT) {
+ fprintf(stderr,
+ "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
+ line, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return NO_LINE;
+ }
+ if(0==strcmp(line, lineTypeStrings[type])) {
+ break;
+ }
+ }
+ lineType=(LineType)type;
+ if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
+ u_versionFromString(ucdVersion, fieldLimit+1);
+ }
+ return lineType;
+}
+
+const char *
+PreparsedUCD::firstField() {
+ char *field=lines[lineIndex];
+ fieldLimit=strchr(field, 0);
+ return field;
+}
+
+const char *
+PreparsedUCD::nextField() {
+ if(fieldLimit==lineLimit) { return nullptr; }
+ char *field=fieldLimit+1;
+ fieldLimit=strchr(field, 0);
+ return field;
+}
+
+const UniProps *
+PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return nullptr; }
+ newValues.clear();
+ if(!lineHasPropertyValues()) {
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ firstField();
+ const char *field=nextField();
+ if(field==nullptr) {
+ // No range field after the type.
+ fprintf(stderr,
+ "error in preparsed UCD: missing default/block/cp range field "
+ "(no second field) on line %ld\n",
+ (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return nullptr;
+ }
+ UChar32 start, end;
+ if(!parseCodePointRange(field, start, end, errorCode)) { return nullptr; }
+ UniProps *props;
+ UBool insideBlock=false; // true if cp or unassigned range inside the block range.
+ switch(lineType) {
+ case DEFAULTS_LINE:
+ // Should occur before any block/cp/unassigned line.
+ if(blockLineIndex>=0) {
+ fprintf(stderr,
+ "error in preparsed UCD: default line %ld after one or more block lines\n",
+ (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return nullptr;
+ }
+ if(defaultLineIndex>=0) {
+ fprintf(stderr,
+ "error in preparsed UCD: second line with default properties on line %ld\n",
+ (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return nullptr;
+ }
+ if(start!=0 || end!=0x10ffff) {
+ fprintf(stderr,
+ "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
+ field, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return nullptr;
+ }
+ props=&defaultProps;
+ defaultLineIndex=lineIndex;
+ break;
+ case BLOCK_LINE:
+ blockProps=defaultProps; // Block inherits default properties.
+ props=&blockProps;
+ blockLineIndex=lineIndex;
+ break;
+ case CP_LINE:
+ case UNASSIGNED_LINE:
+ if(blockProps.start<=start && end<=blockProps.end) {
+ insideBlock=true;
+ if(lineType==CP_LINE) {
+ // Code point range fully inside the last block inherits the block properties.
+ cpProps=blockProps;
+ } else {
+ // Unassigned line inside the block is based on default properties
+ // which override block properties.
+ cpProps=defaultProps;
+ newValues=blockValues;
+ // Except, it inherits the one blk=Block property.
+ int32_t blkIndex=UCHAR_BLOCK-UCHAR_INT_START;
+ cpProps.intProps[blkIndex]=blockProps.intProps[blkIndex];
+ newValues.remove((UChar32)UCHAR_BLOCK);
+ }
+ } else if(start>blockProps.end || end<blockProps.start) {
+ // Code point range fully outside the last block inherits the default properties.
+ cpProps=defaultProps;
+ } else {
+ // Code point range partially overlapping with the last block is illegal.
+ fprintf(stderr,
+ "error in preparsed UCD: cp range %s on line %ld only "
+ "partially overlaps with block range %04lX..%04lX\n",
+ field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
+ errorCode=U_PARSE_ERROR;
+ return nullptr;
+ }
+ props=&cpProps;
+ break;
+ default:
+ // Will not occur because of the range check above.
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ props->start=start;
+ props->end=end;
+ while((field=nextField())!=nullptr) {
+ if(!parseProperty(*props, field, newValues, errorCode)) { return nullptr; }
+ }
+ if(lineType==BLOCK_LINE) {
+ blockValues=newValues;
+ } else if(lineType==UNASSIGNED_LINE && insideBlock) {
+ // Unset newValues for values that are the same as the block values.
+ for(int32_t prop=0; prop<UCHAR_BINARY_LIMIT; ++prop) {
+ if(newValues.contains(prop) && cpProps.binProps[prop]==blockProps.binProps[prop]) {
+ newValues.remove(prop);
+ }
+ }
+ for(int32_t prop=UCHAR_INT_START; prop<UCHAR_INT_LIMIT; ++prop) {
+ int32_t index=prop-UCHAR_INT_START;
+ if(newValues.contains(prop) && cpProps.intProps[index]==blockProps.intProps[index]) {
+ newValues.remove(prop);
+ }
+ }
+ }
+ return props;
+}
+
+static const struct {
+ const char *name;
+ int32_t prop;
+} ppucdProperties[]={
+ { "Name_Alias", PPUCD_NAME_ALIAS },
+ { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
+ { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
+};
+
+// Returns true for "ok to continue parsing fields".
+UBool
+PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
+ UErrorCode &errorCode) {
+ CharString pBuffer;
+ const char *p=field;
+ const char *v=strchr(p, '=');
+ int binaryValue;
+ if(*p=='-') {
+ if(v!=nullptr) {
+ fprintf(stderr,
+ "error in preparsed UCD: mix of binary-property-no and "
+ "enum-property syntax '%s' on line %ld\n",
+ field, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return false;
+ }
+ binaryValue=0;
+ ++p;
+ } else if(v==nullptr) {
+ binaryValue=1;
+ } else {
+ binaryValue=-1;
+ // Copy out the property name rather than modifying the field (writing a NUL).
+ pBuffer.append(p, (int32_t)(v-p), errorCode);
+ p=pBuffer.data();
+ ++v;
+ }
+ int32_t prop=pnames->getPropertyEnum(p);
+ if(prop<0) {
+ for(int32_t i=0;; ++i) {
+ if(i==UPRV_LENGTHOF(ppucdProperties)) {
+ // Ignore unknown property names.
+ return true;
+ }
+ if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
+ prop=ppucdProperties[i].prop;
+ U_ASSERT(prop>=0);
+ break;
+ }
+ }
+ }
+ if(prop<UCHAR_BINARY_LIMIT) {
+ if(binaryValue>=0) {
+ props.binProps[prop]=(UBool)binaryValue;
+ } else {
+ // No binary value for a binary property.
+ fprintf(stderr,
+ "error in preparsed UCD: enum-property syntax '%s' "
+ "for binary property on line %ld\n",
+ field, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ }
+ } else if(binaryValue>=0) {
+ // Binary value for a non-binary property.
+ fprintf(stderr,
+ "error in preparsed UCD: binary-property syntax '%s' "
+ "for non-binary property on line %ld\n",
+ field, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ } else if (prop < UCHAR_INT_START) {
+ fprintf(stderr,
+ "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
+ prop, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ } else if(prop<UCHAR_INT_LIMIT) {
+ int32_t value=pnames->getPropertyValueEnum(prop, v);
+ if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
+ // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
+ char *end;
+ unsigned long ccc=uprv_strtoul(v, &end, 10);
+ if(v<end && *end==0 && ccc<=254) {
+ value=(int32_t)ccc;
+ }
+ }
+ if(value==UCHAR_INVALID_CODE) {
+ fprintf(stderr,
+ "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
+ field, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ } else {
+ props.intProps[prop-UCHAR_INT_START]=value;
+ }
+ } else if(*v=='<') {
+ // Do not parse default values like <code point>, just set null values.
+ switch(prop) {
+ case UCHAR_BIDI_MIRRORING_GLYPH:
+ props.bmg=U_SENTINEL;
+ break;
+ case UCHAR_BIDI_PAIRED_BRACKET:
+ props.bpb=U_SENTINEL;
+ break;
+ case UCHAR_SIMPLE_CASE_FOLDING:
+ props.scf=U_SENTINEL;
+ break;
+ case UCHAR_SIMPLE_LOWERCASE_MAPPING:
+ props.slc=U_SENTINEL;
+ break;
+ case UCHAR_SIMPLE_TITLECASE_MAPPING:
+ props.stc=U_SENTINEL;
+ break;
+ case UCHAR_SIMPLE_UPPERCASE_MAPPING:
+ props.suc=U_SENTINEL;
+ break;
+ case UCHAR_CASE_FOLDING:
+ props.cf.remove();
+ break;
+ case UCHAR_LOWERCASE_MAPPING:
+ props.lc.remove();
+ break;
+ case UCHAR_TITLECASE_MAPPING:
+ props.tc.remove();
+ break;
+ case UCHAR_UPPERCASE_MAPPING:
+ props.uc.remove();
+ break;
+ case UCHAR_SCRIPT_EXTENSIONS:
+ props.scx.clear();
+ break;
+ default:
+ fprintf(stderr,
+ "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
+ field, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ }
+ } else {
+ char c;
+ switch(prop) {
+ case UCHAR_NUMERIC_VALUE:
+ props.numericValue=v;
+ c=*v;
+ if('0'<=c && c<='9' && v[1]==0) {
+ props.digitValue=c-'0';
+ } else {
+ props.digitValue=-1;
+ }
+ break;
+ case UCHAR_NAME:
+ props.name=v;
+ break;
+ case UCHAR_AGE:
+ u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric.
+ break;
+ case UCHAR_BIDI_MIRRORING_GLYPH:
+ props.bmg=parseCodePoint(v, errorCode);
+ break;
+ case UCHAR_BIDI_PAIRED_BRACKET:
+ props.bpb=parseCodePoint(v, errorCode);
+ break;
+ case UCHAR_SIMPLE_CASE_FOLDING:
+ props.scf=parseCodePoint(v, errorCode);
+ break;
+ case UCHAR_SIMPLE_LOWERCASE_MAPPING:
+ props.slc=parseCodePoint(v, errorCode);
+ break;
+ case UCHAR_SIMPLE_TITLECASE_MAPPING:
+ props.stc=parseCodePoint(v, errorCode);
+ break;
+ case UCHAR_SIMPLE_UPPERCASE_MAPPING:
+ props.suc=parseCodePoint(v, errorCode);
+ break;
+ case UCHAR_CASE_FOLDING:
+ parseString(v, props.cf, errorCode);
+ break;
+ case UCHAR_LOWERCASE_MAPPING:
+ parseString(v, props.lc, errorCode);
+ break;
+ case UCHAR_TITLECASE_MAPPING:
+ parseString(v, props.tc, errorCode);
+ break;
+ case UCHAR_UPPERCASE_MAPPING:
+ parseString(v, props.uc, errorCode);
+ break;
+ case PPUCD_NAME_ALIAS:
+ props.nameAlias=v;
+ break;
+ case PPUCD_CONDITIONAL_CASE_MAPPINGS:
+ case PPUCD_TURKIC_CASE_FOLDING:
+ // No need to parse their values: They are hardcoded in the runtime library.
+ break;
+ case UCHAR_SCRIPT_EXTENSIONS:
+ parseScriptExtensions(v, props.scx, errorCode);
+ break;
+ default:
+ // Ignore unhandled properties.
+ return true;
+ }
+ }
+ if(U_SUCCESS(errorCode)) {
+ newValues.add((UChar32)prop);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+UBool
+PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return false; }
+ if(lineType!=ALG_NAMES_RANGE_LINE) {
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return false;
+ }
+ firstField();
+ const char *field=nextField();
+ if(field==nullptr) {
+ // No range field after the type.
+ fprintf(stderr,
+ "error in preparsed UCD: missing algnamesrange range field "
+ "(no second field) on line %ld\n",
+ (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return false;
+ }
+ return parseCodePointRange(field, start, end, errorCode);
+}
+
+UChar32
+PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
+ char *end;
+ uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
+ if(end<=s || *end!=0 || value>=0x110000) {
+ fprintf(stderr,
+ "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
+ s, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return U_SENTINEL;
+ }
+ return (UChar32)value;
+}
+
+UBool
+PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
+ uint32_t st, e;
+ u_parseCodePointRange(s, &st, &e, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr,
+ "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
+ s, (long)lineNumber);
+ return false;
+ }
+ start=(UChar32)st;
+ end=(UChar32)e;
+ return true;
+}
+
+void
+PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
+ char16_t *buffer=toUCharPtr(uni.getBuffer(-1));
+ int32_t length=u_parseString(s, buffer, uni.getCapacity(), nullptr, &errorCode);
+ if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
+ errorCode=U_ZERO_ERROR;
+ uni.releaseBuffer(0);
+ buffer=toUCharPtr(uni.getBuffer(length));
+ length=u_parseString(s, buffer, uni.getCapacity(), nullptr, &errorCode);
+ }
+ uni.releaseBuffer(length);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr,
+ "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
+ s, (long)lineNumber);
+ }
+}
+
+void
+PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return; }
+ scx.clear();
+ CharString scString;
+ for(;;) {
+ const char *scs;
+ const char *scLimit=strchr(s, ' ');
+ if(scLimit!=nullptr) {
+ scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
+ if(U_FAILURE(errorCode)) { return; }
+ } else {
+ scs=s;
+ }
+ int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
+ if(script==UCHAR_INVALID_CODE) {
+ fprintf(stderr,
+ "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
+ scs, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return;
+ } else if(scx.contains(script)) {
+ fprintf(stderr,
+ "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
+ scs, (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ return;
+ } else {
+ scx.add(script);
+ }
+ if(scLimit!=nullptr) {
+ s=scLimit+1;
+ } else {
+ break;
+ }
+ }
+ if(scx.isEmpty()) {
+ fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
+ errorCode=U_PARSE_ERROR;
+ }
+}
+
+U_NAMESPACE_END
diff --git a/intl/icu/source/tools/toolutil/ppucd.h b/intl/icu/source/tools/toolutil/ppucd.h
new file mode 100644
index 0000000000..d5c63fab49
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/ppucd.h
@@ -0,0 +1,180 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+* Copyright (C) 2011-2013, International Business Machines
+* Corporation and others. All Rights Reserved.
+*******************************************************************************
+* file name: ppucd.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2011dec11
+* created by: Markus W. Scherer
+*/
+
+#ifndef __PPUCD_H__
+#define __PPUCD_H__
+
+#include "unicode/utypes.h"
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
+
+#include <stdio.h>
+
+/** Additions to the uchar.h enum UProperty. */
+enum {
+ /** Name_Alias */
+ PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT,
+ PPUCD_CONDITIONAL_CASE_MAPPINGS,
+ PPUCD_TURKIC_CASE_FOLDING
+};
+
+U_NAMESPACE_BEGIN
+
+class U_TOOLUTIL_API PropertyNames {
+public:
+ virtual ~PropertyNames();
+ virtual int32_t getPropertyEnum(const char *name) const = 0;
+ virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const = 0;
+};
+
+struct U_TOOLUTIL_API UniProps {
+ UniProps();
+ ~UniProps();
+
+ int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; }
+
+ UChar32 start, end;
+ UBool binProps[UCHAR_BINARY_LIMIT];
+ int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START];
+ UVersionInfo age;
+ UChar32 bmg, bpb;
+ UChar32 scf, slc, stc, suc;
+ int32_t digitValue;
+ const char *numericValue;
+ const char *name;
+ const char *nameAlias;
+ UnicodeString cf, lc, tc, uc;
+ UnicodeSet scx;
+};
+
+class U_TOOLUTIL_API PreparsedUCD {
+public:
+ enum LineType {
+ /** No line, end of file. */
+ NO_LINE,
+ /** Empty line. (Might contain a comment.) */
+ EMPTY_LINE,
+
+ /** ucd;6.1.0 */
+ UNICODE_VERSION_LINE,
+
+ /** property;Binary;Alpha;Alphabetic */
+ PROPERTY_LINE,
+ /** binary;N;No;F;False */
+ BINARY_LINE,
+ /** value;gc;Zs;Space_Separator */
+ VALUE_LINE,
+
+ /** defaults;0000..10FFFF;age=NA;bc=L;... */
+ DEFAULTS_LINE,
+ /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */
+ BLOCK_LINE,
+ /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */
+ CP_LINE,
+ /** unassigned;E01F0..E0FFF;bc=BN;CWKCF;DI;GCB=CN;NFKC_CF= */
+ UNASSIGNED_LINE,
+
+ /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */
+ ALG_NAMES_RANGE_LINE,
+
+ LINE_TYPE_COUNT
+ };
+
+ /**
+ * Constructor.
+ * Prepare this object for a new, empty package.
+ */
+ PreparsedUCD(const char *filename, UErrorCode &errorCode);
+
+ /** Destructor. */
+ ~PreparsedUCD();
+
+ /** Sets (aliases) a PropertyNames implementation. Caller retains ownership. */
+ void setPropertyNames(const PropertyNames *pn) { pnames=pn; }
+
+ /**
+ * Reads a line from the preparsed UCD file.
+ * Splits the line by replacing each ';' with a NUL.
+ */
+ LineType readLine(UErrorCode &errorCode);
+
+ /** Returns the number of the line read by readLine(). */
+ int32_t getLineNumber() const { return lineNumber; }
+
+ /** Returns the line's next field, or nullptr. */
+ const char *nextField();
+
+ /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */
+ const UVersionInfo &getUnicodeVersion() const { return ucdVersion; }
+
+ /** Returns true if the current line has property values. */
+ UBool lineHasPropertyValues() const {
+ return DEFAULTS_LINE<=lineType && lineType<=UNASSIGNED_LINE;
+ }
+
+ /**
+ * Parses properties from the current line.
+ * Clears newValues and sets UProperty codes for property values mentioned
+ * on the current line (as opposed to being inherited).
+ * Returns a pointer to the filled-in UniProps, or nullptr if something went wrong.
+ * The returned UniProps are usable until the next line of the same type is read.
+ */
+ const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode);
+
+ /**
+ * Returns the code point range for the current algnamesrange line.
+ * Calls & parses nextField().
+ * Further nextField() calls will yield the range's type & prefix string.
+ * Returns U_SUCCESS(errorCode).
+ */
+ UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode);
+
+private:
+ UBool isLineBufferAvailable(int32_t i) {
+ return defaultLineIndex!=i && blockLineIndex!=i;
+ }
+
+ /** Resets the field iterator and returns the line's first field (the line type field). */
+ const char *firstField();
+
+ UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
+ UErrorCode &errorCode);
+ UChar32 parseCodePoint(const char *s, UErrorCode &errorCode);
+ UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode);
+ void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode);
+ void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode);
+
+ static const int32_t kNumLineBuffers=3;
+
+ const PropertyNames *pnames; // aliased
+ FILE *file;
+ int32_t defaultLineIndex, blockLineIndex, lineIndex;
+ int32_t lineNumber;
+ LineType lineType;
+ char *fieldLimit;
+ char *lineLimit;
+
+ UVersionInfo ucdVersion;
+ UniProps defaultProps, blockProps, cpProps;
+ UnicodeSet blockValues;
+ // Multiple lines so that default and block properties can maintain pointers
+ // into their line buffers.
+ char lines[kNumLineBuffers][4096];
+};
+
+U_NAMESPACE_END
+
+#endif // __PPUCD_H__
diff --git a/intl/icu/source/tools/toolutil/sources.txt b/intl/icu/source/tools/toolutil/sources.txt
new file mode 100644
index 0000000000..d3288997e2
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/sources.txt
@@ -0,0 +1,24 @@
+collationinfo.cpp
+dbgutil.cpp
+denseranges.cpp
+filestrm.cpp
+filetools.cpp
+flagparser.cpp
+package.cpp
+pkg_genc.cpp
+pkg_gencmn.cpp
+pkg_icu.cpp
+pkgitems.cpp
+ppucd.cpp
+swapimpl.cpp
+toolutil.cpp
+ucbuf.cpp
+ucln_tu.cpp
+ucm.cpp
+ucmstate.cpp
+udbgutil.cpp
+unewdata.cpp
+uoptions.cpp
+uparse.cpp
+writesrc.cpp
+xmlparser.cpp
diff --git a/intl/icu/source/tools/toolutil/swapimpl.cpp b/intl/icu/source/tools/toolutil/swapimpl.cpp
new file mode 100644
index 0000000000..9c58563965
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/swapimpl.cpp
@@ -0,0 +1,1048 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2005-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: swapimpl.cpp
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2005may05
+* created by: Markus W. Scherer
+*
+* Data file swapping functions moved here from the common library
+* because some data is hardcoded in ICU4C and needs not be swapped any more.
+* Moving the functions here simplifies testing (for code coverage) because
+* we need not jump through hoops (like adding snapshots of these files
+* to testdata).
+*
+* The declarations for these functions remain in the internal header files
+* in icu/source/common/
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/putil.h"
+#include "unicode/udata.h"
+
+/* Explicit include statement for std_string.h is needed
+ * for compilation on certain platforms. (e.g. AIX/VACPP)
+ */
+#include "unicode/std_string.h"
+
+#include "cmemory.h"
+#include "cstring.h"
+#include "uinvchar.h"
+#include "uassert.h"
+#include "uarrsort.h"
+#include "ucmndata.h"
+#include "udataswp.h"
+#include "ulayout_props.h"
+
+/* swapping implementations in common */
+
+#include "emojiprops.h"
+#include "uresdata.h"
+#include "ucnv_io.h"
+#include "uprops.h"
+#include "ucase.h"
+#include "ubidi_props.h"
+#include "ucol_swp.h"
+#include "ucnv_bld.h"
+#include "unormimp.h"
+#include "normalizer2impl.h"
+#include "sprpimpl.h"
+#include "propname.h"
+#include "rbbidata.h"
+#include "utrie.h"
+#include "utrie2.h"
+#include "dictionarydata.h"
+
+/* swapping implementations in i18n */
+
+#if !UCONFIG_NO_NORMALIZATION
+#include "uspoof_impl.h"
+#endif
+
+U_NAMESPACE_USE
+
+/* definitions */
+
+/* Unicode property (value) aliases data swapping --------------------------- */
+
+static int32_t U_CALLCONV
+upname_swap(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode) {
+ /* udata_swapDataHeader checks the arguments */
+ int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+ if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ /* check data format and format version */
+ const UDataInfo *pInfo=
+ reinterpret_cast<const UDataInfo *>(
+ static_cast<const char *>(inData)+4);
+ if(!(
+ pInfo->dataFormat[0]==0x70 && /* dataFormat="pnam" */
+ pInfo->dataFormat[1]==0x6e &&
+ pInfo->dataFormat[2]==0x61 &&
+ pInfo->dataFormat[3]==0x6d &&
+ pInfo->formatVersion[0]==2
+ )) {
+ udata_printError(ds, "upname_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as pnames.icu\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ pInfo->formatVersion[0]);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ const uint8_t *inBytes=static_cast<const uint8_t *>(inData)+headerSize;
+ uint8_t *outBytes=static_cast<uint8_t *>(outData)+headerSize;
+
+ if(length>=0) {
+ length-=headerSize;
+ // formatVersion 2 initially has indexes[8], 32 bytes.
+ if(length<32) {
+ udata_printError(ds, "upname_swap(): too few bytes (%d after header) for pnames.icu\n",
+ (int)length);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+ }
+
+ const int32_t *inIndexes=reinterpret_cast<const int32_t *>(inBytes);
+ int32_t totalSize=udata_readInt32(ds, inIndexes[PropNameData::IX_TOTAL_SIZE]);
+ if(length>=0) {
+ if(length<totalSize) {
+ udata_printError(ds, "upname_swap(): too few bytes (%d after header, should be %d) "
+ "for pnames.icu\n",
+ (int)length, (int)totalSize);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ int32_t numBytesIndexesAndValueMaps=
+ udata_readInt32(ds, inIndexes[PropNameData::IX_BYTE_TRIES_OFFSET]);
+
+ // Swap the indexes[] and the valueMaps[].
+ ds->swapArray32(ds, inBytes, numBytesIndexesAndValueMaps, outBytes, pErrorCode);
+
+ // Copy the rest of the data.
+ if(inBytes!=outBytes) {
+ uprv_memcpy(outBytes+numBytesIndexesAndValueMaps,
+ inBytes+numBytesIndexesAndValueMaps,
+ totalSize-numBytesIndexesAndValueMaps);
+ }
+
+ // We need not swap anything else:
+ //
+ // The ByteTries are already byte-serialized, and are fixed on ASCII.
+ // (On an EBCDIC machine, the input string is converted to lowercase ASCII
+ // while matching.)
+ //
+ // The name groups are mostly invariant characters, but since we only
+ // generate, and keep in subversion, ASCII versions of pnames.icu,
+ // and since only ICU4J uses the pnames.icu data file
+ // (the data is hardcoded in ICU4C) and ICU4J uses ASCII data files,
+ // we just copy those bytes too.
+ }
+
+ return headerSize+totalSize;
+}
+
+/* Unicode properties data swapping ----------------------------------------- */
+
+static int32_t U_CALLCONV
+uprops_swap(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode) {
+ const UDataInfo *pInfo;
+ int32_t headerSize, i;
+
+ int32_t dataIndexes[UPROPS_INDEX_COUNT];
+ const int32_t *inData32;
+
+ /* udata_swapDataHeader checks the arguments */
+ headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+ if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ /* check data format and format version */
+ pInfo=(const UDataInfo *)((const char *)inData+4);
+ if(!(
+ pInfo->dataFormat[0]==0x55 && /* dataFormat="UPro" */
+ pInfo->dataFormat[1]==0x50 &&
+ pInfo->dataFormat[2]==0x72 &&
+ pInfo->dataFormat[3]==0x6f &&
+ (3<=pInfo->formatVersion[0] && pInfo->formatVersion[0]<=7) &&
+ (pInfo->formatVersion[0]>=7 ||
+ (pInfo->formatVersion[2]==UTRIE_SHIFT &&
+ pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT))
+ )) {
+ udata_printError(ds, "uprops_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not a Unicode properties file\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ pInfo->formatVersion[0]);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ /* the properties file must contain at least the indexes array */
+ if(length>=0 && (length-headerSize)<(int32_t)sizeof(dataIndexes)) {
+ udata_printError(ds, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n",
+ length-headerSize);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ /* read the indexes */
+ inData32=(const int32_t *)((const char *)inData+headerSize);
+ for(i=0; i<UPROPS_INDEX_COUNT; ++i) {
+ dataIndexes[i]=udata_readInt32(ds, inData32[i]);
+ }
+
+ /*
+ * comments are copied from the data format description in genprops/store.c
+ * indexes[] constants are in uprops.h
+ */
+ int32_t dataTop;
+ if(length>=0) {
+ int32_t *outData32;
+
+ /*
+ * In formatVersion 7, UPROPS_DATA_TOP_INDEX has the post-header data size.
+ * In earlier formatVersions, it is 0 and a lower dataIndexes entry
+ * has the top of the last item.
+ */
+ for(i=UPROPS_DATA_TOP_INDEX; i>0 && (dataTop=dataIndexes[i])==0; --i) {}
+
+ if((length-headerSize)<(4*dataTop)) {
+ udata_printError(ds, "uprops_swap(): too few bytes (%d after header) for a Unicode properties file\n",
+ length-headerSize);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ outData32=(int32_t *)((char *)outData+headerSize);
+
+ /* copy everything for inaccessible data (padding) */
+ if(inData32!=outData32) {
+ uprv_memcpy(outData32, inData32, 4*(size_t)dataTop);
+ }
+
+ /* swap the indexes[16] */
+ ds->swapArray32(ds, inData32, 4*UPROPS_INDEX_COUNT, outData32, pErrorCode);
+
+ /*
+ * swap the main properties UTrie
+ * PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
+ */
+ utrie_swapAnyVersion(ds,
+ inData32+UPROPS_INDEX_COUNT,
+ 4*(dataIndexes[UPROPS_PROPS32_INDEX]-UPROPS_INDEX_COUNT),
+ outData32+UPROPS_INDEX_COUNT,
+ pErrorCode);
+
+ /*
+ * swap the properties and exceptions words
+ * P const uint32_t props32[i1-i0];
+ * E const uint32_t exceptions[i2-i1];
+ */
+ ds->swapArray32(ds,
+ inData32+dataIndexes[UPROPS_PROPS32_INDEX],
+ 4*(dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX]-dataIndexes[UPROPS_PROPS32_INDEX]),
+ outData32+dataIndexes[UPROPS_PROPS32_INDEX],
+ pErrorCode);
+
+ /*
+ * swap the UChars
+ * U const char16_t uchars[2*(i3-i2)];
+ */
+ ds->swapArray16(ds,
+ inData32+dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX],
+ 4*(dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX]-dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX]),
+ outData32+dataIndexes[UPROPS_EXCEPTIONS_TOP_INDEX],
+ pErrorCode);
+
+ /*
+ * swap the additional UTrie
+ * i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
+ */
+ utrie_swapAnyVersion(ds,
+ inData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX],
+ 4*(dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]-dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX]),
+ outData32+dataIndexes[UPROPS_ADDITIONAL_TRIE_INDEX],
+ pErrorCode);
+
+ /*
+ * swap the properties vectors
+ * PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
+ */
+ ds->swapArray32(ds,
+ inData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX],
+ 4*(dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]-dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX]),
+ outData32+dataIndexes[UPROPS_ADDITIONAL_VECTORS_INDEX],
+ pErrorCode);
+
+ // swap the Script_Extensions data
+ // SCX const uint16_t scriptExtensions[2*(i7-i6)];
+ ds->swapArray16(ds,
+ inData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX],
+ 4*(dataIndexes[UPROPS_RESERVED_INDEX_7]-dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]),
+ outData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX],
+ pErrorCode);
+ }
+
+ /* i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data */
+ return headerSize+4*dataIndexes[UPROPS_RESERVED_INDEX_7];
+}
+
+/* Unicode case mapping data swapping --------------------------------------- */
+
+static int32_t U_CALLCONV
+ucase_swap(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode) {
+ const UDataInfo *pInfo;
+ int32_t headerSize;
+
+ const uint8_t *inBytes;
+ uint8_t *outBytes;
+
+ const int32_t *inIndexes;
+ int32_t indexes[16];
+
+ int32_t i, offset, count, size;
+
+ /* udata_swapDataHeader checks the arguments */
+ headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+ if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ /* check data format and format version */
+ pInfo=(const UDataInfo *)((const char *)inData+4);
+ if(!(
+ pInfo->dataFormat[0]==UCASE_FMT_0 && /* dataFormat="cAsE" */
+ pInfo->dataFormat[1]==UCASE_FMT_1 &&
+ pInfo->dataFormat[2]==UCASE_FMT_2 &&
+ pInfo->dataFormat[3]==UCASE_FMT_3 &&
+ ((pInfo->formatVersion[0]==1 &&
+ pInfo->formatVersion[2]==UTRIE_SHIFT &&
+ pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT) ||
+ (2<=pInfo->formatVersion[0] && pInfo->formatVersion[0]<=4))
+ )) {
+ udata_printError(ds, "ucase_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as case mapping data\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ pInfo->formatVersion[0]);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ inBytes=(const uint8_t *)inData+headerSize;
+ outBytes=(uint8_t *)outData+headerSize;
+
+ inIndexes=(const int32_t *)inBytes;
+
+ if(length>=0) {
+ length-=headerSize;
+ if(length<16*4) {
+ udata_printError(ds, "ucase_swap(): too few bytes (%d after header) for case mapping data\n",
+ length);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+ }
+
+ /* read the first 16 indexes (ICU 3.2/format version 1: UCASE_IX_TOP==16, might grow) */
+ for(i=0; i<16; ++i) {
+ indexes[i]=udata_readInt32(ds, inIndexes[i]);
+ }
+
+ /* get the total length of the data */
+ size=indexes[UCASE_IX_LENGTH];
+
+ if(length>=0) {
+ if(length<size) {
+ udata_printError(ds, "ucase_swap(): too few bytes (%d after header) for all of case mapping data\n",
+ length);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ /* copy the data for inaccessible bytes */
+ if(inBytes!=outBytes) {
+ uprv_memcpy(outBytes, inBytes, size);
+ }
+
+ offset=0;
+
+ /* swap the int32_t indexes[] */
+ count=indexes[UCASE_IX_INDEX_TOP]*4;
+ ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode);
+ offset+=count;
+
+ /* swap the UTrie */
+ count=indexes[UCASE_IX_TRIE_SIZE];
+ utrie_swapAnyVersion(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
+ offset+=count;
+
+ /* swap the uint16_t exceptions[] and unfold[] */
+ count=(indexes[UCASE_IX_EXC_LENGTH]+indexes[UCASE_IX_UNFOLD_LENGTH])*2;
+ ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
+ offset+=count;
+
+ U_ASSERT(offset==size);
+ }
+
+ return headerSize+size;
+}
+
+/* Unicode bidi/shaping data swapping --------------------------------------- */
+
+static int32_t U_CALLCONV
+ubidi_swap(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode) {
+ const UDataInfo *pInfo;
+ int32_t headerSize;
+
+ const uint8_t *inBytes;
+ uint8_t *outBytes;
+
+ const int32_t *inIndexes;
+ int32_t indexes[16];
+
+ int32_t i, offset, count, size;
+
+ /* udata_swapDataHeader checks the arguments */
+ headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+ if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ /* check data format and format version */
+ pInfo=(const UDataInfo *)((const char *)inData+4);
+ if(!(
+ pInfo->dataFormat[0]==UBIDI_FMT_0 && /* dataFormat="BiDi" */
+ pInfo->dataFormat[1]==UBIDI_FMT_1 &&
+ pInfo->dataFormat[2]==UBIDI_FMT_2 &&
+ pInfo->dataFormat[3]==UBIDI_FMT_3 &&
+ ((pInfo->formatVersion[0]==1 &&
+ pInfo->formatVersion[2]==UTRIE_SHIFT &&
+ pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT) ||
+ pInfo->formatVersion[0]==2)
+ )) {
+ udata_printError(ds, "ubidi_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as bidi/shaping data\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ pInfo->formatVersion[0]);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ inBytes=(const uint8_t *)inData+headerSize;
+ outBytes=(uint8_t *)outData+headerSize;
+
+ inIndexes=(const int32_t *)inBytes;
+
+ if(length>=0) {
+ length-=headerSize;
+ if(length<16*4) {
+ udata_printError(ds, "ubidi_swap(): too few bytes (%d after header) for bidi/shaping data\n",
+ length);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+ }
+
+ /* read the first 16 indexes (ICU 3.4/format version 1: UBIDI_IX_TOP==16, might grow) */
+ for(i=0; i<16; ++i) {
+ indexes[i]=udata_readInt32(ds, inIndexes[i]);
+ }
+
+ /* get the total length of the data */
+ size=indexes[UBIDI_IX_LENGTH];
+
+ if(length>=0) {
+ if(length<size) {
+ udata_printError(ds, "ubidi_swap(): too few bytes (%d after header) for all of bidi/shaping data\n",
+ length);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ /* copy the data for inaccessible bytes */
+ if(inBytes!=outBytes) {
+ uprv_memcpy(outBytes, inBytes, size);
+ }
+
+ offset=0;
+
+ /* swap the int32_t indexes[] */
+ count=indexes[UBIDI_IX_INDEX_TOP]*4;
+ ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode);
+ offset+=count;
+
+ /* swap the UTrie */
+ count=indexes[UBIDI_IX_TRIE_SIZE];
+ utrie_swapAnyVersion(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
+ offset+=count;
+
+ /* swap the uint32_t mirrors[] */
+ count=indexes[UBIDI_IX_MIRROR_LENGTH]*4;
+ ds->swapArray32(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
+ offset+=count;
+
+ /* just skip the uint8_t jgArray[] and jgArray2[] */
+ count=indexes[UBIDI_IX_JG_LIMIT]-indexes[UBIDI_IX_JG_START];
+ offset+=count;
+ count=indexes[UBIDI_IX_JG_LIMIT2]-indexes[UBIDI_IX_JG_START2];
+ offset+=count;
+
+ U_ASSERT(offset==size);
+ }
+
+ return headerSize+size;
+}
+
+/* Unicode normalization data swapping -------------------------------------- */
+
+#if !UCONFIG_NO_NORMALIZATION
+
+static int32_t U_CALLCONV
+unorm_swap(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode) {
+ const UDataInfo *pInfo;
+ int32_t headerSize;
+
+ const uint8_t *inBytes;
+ uint8_t *outBytes;
+
+ const int32_t *inIndexes;
+ int32_t indexes[32];
+
+ int32_t i, offset, count, size;
+
+ /* udata_swapDataHeader checks the arguments */
+ headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+ if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ /* check data format and format version */
+ pInfo=(const UDataInfo *)((const char *)inData+4);
+ if(!(
+ pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */
+ pInfo->dataFormat[1]==0x6f &&
+ pInfo->dataFormat[2]==0x72 &&
+ pInfo->dataFormat[3]==0x6d &&
+ pInfo->formatVersion[0]==2
+ )) {
+ udata_printError(ds, "unorm_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unorm.icu\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ pInfo->formatVersion[0]);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ inBytes=(const uint8_t *)inData+headerSize;
+ outBytes=(uint8_t *)outData+headerSize;
+
+ inIndexes=(const int32_t *)inBytes;
+
+ if(length>=0) {
+ length-=headerSize;
+ if(length<32*4) {
+ udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for unorm.icu\n",
+ length);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+ }
+
+ /* read the first 32 indexes (ICU 2.8/format version 2.2: _NORM_INDEX_TOP==32, might grow) */
+ for(i=0; i<32; ++i) {
+ indexes[i]=udata_readInt32(ds, inIndexes[i]);
+ }
+
+ /* calculate the total length of the data */
+ size=
+ 32*4+ /* size of indexes[] */
+ indexes[_NORM_INDEX_TRIE_SIZE]+
+ indexes[_NORM_INDEX_UCHAR_COUNT]*2+
+ indexes[_NORM_INDEX_COMBINE_DATA_COUNT]*2+
+ indexes[_NORM_INDEX_FCD_TRIE_SIZE]+
+ indexes[_NORM_INDEX_AUX_TRIE_SIZE]+
+ indexes[_NORM_INDEX_CANON_SET_COUNT]*2;
+
+ if(length>=0) {
+ if(length<size) {
+ udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for all of unorm.icu\n",
+ length);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ /* copy the data for inaccessible bytes */
+ if(inBytes!=outBytes) {
+ uprv_memcpy(outBytes, inBytes, size);
+ }
+
+ offset=0;
+
+ /* swap the indexes[] */
+ count=32*4;
+ ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode);
+ offset+=count;
+
+ /* swap the main UTrie */
+ count=indexes[_NORM_INDEX_TRIE_SIZE];
+ utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
+ offset+=count;
+
+ /* swap the uint16_t extraData[] and the uint16_t combiningTable[] */
+ count=(indexes[_NORM_INDEX_UCHAR_COUNT]+indexes[_NORM_INDEX_COMBINE_DATA_COUNT])*2;
+ ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
+ offset+=count;
+
+ /* swap the FCD UTrie */
+ count=indexes[_NORM_INDEX_FCD_TRIE_SIZE];
+ if(count!=0) {
+ utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
+ offset+=count;
+ }
+
+ /* swap the aux UTrie */
+ count=indexes[_NORM_INDEX_AUX_TRIE_SIZE];
+ if(count!=0) {
+ utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
+ offset+=count;
+ }
+
+ /* swap the uint16_t combiningTable[] */
+ count=indexes[_NORM_INDEX_CANON_SET_COUNT]*2;
+ ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
+ offset+=count;
+ }
+
+ return headerSize+size;
+}
+
+#endif
+
+// Unicode text layout properties data swapping --------------------------------
+
+static int32_t U_CALLCONV
+ulayout_swap(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode) {
+ // udata_swapDataHeader checks the arguments.
+ int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+ if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ // Check data format and format version.
+ const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4);
+ if (!(
+ pInfo->dataFormat[0] == ULAYOUT_FMT_0 && // dataFormat="Layo"
+ pInfo->dataFormat[1] == ULAYOUT_FMT_1 &&
+ pInfo->dataFormat[2] == ULAYOUT_FMT_2 &&
+ pInfo->dataFormat[3] == ULAYOUT_FMT_3 &&
+ pInfo->formatVersion[0] == 1)) {
+ udata_printError(ds,
+ "ulayout_swap(): data format %02x.%02x.%02x.%02x (format version %02x) "
+ "is not recognized as text layout properties data\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ pInfo->formatVersion[0]);
+ *pErrorCode = U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ const uint8_t *inBytes = (const uint8_t *)inData + headerSize;
+ uint8_t *outBytes = (uint8_t *)outData + headerSize;
+
+ const int32_t *inIndexes = (const int32_t *)inBytes;
+
+ if (length >= 0) {
+ length -= headerSize;
+ if (length < 12 * 4) {
+ udata_printError(ds,
+ "ulayout_swap(): too few bytes (%d after header) for text layout properties data\n",
+ length);
+ *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+ }
+
+ int32_t indexesLength = udata_readInt32(ds, inIndexes[ULAYOUT_IX_INDEXES_LENGTH]);
+ if (indexesLength < 12) {
+ udata_printError(ds,
+ "ulayout_swap(): too few indexes (%d) for text layout properties data\n",
+ indexesLength);
+ *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ // Read the data offsets before swapping anything.
+ int32_t indexes[ULAYOUT_IX_TRIES_TOP + 1];
+ for (int32_t i = ULAYOUT_IX_INPC_TRIE_TOP; i <= ULAYOUT_IX_TRIES_TOP; ++i) {
+ indexes[i] = udata_readInt32(ds, inIndexes[i]);
+ }
+ int32_t size = indexes[ULAYOUT_IX_TRIES_TOP];
+
+ if (length >= 0) {
+ if (length < size) {
+ udata_printError(ds,
+ "ulayout_swap(): too few bytes (%d after header) "
+ "for all of text layout properties data\n",
+ length);
+ *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ // Copy the data for inaccessible bytes.
+ if (inBytes != outBytes) {
+ uprv_memcpy(outBytes, inBytes, size);
+ }
+
+ // Swap the int32_t indexes[].
+ int32_t offset = 0;
+ int32_t count = indexesLength * 4;
+ ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode);
+ offset += count;
+
+ // Swap each trie.
+ for (int32_t i = ULAYOUT_IX_INPC_TRIE_TOP; i <= ULAYOUT_IX_TRIES_TOP; ++i) {
+ int32_t top = indexes[i];
+ count = top - offset;
+ U_ASSERT(count >= 0);
+ if (count >= 16) {
+ utrie_swapAnyVersion(ds, inBytes + offset, count, outBytes + offset, pErrorCode);
+ }
+ offset = top;
+ }
+
+ U_ASSERT(offset == size);
+ }
+
+ return headerSize + size;
+}
+
+// Unicode emoji properties data swapping --------------------------------------
+
+static int32_t U_CALLCONV
+uemoji_swap(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode) {
+ // udata_swapDataHeader checks the arguments.
+ int32_t headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+ if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ // Check data format and format version.
+ const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData + 4);
+ if (!(
+ pInfo->dataFormat[0] == u'E' &&
+ pInfo->dataFormat[1] == u'm' &&
+ pInfo->dataFormat[2] == u'o' &&
+ pInfo->dataFormat[3] == u'j' &&
+ pInfo->formatVersion[0] == 1)) {
+ udata_printError(ds,
+ "uemoji_swap(): data format %02x.%02x.%02x.%02x (format version %02x) "
+ "is not recognized as emoji properties data\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ pInfo->formatVersion[0]);
+ *pErrorCode = U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ const uint8_t *inBytes = (const uint8_t *)inData + headerSize;
+ uint8_t *outBytes = (uint8_t *)outData + headerSize;
+
+ const int32_t *inIndexes = (const int32_t *)inBytes;
+
+ if (length >= 0) {
+ length -= headerSize;
+ // We expect to read at least EmojiProps::IX_TOTAL_SIZE.
+ if (length < 14 * 4) {
+ udata_printError(ds,
+ "uemoji_swap(): too few bytes (%d after header) for emoji properties data\n",
+ length);
+ *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+ }
+
+ // First offset after indexes[].
+ int32_t cpTrieOffset = udata_readInt32(ds, inIndexes[EmojiProps::IX_CPTRIE_OFFSET]);
+ int32_t indexesLength = cpTrieOffset / 4;
+ if (indexesLength < 14) {
+ udata_printError(ds,
+ "uemoji_swap(): too few indexes (%d) for emoji properties data\n",
+ indexesLength);
+ *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ // Read the data offsets before swapping anything.
+ int32_t indexes[EmojiProps::IX_TOTAL_SIZE + 1];
+ indexes[0] = cpTrieOffset;
+ for (int32_t i = 1; i <= EmojiProps::IX_TOTAL_SIZE; ++i) {
+ indexes[i] = udata_readInt32(ds, inIndexes[i]);
+ }
+ int32_t size = indexes[EmojiProps::IX_TOTAL_SIZE];
+
+ if (length >= 0) {
+ if (length < size) {
+ udata_printError(ds,
+ "uemoji_swap(): too few bytes (%d after header) "
+ "for all of emoji properties data\n",
+ length);
+ *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ // Copy the data for inaccessible bytes.
+ if (inBytes != outBytes) {
+ uprv_memcpy(outBytes, inBytes, size);
+ }
+
+ // Swap the int32_t indexes[].
+ int32_t offset = 0;
+ int32_t top = cpTrieOffset;
+ ds->swapArray32(ds, inBytes, top - offset, outBytes, pErrorCode);
+ offset = top;
+
+ // Swap the code point trie.
+ top = indexes[EmojiProps::IX_CPTRIE_OFFSET + 1];
+ int32_t count = top - offset;
+ U_ASSERT(count >= 0);
+ if (count >= 16) {
+ utrie_swapAnyVersion(ds, inBytes + offset, count, outBytes + offset, pErrorCode);
+ }
+ offset = top;
+
+ // Swap all of the string tries.
+ // They are all serialized as arrays of 16-bit units.
+ offset = indexes[EmojiProps::IX_BASIC_EMOJI_TRIE_OFFSET];
+ top = indexes[EmojiProps::IX_RGI_EMOJI_ZWJ_SEQUENCE_TRIE_OFFSET + 1];
+ ds->swapArray16(ds, inBytes + offset, top - offset, outBytes + offset, pErrorCode);
+ offset = top;
+
+ U_ASSERT(offset == size);
+ }
+
+ return headerSize + size;
+}
+
+/* Swap 'Test' data from gentest */
+static int32_t U_CALLCONV
+test_swap(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode) {
+ const UDataInfo *pInfo;
+ int32_t headerSize;
+
+ const uint8_t *inBytes;
+ uint8_t *outBytes;
+
+ int32_t offset;
+
+ /* udata_swapDataHeader checks the arguments */
+ headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+ if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
+ udata_printError(ds, "test_swap(): data header swap failed %s\n", pErrorCode != nullptr ? u_errorName(*pErrorCode) : "pErrorCode is nullptr");
+ return 0;
+ }
+
+ /* check data format and format version */
+ pInfo=(const UDataInfo *)((const char *)inData+4);
+ if(!(
+ pInfo->dataFormat[0]==0x54 && /* dataFormat="Norm" */
+ pInfo->dataFormat[1]==0x65 &&
+ pInfo->dataFormat[2]==0x73 &&
+ pInfo->dataFormat[3]==0x74 &&
+ pInfo->formatVersion[0]==1
+ )) {
+ udata_printError(ds, "test_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as testdata\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ pInfo->formatVersion[0]);
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+ }
+
+ inBytes=(const uint8_t *)inData+headerSize;
+ outBytes=(uint8_t *)outData+headerSize;
+
+ int32_t size16 = 2; // 16bit plus padding
+ int32_t sizeStr = 5; // 4 char inv-str plus null
+ int32_t size = size16 + sizeStr;
+
+ if(length>=0) {
+ if(length<size) {
+ udata_printError(ds, "test_swap(): too few bytes (%d after header, wanted %d) for all of testdata\n",
+ length, size);
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ offset =0;
+ /* swap a 1 entry array */
+ ds->swapArray16(ds, inBytes+offset, size16, outBytes+offset, pErrorCode);
+ offset+=size16;
+ ds->swapInvChars(ds, inBytes+offset, sizeStr, outBytes+offset, pErrorCode);
+ }
+
+ return headerSize+size;
+}
+
+/* swap any data (except a .dat package) ------------------------------------ */
+
+static const struct {
+ uint8_t dataFormat[4];
+ UDataSwapFn *swapFn;
+} swapFns[]={
+ { { 0x52, 0x65, 0x73, 0x42 }, ures_swap }, /* dataFormat="ResB" */
+#if !UCONFIG_NO_LEGACY_CONVERSION
+ { { 0x63, 0x6e, 0x76, 0x74 }, ucnv_swap }, /* dataFormat="cnvt" */
+#endif
+#if !UCONFIG_NO_CONVERSION
+ { { 0x43, 0x76, 0x41, 0x6c }, ucnv_swapAliases }, /* dataFormat="CvAl" */
+#endif
+#if !UCONFIG_NO_IDNA
+ { { 0x53, 0x50, 0x52, 0x50 }, usprep_swap }, /* dataFormat="SPRP" */
+#endif
+ /* insert data formats here, descending by expected frequency of occurrence */
+ { { 0x55, 0x50, 0x72, 0x6f }, uprops_swap }, /* dataFormat="UPro" */
+
+ { { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 },
+ ucase_swap }, /* dataFormat="cAsE" */
+
+ { { UBIDI_FMT_0, UBIDI_FMT_1, UBIDI_FMT_2, UBIDI_FMT_3 },
+ ubidi_swap }, /* dataFormat="BiDi" */
+
+#if !UCONFIG_NO_NORMALIZATION
+ { { 0x4e, 0x6f, 0x72, 0x6d }, unorm_swap }, /* dataFormat="Norm" */
+ { { 0x4e, 0x72, 0x6d, 0x32 }, unorm2_swap }, /* dataFormat="Nrm2" */
+#endif
+
+ { { ULAYOUT_FMT_0, ULAYOUT_FMT_1, ULAYOUT_FMT_2, ULAYOUT_FMT_3 },
+ ulayout_swap }, // dataFormat="Layo"
+
+ { { u'E', u'm', u'o', u'j' }, uemoji_swap },
+
+#if !UCONFIG_NO_COLLATION
+ { { 0x55, 0x43, 0x6f, 0x6c }, ucol_swap }, /* dataFormat="UCol" */
+ { { 0x49, 0x6e, 0x76, 0x43 }, ucol_swapInverseUCA },/* dataFormat="InvC" */
+#endif
+#if !UCONFIG_NO_BREAK_ITERATION
+ { { 0x42, 0x72, 0x6b, 0x20 }, ubrk_swap }, /* dataFormat="Brk " */
+ { { 0x44, 0x69, 0x63, 0x74 }, udict_swap }, /* dataFormat="Dict" */
+#endif
+ { { 0x70, 0x6e, 0x61, 0x6d }, upname_swap }, /* dataFormat="pnam" */
+ { { 0x75, 0x6e, 0x61, 0x6d }, uchar_swapNames }, /* dataFormat="unam" */
+#if !UCONFIG_NO_NORMALIZATION
+ { { 0x43, 0x66, 0x75, 0x20 }, uspoof_swap }, /* dataFormat="Cfu " */
+#endif
+ { { 0x54, 0x65, 0x73, 0x74 }, test_swap } /* dataFormat="Test" */
+};
+
+U_CAPI int32_t U_EXPORT2
+udata_swap(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode) {
+ char dataFormatChars[4];
+ const UDataInfo *pInfo;
+ int32_t i, swappedLength;
+
+ if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ /*
+ * Preflight the header first; checks for illegal arguments, too.
+ * Do not swap the header right away because the format-specific swapper
+ * will swap it, get the headerSize again, and also use the header
+ * information. Otherwise we would have to pass some of the information
+ * and not be able to use the UDataSwapFn signature.
+ */
+ udata_swapDataHeader(ds, inData, -1, nullptr, pErrorCode);
+
+ /*
+ * If we wanted udata_swap() to also handle non-loadable data like a UTrie,
+ * then we could check here for further known magic values and structures.
+ */
+ if(U_FAILURE(*pErrorCode)) {
+ return 0; /* the data format was not recognized */
+ }
+
+ pInfo=(const UDataInfo *)((const char *)inData+4);
+
+ {
+ /* convert the data format from ASCII to Unicode to the system charset */
+ char16_t u[4]={
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3]
+ };
+
+ if(uprv_isInvariantUString(u, 4)) {
+ u_UCharsToChars(u, dataFormatChars, 4);
+ } else {
+ dataFormatChars[0]=dataFormatChars[1]=dataFormatChars[2]=dataFormatChars[3]='?';
+ }
+ }
+
+ /* dispatch to the swap function for the dataFormat */
+ for(i=0; i<UPRV_LENGTHOF(swapFns); ++i) {
+ if(0==memcmp(swapFns[i].dataFormat, pInfo->dataFormat, 4)) {
+ swappedLength=swapFns[i].swapFn(ds, inData, length, outData, pErrorCode);
+
+ if(U_FAILURE(*pErrorCode)) {
+ udata_printError(ds, "udata_swap(): failure swapping data format %02x.%02x.%02x.%02x (\"%c%c%c%c\") - %s\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ dataFormatChars[0], dataFormatChars[1],
+ dataFormatChars[2], dataFormatChars[3],
+ u_errorName(*pErrorCode));
+ } else if(swappedLength<(length-15)) {
+ /* swapped less than expected */
+ udata_printError(ds, "udata_swap() warning: swapped only %d out of %d bytes - data format %02x.%02x.%02x.%02x (\"%c%c%c%c\")\n",
+ swappedLength, length,
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ dataFormatChars[0], dataFormatChars[1],
+ dataFormatChars[2], dataFormatChars[3],
+ u_errorName(*pErrorCode));
+ }
+
+ return swappedLength;
+ }
+ }
+
+ /* the dataFormat was not recognized */
+ udata_printError(ds, "udata_swap(): unknown data format %02x.%02x.%02x.%02x (\"%c%c%c%c\")\n",
+ pInfo->dataFormat[0], pInfo->dataFormat[1],
+ pInfo->dataFormat[2], pInfo->dataFormat[3],
+ dataFormatChars[0], dataFormatChars[1],
+ dataFormatChars[2], dataFormatChars[3]);
+
+ *pErrorCode=U_UNSUPPORTED_ERROR;
+ return 0;
+}
diff --git a/intl/icu/source/tools/toolutil/swapimpl.h b/intl/icu/source/tools/toolutil/swapimpl.h
new file mode 100644
index 0000000000..8c6474f662
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/swapimpl.h
@@ -0,0 +1,45 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2005, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: swapimpl.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2005jul29
+* created by: Markus W. Scherer
+*
+* Declarations for data file swapping functions not declared in internal
+* library headers.
+*/
+
+#ifndef __SWAPIMPL_H__
+#define __SWAPIMPL_H__
+
+#include "unicode/utypes.h"
+#include "udataswp.h"
+
+/**
+ * Identifies and then transforms the ICU data piece in-place, or determines
+ * its length. See UDataSwapFn.
+ * This function handles single data pieces (but not .dat data packages)
+ * and internally dispatches to per-type swap functions.
+ * Sets a U_UNSUPPORTED_ERROR if the data format is not recognized.
+ *
+ * @see UDataSwapFn
+ * @see udata_openSwapper
+ * @see udata_openSwapperForInputData
+ * @internal ICU 2.8
+ */
+U_CAPI int32_t U_EXPORT2
+udata_swap(const UDataSwapper *ds,
+ const void *inData, int32_t length, void *outData,
+ UErrorCode *pErrorCode);
+
+#endif
diff --git a/intl/icu/source/tools/toolutil/toolutil.cpp b/intl/icu/source/tools/toolutil/toolutil.cpp
new file mode 100644
index 0000000000..7e7bdc78a1
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/toolutil.cpp
@@ -0,0 +1,381 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 1999-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: toolutil.c
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 1999nov19
+* created by: Markus W. Scherer
+*
+* 6/25/08 - Added Cygwin specific code in uprv_mkdir - Brian Rower
+*
+* This file contains utility functions for ICU tools like genccode.
+*/
+
+#include "unicode/platform.h"
+#if U_PLATFORM == U_PF_MINGW
+// *cough* - for struct stat
+#ifdef __STRICT_ANSI__
+#undef __STRICT_ANSI__
+#endif
+#endif
+
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <time.h>
+#include "unicode/utypes.h"
+
+#ifndef U_TOOLUTIL_IMPLEMENTATION
+#error U_TOOLUTIL_IMPLEMENTATION not set - must be set for all ICU source files in common/ - see https://unicode-org.github.io/icu/userguide/howtouseicu
+#endif
+
+#if U_PLATFORM_USES_ONLY_WIN32_API
+# define VC_EXTRALEAN
+# define WIN32_LEAN_AND_MEAN
+# define NOUSER
+# define NOSERVICE
+# define NOIME
+# define NOMCX
+# if U_PLATFORM == U_PF_MINGW
+# define __NO_MINGW_LFS /* gets around missing 'off64_t' */
+# endif
+# include <windows.h>
+# include <direct.h>
+#else
+# include <sys/stat.h>
+# include <sys/types.h>
+#endif
+
+/* In MinGW environment, io.h needs to be included for _mkdir() */
+#if U_PLATFORM == U_PF_MINGW
+#include <io.h>
+#endif
+
+#include <errno.h>
+
+#include <cstddef>
+
+#include "unicode/errorcode.h"
+#include "unicode/putil.h"
+#include "cmemory.h"
+#include "cstring.h"
+#include "toolutil.h"
+
+U_NAMESPACE_BEGIN
+
+IcuToolErrorCode::~IcuToolErrorCode() {
+ // Safe because our handleFailure() does not throw exceptions.
+ if(isFailure()) { handleFailure(); }
+}
+
+void IcuToolErrorCode::handleFailure() const {
+ fprintf(stderr, "error at %s: %s\n", location, errorName());
+ exit(errorCode);
+}
+
+U_NAMESPACE_END
+
+static int32_t currentYear = -1;
+
+U_CAPI int32_t U_EXPORT2 getCurrentYear() {
+ if(currentYear == -1) {
+ time_t now = time(nullptr);
+ tm *fields = gmtime(&now);
+ currentYear = 1900 + fields->tm_year;
+ }
+ return currentYear;
+}
+
+
+U_CAPI const char * U_EXPORT2
+getLongPathname(const char *pathname) {
+#if U_PLATFORM_USES_ONLY_WIN32_API
+ /* anticipate problems with "short" pathnames */
+ static WIN32_FIND_DATAA info;
+ HANDLE file=FindFirstFileA(pathname, &info);
+ if(file!=INVALID_HANDLE_VALUE) {
+ if(info.cAlternateFileName[0]!=0) {
+ /* this file has a short name, get and use the long one */
+ const char *basename=findBasename(pathname);
+ if(basename!=pathname) {
+ /* prepend the long filename with the original path */
+ uprv_memmove(info.cFileName+(basename-pathname), info.cFileName, uprv_strlen(info.cFileName)+1);
+ uprv_memcpy(info.cFileName, pathname, basename-pathname);
+ }
+ pathname=info.cFileName;
+ }
+ FindClose(file);
+ }
+#endif
+ return pathname;
+}
+
+U_CAPI const char * U_EXPORT2
+findDirname(const char *path, char *buffer, int32_t bufLen, UErrorCode* status) {
+ if(U_FAILURE(*status)) return nullptr;
+ const char *resultPtr = nullptr;
+ int32_t resultLen = 0;
+
+ const char *basename=uprv_strrchr(path, U_FILE_SEP_CHAR);
+#if U_FILE_ALT_SEP_CHAR!=U_FILE_SEP_CHAR
+ const char *basenameAlt=uprv_strrchr(path, U_FILE_ALT_SEP_CHAR);
+ if(basenameAlt && (!basename || basename<basenameAlt)) {
+ basename = basenameAlt;
+ }
+#endif
+ if(!basename) {
+ /* no basename - return ''. */
+ resultPtr = "";
+ resultLen = 0;
+ } else {
+ resultPtr = path;
+ resultLen = static_cast<int32_t>(basename - path);
+ if(resultLen<1) {
+ resultLen = 1; /* '/' or '/a' -> '/' */
+ }
+ }
+
+ if((resultLen+1) <= bufLen) {
+ uprv_strncpy(buffer, resultPtr, resultLen);
+ buffer[resultLen]=0;
+ return buffer;
+ } else {
+ *status = U_BUFFER_OVERFLOW_ERROR;
+ return nullptr;
+ }
+}
+
+U_CAPI const char * U_EXPORT2
+findBasename(const char *filename) {
+ const char *basename=uprv_strrchr(filename, U_FILE_SEP_CHAR);
+
+#if U_FILE_ALT_SEP_CHAR!=U_FILE_SEP_CHAR
+ //be lenient about pathname separators on Windows, like official implementation of C++17 std::filesystem in MSVC
+ //would be convenient to merge this loop with the one above, but alas, there is no such solution in the standard library
+ const char *alt_basename=uprv_strrchr(filename, U_FILE_ALT_SEP_CHAR);
+ if(alt_basename>basename) {
+ basename=alt_basename;
+ }
+#endif
+
+ if(basename!=nullptr) {
+ return basename+1;
+ } else {
+ return filename;
+ }
+}
+
+U_CAPI void U_EXPORT2
+uprv_mkdir(const char *pathname, UErrorCode *status) {
+
+ int retVal = 0;
+#if U_PLATFORM_USES_ONLY_WIN32_API
+ retVal = _mkdir(pathname);
+#else
+ retVal = mkdir(pathname, S_IRWXU | (S_IROTH | S_IXOTH) | (S_IROTH | S_IXOTH));
+#endif
+ if (retVal && errno != EEXIST) {
+#if U_PF_MINGW <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN
+ /*if using Cygwin and the mkdir says it failed...check if the directory already exists..*/
+ /* if it does...don't give the error, if it does not...give the error - Brian Rower - 6/25/08 */
+ struct stat st;
+
+ if(stat(pathname,&st) != 0)
+ {
+ *status = U_FILE_ACCESS_ERROR;
+ }
+#else
+ *status = U_FILE_ACCESS_ERROR;
+#endif
+ }
+}
+
+#if !UCONFIG_NO_FILE_IO
+U_CAPI UBool U_EXPORT2
+uprv_fileExists(const char *file) {
+ struct stat stat_buf;
+ if (stat(file, &stat_buf) == 0) {
+ return true;
+ } else {
+ return false;
+ }
+}
+#endif
+
+U_CAPI int32_t U_EXPORT2
+uprv_compareGoldenFiles(
+ const char* buffer, int32_t bufferLen,
+ const char* goldenFilePath,
+ bool overwrite) {
+
+ if (overwrite) {
+ std::ofstream ofs;
+ ofs.open(goldenFilePath);
+ ofs.write(buffer, bufferLen);
+ ofs.close();
+ return -1;
+ }
+
+ std::ifstream ifs(goldenFilePath, std::ifstream::in);
+ int32_t pos = 0;
+ char c;
+ while (ifs.get(c) && pos < bufferLen) {
+ if (c != buffer[pos]) {
+ // Files differ at this position
+ break;
+ }
+ pos++;
+ }
+ if (pos == bufferLen && ifs.eof()) {
+ // Files are same lengths
+ pos = -1;
+ }
+ ifs.close();
+ return pos;
+}
+
+/*U_CAPI UDate U_EXPORT2
+uprv_getModificationDate(const char *pathname, UErrorCode *status)
+{
+ if(U_FAILURE(*status)) {
+ return;
+ }
+ // TODO: handle case where stat is not available
+ struct stat st;
+
+ if(stat(pathname,&st) != 0)
+ {
+ *status = U_FILE_ACCESS_ERROR;
+ } else {
+ return st.st_mtime;
+ }
+}
+*/
+
+/* tool memory helper ------------------------------------------------------- */
+
+struct UToolMemory {
+ char name[64];
+ int32_t capacity, maxCapacity, size, idx;
+ void *array;
+ alignas(std::max_align_t) char staticArray[1];
+};
+
+U_CAPI UToolMemory * U_EXPORT2
+utm_open(const char *name, int32_t initialCapacity, int32_t maxCapacity, int32_t size) {
+ UToolMemory *mem;
+
+ if(maxCapacity<initialCapacity) {
+ maxCapacity=initialCapacity;
+ }
+
+ mem=(UToolMemory *)uprv_malloc(sizeof(UToolMemory)+initialCapacity*size);
+ if(mem==nullptr) {
+ fprintf(stderr, "error: %s - out of memory\n", name);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ mem->array=mem->staticArray;
+
+ uprv_strcpy(mem->name, name);
+ mem->capacity=initialCapacity;
+ mem->maxCapacity=maxCapacity;
+ mem->size=size;
+ mem->idx=0;
+ return mem;
+}
+
+U_CAPI void U_EXPORT2
+utm_close(UToolMemory *mem) {
+ if(mem!=nullptr) {
+ if(mem->array!=mem->staticArray) {
+ uprv_free(mem->array);
+ }
+ uprv_free(mem);
+ }
+}
+
+
+U_CAPI void * U_EXPORT2
+utm_getStart(UToolMemory *mem) {
+ return (char *)mem->array;
+}
+
+U_CAPI int32_t U_EXPORT2
+utm_countItems(UToolMemory *mem) {
+ return mem->idx;
+}
+
+
+static UBool
+utm_hasCapacity(UToolMemory *mem, int32_t capacity) {
+ if(mem->capacity<capacity) {
+ int32_t newCapacity;
+
+ if(mem->maxCapacity<capacity) {
+ fprintf(stderr, "error: %s - trying to use more than maxCapacity=%ld units\n",
+ mem->name, (long)mem->maxCapacity);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+
+ /* try to allocate a larger array */
+ if(capacity>=2*mem->capacity) {
+ newCapacity=capacity;
+ } else if(mem->capacity<=mem->maxCapacity/3) {
+ newCapacity=2*mem->capacity;
+ } else {
+ newCapacity=mem->maxCapacity;
+ }
+
+ if(mem->array==mem->staticArray) {
+ mem->array=uprv_malloc(newCapacity*mem->size);
+ if(mem->array!=nullptr) {
+ uprv_memcpy(mem->array, mem->staticArray, (size_t)mem->idx*mem->size);
+ }
+ } else {
+ mem->array=uprv_realloc(mem->array, newCapacity*mem->size);
+ }
+
+ if(mem->array==nullptr) {
+ fprintf(stderr, "error: %s - out of memory\n", mem->name);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ mem->capacity=newCapacity;
+ }
+
+ return true;
+}
+
+U_CAPI void * U_EXPORT2
+utm_alloc(UToolMemory *mem) {
+ char *p=nullptr;
+ int32_t oldIndex=mem->idx;
+ int32_t newIndex=oldIndex+1;
+ if(utm_hasCapacity(mem, newIndex)) {
+ p=(char *)mem->array+oldIndex*mem->size;
+ mem->idx=newIndex;
+ uprv_memset(p, 0, mem->size);
+ }
+ return p;
+}
+
+U_CAPI void * U_EXPORT2
+utm_allocN(UToolMemory *mem, int32_t n) {
+ char *p=nullptr;
+ int32_t oldIndex=mem->idx;
+ int32_t newIndex=oldIndex+n;
+ if(utm_hasCapacity(mem, newIndex)) {
+ p=(char *)mem->array+oldIndex*mem->size;
+ mem->idx=newIndex;
+ uprv_memset(p, 0, n*mem->size);
+ }
+ return p;
+}
diff --git a/intl/icu/source/tools/toolutil/toolutil.h b/intl/icu/source/tools/toolutil/toolutil.h
new file mode 100644
index 0000000000..b32a0b8762
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/toolutil.h
@@ -0,0 +1,201 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 1999-2013, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: toolutil.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 1999nov19
+* created by: Markus W. Scherer
+*
+* This file defines utility functions for ICU tools like genccode.
+*/
+
+#ifndef __TOOLUTIL_H__
+#define __TOOLUTIL_H__
+
+#include "unicode/utypes.h"
+
+#ifdef __cplusplus
+
+#include "unicode/errorcode.h"
+
+U_NAMESPACE_BEGIN
+
+/**
+ * ErrorCode subclass for use in ICU command-line tools.
+ * The destructor calls handleFailure() which calls exit(errorCode) when isFailure().
+ */
+class U_TOOLUTIL_API IcuToolErrorCode : public ErrorCode {
+public:
+ /**
+ * @param loc A short string describing where the IcuToolErrorCode is used.
+ */
+ IcuToolErrorCode(const char *loc) : location(loc) {}
+ virtual ~IcuToolErrorCode();
+protected:
+ virtual void handleFailure() const override;
+private:
+ const char *location;
+};
+
+U_NAMESPACE_END
+
+#endif
+
+/*
+ * For Windows, a path/filename may be the short (8.3) version
+ * of the "real", long one. In this case, the short one
+ * is abbreviated and contains a tilde etc.
+ * This function returns a pointer to the original pathname
+ * if it is the "real" one itself, and a pointer to a static
+ * buffer (not thread-safe) containing the long version
+ * if the pathname is indeed abbreviated.
+ *
+ * On platforms other than Windows, this function always returns
+ * the input pathname pointer.
+ *
+ * This function is especially useful in tools that are called
+ * by a batch file for loop, which yields short pathnames on Win9x.
+ */
+U_CAPI const char * U_EXPORT2
+getLongPathname(const char *pathname);
+
+/**
+ * Find the basename at the end of a pathname, i.e., the part
+ * after the last file separator, and return a pointer
+ * to this part of the pathname.
+ * If the pathname only contains a basename and no file separator,
+ * then the pathname pointer itself is returned.
+ **/
+U_CAPI const char * U_EXPORT2
+findBasename(const char *filename);
+
+/**
+ * Find the directory name of a pathname, that is, everything
+ * up to but not including the last file separator.
+ *
+ * If successful, copies the directory name into the output buffer along with
+ * a terminating NULL.
+ *
+ * If there isn't a directory name in the path, it returns an empty string.
+ * @param path the full pathname to inspect.
+ * @param buffer the output buffer
+ * @param bufLen the output buffer length
+ * @param status error code- may return U_BUFFER_OVERFLOW_ERROR if bufLen is too small.
+ * @return If successful, a pointer to the output buffer. If failure or bufLen is too small, NULL.
+ **/
+U_CAPI const char * U_EXPORT2
+findDirname(const char *path, char *buffer, int32_t bufLen, UErrorCode* status);
+
+/*
+ * Return the current year in the Gregorian calendar. Used for copyright generation.
+ */
+U_CAPI int32_t U_EXPORT2
+getCurrentYear();
+
+/*
+ * Creates a directory with pathname.
+ *
+ * @param status Set to an error code when mkdir failed.
+ */
+U_CAPI void U_EXPORT2
+uprv_mkdir(const char *pathname, UErrorCode *status);
+
+#if !UCONFIG_NO_FILE_IO
+/**
+ * Return true if the named item exists
+ * @param file filename
+ * @return true if named item (file, dir, etc) exists, false otherwise
+ */
+U_CAPI UBool U_EXPORT2
+uprv_fileExists(const char *file);
+#endif
+
+/**
+ * Performs a golden data test. Asserts that the contents of the buffer is equal
+ * to the data in goldenFilePath.
+ *
+ * Pass the value of the -G flag to "overwrite"; if true, new goldens will be
+ * written to the filesystem.
+ *
+ * @return The first index at which the files differ, or -1 if they are the same.
+ */
+U_CAPI int32_t U_EXPORT2
+uprv_compareGoldenFiles(
+ const char* buffer, int32_t bufferLen,
+ const char* goldenFilePath,
+ bool overwrite);
+
+/**
+ * Return the modification date for the specified file or directory.
+ * Return value is undefined if there was an error.
+ */
+/*U_CAPI UDate U_EXPORT2
+uprv_getModificationDate(const char *pathname, UErrorCode *status);
+*/
+/*
+ * Returns the modification
+ *
+ * @param status Set to an error code when mkdir failed.
+ */
+
+/*
+ * UToolMemory is used for generic, custom memory management.
+ * It is allocated with enough space for count*size bytes starting
+ * at array.
+ * The array is declared with a union of large data types so
+ * that its base address is aligned for any types.
+ * If size is a multiple of a data type size, then such items
+ * can be safely allocated inside the array, at offsets that
+ * are themselves multiples of size.
+ */
+struct UToolMemory;
+typedef struct UToolMemory UToolMemory;
+
+/**
+ * Open a UToolMemory object for allocation of initialCapacity to maxCapacity
+ * items with size bytes each.
+ */
+U_CAPI UToolMemory * U_EXPORT2
+utm_open(const char *name, int32_t initialCapacity, int32_t maxCapacity, int32_t size);
+
+/**
+ * Close a UToolMemory object.
+ */
+U_CAPI void U_EXPORT2
+utm_close(UToolMemory *mem);
+
+/**
+ * Get the pointer to the beginning of the array of items.
+ * The pointer becomes invalid after allocation of new items.
+ */
+U_CAPI void * U_EXPORT2
+utm_getStart(UToolMemory *mem);
+
+/**
+ * Get the current number of items.
+ */
+U_CAPI int32_t U_EXPORT2
+utm_countItems(UToolMemory *mem);
+
+/**
+ * Allocate one more item and return the pointer to its start in the array.
+ */
+U_CAPI void * U_EXPORT2
+utm_alloc(UToolMemory *mem);
+
+/**
+ * Allocate n items and return the pointer to the start of the first one in the array.
+ */
+U_CAPI void * U_EXPORT2
+utm_allocN(UToolMemory *mem, int32_t n);
+
+#endif
diff --git a/intl/icu/source/tools/toolutil/toolutil.vcxproj b/intl/icu/source/tools/toolutil/toolutil.vcxproj
new file mode 100644
index 0000000000..0995ef06f7
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/toolutil.vcxproj
@@ -0,0 +1,272 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{6B231032-3CB5-4EED-9210-810D666A23A0}</ProjectGuid>
+ </PropertyGroup>
+ <PropertyGroup Label="Configuration">
+ <ConfigurationType>DynamicLibrary</ConfigurationType>
+ <UseOfMfc>false</UseOfMfc>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <!-- The following import will include the 'default' configuration options for VS projects. -->
+ <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" />
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup>
+ <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+ <OutDir>.\$(Platform)\$(Configuration)\</OutDir>
+ <IntDir>.\$(Platform)\$(Configuration)\</IntDir>
+ <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. -->
+ <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir>
+ <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation -->
+ <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental>
+ <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental>
+ </PropertyGroup>
+ <!-- Options that are common to *all* project configurations -->
+ <ItemDefinitionGroup>
+ <ClCompile>
+ <AdditionalIncludeDirectories>..\..\..\include;..\..\common;..\..\i18n;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>U_TOOLUTIL_IMPLEMENTATION;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <DisableLanguageExtensions>false</DisableLanguageExtensions>
+ <WarningLevel>Level3</WarningLevel>
+ <CompileAs>Default</CompileAs>
+ </ClCompile>
+ </ItemDefinitionGroup>
+ <!-- Options that are common to all 'Debug' project configurations -->
+ <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+ <ClCompile>
+ <BrowseInformation>true</BrowseInformation>
+ <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+ <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>icuucd.lib;icuind.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <!-- Options that are common to all 'Release' project configurations -->
+ <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+ <ClCompile>
+ <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>icuuc.lib;icuin.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <!-- Options that are common to all 'Win32' project configurations -->
+ <ItemDefinitionGroup Condition="'$(Platform)'=='Win32'">
+ <ClCompile>
+ <PrecompiledHeaderOutputFile>.\x86\$(Configuration)/toolutil.pch</PrecompiledHeaderOutputFile>
+ <AssemblerListingLocation>.\x86\$(Configuration)/</AssemblerListingLocation>
+ <ObjectFileName>.\x86\$(Configuration)/</ObjectFileName>
+ <ProgramDataBaseFileName>.\x86\$(Configuration)/</ProgramDataBaseFileName>
+ </ClCompile>
+ <Link>
+ <AdditionalLibraryDirectories>..\..\..\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <Midl>
+ <TypeLibraryName>.\..\..\..\lib\icutu.tlb</TypeLibraryName>
+ </Midl>
+ <Link>
+ <OutputFile>..\..\..\bin\icutu$(IcuMajorVersion).dll</OutputFile>
+ <ProgramDatabaseFile>.\..\..\..\lib\icutu.pdb</ProgramDatabaseFile>
+ <DataExecutionPrevention>
+ </DataExecutionPrevention>
+ <ImportLibrary>..\..\..\lib\icutu.lib</ImportLibrary>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <Midl>
+ <TypeLibraryName>.\..\..\..\lib\icutud.tlb</TypeLibraryName>
+ </Midl>
+ <Link>
+ <OutputFile>..\..\..\bin\icutu$(IcuMajorVersion)d.dll</OutputFile>
+ <ProgramDatabaseFile>.\..\..\..\lib\icutud.pdb</ProgramDatabaseFile>
+ <DataExecutionPrevention>
+ </DataExecutionPrevention>
+ <ImportLibrary>..\..\..\lib\icutud.lib</ImportLibrary>
+ </Link>
+ </ItemDefinitionGroup>
+ <!-- Options that are common to all 'x64' project configurations -->
+ <ItemDefinitionGroup Condition="'$(Platform)'=='x64'">
+ <ClCompile>
+ <PrecompiledHeaderOutputFile>.\x64\$(Configuration)/toolutil.pch</PrecompiledHeaderOutputFile>
+ <AssemblerListingLocation>.\x64\$(Configuration)/</AssemblerListingLocation>
+ <ObjectFileName>.\x64\$(Configuration)/</ObjectFileName>
+ <ProgramDataBaseFileName>.\x64\$(Configuration)/</ProgramDataBaseFileName>
+ </ClCompile>
+ <Link>
+ <AdditionalLibraryDirectories>..\..\..\lib64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <Midl>
+ <TypeLibraryName>.\..\..\..\lib64\icutu.tlb</TypeLibraryName>
+ </Midl>
+ <ClCompile>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ </ClCompile>
+ <Link>
+ <OutputFile>..\..\..\bin64\icutu$(IcuMajorVersion).dll</OutputFile>
+ <ProgramDatabaseFile>.\..\..\..\lib64\icutu.pdb</ProgramDatabaseFile>
+ <ImportLibrary>..\..\..\lib64\icutu.lib</ImportLibrary>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <Midl>
+ <TypeLibraryName>.\..\..\..\lib64\icutud.tlb</TypeLibraryName>
+ </Midl>
+ <Link>
+ <OutputFile>..\..\..\bin64\icutu$(IcuMajorVersion)d.dll</OutputFile>
+ <ProgramDatabaseFile>.\..\..\..\lib64\icutud.pdb</ProgramDatabaseFile>
+ <ImportLibrary>..\..\..\lib64\icutud.lib</ImportLibrary>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Platform)'=='ARM'">
+ <ClCompile>
+ <PrecompiledHeaderOutputFile>.\ARM\$(Configuration)/toolutil.pch</PrecompiledHeaderOutputFile>
+ <AssemblerListingLocation>.\ARM\$(Configuration)/</AssemblerListingLocation>
+ <ObjectFileName>.\ARM\$(Configuration)/</ObjectFileName>
+ <ProgramDataBaseFileName>.\ARM\$(Configuration)/</ProgramDataBaseFileName>
+ </ClCompile>
+ <Link>
+ <AdditionalLibraryDirectories>.\..\..\..\libARM;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">
+ <Midl>
+ <TypeLibraryName>..\..\..\libARM\icutu.tlb</TypeLibraryName>
+ </Midl>
+ <Link>
+ <OutputFile>..\..\..\binARM\icutu$(IcuMajorVersion).dll</OutputFile>
+ <ProgramDatabaseFile>.\..\..\..\libARM\icutu.pdb</ProgramDatabaseFile>
+ <ImportLibrary>..\..\..\libARM\icutu.lib</ImportLibrary>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">
+ <Midl>
+ <TypeLibraryName>.\..\..\..\libARM\icutud.tlb</TypeLibraryName>
+ </Midl>
+ <ClCompile>
+ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <OutputFile>..\..\..\binARM\icutu$(IcuMajorVersion)d.dll</OutputFile>
+ <ProgramDatabaseFile>.\..\..\..\libARM\icutud.pdb</ProgramDatabaseFile>
+ <ImportLibrary>..\..\..\libARM\icutud.lib</ImportLibrary>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Platform)'=='ARM64'">
+ <ClCompile>
+ <PrecompiledHeaderOutputFile>.\ARM64\$(Configuration)/toolutil.pch</PrecompiledHeaderOutputFile>
+ <AssemblerListingLocation>.\ARM64\$(Configuration)/</AssemblerListingLocation>
+ <ObjectFileName>.\ARM64\$(Configuration)/</ObjectFileName>
+ <ProgramDataBaseFileName>.\ARM64\$(Configuration)/</ProgramDataBaseFileName>
+ </ClCompile>
+ <Link>
+ <AdditionalLibraryDirectories>.\..\..\..\libARM64;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
+ <Midl>
+ <TypeLibraryName>.\..\..\..\libARM64\icutu.tlb</TypeLibraryName>
+ </Midl>
+ <Link>
+ <OutputFile>..\..\..\binARM64\icutu$(IcuMajorVersion).dll</OutputFile>
+ <ProgramDatabaseFile>.\..\..\..\libARM64\icutu.pdb</ProgramDatabaseFile>
+ <ImportLibrary>..\..\..\libARM64\icutu.lib</ImportLibrary>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
+ <Midl>
+ <TypeLibraryName>.\..\..\..\libARM64\icutud.tlb</TypeLibraryName>
+ </Midl>
+ <ClCompile>
+ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <OutputFile>..\..\..\binARM64\icutu$(IcuMajorVersion)d.dll</OutputFile>
+ <ProgramDatabaseFile>.\..\..\..\libARM64\icutud.pdb</ProgramDatabaseFile>
+ <ImportLibrary>..\..\..\libARM64\icutud.lib</ImportLibrary>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClCompile Include="collationinfo.cpp">
+ <DisableLanguageExtensions>false</DisableLanguageExtensions>
+ </ClCompile>
+ <ClCompile Include="denseranges.cpp" />
+ <ClCompile Include="filestrm.cpp" />
+ <ClCompile Include="filetools.cpp" />
+ <ClCompile Include="flagparser.cpp" />
+ <ClCompile Include="package.cpp" />
+ <ClCompile Include="pkg_genc.cpp">
+ <DisableLanguageExtensions>false</DisableLanguageExtensions>
+ </ClCompile>
+ <ClCompile Include="pkg_gencmn.cpp">
+ <DisableLanguageExtensions>false</DisableLanguageExtensions>
+ </ClCompile>
+ <ClCompile Include="pkg_icu.cpp" />
+ <ClCompile Include="pkgitems.cpp" />
+ <ClCompile Include="ppucd.cpp">
+ <DisableLanguageExtensions>false</DisableLanguageExtensions>
+ </ClCompile>
+ <ClCompile Include="swapimpl.cpp">
+ <DisableLanguageExtensions>false</DisableLanguageExtensions>
+ </ClCompile>
+ <ClCompile Include="toolutil.cpp">
+ <DisableLanguageExtensions>false</DisableLanguageExtensions>
+ </ClCompile>
+ <ClCompile Include="ucbuf.cpp" />
+ <ClCompile Include="ucm.cpp" />
+ <ClCompile Include="ucmstate.cpp" />
+ <ClCompile Include="unewdata.cpp" />
+ <ClCompile Include="uoptions.cpp" />
+ <ClCompile Include="uparse.cpp" />
+ <ClCompile Include="writesrc.cpp" />
+ <ClCompile Include="xmlparser.cpp">
+ <DisableLanguageExtensions>false</DisableLanguageExtensions>
+ </ClCompile>
+ <ClCompile Include="dbgutil.cpp">
+ <DisableLanguageExtensions>false</DisableLanguageExtensions>
+ </ClCompile>
+ <ClCompile Include="udbgutil.cpp">
+ <DisableLanguageExtensions>false</DisableLanguageExtensions>
+ </ClCompile>
+ <ClCompile Include="ucln_tu.cpp">
+ <DisableLanguageExtensions>false</DisableLanguageExtensions>
+ </ClCompile>
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="collationinfo.h" />
+ <ClInclude Include="denseranges.h" />
+ <ClInclude Include="filestrm.h" />
+ <ClInclude Include="filetools.h" />
+ <ClInclude Include="flagparser.h" />
+ <ClInclude Include="package.h" />
+ <ClInclude Include="pkg_genc.h" />
+ <ClInclude Include="pkg_gencmn.h" />
+ <ClInclude Include="pkg_icu.h" />
+ <ClInclude Include="pkg_imp.h" />
+ <ClInclude Include="ppucd.h" />
+ <ClInclude Include="swapimpl.h" />
+ <ClInclude Include="toolutil.h" />
+ <ClInclude Include="ucbuf.h" />
+ <ClInclude Include="ucm.h" />
+ <ClInclude Include="unewdata.h" />
+ <ClInclude Include="uoptions.h" />
+ <ClInclude Include="uparse.h" />
+ <ClInclude Include="writesrc.h" />
+ <ClInclude Include="xmlparser.h" />
+ <ClInclude Include="dbgutil.h" />
+ <ClInclude Include="udbgutil.h" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
+</Project>
diff --git a/intl/icu/source/tools/toolutil/ucbuf.cpp b/intl/icu/source/tools/toolutil/ucbuf.cpp
new file mode 100644
index 0000000000..1eb54e260e
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/ucbuf.cpp
@@ -0,0 +1,788 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 1998-2016, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+*
+* File ucbuf.cpp
+*
+* Modification History:
+*
+* Date Name Description
+* 05/10/01 Ram Creation.
+*******************************************************************************
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/putil.h"
+#include "unicode/uchar.h"
+#include "unicode/ucnv.h"
+#include "unicode/ucnv_err.h"
+#include "unicode/ustring.h"
+#include "unicode/utf16.h"
+#include "filestrm.h"
+#include "cstring.h"
+#include "cmemory.h"
+#include "ustrfmt.h"
+#include "ucbuf.h"
+#include <stdio.h>
+
+#if !UCONFIG_NO_CONVERSION
+
+
+#define MAX_IN_BUF 1000
+#define MAX_U_BUF 1500
+#define CONTEXT_LEN 20
+
+struct UCHARBUF {
+ char16_t* buffer;
+ char16_t* currentPos;
+ char16_t* bufLimit;
+ int32_t bufCapacity;
+ int32_t remaining;
+ int32_t signatureLength;
+ FileStream* in;
+ UConverter* conv;
+ UBool showWarning; /* makes this API not produce any errors */
+ UBool isBuffered;
+};
+
+U_CAPI UBool U_EXPORT2
+ucbuf_autodetect_fs(FileStream* in, const char** cp, UConverter** conv, int32_t* signatureLength, UErrorCode* error){
+ char start[8];
+ int32_t numRead;
+
+ char16_t target[1]={ 0 };
+ char16_t* pTarget;
+ const char* pStart;
+
+ /* read a few bytes */
+ numRead=T_FileStream_read(in, start, sizeof(start));
+
+ *cp = ucnv_detectUnicodeSignature(start, numRead, signatureLength, error);
+
+ /* unread the bytes beyond what was consumed for U+FEFF */
+ T_FileStream_rewind(in);
+ if (*signatureLength > 0) {
+ T_FileStream_read(in, start, *signatureLength);
+ }
+
+ if(*cp==nullptr){
+ *conv =nullptr;
+ return false;
+ }
+
+ /* open the converter for the detected Unicode charset */
+ *conv = ucnv_open(*cp,error);
+
+ /* convert and ignore initial U+FEFF, and the buffer overflow */
+ pTarget = target;
+ pStart = start;
+ ucnv_toUnicode(*conv, &pTarget, target+1, &pStart, start+*signatureLength, nullptr, false, error);
+ *signatureLength = (int32_t)(pStart - start);
+ if(*error==U_BUFFER_OVERFLOW_ERROR) {
+ *error=U_ZERO_ERROR;
+ }
+
+ /* verify that we successfully read exactly U+FEFF */
+ if(U_SUCCESS(*error) && (pTarget!=(target+1) || target[0]!=0xfeff)) {
+ *error=U_INTERNAL_PROGRAM_ERROR;
+ }
+
+
+ return true;
+}
+static UBool ucbuf_isCPKnown(const char* cp){
+ if(ucnv_compareNames("UTF-8",cp)==0){
+ return true;
+ }
+ if(ucnv_compareNames("UTF-16BE",cp)==0){
+ return true;
+ }
+ if(ucnv_compareNames("UTF-16LE",cp)==0){
+ return true;
+ }
+ if(ucnv_compareNames("UTF-16",cp)==0){
+ return true;
+ }
+ if(ucnv_compareNames("UTF-32",cp)==0){
+ return true;
+ }
+ if(ucnv_compareNames("UTF-32BE",cp)==0){
+ return true;
+ }
+ if(ucnv_compareNames("UTF-32LE",cp)==0){
+ return true;
+ }
+ if(ucnv_compareNames("SCSU",cp)==0){
+ return true;
+ }
+ if(ucnv_compareNames("BOCU-1",cp)==0){
+ return true;
+ }
+ if(ucnv_compareNames("UTF-7",cp)==0){
+ return true;
+ }
+ return false;
+}
+
+U_CAPI FileStream * U_EXPORT2
+ucbuf_autodetect(const char* fileName, const char** cp,UConverter** conv, int32_t* signatureLength,UErrorCode* error){
+ FileStream* in=nullptr;
+ if(error==nullptr || U_FAILURE(*error)){
+ return nullptr;
+ }
+ if(conv==nullptr || cp==nullptr || fileName==nullptr){
+ *error = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ /* open the file */
+ in= T_FileStream_open(fileName,"rb");
+
+ if(in == nullptr){
+ *error=U_FILE_ACCESS_ERROR;
+ return nullptr;
+ }
+
+ if(ucbuf_autodetect_fs(in,cp,conv,signatureLength,error)) {
+ return in;
+ } else {
+ ucnv_close(*conv);
+ *conv=nullptr;
+ T_FileStream_close(in);
+ return nullptr;
+ }
+}
+
+/* fill the uchar buffer */
+static UCHARBUF*
+ucbuf_fillucbuf( UCHARBUF* buf,UErrorCode* error){
+ char16_t* pTarget=nullptr;
+ char16_t* target=nullptr;
+ const char* source=nullptr;
+ char carr[MAX_IN_BUF] = {'\0'};
+ char* cbuf = carr;
+ int32_t inputRead=0;
+ int32_t outputWritten=0;
+ int32_t offset=0;
+ const char* sourceLimit =nullptr;
+ int32_t cbufSize=0;
+ pTarget = buf->buffer;
+ /* check if we arrived here without exhausting the buffer*/
+ if(buf->currentPos<buf->bufLimit){
+ offset = (int32_t)(buf->bufLimit-buf->currentPos);
+ memmove(buf->buffer,buf->currentPos,offset* sizeof(char16_t));
+ }
+
+#ifdef UCBUF_DEBUG
+ memset(pTarget+offset,0xff,sizeof(char16_t)*(MAX_IN_BUF-offset));
+#endif
+ if(buf->isBuffered){
+ cbufSize = MAX_IN_BUF;
+ /* read the file */
+ inputRead=T_FileStream_read(buf->in,cbuf,cbufSize-offset);
+ buf->remaining-=inputRead;
+
+ }else{
+ cbufSize = T_FileStream_size(buf->in);
+ cbuf = (char*)uprv_malloc(cbufSize);
+ if (cbuf == nullptr) {
+ *error = U_MEMORY_ALLOCATION_ERROR;
+ return nullptr;
+ }
+ inputRead= T_FileStream_read(buf->in,cbuf,cbufSize);
+ buf->remaining-=inputRead;
+ }
+
+ /* just to be sure...*/
+ if ( 0 == inputRead )
+ buf->remaining = 0;
+
+ target=pTarget;
+ /* convert the bytes */
+ if(buf->conv){
+ /* set the callback to stop */
+ UConverterToUCallback toUOldAction ;
+ void* toUOldContext;
+ void* toUNewContext=nullptr;
+ ucnv_setToUCallBack(buf->conv,
+ UCNV_TO_U_CALLBACK_STOP,
+ toUNewContext,
+ &toUOldAction,
+ (const void**)&toUOldContext,
+ error);
+ /* since state is saved in the converter we add offset to source*/
+ target = pTarget+offset;
+ source = cbuf;
+ sourceLimit = source + inputRead;
+ ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset),
+ &source,sourceLimit,nullptr,
+ (UBool)(buf->remaining==0),error);
+
+ if(U_FAILURE(*error)){
+ char context[CONTEXT_LEN+1];
+ char preContext[CONTEXT_LEN+1];
+ char postContext[CONTEXT_LEN+1];
+ int8_t len = CONTEXT_LEN;
+ int32_t start=0;
+ int32_t stop =0;
+ int32_t pos =0;
+ /* use erro1 to preserve the error code */
+ UErrorCode error1 =U_ZERO_ERROR;
+
+ if( buf->showWarning==true){
+ fprintf(stderr,"\n###WARNING: Encountered abnormal bytes while"
+ " converting input stream to target encoding: %s\n",
+ u_errorName(*error));
+ }
+
+
+ /* now get the context chars */
+ ucnv_getInvalidChars(buf->conv,context,&len,&error1);
+ context[len]= 0 ; /* null terminate the buffer */
+
+ pos = (int32_t)(source - cbuf - len);
+
+ /* for pre-context */
+ start = (pos <=CONTEXT_LEN)? 0 : (pos - (CONTEXT_LEN-1));
+ stop = pos-len;
+
+ memcpy(preContext,cbuf+start,stop-start);
+ /* null terminate the buffer */
+ preContext[stop-start] = 0;
+
+ /* for post-context */
+ start = pos+len;
+ stop = (int32_t)(((pos+CONTEXT_LEN)<= (sourceLimit-cbuf) )? (pos+(CONTEXT_LEN-1)) : (sourceLimit-cbuf));
+
+ memcpy(postContext,source,stop-start);
+ /* null terminate the buffer */
+ postContext[stop-start] = 0;
+
+ if(buf->showWarning ==true){
+ /* print out the context */
+ fprintf(stderr,"\tPre-context: %s\n",preContext);
+ fprintf(stderr,"\tContext: %s\n",context);
+ fprintf(stderr,"\tPost-context: %s\n", postContext);
+ }
+
+ /* reset the converter */
+ ucnv_reset(buf->conv);
+
+ /* set the call back to substitute
+ * and restart conversion
+ */
+ ucnv_setToUCallBack(buf->conv,
+ UCNV_TO_U_CALLBACK_SUBSTITUTE,
+ toUNewContext,
+ &toUOldAction,
+ (const void**)&toUOldContext,
+ &error1);
+
+ /* reset source and target start positions */
+ target = pTarget+offset;
+ source = cbuf;
+
+ /* re convert */
+ ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset),
+ &source,sourceLimit,nullptr,
+ (UBool)(buf->remaining==0),&error1);
+
+ }
+ outputWritten = (int32_t)(target - pTarget);
+
+#ifdef UCBUF_DEBUG
+ {
+ int i;
+ target = pTarget;
+ for(i=0;i<numRead;i++){
+ /* printf("%c", (char)(*target++));*/
+ }
+ }
+#endif
+
+ }else{
+ u_charsToUChars(cbuf,target+offset,inputRead);
+ outputWritten=((buf->remaining>cbufSize)? cbufSize:inputRead+offset);
+ }
+ buf->currentPos = pTarget;
+ buf->bufLimit=pTarget+outputWritten;
+ *buf->bufLimit=0; /*NUL terminate*/
+ if(cbuf!=carr){
+ uprv_free(cbuf);
+ }
+ return buf;
+}
+
+
+
+/* get a char16_t from the stream*/
+U_CAPI int32_t U_EXPORT2
+ucbuf_getc(UCHARBUF* buf,UErrorCode* error){
+ if(error==nullptr || U_FAILURE(*error)){
+ return false;
+ }
+ if(buf->currentPos>=buf->bufLimit){
+ if(buf->remaining==0){
+ return U_EOF;
+ }
+ buf=ucbuf_fillucbuf(buf,error);
+ if(U_FAILURE(*error)){
+ return U_EOF;
+ }
+ }
+
+ return *(buf->currentPos++);
+}
+
+/* get a UChar32 from the stream*/
+U_CAPI int32_t U_EXPORT2
+ucbuf_getc32(UCHARBUF* buf,UErrorCode* error){
+ int32_t retVal = (int32_t)U_EOF;
+ if(error==nullptr || U_FAILURE(*error)){
+ return false;
+ }
+ if(buf->currentPos+1>=buf->bufLimit){
+ if(buf->remaining==0){
+ return U_EOF;
+ }
+ buf=ucbuf_fillucbuf(buf,error);
+ if(U_FAILURE(*error)){
+ return U_EOF;
+ }
+ }
+ if(U16_IS_LEAD(*(buf->currentPos))){
+ retVal=U16_GET_SUPPLEMENTARY(buf->currentPos[0],buf->currentPos[1]);
+ buf->currentPos+=2;
+ }else{
+ retVal = *(buf->currentPos++);
+ }
+ return retVal;
+}
+
+/* u_unescapeAt() callback to return a char16_t*/
+static char16_t U_CALLCONV
+_charAt(int32_t offset, void *context) {
+ return ((UCHARBUF*) context)->currentPos[offset];
+}
+
+/* getc and escape it */
+U_CAPI int32_t U_EXPORT2
+ucbuf_getcx32(UCHARBUF* buf,UErrorCode* error) {
+ int32_t length;
+ int32_t offset;
+ UChar32 c32,c1,c2;
+ if(error==nullptr || U_FAILURE(*error)){
+ return false;
+ }
+ /* Fill the buffer if it is empty */
+ if (buf->currentPos >=buf->bufLimit-2) {
+ ucbuf_fillucbuf(buf,error);
+ }
+
+ /* Get the next character in the buffer */
+ if (buf->currentPos < buf->bufLimit) {
+ c1 = *(buf->currentPos)++;
+ } else {
+ c1 = U_EOF;
+ }
+
+ c2 = *(buf->currentPos);
+
+ /* If it isn't a backslash, return it */
+ if (c1 != 0x005C) {
+ return c1;
+ }
+
+ /* Determine the amount of data in the buffer */
+ length = (int32_t)(buf->bufLimit - buf->currentPos);
+
+ /* The longest escape sequence is \Uhhhhhhhh; make sure
+ we have at least that many characters */
+ if (length < 10) {
+
+ /* fill the buffer */
+ ucbuf_fillucbuf(buf,error);
+ length = (int32_t)(buf->bufLimit - buf->buffer);
+ }
+
+ /* Process the escape */
+ offset = 0;
+ c32 = u_unescapeAt(_charAt, &offset, length, (void*)buf);
+
+ /* check if u_unescapeAt unescaped and converted
+ * to c32 or not
+ */
+ if(c32==(UChar32)0xFFFFFFFF){
+ if(buf->showWarning) {
+ char context[CONTEXT_LEN+1];
+ int32_t len = CONTEXT_LEN;
+ if(length < len) {
+ len = length;
+ }
+ context[len]= 0 ; /* null terminate the buffer */
+ u_UCharsToChars( buf->currentPos, context, len);
+ fprintf(stderr,"Bad escape: [%c%s]...\n", (int)c1, context);
+ }
+ *error= U_ILLEGAL_ESCAPE_SEQUENCE;
+ return c1;
+ }else if(c32!=c2 || (c32==0x0075 && c2==0x0075 && c1==0x005C) /* for \u0075 c2=0x0075 and c32==0x0075*/){
+ /* Update the current buffer position */
+ buf->currentPos += offset;
+ }else{
+ /* unescaping failed so we just return
+ * c1 and not consume the buffer
+ * this is useful for rules with escapes
+ * in resource bundles
+ * eg: \' \\ \"
+ */
+ return c1;
+ }
+
+ return c32;
+}
+
+U_CAPI UCHARBUF* U_EXPORT2
+ucbuf_open(const char* fileName,const char** cp,UBool showWarning, UBool buffered, UErrorCode* error){
+
+ FileStream* in = nullptr;
+ int32_t fileSize=0;
+ const char* knownCp;
+ if(error==nullptr || U_FAILURE(*error)){
+ return nullptr;
+ }
+ if(cp==nullptr || fileName==nullptr){
+ *error = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ if (!uprv_strcmp(fileName, "-")) {
+ in = T_FileStream_stdin();
+ }else{
+ in = T_FileStream_open(fileName, "rb");
+ }
+
+ if(in!=nullptr){
+ UCHARBUF* buf =(UCHARBUF*) uprv_malloc(sizeof(UCHARBUF));
+ fileSize = T_FileStream_size(in);
+ if(buf == nullptr){
+ *error = U_MEMORY_ALLOCATION_ERROR;
+ T_FileStream_close(in);
+ return nullptr;
+ }
+ buf->in=in;
+ buf->conv=nullptr;
+ buf->showWarning = showWarning;
+ buf->isBuffered = buffered;
+ buf->signatureLength=0;
+ if(*cp==nullptr || **cp=='\0'){
+ /* don't have code page name... try to autodetect */
+ ucbuf_autodetect_fs(in,cp,&buf->conv,&buf->signatureLength,error);
+ }else if(ucbuf_isCPKnown(*cp)){
+ /* discard BOM */
+ ucbuf_autodetect_fs(in,&knownCp,&buf->conv,&buf->signatureLength,error);
+ }
+ if(U_SUCCESS(*error) && buf->conv==nullptr) {
+ buf->conv=ucnv_open(*cp,error);
+ }
+ if(U_FAILURE(*error)){
+ ucnv_close(buf->conv);
+ uprv_free(buf);
+ T_FileStream_close(in);
+ return nullptr;
+ }
+
+ if((buf->conv==nullptr) && (buf->showWarning==true)){
+ fprintf(stderr,"###WARNING: No converter defined. Using codepage of system.\n");
+ }
+ buf->remaining=fileSize-buf->signatureLength;
+ if(buf->isBuffered){
+ buf->bufCapacity=MAX_U_BUF;
+ }else{
+ buf->bufCapacity=buf->remaining+buf->signatureLength+1/*for terminating nul*/;
+ }
+ buf->buffer=(char16_t*) uprv_malloc(U_SIZEOF_UCHAR * buf->bufCapacity );
+ if (buf->buffer == nullptr) {
+ *error = U_MEMORY_ALLOCATION_ERROR;
+ ucbuf_close(buf);
+ return nullptr;
+ }
+ buf->currentPos=buf->buffer;
+ buf->bufLimit=buf->buffer;
+ if(U_FAILURE(*error)){
+ fprintf(stderr, "Could not open codepage [%s]: %s\n", *cp, u_errorName(*error));
+ ucbuf_close(buf);
+ return nullptr;
+ }
+ ucbuf_fillucbuf(buf,error);
+ if(U_FAILURE(*error)){
+ ucbuf_close(buf);
+ return nullptr;
+ }
+ return buf;
+ }
+ *error =U_FILE_ACCESS_ERROR;
+ return nullptr;
+}
+
+
+
+/* TODO: this method will fail if at the
+ * beginning of buffer and the uchar to unget
+ * is from the previous buffer. Need to implement
+ * system to take care of that situation.
+ */
+U_CAPI void U_EXPORT2
+ucbuf_ungetc(int32_t c,UCHARBUF* buf){
+ /* decrement currentPos pointer
+ * if not at the beginning of buffer
+ */
+ if(buf->currentPos!=buf->buffer){
+ if(*(buf->currentPos-1)==c){
+ buf->currentPos--;
+ } else {
+ /* ungetc failed - did not match. */
+ }
+ } else {
+ /* ungetc failed - beginning of buffer. */
+ }
+}
+
+/* frees the resources of char16_t* buffer */
+static void
+ucbuf_closebuf(UCHARBUF* buf){
+ uprv_free(buf->buffer);
+ buf->buffer = nullptr;
+}
+
+/* close the buf and release resources*/
+U_CAPI void U_EXPORT2
+ucbuf_close(UCHARBUF* buf){
+ if(buf!=nullptr){
+ if(buf->conv){
+ ucnv_close(buf->conv);
+ }
+ T_FileStream_close(buf->in);
+ ucbuf_closebuf(buf);
+ uprv_free(buf);
+ }
+}
+
+/* rewind the buf and file stream */
+U_CAPI void U_EXPORT2
+ucbuf_rewind(UCHARBUF* buf,UErrorCode* error){
+ if(error==nullptr || U_FAILURE(*error)){
+ return;
+ }
+ if(buf){
+ buf->currentPos=buf->buffer;
+ buf->bufLimit=buf->buffer;
+ T_FileStream_rewind(buf->in);
+ buf->remaining=T_FileStream_size(buf->in)-buf->signatureLength;
+
+ ucnv_resetToUnicode(buf->conv);
+ if(buf->signatureLength>0) {
+ char16_t target[1]={ 0 };
+ char16_t* pTarget;
+ char start[8];
+ const char* pStart;
+ int32_t numRead;
+
+ /* read the signature bytes */
+ numRead=T_FileStream_read(buf->in, start, buf->signatureLength);
+
+ /* convert and ignore initial U+FEFF, and the buffer overflow */
+ pTarget = target;
+ pStart = start;
+ ucnv_toUnicode(buf->conv, &pTarget, target+1, &pStart, start+numRead, nullptr, false, error);
+ if(*error==U_BUFFER_OVERFLOW_ERROR) {
+ *error=U_ZERO_ERROR;
+ }
+
+ /* verify that we successfully read exactly U+FEFF */
+ if(U_SUCCESS(*error) && (numRead!=buf->signatureLength || pTarget!=(target+1) || target[0]!=0xfeff)) {
+ *error=U_INTERNAL_PROGRAM_ERROR;
+ }
+ }
+ }
+}
+
+
+U_CAPI int32_t U_EXPORT2
+ucbuf_size(UCHARBUF* buf){
+ if(buf){
+ if(buf->isBuffered){
+ return (T_FileStream_size(buf->in)-buf->signatureLength)/ucnv_getMinCharSize(buf->conv);
+ }else{
+ return (int32_t)(buf->bufLimit - buf->buffer);
+ }
+ }
+ return 0;
+}
+
+U_CAPI const char16_t* U_EXPORT2
+ucbuf_getBuffer(UCHARBUF* buf,int32_t* len,UErrorCode* error){
+ if(error==nullptr || U_FAILURE(*error)){
+ return nullptr;
+ }
+ if(buf==nullptr || len==nullptr){
+ *error = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ *len = (int32_t)(buf->bufLimit - buf->buffer);
+ return buf->buffer;
+}
+
+U_CAPI const char* U_EXPORT2
+ucbuf_resolveFileName(const char* inputDir, const char* fileName, char* target, int32_t* len, UErrorCode* status){
+ int32_t requiredLen = 0;
+ int32_t dirlen = 0;
+ int32_t filelen = 0;
+ if(status==nullptr || U_FAILURE(*status)){
+ return nullptr;
+ }
+
+ if(inputDir == nullptr || fileName == nullptr || len==nullptr || (target==nullptr && *len>0)){
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+
+
+ dirlen = (int32_t)uprv_strlen(inputDir);
+ filelen = (int32_t)uprv_strlen(fileName);
+ if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) {
+ requiredLen = dirlen + filelen + 2;
+ if((*len < requiredLen) || target==nullptr){
+ *len = requiredLen;
+ *status = U_BUFFER_OVERFLOW_ERROR;
+ return nullptr;
+ }
+
+ target[0] = '\0';
+ /*
+ * append the input dir to openFileName if the first char in
+ * filename is not file separation char and the last char input directory is not '.'.
+ * This is to support :
+ * genrb -s. /home/icu/data
+ * genrb -s. icu/data
+ * The user cannot mix notations like
+ * genrb -s. /icu/data --- the absolute path specified. -s redundant
+ * user should use
+ * genrb -s. icu/data --- start from CWD and look in icu/data dir
+ */
+ if( (fileName[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')){
+ uprv_strcpy(target, inputDir);
+ target[dirlen] = U_FILE_SEP_CHAR;
+ }
+ target[dirlen + 1] = '\0';
+ } else {
+ requiredLen = dirlen + filelen + 1;
+ if((*len < requiredLen) || target==nullptr){
+ *len = requiredLen;
+ *status = U_BUFFER_OVERFLOW_ERROR;
+ return nullptr;
+ }
+
+ uprv_strcpy(target, inputDir);
+ }
+
+ uprv_strcat(target, fileName);
+ return target;
+}
+/*
+ * Unicode TR 13 says any of the below chars is
+ * a new line char in a readline function in addition
+ * to CR+LF combination which needs to be
+ * handled separately
+ */
+static UBool ucbuf_isCharNewLine(char16_t c){
+ switch(c){
+ case 0x000A: /* LF */
+ case 0x000D: /* CR */
+ case 0x000C: /* FF */
+ case 0x0085: /* NEL */
+ case 0x2028: /* LS */
+ case 0x2029: /* PS */
+ return true;
+ default:
+ return false;
+ }
+}
+
+U_CAPI const char16_t* U_EXPORT2
+ucbuf_readline(UCHARBUF* buf,int32_t* len,UErrorCode* err){
+ char16_t* temp = buf->currentPos;
+ char16_t* savePos =nullptr;
+ char16_t c=0x0000;
+ if(buf->isBuffered){
+ /* The input is buffered we have to do more
+ * for returning a pointer U_TRUNCATED_CHAR_FOUND
+ */
+ for(;;){
+ c = *temp++;
+ if(buf->remaining==0){
+ return nullptr; /* end of file is reached return nullptr */
+ }
+ if(temp>=buf->bufLimit && buf->currentPos == buf->buffer){
+ *err= U_TRUNCATED_CHAR_FOUND;
+ return nullptr;
+ }else{
+ ucbuf_fillucbuf(buf,err);
+ if(U_FAILURE(*err)){
+ return nullptr;
+ }
+ }
+ /*
+ * According to TR 13 readLine functions must interpret
+ * CR, CR+LF, LF, NEL, PS, LS or FF as line seperators
+ */
+ /* Windows CR LF */
+ if(c ==0x0d && temp <= buf->bufLimit && *temp == 0x0a ){
+ *len = (int32_t)(temp++ - buf->currentPos);
+ savePos = buf->currentPos;
+ buf->currentPos = temp;
+ return savePos;
+ }
+ /* else */
+
+ if (temp>=buf->bufLimit|| ucbuf_isCharNewLine(c)){ /* Unipad inserts 2028 line separators! */
+ *len = (int32_t)(temp - buf->currentPos);
+ savePos = buf->currentPos;
+ buf->currentPos = temp;
+ return savePos;
+ }
+ }
+ }else{
+ /* we know that all input is read into the internal
+ * buffer so we can safely return pointers
+ */
+ for(;;){
+ c = *temp++;
+
+ if(buf->currentPos==buf->bufLimit){
+ return nullptr; /* end of file is reached return nullptr */
+ }
+ /* Windows CR LF */
+ if(c ==0x0d && temp <= buf->bufLimit && *temp == 0x0a ){
+ *len = (int32_t)(temp++ - buf->currentPos);
+ savePos = buf->currentPos;
+ buf->currentPos = temp;
+ return savePos;
+ }
+ /* else */
+ if (temp>=buf->bufLimit|| ucbuf_isCharNewLine(c)) { /* Unipad inserts 2028 line separators! */
+ *len = (int32_t)(temp - buf->currentPos);
+ savePos = buf->currentPos;
+ buf->currentPos = temp;
+ return savePos;
+ }
+ }
+ }
+ /* not reached */
+ /* A compiler warning will appear if all paths don't contain a return statement. */
+/* return nullptr;*/
+}
+#endif
diff --git a/intl/icu/source/tools/toolutil/ucbuf.h b/intl/icu/source/tools/toolutil/ucbuf.h
new file mode 100644
index 0000000000..117920b794
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/ucbuf.h
@@ -0,0 +1,218 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 1998-2016, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+*
+* File ucbuf.h
+*
+* Modification History:
+*
+* Date Name Description
+* 05/10/01 Ram Creation.
+*
+* This API reads in files and returns UChars
+*******************************************************************************
+*/
+
+#include "unicode/localpointer.h"
+#include "unicode/ucnv.h"
+#include "filestrm.h"
+
+#if !UCONFIG_NO_CONVERSION
+
+#ifndef UCBUF_H
+#define UCBUF_H 1
+
+typedef struct UCHARBUF UCHARBUF;
+/**
+ * End of file value
+ */
+#define U_EOF ((int32_t)0xFFFFFFFF)
+/**
+ * Error value if a sequence cannot be unescaped
+ */
+#define U_ERR ((int32_t)0xFFFFFFFE)
+
+typedef struct ULine ULine;
+
+struct ULine {
+ UChar *name;
+ int32_t len;
+};
+
+/**
+ * Opens the UCHARBUF with the given file stream and code page for conversion
+ * @param fileName Name of the file to open.
+ * @param codepage The encoding of the file stream to convert to Unicode.
+ * If *codepage is NULL on input the API will try to autodetect
+ * popular Unicode encodings
+ * @param showWarning Flag to print out warnings to STDOUT
+ * @param buffered If true performs a buffered read of the input file. If false reads
+ * the whole file into memory and converts it.
+ * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
+ * indicates a failure on entry, the function will immediately return.
+ * On exit the value will indicate the success of the operation.
+ * @return pointer to the newly opened UCHARBUF
+ */
+U_CAPI UCHARBUF* U_EXPORT2
+ucbuf_open(const char* fileName,const char** codepage,UBool showWarning, UBool buffered, UErrorCode* err);
+
+/**
+ * Gets a UTF-16 code unit at the current position from the converted buffer
+ * and increments the current position
+ * @param buf Pointer to UCHARBUF structure
+ * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
+ * indicates a failure on entry, the function will immediately return.
+ * On exit the value will indicate the success of the operation.
+ */
+U_CAPI int32_t U_EXPORT2
+ucbuf_getc(UCHARBUF* buf,UErrorCode* err);
+
+/**
+ * Gets a UTF-32 code point at the current position from the converted buffer
+ * and increments the current position
+ * @param buf Pointer to UCHARBUF structure
+ * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
+ * indicates a failure on entry, the function will immediately return.
+ * On exit the value will indicate the success of the operation.
+ */
+U_CAPI int32_t U_EXPORT2
+ucbuf_getc32(UCHARBUF* buf,UErrorCode* err);
+
+/**
+ * Gets a UTF-16 code unit at the current position from the converted buffer after
+ * unescaping and increments the current position. If the escape sequence is for UTF-32
+ * code point (\\Uxxxxxxxx) then a UTF-32 codepoint is returned
+ * @param buf Pointer to UCHARBUF structure
+ * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
+ * indicates a failure on entry, the function will immediately return.
+ * On exit the value will indicate the success of the operation.
+ */
+U_CAPI int32_t U_EXPORT2
+ucbuf_getcx32(UCHARBUF* buf,UErrorCode* err);
+
+/**
+ * Gets a pointer to the current position in the internal buffer and length of the line.
+ * It imperative to make a copy of the returned buffer before performing operations on it.
+ * @param buf Pointer to UCHARBUF structure
+ * @param len Output param to receive the len of the buffer returned till end of the line
+ * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
+ * indicates a failure on entry, the function will immediately return.
+ * On exit the value will indicate the success of the operation.
+ * Error: U_TRUNCATED_CHAR_FOUND
+ * @return Pointer to the internal buffer, NULL if EOF
+ */
+U_CAPI const UChar* U_EXPORT2
+ucbuf_readline(UCHARBUF* buf,int32_t* len, UErrorCode* err);
+
+
+/**
+ * Resets the buffers and the underlying file stream.
+ * @param buf Pointer to UCHARBUF structure
+ * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
+ * indicates a failure on entry, the function will immediately return.
+ * On exit the value will indicate the success of the operation.
+ */
+U_CAPI void U_EXPORT2
+ucbuf_rewind(UCHARBUF* buf,UErrorCode* err);
+
+/**
+ * Returns a pointer to the internal converted buffer
+ * @param buf Pointer to UCHARBUF structure
+ * @param len Pointer to int32_t to receive the length of buffer
+ * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
+ * indicates a failure on entry, the function will immediately return.
+ * On exit the value will indicate the success of the operation.
+ * @return Pointer to internal UChar buffer
+ */
+U_CAPI const UChar* U_EXPORT2
+ucbuf_getBuffer(UCHARBUF* buf,int32_t* len,UErrorCode* err);
+
+/**
+ * Closes the UCHARBUF structure members and cleans up the malloc'ed memory
+ * @param buf Pointer to UCHARBUF structure
+ */
+U_CAPI void U_EXPORT2
+ucbuf_close(UCHARBUF* buf);
+
+#if U_SHOW_CPLUSPLUS_API
+
+U_NAMESPACE_BEGIN
+
+/**
+ * \class LocalUCHARBUFPointer
+ * "Smart pointer" class, closes a UCHARBUF via ucbuf_close().
+ * For most methods see the LocalPointerBase base class.
+ *
+ * @see LocalPointerBase
+ * @see LocalPointer
+ */
+U_DEFINE_LOCAL_OPEN_POINTER(LocalUCHARBUFPointer, UCHARBUF, ucbuf_close);
+
+U_NAMESPACE_END
+
+#endif
+
+/**
+ * Rewinds the buffer by one codepoint. Does not rewind over escaped characters.
+ */
+U_CAPI void U_EXPORT2
+ucbuf_ungetc(int32_t ungetChar,UCHARBUF* buf);
+
+
+/**
+ * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
+ * Some Unicode charsets are stateful and need byte identifiers to be converted also to bring
+ * the converter to correct state for converting the rest of the stream. So the UConverter parameter
+ * is necessary.
+ * If the charset was autodetected, the caller must close both the input FileStream
+ * and the converter.
+ *
+ * @param fileName The file name to be opened and encoding autodected
+ * @param conv Output param to receive the opened converter if autodetected; NULL otherwise.
+ * @param cp Output param to receive the detected encoding
+ * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
+ * indicates a failure on entry, the function will immediately return.
+ * On exit the value will indicate the success of the operation.
+ * @return The input FileStream if its charset was autodetected; NULL otherwise.
+ */
+U_CAPI FileStream * U_EXPORT2
+ucbuf_autodetect(const char* fileName, const char** cp,UConverter** conv,
+int32_t* signatureLength, UErrorCode* status);
+
+/**
+ * Autodetects the encoding of the file stream. Only Unicode charsets are autodectected.
+ * Some Unicode charsets are stateful and need byte identifiers to be converted also to bring
+ * the converter to correct state for converting the rest of the stream. So the UConverter parameter
+ * is necessary.
+ * If the charset was autodetected, the caller must close the converter.
+ *
+ * @param fileStream The file stream whose encoding is to be detected
+ * @param conv Output param to receive the opened converter if autodetected; NULL otherwise.
+ * @param cp Output param to receive the detected encoding
+ * @param err is a pointer to a valid <code>UErrorCode</code> value. If this value
+ * indicates a failure on entry, the function will immediately return.
+ * On exit the value will indicate the success of the operation.
+ * @return Boolean whether the Unicode charset was autodetected.
+ */
+
+U_CAPI UBool U_EXPORT2
+ucbuf_autodetect_fs(FileStream* in, const char** cp, UConverter** conv, int32_t* signatureLength, UErrorCode* status);
+
+/**
+ * Returns the approximate size in UChars required for converting the file to UChars
+ */
+U_CAPI int32_t U_EXPORT2
+ucbuf_size(UCHARBUF* buf);
+
+U_CAPI const char* U_EXPORT2
+ucbuf_resolveFileName(const char* inputDir, const char* fileName, char* target, int32_t* len, UErrorCode* status);
+
+#endif
+#endif
+
diff --git a/intl/icu/source/tools/toolutil/ucln_tu.cpp b/intl/icu/source/tools/toolutil/ucln_tu.cpp
new file mode 100644
index 0000000000..4727227ebf
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/ucln_tu.cpp
@@ -0,0 +1,19 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/********************************************************************
+ * COPYRIGHT:
+ * Copyright (c) 2007-2014, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ ********************************************************************/
+
+
+/** Auto-client **/
+#define UCLN_TYPE UCLN_TOOLUTIL
+#include "ucln_imp.h"
+
+int uprv_dummyFunction_TU();
+int uprv_dummyFunction_TU()
+{
+ /* this is here to prevent the compiler from complaining about an empty file */
+ return 0;
+}
diff --git a/intl/icu/source/tools/toolutil/ucm.cpp b/intl/icu/source/tools/toolutil/ucm.cpp
new file mode 100644
index 0000000000..272570e72f
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/ucm.cpp
@@ -0,0 +1,1195 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2003-2013, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: ucm.c
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2003jun20
+* created by: Markus W. Scherer
+*
+* This file reads a .ucm file, stores its mappings and sorts them.
+* It implements handling of Unicode conversion mappings from .ucm files
+* for makeconv, canonucm, rptp2ucm, etc.
+*
+* Unicode code point sequences with a length of more than 1,
+* as well as byte sequences with more than 4 bytes or more than one complete
+* character sequence are handled to support m:n mappings.
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/ustring.h"
+#include "cstring.h"
+#include "cmemory.h"
+#include "filestrm.h"
+#include "uarrsort.h"
+#include "ucnvmbcs.h"
+#include "ucnv_bld.h"
+#include "ucnv_ext.h"
+#include "uparse.h"
+#include "ucm.h"
+#include <stdio.h>
+
+#if !UCONFIG_NO_CONVERSION
+
+/* -------------------------------------------------------------------------- */
+
+static void
+printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
+ int32_t j;
+
+ for(j=0; j<m->uLen; ++j) {
+ fprintf(f, "<U%04lX>", (long)codePoints[j]);
+ }
+
+ fputc(' ', f);
+
+ for(j=0; j<m->bLen; ++j) {
+ fprintf(f, "\\x%02X", bytes[j]);
+ }
+
+ if(m->f>=0) {
+ fprintf(f, " |%u\n", m->f);
+ } else {
+ fputs("\n", f);
+ }
+}
+
+U_CAPI void U_EXPORT2
+ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
+ printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
+}
+
+U_CAPI void U_EXPORT2
+ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
+ UCMapping *m;
+ int32_t i, length;
+
+ m=table->mappings;
+ length=table->mappingsLength;
+ if(byUnicode) {
+ for(i=0; i<length; ++m, ++i) {
+ ucm_printMapping(table, m, f);
+ }
+ } else {
+ const int32_t *map=table->reverseMap;
+ for(i=0; i<length; ++i) {
+ ucm_printMapping(table, m+map[i], f);
+ }
+ }
+}
+
+/* mapping comparisons ------------------------------------------------------ */
+
+static int32_t
+compareUnicode(UCMTable *lTable, const UCMapping *l,
+ UCMTable *rTable, const UCMapping *r) {
+ const UChar32 *lu, *ru;
+ int32_t result, i, length;
+
+ if(l->uLen==1 && r->uLen==1) {
+ /* compare two single code points */
+ return l->u-r->u;
+ }
+
+ /* get pointers to the code point sequences */
+ lu=UCM_GET_CODE_POINTS(lTable, l);
+ ru=UCM_GET_CODE_POINTS(rTable, r);
+
+ /* get the minimum length */
+ if(l->uLen<=r->uLen) {
+ length=l->uLen;
+ } else {
+ length=r->uLen;
+ }
+
+ /* compare the code points */
+ for(i=0; i<length; ++i) {
+ result=lu[i]-ru[i];
+ if(result!=0) {
+ return result;
+ }
+ }
+
+ /* compare the lengths */
+ return l->uLen-r->uLen;
+}
+
+static int32_t
+compareBytes(UCMTable *lTable, const UCMapping *l,
+ UCMTable *rTable, const UCMapping *r,
+ UBool lexical) {
+ const uint8_t *lb, *rb;
+ int32_t result, i, length;
+
+ /*
+ * A lexical comparison is used for sorting in the builder, to allow
+ * an efficient search for a byte sequence that could be a prefix
+ * of a previously entered byte sequence.
+ *
+ * Comparing by lengths first is for compatibility with old .ucm tools
+ * like canonucm and rptp2ucm.
+ */
+ if(lexical) {
+ /* get the minimum length and continue */
+ if(l->bLen<=r->bLen) {
+ length=l->bLen;
+ } else {
+ length=r->bLen;
+ }
+ } else {
+ /* compare lengths first */
+ result=l->bLen-r->bLen;
+ if(result!=0) {
+ return result;
+ } else {
+ length=l->bLen;
+ }
+ }
+
+ /* get pointers to the byte sequences */
+ lb=UCM_GET_BYTES(lTable, l);
+ rb=UCM_GET_BYTES(rTable, r);
+
+ /* compare the bytes */
+ for(i=0; i<length; ++i) {
+ result=lb[i]-rb[i];
+ if(result!=0) {
+ return result;
+ }
+ }
+
+ /* compare the lengths */
+ return l->bLen-r->bLen;
+}
+
+/* compare UCMappings for sorting */
+static int32_t
+compareMappings(UCMTable *lTable, const UCMapping *l,
+ UCMTable *rTable, const UCMapping *r,
+ UBool uFirst) {
+ int32_t result;
+
+ /* choose which side to compare first */
+ if(uFirst) {
+ /* Unicode then bytes */
+ result=compareUnicode(lTable, l, rTable, r);
+ if(result==0) {
+ result=compareBytes(lTable, l, rTable, r, false); /* not lexically, like canonucm */
+ }
+ } else {
+ /* bytes then Unicode */
+ result=compareBytes(lTable, l, rTable, r, true); /* lexically, for builder */
+ if(result==0) {
+ result=compareUnicode(lTable, l, rTable, r);
+ }
+ }
+
+ if(result!=0) {
+ return result;
+ }
+
+ /* compare the flags */
+ return l->f-r->f;
+}
+U_CDECL_BEGIN
+/* sorting by Unicode first sorts mappings directly */
+static int32_t U_CALLCONV
+compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
+ return compareMappings(
+ (UCMTable *)context, (const UCMapping *)left,
+ (UCMTable *)context, (const UCMapping *)right, true);
+}
+
+/* sorting by bytes first sorts the reverseMap; use indirection to mappings */
+static int32_t U_CALLCONV
+compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
+ UCMTable *table=(UCMTable *)context;
+ int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
+ return compareMappings(
+ table, table->mappings+l,
+ table, table->mappings+r, false);
+}
+U_CDECL_END
+
+U_CAPI void U_EXPORT2
+ucm_sortTable(UCMTable *t) {
+ UErrorCode errorCode;
+ int32_t i;
+
+ if(t->isSorted) {
+ return;
+ }
+
+ errorCode=U_ZERO_ERROR;
+
+ /* 1. sort by Unicode first */
+ uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
+ compareMappingsUnicodeFirst, t,
+ false, &errorCode);
+
+ /* build the reverseMap */
+ if(t->reverseMap==nullptr) {
+ /*
+ * allocate mappingsCapacity instead of mappingsLength so that
+ * if mappings are added, the reverseMap need not be
+ * reallocated each time
+ * (see ucm_moveMappings() and ucm_addMapping())
+ */
+ t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
+ if(t->reverseMap==nullptr) {
+ fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ }
+ for(i=0; i<t->mappingsLength; ++i) {
+ t->reverseMap[i]=i;
+ }
+
+ /* 2. sort reverseMap by mappings bytes first */
+ uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
+ compareMappingsBytesFirst, t,
+ false, &errorCode);
+
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
+ u_errorName(errorCode));
+ exit(errorCode);
+ }
+
+ t->isSorted=true;
+}
+
+/*
+ * remove mappings with their move flag set from the base table
+ * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
+ */
+U_CAPI void U_EXPORT2
+ucm_moveMappings(UCMTable *base, UCMTable *ext) {
+ UCMapping *mb, *mbLimit;
+ int8_t flag;
+
+ mb=base->mappings;
+ mbLimit=mb+base->mappingsLength;
+
+ while(mb<mbLimit) {
+ flag=mb->moveFlag;
+ if(flag!=0) {
+ /* reset the move flag */
+ mb->moveFlag=0;
+
+ if(ext!=nullptr && (flag&UCM_MOVE_TO_EXT)) {
+ /* add the mapping to the extension table */
+ ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
+ }
+
+ /* remove this mapping: move the last base mapping down and overwrite the current one */
+ if(mb<(mbLimit-1)) {
+ uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
+ }
+ --mbLimit;
+ --base->mappingsLength;
+ base->isSorted=false;
+ } else {
+ ++mb;
+ }
+ }
+}
+
+enum {
+ NEEDS_MOVE=1,
+ HAS_ERRORS=2
+};
+
+static uint8_t
+checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
+ UBool moveToExt, UBool intersectBase) {
+ (void)baseStates;
+
+ UCMapping *mb, *me, *mbLimit, *meLimit;
+ int32_t cmp;
+ uint8_t result;
+
+ mb=base->mappings;
+ mbLimit=mb+base->mappingsLength;
+
+ me=ext->mappings;
+ meLimit=me+ext->mappingsLength;
+
+ result=0;
+
+ for(;;) {
+ /* skip irrelevant mappings on both sides */
+ for(;;) {
+ if(mb==mbLimit) {
+ return result;
+ }
+
+ if((0<=mb->f && mb->f<=2) || mb->f==4) {
+ break;
+ }
+
+ ++mb;
+ }
+
+ for(;;) {
+ if(me==meLimit) {
+ return result;
+ }
+
+ if((0<=me->f && me->f<=2) || me->f==4) {
+ break;
+ }
+
+ ++me;
+ }
+
+ /* compare the base and extension mappings */
+ cmp=compareUnicode(base, mb, ext, me);
+ if(cmp<0) {
+ if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
+ /*
+ * mapping in base but not in ext, move it
+ *
+ * if ext is DBCS, move DBCS mappings here
+ * and check SBCS ones for Unicode prefix below
+ */
+ mb->moveFlag|=UCM_MOVE_TO_EXT;
+ result|=NEEDS_MOVE;
+
+ /* does mb map from an input sequence that is a prefix of me's? */
+ } else if( mb->uLen<me->uLen &&
+ 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
+ ) {
+ if(moveToExt) {
+ /* mark this mapping to be moved to the extension table */
+ mb->moveFlag|=UCM_MOVE_TO_EXT;
+ result|=NEEDS_MOVE;
+ } else {
+ fprintf(stderr,
+ "ucm error: the base table contains a mapping whose input sequence\n"
+ " is a prefix of the input sequence of an extension mapping\n");
+ ucm_printMapping(base, mb, stderr);
+ ucm_printMapping(ext, me, stderr);
+ result|=HAS_ERRORS;
+ }
+ }
+
+ ++mb;
+ } else if(cmp==0) {
+ /*
+ * same output: remove the extension mapping,
+ * otherwise treat as an error
+ */
+ if( mb->f==me->f && mb->bLen==me->bLen &&
+ 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
+ ) {
+ me->moveFlag|=UCM_REMOVE_MAPPING;
+ result|=NEEDS_MOVE;
+ } else if(intersectBase) {
+ /* mapping in base but not in ext, move it */
+ mb->moveFlag|=UCM_MOVE_TO_EXT;
+ result|=NEEDS_MOVE;
+ } else {
+ fprintf(stderr,
+ "ucm error: the base table contains a mapping whose input sequence\n"
+ " is the same as the input sequence of an extension mapping\n"
+ " but it maps differently\n");
+ ucm_printMapping(base, mb, stderr);
+ ucm_printMapping(ext, me, stderr);
+ result|=HAS_ERRORS;
+ }
+
+ ++mb;
+ } else /* cmp>0 */ {
+ ++me;
+ }
+ }
+}
+
+static uint8_t
+checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
+ UBool moveToExt, UBool intersectBase) {
+ UCMapping *mb, *me;
+ int32_t *baseMap, *extMap;
+ int32_t b, e, bLimit, eLimit, cmp;
+ uint8_t result;
+ UBool isSISO;
+
+ baseMap=base->reverseMap;
+ extMap=ext->reverseMap;
+
+ b=e=0;
+ bLimit=base->mappingsLength;
+ eLimit=ext->mappingsLength;
+
+ result=0;
+
+ isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
+
+ for(;;) {
+ /* skip irrelevant mappings on both sides */
+ for(;; ++b) {
+ if(b==bLimit) {
+ return result;
+ }
+ mb=base->mappings+baseMap[b];
+
+ if(intersectBase==2 && mb->bLen==1) {
+ /*
+ * comparing a base against a DBCS extension:
+ * leave SBCS base mappings alone
+ */
+ continue;
+ }
+
+ if(mb->f==0 || mb->f==3) {
+ break;
+ }
+ }
+
+ for(;;) {
+ if(e==eLimit) {
+ return result;
+ }
+ me=ext->mappings+extMap[e];
+
+ if(me->f==0 || me->f==3) {
+ break;
+ }
+
+ ++e;
+ }
+
+ /* compare the base and extension mappings */
+ cmp=compareBytes(base, mb, ext, me, true);
+ if(cmp<0) {
+ if(intersectBase) {
+ /* mapping in base but not in ext, move it */
+ mb->moveFlag|=UCM_MOVE_TO_EXT;
+ result|=NEEDS_MOVE;
+
+ /*
+ * does mb map from an input sequence that is a prefix of me's?
+ * for SI/SO tables, a single byte is never a prefix because it
+ * occurs in a separate single-byte state
+ */
+ } else if( mb->bLen<me->bLen &&
+ (!isSISO || mb->bLen>1) &&
+ 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
+ ) {
+ if(moveToExt) {
+ /* mark this mapping to be moved to the extension table */
+ mb->moveFlag|=UCM_MOVE_TO_EXT;
+ result|=NEEDS_MOVE;
+ } else {
+ fprintf(stderr,
+ "ucm error: the base table contains a mapping whose input sequence\n"
+ " is a prefix of the input sequence of an extension mapping\n");
+ ucm_printMapping(base, mb, stderr);
+ ucm_printMapping(ext, me, stderr);
+ result|=HAS_ERRORS;
+ }
+ }
+
+ ++b;
+ } else if(cmp==0) {
+ /*
+ * same output: remove the extension mapping,
+ * otherwise treat as an error
+ */
+ if( mb->f==me->f && mb->uLen==me->uLen &&
+ 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
+ ) {
+ me->moveFlag|=UCM_REMOVE_MAPPING;
+ result|=NEEDS_MOVE;
+ } else if(intersectBase) {
+ /* mapping in base but not in ext, move it */
+ mb->moveFlag|=UCM_MOVE_TO_EXT;
+ result|=NEEDS_MOVE;
+ } else {
+ fprintf(stderr,
+ "ucm error: the base table contains a mapping whose input sequence\n"
+ " is the same as the input sequence of an extension mapping\n"
+ " but it maps differently\n");
+ ucm_printMapping(base, mb, stderr);
+ ucm_printMapping(ext, me, stderr);
+ result|=HAS_ERRORS;
+ }
+
+ ++b;
+ } else /* cmp>0 */ {
+ ++e;
+ }
+ }
+}
+
+U_CAPI UBool U_EXPORT2
+ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
+ UCMapping *m, *mLimit;
+ int32_t count;
+ UBool isOK;
+
+ m=table->mappings;
+ mLimit=m+table->mappingsLength;
+ isOK=true;
+
+ while(m<mLimit) {
+ count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
+ if(count<1) {
+ ucm_printMapping(table, m, stderr);
+ isOK=false;
+ }
+ ++m;
+ }
+
+ return isOK;
+}
+
+U_CAPI UBool U_EXPORT2
+ucm_checkBaseExt(UCMStates *baseStates,
+ UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
+ UBool intersectBase) {
+ uint8_t result;
+
+ /* if we have an extension table, we must always use precision flags */
+ if(base->flagsType&UCM_FLAGS_IMPLICIT) {
+ fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
+ return false;
+ }
+ if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
+ fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
+ return false;
+ }
+
+ /* checking requires both tables to be sorted */
+ ucm_sortTable(base);
+ ucm_sortTable(ext);
+
+ /* check */
+ result=
+ checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=nullptr), intersectBase)|
+ checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=nullptr), intersectBase);
+
+ if(result&HAS_ERRORS) {
+ return false;
+ }
+
+ if(result&NEEDS_MOVE) {
+ ucm_moveMappings(ext, nullptr);
+ ucm_moveMappings(base, moveTarget);
+ ucm_sortTable(base);
+ ucm_sortTable(ext);
+ if(moveTarget!=nullptr) {
+ ucm_sortTable(moveTarget);
+ }
+ }
+
+ return true;
+}
+
+/* merge tables for rptp2ucm ------------------------------------------------ */
+
+U_CAPI void U_EXPORT2
+ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
+ const uint8_t *subchar, int32_t subcharLength,
+ uint8_t subchar1) {
+ UCMapping *fromUMapping, *toUMapping;
+ int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
+
+ ucm_sortTable(fromUTable);
+ ucm_sortTable(toUTable);
+
+ fromUMapping=fromUTable->mappings;
+ toUMapping=toUTable->mappings;
+
+ fromUTop=fromUTable->mappingsLength;
+ toUTop=toUTable->mappingsLength;
+
+ fromUIndex=toUIndex=0;
+
+ while(fromUIndex<fromUTop && toUIndex<toUTop) {
+ cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, true);
+ if(cmp==0) {
+ /* equal: roundtrip, nothing to do (flags are initially 0) */
+ ++fromUMapping;
+ ++toUMapping;
+
+ ++fromUIndex;
+ ++toUIndex;
+ } else if(cmp<0) {
+ /*
+ * the fromU mapping does not have a toU counterpart:
+ * fallback Unicode->codepage
+ */
+ if( (fromUMapping->bLen==subcharLength &&
+ 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
+ (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
+ ) {
+ fromUMapping->f=2; /* SUB mapping */
+ } else {
+ fromUMapping->f=1; /* normal fallback */
+ }
+
+ ++fromUMapping;
+ ++fromUIndex;
+ } else {
+ /*
+ * the toU mapping does not have a fromU counterpart:
+ * (reverse) fallback codepage->Unicode, copy it to the fromU table
+ */
+
+ /* ignore reverse fallbacks to Unicode SUB */
+ if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
+ toUMapping->f=3; /* reverse fallback */
+ ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
+
+ /* the table may have been reallocated */
+ fromUMapping=fromUTable->mappings+fromUIndex;
+ }
+
+ ++toUMapping;
+ ++toUIndex;
+ }
+ }
+
+ /* either one or both tables are exhausted */
+ while(fromUIndex<fromUTop) {
+ /* leftover fromU mappings are fallbacks */
+ if( (fromUMapping->bLen==subcharLength &&
+ 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
+ (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
+ ) {
+ fromUMapping->f=2; /* SUB mapping */
+ } else {
+ fromUMapping->f=1; /* normal fallback */
+ }
+
+ ++fromUMapping;
+ ++fromUIndex;
+ }
+
+ while(toUIndex<toUTop) {
+ /* leftover toU mappings are reverse fallbacks */
+
+ /* ignore reverse fallbacks to Unicode SUB */
+ if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
+ toUMapping->f=3; /* reverse fallback */
+ ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
+ }
+
+ ++toUMapping;
+ ++toUIndex;
+ }
+
+ fromUTable->isSorted=false;
+}
+
+/* separate extension mappings out of base table for rptp2ucm --------------- */
+
+U_CAPI UBool U_EXPORT2
+ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
+ UCMTable *table;
+ UCMapping *m, *mLimit;
+ int32_t type;
+ UBool needsMove, isOK;
+
+ table=ucm->base;
+ m=table->mappings;
+ mLimit=m+table->mappingsLength;
+
+ needsMove=false;
+ isOK=true;
+
+ for(; m<mLimit; ++m) {
+ if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
+ fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
+ ucm_printMapping(table, m, stderr);
+ m->moveFlag|=UCM_REMOVE_MAPPING;
+ needsMove=true;
+ continue;
+ }
+
+ type=ucm_mappingType(
+ &ucm->states, m,
+ UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
+ if(type<0) {
+ /* illegal byte sequence */
+ printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
+ isOK=false;
+ } else if(type>0) {
+ m->moveFlag|=UCM_MOVE_TO_EXT;
+ needsMove=true;
+ }
+ }
+
+ if(!isOK) {
+ return false;
+ }
+ if(needsMove) {
+ ucm_moveMappings(ucm->base, ucm->ext);
+ return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, false);
+ } else {
+ ucm_sortTable(ucm->base);
+ return true;
+ }
+}
+
+/* ucm parser --------------------------------------------------------------- */
+
+U_CAPI int8_t U_EXPORT2
+ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
+ const char *s=*ps;
+ char *end;
+ uint8_t byte;
+ int8_t bLen;
+
+ bLen=0;
+ for(;;) {
+ /* skip an optional plus sign */
+ if(bLen>0 && *s=='+') {
+ ++s;
+ }
+ if(*s!='\\') {
+ break;
+ }
+
+ if( s[1]!='x' ||
+ (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
+ ) {
+ fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
+ return -1;
+ }
+
+ if(bLen==UCNV_EXT_MAX_BYTES) {
+ fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
+ return -1;
+ }
+ bytes[bLen++]=byte;
+ s=end;
+ }
+
+ *ps=s;
+ return bLen;
+}
+
+/* parse a mapping line; must not be empty */
+U_CAPI UBool U_EXPORT2
+ucm_parseMappingLine(UCMapping *m,
+ UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+ uint8_t bytes[UCNV_EXT_MAX_BYTES],
+ const char *line) {
+ const char *s;
+ char *end;
+ UChar32 cp;
+ int32_t u16Length;
+ int8_t uLen, bLen, f;
+
+ s=line;
+ uLen=bLen=0;
+
+ /* parse code points */
+ for(;;) {
+ /* skip an optional plus sign */
+ if(uLen>0 && *s=='+') {
+ ++s;
+ }
+ if(*s!='<') {
+ break;
+ }
+
+ if( s[1]!='U' ||
+ (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
+ *end!='>'
+ ) {
+ fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
+ return false;
+ }
+ if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
+ fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
+ return false;
+ }
+
+ if(uLen==UCNV_EXT_MAX_UCHARS) {
+ fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
+ return false;
+ }
+ codePoints[uLen++]=cp;
+ s=end+1;
+ }
+
+ if(uLen==0) {
+ fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
+ return false;
+ } else if(uLen==1) {
+ m->u=codePoints[0];
+ } else {
+ UErrorCode errorCode=U_ZERO_ERROR;
+ u_strFromUTF32(nullptr, 0, &u16Length, codePoints, uLen, &errorCode);
+ if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
+ u16Length>UCNV_EXT_MAX_UCHARS
+ ) {
+ fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
+ return false;
+ }
+ }
+
+ s=u_skipWhitespace(s);
+
+ /* parse bytes */
+ bLen=ucm_parseBytes(bytes, line, &s);
+
+ if(bLen<0) {
+ return false;
+ } else if(bLen==0) {
+ fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
+ return false;
+ } else if(bLen<=4) {
+ uprv_memcpy(m->b.bytes, bytes, bLen);
+ }
+
+ /* skip everything until the fallback indicator, even the start of a comment */
+ for(;;) {
+ if(*s==0) {
+ f=-1; /* no fallback indicator */
+ break;
+ } else if(*s=='|') {
+ f=(int8_t)(s[1]-'0');
+ if((uint8_t)f>4) {
+ fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
+ return false;
+ }
+ break;
+ }
+ ++s;
+ }
+
+ m->uLen=uLen;
+ m->bLen=bLen;
+ m->f=f;
+ return true;
+}
+
+/* general APIs ------------------------------------------------------------- */
+
+U_CAPI UCMTable * U_EXPORT2
+ucm_openTable() {
+ UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
+ if(table==nullptr) {
+ fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+
+ memset(table, 0, sizeof(UCMTable));
+ return table;
+}
+
+U_CAPI void U_EXPORT2
+ucm_closeTable(UCMTable *table) {
+ if(table!=nullptr) {
+ uprv_free(table->mappings);
+ uprv_free(table->codePoints);
+ uprv_free(table->bytes);
+ uprv_free(table->reverseMap);
+ uprv_free(table);
+ }
+}
+
+U_CAPI void U_EXPORT2
+ucm_resetTable(UCMTable *table) {
+ if(table!=nullptr) {
+ table->mappingsLength=0;
+ table->flagsType=0;
+ table->unicodeMask=0;
+ table->bytesLength=table->codePointsLength=0;
+ table->isSorted=false;
+ }
+}
+
+U_CAPI void U_EXPORT2
+ucm_addMapping(UCMTable *table,
+ UCMapping *m,
+ UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+ uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
+ UCMapping *tm;
+ UChar32 c;
+ int32_t idx;
+
+ if(table->mappingsLength>=table->mappingsCapacity) {
+ /* make the mappings array larger */
+ if(table->mappingsCapacity==0) {
+ table->mappingsCapacity=1000;
+ } else {
+ table->mappingsCapacity*=10;
+ }
+ table->mappings=(UCMapping *)uprv_realloc(table->mappings,
+ table->mappingsCapacity*sizeof(UCMapping));
+ if(table->mappings==nullptr) {
+ fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
+ (int)table->mappingsCapacity);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+
+ if(table->reverseMap!=nullptr) {
+ /* the reverseMap must be reallocated in a new sort */
+ uprv_free(table->reverseMap);
+ table->reverseMap=nullptr;
+ }
+ }
+
+ if(m->uLen>1 && table->codePointsCapacity==0) {
+ table->codePointsCapacity=10000;
+ table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
+ if(table->codePoints==nullptr) {
+ fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
+ (int)table->codePointsCapacity);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ }
+
+ if(m->bLen>4 && table->bytesCapacity==0) {
+ table->bytesCapacity=10000;
+ table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
+ if(table->bytes==nullptr) {
+ fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
+ (int)table->bytesCapacity);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ }
+
+ if(m->uLen>1) {
+ idx=table->codePointsLength;
+ table->codePointsLength+=m->uLen;
+ if(table->codePointsLength>table->codePointsCapacity) {
+ fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+
+ uprv_memcpy(table->codePoints+idx, codePoints, (size_t)m->uLen*4);
+ m->u=idx;
+ }
+
+ if(m->bLen>4) {
+ idx=table->bytesLength;
+ table->bytesLength+=m->bLen;
+ if(table->bytesLength>table->bytesCapacity) {
+ fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+
+ uprv_memcpy(table->bytes+idx, bytes, m->bLen);
+ m->b.idx=idx;
+ }
+
+ /* set unicodeMask */
+ for(idx=0; idx<m->uLen; ++idx) {
+ c=codePoints[idx];
+ if(c>=0x10000) {
+ table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
+ } else if(U_IS_SURROGATE(c)) {
+ table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */
+ }
+ }
+
+ /* set flagsType */
+ if(m->f<0) {
+ table->flagsType|=UCM_FLAGS_IMPLICIT;
+ } else {
+ table->flagsType|=UCM_FLAGS_EXPLICIT;
+ }
+
+ tm=table->mappings+table->mappingsLength++;
+ uprv_memcpy(tm, m, sizeof(UCMapping));
+
+ table->isSorted=false;
+}
+
+U_CAPI UCMFile * U_EXPORT2
+ucm_open() {
+ UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
+ if(ucm==nullptr) {
+ fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+
+ memset(ucm, 0, sizeof(UCMFile));
+
+ ucm->base=ucm_openTable();
+ ucm->ext=ucm_openTable();
+
+ ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
+ ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
+ ucm->states.outputType=-1;
+ ucm->states.minCharLength=ucm->states.maxCharLength=1;
+
+ return ucm;
+}
+
+U_CAPI void U_EXPORT2
+ucm_close(UCMFile *ucm) {
+ if(ucm!=nullptr) {
+ ucm_closeTable(ucm->base);
+ ucm_closeTable(ucm->ext);
+ uprv_free(ucm);
+ }
+}
+
+U_CAPI int32_t U_EXPORT2
+ucm_mappingType(UCMStates *baseStates,
+ UCMapping *m,
+ UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+ uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
+ (void)codePoints;
+ /* check validity of the bytes and count the characters in them */
+ int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
+ if(count<1) {
+ /* illegal byte sequence */
+ return -1;
+ }
+
+ /*
+ * Suitable for an ICU conversion base table means:
+ * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
+ * - precision flag 0..3
+ * - SBCS: any 1:1 mapping
+ * (the table stores additional bits to distinguish mapping types)
+ * - MBCS: not a |2 SUB mapping for <subchar1>
+ * - MBCS: not a |1 fallback to 0x00
+ * - MBCS: not a multi-byte mapping with leading 0x00 bytes
+ *
+ * Further restrictions for fromUnicode tables
+ * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
+ *
+ * All of the MBCS fromUnicode specific tests could be removed from here,
+ * but the ones above are for unusual mappings, and removing the tests
+ * from here would change canonucm output which seems gratuitous.
+ * (Markus Scherer 2006-nov-28)
+ *
+ * Exception: All implicit mappings (f<0) that need to be moved
+ * because of fromUnicode restrictions _must_ be moved here because
+ * makeconv uses a hack for moving mappings only for the fromUnicode table
+ * that only works with non-negative values of f.
+ */
+ if( m->uLen==1 && count==1 && m->f<=3 &&
+ (baseStates->maxCharLength==1 ||
+ !((m->f==2 && m->bLen==1) ||
+ (m->f==1 && bytes[0]==0) ||
+ (m->f<=1 && m->bLen>1 && bytes[0]==0)))
+ ) {
+ return 0; /* suitable for a base table */
+ } else {
+ return 1; /* needs to go into an extension table */
+ }
+}
+
+U_CAPI UBool U_EXPORT2
+ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
+ UCMapping *m,
+ UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+ uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
+ int32_t type;
+
+ if(m->f==2 && m->uLen>1) {
+ fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
+ printMapping(m, codePoints, bytes, stderr);
+ return false;
+ }
+
+ if(baseStates!=nullptr) {
+ /* check validity of the bytes and count the characters in them */
+ type=ucm_mappingType(baseStates, m, codePoints, bytes);
+ if(type<0) {
+ /* illegal byte sequence */
+ printMapping(m, codePoints, bytes, stderr);
+ return false;
+ }
+ } else {
+ /* not used - adding a mapping for an extension-only table before its base table is read */
+ type=1;
+ }
+
+ /*
+ * Add the mapping to the base table if this is requested and suitable.
+ * Otherwise, add it to the extension table.
+ */
+ if(forBase && type==0) {
+ ucm_addMapping(ucm->base, m, codePoints, bytes);
+ } else {
+ ucm_addMapping(ucm->ext, m, codePoints, bytes);
+ }
+
+ return true;
+}
+
+U_CAPI UBool U_EXPORT2
+ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
+ UCMapping m={ 0, {0}, 0, 0, 0, 0 };
+ UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
+ uint8_t bytes[UCNV_EXT_MAX_BYTES];
+
+ const char *s;
+
+ /* ignore empty and comment lines */
+ if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
+ return true;
+ }
+
+ return
+ ucm_parseMappingLine(&m, codePoints, bytes, line) &&
+ ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
+}
+
+U_CAPI void U_EXPORT2
+ucm_readTable(UCMFile *ucm, FileStream* convFile,
+ UBool forBase, UCMStates *baseStates,
+ UErrorCode *pErrorCode) {
+ char line[500];
+ char *end;
+ UBool isOK;
+
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ }
+
+ isOK=true;
+
+ for(;;) {
+ /* read the next line */
+ if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
+ fprintf(stderr, "incomplete charmap section\n");
+ isOK=false;
+ break;
+ }
+
+ /* remove CR LF */
+ end=uprv_strchr(line, 0);
+ while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
+ --end;
+ }
+ *end=0;
+
+ /* ignore empty and comment lines */
+ if(line[0]==0 || line[0]=='#') {
+ continue;
+ }
+
+ /* stop at the end of the mapping table */
+ if(0==uprv_strcmp(line, "END CHARMAP")) {
+ break;
+ }
+
+ isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
+ }
+
+ if(!isOK) {
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ }
+}
+#endif
diff --git a/intl/icu/source/tools/toolutil/ucm.h b/intl/icu/source/tools/toolutil/ucm.h
new file mode 100644
index 0000000000..8ea90604d4
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/ucm.h
@@ -0,0 +1,302 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+ *******************************************************************************
+ * Copyright (C) 2003-2013, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *******************************************************************************
+ * file name: ucm.h
+ * encoding: UTF-8
+ * tab size: 8 (not used)
+ * indentation:4
+ *
+ * created on: 2003jun20
+ * created by: Markus W. Scherer
+ *
+ * Definitions for the .ucm file parser and handler module ucm.c.
+ */
+
+#ifndef __UCM_H__
+#define __UCM_H__
+
+#include "unicode/utypes.h"
+#include "ucnvmbcs.h"
+#include "ucnv_ext.h"
+#include "filestrm.h"
+#include <stdio.h>
+
+#if !UCONFIG_NO_CONVERSION
+
+U_CDECL_BEGIN
+
+/* constants for UCMapping.moveFlag */
+enum {
+ UCM_MOVE_TO_EXT=1,
+ UCM_REMOVE_MAPPING=2
+};
+
+/*
+ * Per-mapping data structure
+ *
+ * u if uLen==1: Unicode code point
+ * else index to uLen code points
+ * b if bLen<=4: up to 4 bytes
+ * else index to bLen bytes
+ * uLen number of code points
+ * bLen number of words containing left-justified bytes
+ * bIsMultipleChars indicates that the bytes contain more than one sequence
+ * according to the state table
+ * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3)
+ * or "good one-way" mapping (4).
+ * Same values as in the source file after |
+ */
+typedef struct UCMapping {
+ UChar32 u;
+ union {
+ uint32_t idx;
+ uint8_t bytes[4];
+ } b;
+ int8_t uLen, bLen, f, moveFlag;
+} UCMapping;
+
+/* constants for UCMTable.flagsType */
+enum {
+ UCM_FLAGS_INITIAL, /* no mappings parsed yet */
+ UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */
+ UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */
+ UCM_FLAGS_MIXED /* both implicit and explicit */
+};
+
+typedef struct UCMTable {
+ UCMapping *mappings;
+ int32_t mappingsCapacity, mappingsLength;
+
+ UChar32 *codePoints;
+ int32_t codePointsCapacity, codePointsLength;
+
+ uint8_t *bytes;
+ int32_t bytesCapacity, bytesLength;
+
+ /* index map for mapping by bytes first */
+ int32_t *reverseMap;
+
+ uint8_t unicodeMask;
+ int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */
+ UBool isSorted;
+} UCMTable;
+
+enum {
+ MBCS_STATE_FLAG_DIRECT=1,
+ MBCS_STATE_FLAG_SURROGATES,
+
+ MBCS_STATE_FLAG_READY=16
+};
+
+typedef struct UCMStates {
+ int32_t stateTable[MBCS_MAX_STATE_COUNT][256];
+ uint32_t stateFlags[MBCS_MAX_STATE_COUNT],
+ stateOffsetSum[MBCS_MAX_STATE_COUNT];
+
+ int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits;
+ int8_t conversionType, outputType;
+} UCMStates;
+
+typedef struct UCMFile {
+ UCMTable *base, *ext;
+ UCMStates states;
+
+ char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH];
+} UCMFile;
+
+/* simple accesses ---------------------------------------------------------- */
+
+#define UCM_GET_CODE_POINTS(t, m) \
+ (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u)
+
+#define UCM_GET_BYTES(t, m) \
+ (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx)
+
+/* APIs --------------------------------------------------------------------- */
+
+U_CAPI UCMFile * U_EXPORT2
+ucm_open(void);
+
+U_CAPI void U_EXPORT2
+ucm_close(UCMFile *ucm);
+
+U_CAPI UBool U_EXPORT2
+ucm_parseHeaderLine(UCMFile *ucm,
+ char *line, char **pKey, char **pValue);
+
+/* @return -1 illegal bytes 0 suitable for base table 1 needs to go into extension table */
+U_CAPI int32_t U_EXPORT2
+ucm_mappingType(UCMStates *baseStates,
+ UCMapping *m,
+ UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+ uint8_t bytes[UCNV_EXT_MAX_BYTES]);
+
+/* add a mapping to the base or extension table as appropriate */
+U_CAPI UBool U_EXPORT2
+ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
+ UCMapping *m,
+ UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+ uint8_t bytes[UCNV_EXT_MAX_BYTES]);
+
+U_CAPI UBool U_EXPORT2
+ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates);
+
+
+U_CAPI UCMTable * U_EXPORT2
+ucm_openTable(void);
+
+U_CAPI void U_EXPORT2
+ucm_closeTable(UCMTable *table);
+
+U_CAPI void U_EXPORT2
+ucm_resetTable(UCMTable *table);
+
+U_CAPI void U_EXPORT2
+ucm_sortTable(UCMTable *t);
+
+/*
+ * Remove mappings with their move flag set from the base table
+ * and move some of them (with UCM_MOVE_TO_EXT) to the extension table.
+ */
+U_CAPI void U_EXPORT2
+ucm_moveMappings(UCMTable *base, UCMTable *ext);
+
+/**
+ * Read a table from a .ucm file, from after the CHARMAP line to
+ * including the END CHARMAP line.
+ */
+U_CAPI void U_EXPORT2
+ucm_readTable(UCMFile *ucm, FileStream* convFile,
+ UBool forBase, UCMStates *baseStates,
+ UErrorCode *pErrorCode);
+
+/**
+ * Check the validity of mappings against a base table's states;
+ * necessary for extension-only tables that were read before their base tables.
+ */
+U_CAPI UBool U_EXPORT2
+ucm_checkValidity(UCMTable *ext, UCMStates *baseStates);
+
+/**
+ * Check a base table against an extension table.
+ * Set the moveTarget!=NULL if it is possible to move mappings from the base.
+ * This is the case where base and extension tables are parsed from a single file
+ * (moveTarget==ext)
+ * or when delta file mappings are subtracted from a base table.
+ *
+ * When a base table cannot be modified because a delta file is parsed in makeconv,
+ * then set moveTarget=NULL.
+ *
+ * if(intersectBase) then mappings that exist in the base table but not in
+ * the extension table are moved to moveTarget instead of showing an error.
+ *
+ * Special mode:
+ * If intersectBase==2 for a DBCS extension table, then SBCS mappings are
+ * not moved out of the base unless their Unicode input requires it.
+ * This helps ucmkbase generate base tables for DBCS-only extension .cnv files.
+ *
+ * For both tables in the same file, the extension table is automatically
+ * built.
+ * For separate files, the extension file can use a complete mapping table (.ucm file),
+ * so that common mappings need not be stripped out manually.
+ *
+ *
+ * Sort both tables, and then for each mapping direction:
+ *
+ * If intersectBase is true and the base table contains a mapping
+ * that does not exist in the extension table, then this mapping is moved
+ * to moveTarget.
+ *
+ * - otherwise -
+ *
+ * If the base table contains a mapping for which the input sequence is
+ * the same as the extension input, then
+ * - if the output is the same: remove the extension mapping
+ * - else: error
+ *
+ * If the base table contains a mapping for which the input sequence is
+ * a prefix of the extension input, then
+ * - if moveTarget!=NULL: move the base mapping to the moveTarget table
+ * - else: error
+ *
+ * @return false in case of an irreparable error
+ */
+U_CAPI UBool U_EXPORT2
+ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
+ UCMTable *moveTarget, UBool intersectBase);
+
+U_CAPI void U_EXPORT2
+ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode);
+
+U_CAPI void U_EXPORT2
+ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f);
+
+
+U_CAPI void U_EXPORT2
+ucm_addState(UCMStates *states, const char *s);
+
+U_CAPI void U_EXPORT2
+ucm_processStates(UCMStates *states, UBool ignoreSISOCheck);
+
+U_CAPI int32_t U_EXPORT2
+ucm_countChars(UCMStates *states,
+ const uint8_t *bytes, int32_t length);
+
+
+U_CAPI int8_t U_EXPORT2
+ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps);
+
+U_CAPI UBool U_EXPORT2
+ucm_parseMappingLine(UCMapping *m,
+ UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+ uint8_t bytes[UCNV_EXT_MAX_BYTES],
+ const char *line);
+
+U_CAPI void U_EXPORT2
+ucm_addMapping(UCMTable *table,
+ UCMapping *m,
+ UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+ uint8_t bytes[UCNV_EXT_MAX_BYTES]);
+
+/* very makeconv-specific functions ----------------------------------------- */
+
+/* finalize and optimize states after the toUnicode mappings are processed */
+U_CAPI void U_EXPORT2
+ucm_optimizeStates(UCMStates *states,
+ uint16_t **pUnicodeCodeUnits,
+ _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
+ UBool verbose);
+
+/* moved here because it is used inside ucmstate.c */
+U_CAPI int32_t U_EXPORT2
+ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
+ uint32_t offset);
+
+/* very rptp2ucm-specific functions ----------------------------------------- */
+
+/*
+ * Input: Separate tables with mappings from/to Unicode,
+ * subchar and subchar1 (0 if none).
+ * All mappings must have flag 0.
+ *
+ * Output: fromUTable will contain the union of mappings with the correct
+ * precision flags, and be sorted.
+ */
+U_CAPI void U_EXPORT2
+ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
+ const uint8_t *subchar, int32_t subcharLength,
+ uint8_t subchar1);
+
+U_CAPI UBool U_EXPORT2
+ucm_separateMappings(UCMFile *ucm, UBool isSISO);
+
+U_CDECL_END
+
+#endif
+
+#endif
+
diff --git a/intl/icu/source/tools/toolutil/ucmstate.cpp b/intl/icu/source/tools/toolutil/ucmstate.cpp
new file mode 100644
index 0000000000..08782f68d1
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/ucmstate.cpp
@@ -0,0 +1,1053 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2003-2012, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: ucmstate.c
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2003oct09
+* created by: Markus W. Scherer
+*
+* This file handles ICU .ucm file state information as part of the ucm module.
+* Most of this code used to be in makeconv.c.
+*/
+
+#include "unicode/utypes.h"
+#include "cstring.h"
+#include "cmemory.h"
+#include "uarrsort.h"
+#include "ucnvmbcs.h"
+#include "ucnv_ext.h"
+#include "uparse.h"
+#include "ucm.h"
+#include <stdio.h>
+
+#if !UCONFIG_NO_CONVERSION
+
+/* MBCS state handling ------------------------------------------------------ */
+
+/*
+ * state table row grammar (ebnf-style):
+ * (whitespace is allowed between all tokens)
+ *
+ * row=[[firstentry ','] entry (',' entry)*]
+ * firstentry="initial" | "surrogates"
+ * (initial state (default for state 0), output is all surrogate pairs)
+ * entry=range [':' nextstate] ['.' action]
+ * range=number ['-' number]
+ * nextstate=number
+ * (0..7f)
+ * action='u' | 's' | 'p' | 'i'
+ * (unassigned, state change only, surrogate pair, illegal)
+ * number=(1- or 2-digit hexadecimal number)
+ */
+static const char *
+parseState(const char *s, int32_t state[256], uint32_t *pFlags) {
+ const char *t;
+ uint32_t start, end, i;
+ int32_t entry;
+
+ /* initialize the state: all illegal with U+ffff */
+ for(i=0; i<256; ++i) {
+ state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff);
+ }
+
+ /* skip leading white space */
+ s=u_skipWhitespace(s);
+
+ /* is there an "initial" or "surrogates" directive? */
+ if(uprv_strncmp("initial", s, 7)==0) {
+ *pFlags=MBCS_STATE_FLAG_DIRECT;
+ s=u_skipWhitespace(s+7);
+ if(*s++!=',') {
+ return s-1;
+ }
+ } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) {
+ *pFlags=MBCS_STATE_FLAG_SURROGATES;
+ s=u_skipWhitespace(s+10);
+ if(*s++!=',') {
+ return s-1;
+ }
+ } else if(*s==0) {
+ /* empty state row: all-illegal */
+ return nullptr;
+ }
+
+ for(;;) {
+ /* read an entry, the start of the range first */
+ s=u_skipWhitespace(s);
+ start=uprv_strtoul(s, (char **)&t, 16);
+ if(s==t || 0xff<start) {
+ return s;
+ }
+ s=u_skipWhitespace(t);
+
+ /* read the end of the range if there is one */
+ if(*s=='-') {
+ s=u_skipWhitespace(s+1);
+ end=uprv_strtoul(s, (char **)&t, 16);
+ if(s==t || end<start || 0xff<end) {
+ return s;
+ }
+ s=u_skipWhitespace(t);
+ } else {
+ end=start;
+ }
+
+ /* determine the state entry for this range */
+ if(*s!=':' && *s!='.') {
+ /* the default is: final state with valid entries */
+ entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0);
+ } else {
+ entry=MBCS_ENTRY_TRANSITION(0, 0);
+ if(*s==':') {
+ /* get the next state, default to 0 */
+ s=u_skipWhitespace(s+1);
+ i=uprv_strtoul(s, (char **)&t, 16);
+ if(s!=t) {
+ if(0x7f<i) {
+ return s;
+ }
+ s=u_skipWhitespace(t);
+ entry=MBCS_ENTRY_SET_STATE(entry, i);
+ }
+ }
+
+ /* get the state action, default to valid */
+ if(*s=='.') {
+ /* this is a final state */
+ entry=MBCS_ENTRY_SET_FINAL(entry);
+
+ s=u_skipWhitespace(s+1);
+ if(*s=='u') {
+ /* unassigned set U+fffe */
+ entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
+ s=u_skipWhitespace(s+1);
+ } else if(*s=='p') {
+ if(*pFlags!=MBCS_STATE_FLAG_DIRECT) {
+ entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR);
+ } else {
+ entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
+ }
+ s=u_skipWhitespace(s+1);
+ } else if(*s=='s') {
+ entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY);
+ s=u_skipWhitespace(s+1);
+ } else if(*s=='i') {
+ /* illegal set U+ffff */
+ entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff);
+ s=u_skipWhitespace(s+1);
+ } else {
+ /* default to valid */
+ entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16);
+ }
+ } else {
+ /* this is an intermediate state, nothing to do */
+ }
+ }
+
+ /* adjust "final valid" states according to the state flags */
+ if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) {
+ switch(*pFlags) {
+ case 0:
+ /* no adjustment */
+ break;
+ case MBCS_STATE_FLAG_DIRECT:
+ /* set the valid-direct code point to "unassigned"==0xfffe */
+ entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe);
+ break;
+ case MBCS_STATE_FLAG_SURROGATES:
+ entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0);
+ break;
+ default:
+ break;
+ }
+ }
+
+ /* set this entry for the range */
+ for(i=start; i<=end; ++i) {
+ state[i]=entry;
+ }
+
+ if(*s==',') {
+ ++s;
+ } else {
+ return *s==0 ? nullptr : s;
+ }
+ }
+}
+
+U_CAPI void U_EXPORT2
+ucm_addState(UCMStates *states, const char *s) {
+ const char *error;
+
+ if(states->countStates==MBCS_MAX_STATE_COUNT) {
+ fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+
+ error=parseState(s, states->stateTable[states->countStates],
+ &states->stateFlags[states->countStates]);
+ if(error!=nullptr) {
+ fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+
+ ++states->countStates;
+}
+
+U_CAPI UBool U_EXPORT2
+ucm_parseHeaderLine(UCMFile *ucm,
+ char *line, char **pKey, char **pValue) {
+ UCMStates *states;
+ char *s, *end;
+ char c;
+
+ states=&ucm->states;
+
+ /* remove comments and trailing CR and LF and remove whitespace from the end */
+ for(end=line; (c=*end)!=0; ++end) {
+ if(c=='#' || c=='\r' || c=='\n') {
+ break;
+ }
+ }
+ while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) {
+ --end;
+ }
+ *end=0;
+
+ /* skip leading white space and ignore empty lines */
+ s=(char *)u_skipWhitespace(line);
+ if(*s==0) {
+ return true;
+ }
+
+ /* stop at the beginning of the mapping section */
+ if(uprv_memcmp(s, "CHARMAP", 7)==0) {
+ return false;
+ }
+
+ /* get the key name, bracketed in <> */
+ if(*s!='<') {
+ fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ *pKey=++s;
+ while(*s!='>') {
+ if(*s==0) {
+ fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ ++s;
+ }
+ *s=0;
+
+ /* get the value string, possibly quoted */
+ s=(char *)u_skipWhitespace(s+1);
+ if(*s!='"') {
+ *pValue=s;
+ } else {
+ /* remove the quotes */
+ *pValue=s+1;
+ if(end>*pValue && *(end-1)=='"') {
+ *--end=0;
+ }
+ }
+
+ /* collect the information from the header field, ignore unknown keys */
+ if(uprv_strcmp(*pKey, "uconv_class")==0) {
+ if(uprv_strcmp(*pValue, "DBCS")==0) {
+ states->conversionType=UCNV_DBCS;
+ } else if(uprv_strcmp(*pValue, "SBCS")==0) {
+ states->conversionType = UCNV_SBCS;
+ } else if(uprv_strcmp(*pValue, "MBCS")==0) {
+ states->conversionType = UCNV_MBCS;
+ } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) {
+ states->conversionType = UCNV_EBCDIC_STATEFUL;
+ } else {
+ fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ return true;
+ } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) {
+ c=**pValue;
+ if('1'<=c && c<='4' && (*pValue)[1]==0) {
+ states->maxCharLength=(int8_t)(c-'0');
+ states->outputType=(int8_t)(states->maxCharLength-1);
+ } else {
+ fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ return true;
+ } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) {
+ c=**pValue;
+ if('1'<=c && c<='4' && (*pValue)[1]==0) {
+ states->minCharLength=(int8_t)(c-'0');
+ } else {
+ fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ return true;
+ } else if(uprv_strcmp(*pKey, "icu:state")==0) {
+ /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
+ switch(states->conversionType) {
+ case UCNV_SBCS:
+ case UCNV_DBCS:
+ case UCNV_EBCDIC_STATEFUL:
+ states->conversionType=UCNV_MBCS;
+ break;
+ case UCNV_MBCS:
+ break;
+ default:
+ fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+
+ if(states->maxCharLength==0) {
+ fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ ucm_addState(states, *pValue);
+ return true;
+ } else if(uprv_strcmp(*pKey, "icu:base")==0) {
+ if(**pValue==0) {
+ fprintf(stderr, "ucm error: <icu:base> without a base table name\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ uprv_strcpy(ucm->baseName, *pValue);
+ return true;
+ }
+
+ return false;
+}
+
+/* post-processing ---------------------------------------------------------- */
+
+static int32_t
+sumUpStates(UCMStates *states) {
+ int32_t entry, sum, state, cell, count;
+ UBool allStatesReady;
+
+ /*
+ * Sum up the offsets for all states.
+ * In each final state (where there are only final entries),
+ * the offsets add up directly.
+ * In all other state table rows, for each transition entry to another state,
+ * the offsets sum of that state needs to be added.
+ * This is achieved in at most countStates iterations.
+ */
+ allStatesReady=false;
+ for(count=states->countStates; !allStatesReady && count>=0; --count) {
+ allStatesReady=true;
+ for(state=states->countStates-1; state>=0; --state) {
+ if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) {
+ allStatesReady=false;
+ sum=0;
+
+ /* at first, add up only the final delta offsets to keep them <512 */
+ for(cell=0; cell<256; ++cell) {
+ entry=states->stateTable[state][cell];
+ if(MBCS_ENTRY_IS_FINAL(entry)) {
+ switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
+ case MBCS_STATE_VALID_16:
+ states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
+ sum+=1;
+ break;
+ case MBCS_STATE_VALID_16_PAIR:
+ states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
+ sum+=2;
+ break;
+ default:
+ /* no addition */
+ break;
+ }
+ }
+ }
+
+ /* now, add up the delta offsets for the transitional entries */
+ for(cell=0; cell<256; ++cell) {
+ entry=states->stateTable[state][cell];
+ if(MBCS_ENTRY_IS_TRANSITION(entry)) {
+ if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) {
+ states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum);
+ sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)];
+ } else {
+ /* that next state does not have a sum yet, we cannot finish the one for this state */
+ sum=-1;
+ break;
+ }
+ }
+ }
+
+ if(sum!=-1) {
+ states->stateOffsetSum[state]=sum;
+ states->stateFlags[state]|=MBCS_STATE_FLAG_READY;
+ }
+ }
+ }
+ }
+
+ if(!allStatesReady) {
+ fprintf(stderr, "ucm error: the state table contains loops\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+
+ /*
+ * For all "direct" (i.e., initial) states>0,
+ * the offsets need to be increased by the sum of
+ * the previous initial states.
+ */
+ sum=states->stateOffsetSum[0];
+ for(state=1; state<states->countStates; ++state) {
+ if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
+ int32_t sum2=sum;
+ sum+=states->stateOffsetSum[state];
+ for(cell=0; cell<256; ++cell) {
+ entry=states->stateTable[state][cell];
+ if(MBCS_ENTRY_IS_TRANSITION(entry)) {
+ states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2);
+ }
+ }
+ }
+ }
+
+ /* round up to the next even number to have the following data 32-bit-aligned */
+ return states->countToUCodeUnits=(sum+1)&~1;
+}
+
+U_CAPI void U_EXPORT2
+ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) {
+ int32_t entry, state, cell, count;
+
+ if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
+ fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+
+ if(states->countStates==0) {
+ switch(states->conversionType) {
+ case UCNV_SBCS:
+ /* SBCS: use MBCS data structure with a default state table */
+ if(states->maxCharLength!=1) {
+ fprintf(stderr, "error: SBCS codepage with max B/char!=1\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ states->conversionType=UCNV_MBCS;
+ ucm_addState(states, "0-ff");
+ break;
+ case UCNV_MBCS:
+ fprintf(stderr, "ucm error: missing state table information (<icu:state>) for MBCS\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ break;
+ case UCNV_EBCDIC_STATEFUL:
+ /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
+ if(states->minCharLength!=1 || states->maxCharLength!=2) {
+ fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ states->conversionType=UCNV_MBCS;
+ ucm_addState(states, "0-ff, e:1.s, f:0.s");
+ ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
+ ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i");
+ ucm_addState(states, "0-ff:1.i, 40:1.");
+ ucm_addState(states, "0-ff:1.i");
+ break;
+ case UCNV_DBCS:
+ /* DBCS: use MBCS data structure with a default state table */
+ if(states->minCharLength!=2 || states->maxCharLength!=2) {
+ fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ states->conversionType = UCNV_MBCS;
+ ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3");
+ ucm_addState(states, "41-fe");
+ ucm_addState(states, "40");
+ ucm_addState(states, "");
+ break;
+ default:
+ fprintf(stderr, "ucm error: unknown charset structure\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ break;
+ }
+ }
+
+ /*
+ * check that the min/max character lengths are reasonable;
+ * to do this right, all paths through the state table would have to be
+ * recursively walked while keeping track of the sequence lengths,
+ * but these simple checks cover most state tables in practice
+ */
+ if(states->maxCharLength<states->minCharLength) {
+ fprintf(stderr, "ucm error: max B/char < min B/char\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+
+ /* count non-direct states and compare with max B/char */
+ count=0;
+ for(state=0; state<states->countStates; ++state) {
+ if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
+ ++count;
+ }
+ }
+ if(states->maxCharLength>count+1) {
+ fprintf(stderr, "ucm error: max B/char too large\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+
+ if(states->minCharLength==1) {
+ int32_t action;
+
+ /*
+ * if there are single-byte characters,
+ * then the initial state must have direct result states
+ */
+ for(cell=0; cell<256; ++cell) {
+ entry=states->stateTable[0][cell];
+ if( MBCS_ENTRY_IS_FINAL(entry) &&
+ ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 ||
+ action==MBCS_STATE_UNASSIGNED)
+ ) {
+ break;
+ }
+ }
+
+ if(cell==256) {
+ fprintf(stderr, "ucm warning: min B/char too small\n");
+ }
+ }
+
+ /*
+ * make sure that all "next state" values are within limits
+ * and that all next states after final ones have the "direct"
+ * flag of initial states
+ */
+ for(state=states->countStates-1; state>=0; --state) {
+ for(cell=0; cell<256; ++cell) {
+ entry=states->stateTable[state][cell];
+ if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) {
+ fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
+ (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
+ fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
+ (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
+ exit(U_INVALID_TABLE_FORMAT);
+ } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) {
+ fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
+ (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry));
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ }
+ }
+
+ /* is this an SI/SO (like EBCDIC-stateful) state table? */
+ if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) {
+ if(states->maxCharLength!=2) {
+ fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ if(states->countStates<3) {
+ fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ /* are the SI/SO all in the right places? */
+ if( ignoreSISOCheck ||
+ (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
+ states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) &&
+ states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
+ states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0))
+ ) {
+ states->outputType=MBCS_OUTPUT_2_SISO;
+ } else {
+ fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ state=2;
+ } else {
+ state=1;
+ }
+
+ /* check that no unexpected state is a "direct" one */
+ while(state<states->countStates) {
+ if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
+ fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ ++state;
+ }
+
+ sumUpStates(states);
+}
+
+/* find a fallback for this offset; return the index or -1 if not found */
+U_CAPI int32_t U_EXPORT2
+ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
+ uint32_t offset) {
+ int32_t i;
+
+ if(countToUFallbacks==0) {
+ /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
+ return -1;
+ }
+
+ /* do a linear search for the fallback mapping (the table is not yet sorted) */
+ for(i=0; i<countToUFallbacks; ++i) {
+ if(offset==toUFallbacks[i].offset) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+/*
+ * This function tries to compact toUnicode tables for 2-byte codepages
+ * by finding lead bytes with all-unassigned trail bytes and adding another state
+ * for them.
+ */
+static void
+compactToUnicode2(UCMStates *states,
+ uint16_t **pUnicodeCodeUnits,
+ _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
+ UBool verbose) {
+ int32_t (*oldStateTable)[256];
+ uint16_t count[256];
+ uint16_t *oldUnicodeCodeUnits;
+ int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum;
+ int32_t i, j, leadState, trailState, newState, fallback;
+ uint16_t unit;
+
+ /* find the lead state */
+ if(states->outputType==MBCS_OUTPUT_2_SISO) {
+ /* use the DBCS lead state for SI/SO codepages */
+ leadState=1;
+ } else {
+ leadState=0;
+ }
+
+ /* find the main trail state: the most used target state */
+ uprv_memset(count, 0, sizeof(count));
+ for(i=0; i<256; ++i) {
+ entry=states->stateTable[leadState][i];
+ if(MBCS_ENTRY_IS_TRANSITION(entry)) {
+ ++count[MBCS_ENTRY_TRANSITION_STATE(entry)];
+ }
+ }
+ trailState=0;
+ for(i=1; i<states->countStates; ++i) {
+ if(count[i]>count[trailState]) {
+ trailState=i;
+ }
+ }
+
+ /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
+ uprv_memset(count, 0, sizeof(count));
+ savings=0;
+ /* for each lead byte */
+ for(i=0; i<256; ++i) {
+ entry=states->stateTable[leadState][i];
+ if(MBCS_ENTRY_IS_TRANSITION(entry) &&
+ (MBCS_ENTRY_TRANSITION_STATE(entry))==static_cast<uint32_t>(trailState)) {
+ /* the offset is different for each lead byte */
+ offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
+ /* for each trail byte for this lead byte */
+ for(j=0; j<256; ++j) {
+ entry=states->stateTable[trailState][j];
+ switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
+ case MBCS_STATE_VALID_16:
+ entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
+ if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
+ ++count[i];
+ } else {
+ j=999; /* do not count for this lead byte because there are assignments */
+ }
+ break;
+ case MBCS_STATE_VALID_16_PAIR:
+ entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
+ if((*pUnicodeCodeUnits)[entry]==0xfffe) {
+ count[i]+=2;
+ } else {
+ j=999; /* do not count for this lead byte because there are assignments */
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ if(j==256) {
+ /* all trail bytes for this lead byte are unassigned */
+ savings+=count[i];
+ } else {
+ count[i]=0;
+ }
+ }
+ }
+ /* subtract from the possible savings the cost of an additional state */
+ savings=savings*2-1024; /* count bytes, not 16-bit words */
+ if(savings<=0) {
+ return;
+ }
+ if(verbose) {
+ printf("compacting toUnicode data saves %ld bytes\n", (long)savings);
+ }
+ if(states->countStates>=MBCS_MAX_STATE_COUNT) {
+ fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n");
+ return;
+ }
+
+ /* make a copy of the state table */
+ oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024);
+ if(oldStateTable==nullptr) {
+ fprintf(stderr, "cannot compact toUnicode: out of memory\n");
+ return;
+ }
+ uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024);
+
+ /* add the new state */
+ /*
+ * this function does not catch the degenerate case where all lead bytes
+ * have all-unassigned trail bytes and the lead state could be removed
+ */
+ newState=states->countStates++;
+ states->stateFlags[newState]=0;
+ /* copy the old trail state, turning all assigned states into unassigned ones */
+ for(i=0; i<256; ++i) {
+ entry=states->stateTable[trailState][i];
+ switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
+ case MBCS_STATE_VALID_16:
+ case MBCS_STATE_VALID_16_PAIR:
+ states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
+ break;
+ default:
+ states->stateTable[newState][i]=entry;
+ break;
+ }
+ }
+
+ /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
+ for(i=0; i<256; ++i) {
+ if(count[i]>0) {
+ states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState);
+ }
+ }
+
+ /* sum up the new state table */
+ for(i=0; i<states->countStates; ++i) {
+ states->stateFlags[i]&=~MBCS_STATE_FLAG_READY;
+ }
+ sum=sumUpStates(states);
+
+ /* allocate a new, smaller code units array */
+ oldUnicodeCodeUnits=*pUnicodeCodeUnits;
+ if(sum==0) {
+ *pUnicodeCodeUnits=nullptr;
+ if(oldUnicodeCodeUnits!=nullptr) {
+ uprv_free(oldUnicodeCodeUnits);
+ }
+ uprv_free(oldStateTable);
+ return;
+ }
+ *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
+ if(*pUnicodeCodeUnits==nullptr) {
+ fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
+ (long)sum);
+ /* revert to the old state table */
+ *pUnicodeCodeUnits=oldUnicodeCodeUnits;
+ --states->countStates;
+ uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024);
+ uprv_free(oldStateTable);
+ return;
+ }
+ for(i=0; i<sum; ++i) {
+ (*pUnicodeCodeUnits)[i]=0xfffe;
+ }
+
+ /* copy the code units for all assigned characters */
+ /*
+ * The old state table has the same lead _and_ trail states for assigned characters!
+ * The differences are in the offsets, and in the trail states for some unassigned characters.
+ * For each character with an assigned state in the new table, it was assigned in the old one.
+ * Only still-assigned characters are copied.
+ * Note that fallback mappings need to get their offset values adjusted.
+ */
+
+ /* for each initial state */
+ for(leadState=0; leadState<states->countStates; ++leadState) {
+ if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) {
+ /* for each lead byte from there */
+ for(i=0; i<256; ++i) {
+ entry=states->stateTable[leadState][i];
+ if(MBCS_ENTRY_IS_TRANSITION(entry)) {
+ trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
+ /* the new state does not have assigned states */
+ if(trailState!=newState) {
+ trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
+ oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]);
+ /* for each trail byte */
+ for(j=0; j<256; ++j) {
+ entry=states->stateTable[trailState][j];
+ /* copy assigned-character code units and adjust fallback offsets */
+ switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
+ case MBCS_STATE_VALID_16:
+ offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
+ /* find the old offset according to the old state table */
+ oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
+ unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
+ if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) {
+ toUFallbacks[fallback].offset=0x80000000|offset;
+ }
+ break;
+ case MBCS_STATE_VALID_16_PAIR:
+ offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
+ /* find the old offset according to the old state table */
+ oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
+ (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++];
+ (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /* remove temporary flags from fallback offsets that protected them from being modified twice */
+ for(i=0; i<countToUFallbacks; ++i) {
+ toUFallbacks[i].offset&=0x7fffffff;
+ }
+
+ /* free temporary memory */
+ uprv_free(oldUnicodeCodeUnits);
+ uprv_free(oldStateTable);
+}
+
+/*
+ * recursive sub-function of compactToUnicodeHelper()
+ * returns:
+ * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
+ * if all sequences from this state are unassigned, returns the
+ * <0 there are assignments in unicodeCodeUnits[]
+ * 0 no use of unicodeCodeUnits[]
+ */
+static int32_t
+findUnassigned(UCMStates *states,
+ uint16_t *unicodeCodeUnits,
+ _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
+ int32_t state, int32_t offset, uint32_t b) {
+ int32_t i, entry, savings, localSavings, belowSavings;
+ UBool haveAssigned;
+
+ localSavings=belowSavings=0;
+ haveAssigned=false;
+ for(i=0; i<256; ++i) {
+ entry=states->stateTable[state][i];
+ if(MBCS_ENTRY_IS_TRANSITION(entry)) {
+ savings=findUnassigned(states,
+ unicodeCodeUnits,
+ toUFallbacks, countToUFallbacks,
+ MBCS_ENTRY_TRANSITION_STATE(entry),
+ offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
+ (b<<8)|(uint32_t)i);
+ if(savings<0) {
+ haveAssigned=true;
+ } else if(savings>0) {
+ printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
+ (unsigned long)((b<<8)|i), (long)state, (long)savings);
+ belowSavings+=savings;
+ }
+ } else if(!haveAssigned) {
+ switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
+ case MBCS_STATE_VALID_16:
+ entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
+ if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
+ localSavings+=2;
+ } else {
+ haveAssigned=true;
+ }
+ break;
+ case MBCS_STATE_VALID_16_PAIR:
+ entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
+ if(unicodeCodeUnits[entry]==0xfffe) {
+ localSavings+=4;
+ } else {
+ haveAssigned=true;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if(haveAssigned) {
+ return -1;
+ } else {
+ return localSavings+belowSavings;
+ }
+}
+
+/* helper function for finding compaction opportunities */
+static void
+compactToUnicodeHelper(UCMStates *states,
+ uint16_t *unicodeCodeUnits,
+ _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) {
+ int32_t state, savings;
+
+ /* for each initial state */
+ for(state=0; state<states->countStates; ++state) {
+ if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
+ savings=findUnassigned(states,
+ unicodeCodeUnits,
+ toUFallbacks, countToUFallbacks,
+ state, 0, 0);
+ if(savings>0) {
+ printf(" all-unassigned sequences from initial state %ld use %ld bytes\n",
+ (long)state, (long)savings);
+ }
+ }
+ }
+}
+
+U_CDECL_BEGIN
+static int32_t U_CALLCONV
+compareFallbacks(const void *context, const void *fb1, const void *fb2) {
+ (void)context;
+ return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset;
+}
+U_CDECL_END
+
+U_CAPI void U_EXPORT2
+ucm_optimizeStates(UCMStates *states,
+ uint16_t **pUnicodeCodeUnits,
+ _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
+ UBool verbose) {
+ UErrorCode errorCode;
+ int32_t state, cell, entry;
+
+ /* test each state table entry */
+ for(state=0; state<states->countStates; ++state) {
+ for(cell=0; cell<256; ++cell) {
+ entry=states->stateTable[state][cell];
+ /*
+ * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
+ * and the code point is "unassigned" (0xfffe), then change it to
+ * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
+ */
+ if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
+ states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED);
+ }
+ }
+ }
+
+ /* try to compact the toUnicode tables */
+ if(states->maxCharLength==2) {
+ compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose);
+ } else if(states->maxCharLength>2) {
+ if(verbose) {
+ compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks);
+ }
+ }
+
+ /* sort toUFallbacks */
+ /*
+ * It should be safe to sort them before compactToUnicode2() is called,
+ * because it should not change the relative order of the offset values
+ * that it adjusts, but they need to be sorted at some point, and
+ * it is safest here.
+ */
+ if(countToUFallbacks>0) {
+ errorCode=U_ZERO_ERROR; /* nothing bad will happen... */
+ uprv_sortArray(toUFallbacks, countToUFallbacks,
+ sizeof(_MBCSToUFallback),
+ compareFallbacks, nullptr, false, &errorCode);
+ }
+}
+
+/* use a complete state table ----------------------------------------------- */
+
+U_CAPI int32_t U_EXPORT2
+ucm_countChars(UCMStates *states,
+ const uint8_t *bytes, int32_t length) {
+ uint32_t offset;
+ int32_t i, entry, count;
+ uint8_t state;
+
+ offset=0;
+ count=0;
+ state=0;
+
+ if(states->countStates==0) {
+ fprintf(stderr, "ucm error: there is no state information!\n");
+ return -1;
+ }
+
+ /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
+ if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) {
+ state=1;
+ }
+
+ /*
+ * Walk down the state table like in conversion,
+ * much like getNextUChar().
+ * We assume that c<=0x10ffff.
+ */
+ for(i=0; i<length; ++i) {
+ entry=states->stateTable[state][bytes[i]];
+ if(MBCS_ENTRY_IS_TRANSITION(entry)) {
+ state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
+ offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
+ } else {
+ switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
+ case MBCS_STATE_ILLEGAL:
+ fprintf(stderr, "ucm error: byte sequence ends in illegal state\n");
+ return -1;
+ case MBCS_STATE_CHANGE_ONLY:
+ fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n");
+ return -1;
+ case MBCS_STATE_UNASSIGNED:
+ case MBCS_STATE_FALLBACK_DIRECT_16:
+ case MBCS_STATE_VALID_DIRECT_16:
+ case MBCS_STATE_FALLBACK_DIRECT_20:
+ case MBCS_STATE_VALID_DIRECT_20:
+ case MBCS_STATE_VALID_16:
+ case MBCS_STATE_VALID_16_PAIR:
+ /* count a complete character and prepare for a new one */
+ ++count;
+ state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
+ offset=0;
+ break;
+ default:
+ /* reserved, must never occur */
+ fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry);
+ return -1;
+ }
+ }
+ }
+
+ if(offset!=0) {
+ fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %u\n", state);
+ return -1;
+ }
+
+ /*
+ * for SI/SO (like EBCDIC-stateful), multiple-character results
+ * must consist of only double-byte sequences
+ */
+ if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) {
+ fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count);
+ return -1;
+ }
+
+ return count;
+}
+#endif
+
diff --git a/intl/icu/source/tools/toolutil/udbgutil.cpp b/intl/icu/source/tools/toolutil/udbgutil.cpp
new file mode 100644
index 0000000000..3f4bf3718e
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/udbgutil.cpp
@@ -0,0 +1,769 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/********************************************************************
+ * COPYRIGHT:
+ * Copyright (c) 2007-2016, International Business Machines Corporation and
+ * others. All Rights Reserved.
+ ********************************************************************/
+
+#include "udbgutil.h"
+#include <string.h>
+#include "ustr_imp.h"
+#include "cmemory.h"
+#include "cstring.h"
+#include "putilimp.h"
+#include "unicode/ulocdata.h"
+#include "unicode/ucnv.h"
+#include "unicode/unistr.h"
+#include "cstr.h"
+
+/*
+To add a new enum type
+ (For example: UShoeSize with values USHOE_WIDE=0, USHOE_REGULAR, USHOE_NARROW, USHOE_COUNT)
+
+ 0. Make sure that all lines you add are protected with appropriate uconfig guards,
+ such as '#if !UCONFIG_NO_SHOES'.
+ 1. udbgutil.h: add UDBG_UShoeSize to the UDebugEnumType enum before UDBG_ENUM_COUNT
+ ( The subsequent steps involve this file, udbgutil.cpp )
+ 2. Find the marker "Add new enum types above this line"
+ 3. Before that marker, add a #include of any header file you need.
+ 4. Each enum type has three things in this section: a #define, a count_, and an array of Fields.
+ It may help to copy and paste a previous definition.
+ 5. In the case of the USHOE_... strings above, "USHOE_" is common to all values- six characters
+ " #define LEN_USHOE 6 "
+ 6 characters will strip off "USHOE_" leaving enum values of WIDE, REGULAR, and NARROW.
+ 6. Define the 'count_' variable, with the number of enum values. If the enum has a _MAX or _COUNT value,
+ that can be helpful for automatically defining the count. Otherwise define it manually.
+ " static const int32_t count_UShoeSize = USHOE_COUNT; "
+ 7. Define the field names, in order.
+ " static const Field names_UShoeSize[] = {
+ " FIELD_NAME_STR( LEN_USHOE, USHOE_WIDE ),
+ " FIELD_NAME_STR( LEN_USHOE, USHOE_REGULAR ),
+ " FIELD_NAME_STR( LEN_USHOE, USHOE_NARROW ),
+ " };
+ ( The following command was usedfor converting ucol.h into partially correct entities )
+ grep "^[ ]*UCOL" < unicode/ucol.h |
+ sed -e 's%^[ ]*\([A-Z]*\)_\([A-Z_]*\).*% FIELD_NAME_STR( LEN_\1, \1_\2 ),%g'
+ 8. Now, a bit farther down, add the name of the enum itself to the end of names_UDebugEnumType
+ ( UDebugEnumType is an enum, too!)
+ names_UDebugEnumType[] { ...
+ " FIELD_NAME_STR( LEN_UDBG, UDBG_UShoeSize ), "
+ 9. Find the function _udbg_enumCount and add the count macro:
+ " COUNT_CASE(UShoeSize)
+ 10. Find the function _udbg_enumFields and add the field macro:
+ " FIELD_CASE(UShoeSize)
+ 11. verify that your test code, and Java data generation, works properly.
+*/
+
+/**
+ * Structure representing an enum value
+ */
+struct Field {
+ int32_t prefix; /**< how many characters to remove in the prefix - i.e. UCHAR_ = 5 */
+ const char *str; /**< The actual string value */
+ int32_t num; /**< The numeric value */
+};
+
+/**
+ * Define another field name. Used in an array of Field s
+ * @param y the common prefix length (i.e. 6 for "USHOE_" )
+ * @param x the actual enum value - it will be copied in both string and symbolic form.
+ * @see Field
+ */
+#define FIELD_NAME_STR(y,x) { y, #x, x }
+
+
+// TODO: Currently, this whole functionality goes away with UCONFIG_NO_FORMATTING. Should be split up.
+#if !UCONFIG_NO_FORMATTING
+
+// Calendar
+#include "unicode/ucal.h"
+
+// 'UCAL_' = 5
+#define LEN_UCAL 5 /* UCAL_ */
+static const int32_t count_UCalendarDateFields = UCAL_FIELD_COUNT;
+static const Field names_UCalendarDateFields[] =
+{
+ FIELD_NAME_STR( LEN_UCAL, UCAL_ERA ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_YEAR ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_MONTH ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_WEEK_OF_YEAR ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_WEEK_OF_MONTH ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_DATE ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_DAY_OF_YEAR ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_DAY_OF_WEEK ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_DAY_OF_WEEK_IN_MONTH ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_AM_PM ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_HOUR ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_HOUR_OF_DAY ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_MINUTE ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_SECOND ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_MILLISECOND ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_ZONE_OFFSET ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_DST_OFFSET ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_YEAR_WOY ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_DOW_LOCAL ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_EXTENDED_YEAR ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_JULIAN_DAY ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_MILLISECONDS_IN_DAY ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_IS_LEAP_MONTH ),
+#ifndef U_HIDE_DRAFT_API
+ FIELD_NAME_STR( LEN_UCAL, UCAL_ORDINAL_MONTH ),
+#endif // U_HIDE_DRAFT_API
+};
+
+
+static const int32_t count_UCalendarMonths = UCAL_UNDECIMBER+1;
+static const Field names_UCalendarMonths[] =
+{
+ FIELD_NAME_STR( LEN_UCAL, UCAL_JANUARY ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_FEBRUARY ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_MARCH ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_APRIL ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_MAY ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_JUNE ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_JULY ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_AUGUST ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_SEPTEMBER ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_OCTOBER ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_NOVEMBER ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_DECEMBER ),
+ FIELD_NAME_STR( LEN_UCAL, UCAL_UNDECIMBER)
+};
+
+#include "unicode/udat.h"
+
+#define LEN_UDAT 5 /* "UDAT_" */
+static const int32_t count_UDateFormatStyle = UDAT_SHORT+1;
+static const Field names_UDateFormatStyle[] =
+{
+ FIELD_NAME_STR( LEN_UDAT, UDAT_FULL ),
+ FIELD_NAME_STR( LEN_UDAT, UDAT_LONG ),
+ FIELD_NAME_STR( LEN_UDAT, UDAT_MEDIUM ),
+ FIELD_NAME_STR( LEN_UDAT, UDAT_SHORT ),
+ /* end regular */
+ /*
+ * negative enums.. leave out for now.
+ FIELD_NAME_STR( LEN_UDAT, UDAT_NONE ),
+ FIELD_NAME_STR( LEN_UDAT, UDAT_PATTERN ),
+ */
+};
+
+#endif
+
+#include "unicode/uloc.h"
+
+#define LEN_UAR 12 /* "ULOC_ACCEPT_" */
+static const int32_t count_UAcceptResult = 3;
+static const Field names_UAcceptResult[] =
+{
+ FIELD_NAME_STR( LEN_UAR, ULOC_ACCEPT_FAILED ),
+ FIELD_NAME_STR( LEN_UAR, ULOC_ACCEPT_VALID ),
+ FIELD_NAME_STR( LEN_UAR, ULOC_ACCEPT_FALLBACK ),
+};
+
+#if !UCONFIG_NO_COLLATION
+#include "unicode/ucol.h"
+#define LEN_UCOL 5 /* UCOL_ */
+static const int32_t count_UColAttributeValue = UCOL_ATTRIBUTE_VALUE_COUNT;
+static const Field names_UColAttributeValue[] = {
+ FIELD_NAME_STR( LEN_UCOL, UCOL_PRIMARY ),
+ FIELD_NAME_STR( LEN_UCOL, UCOL_SECONDARY ),
+ FIELD_NAME_STR( LEN_UCOL, UCOL_TERTIARY ),
+// FIELD_NAME_STR( LEN_UCOL, UCOL_CE_STRENGTH_LIMIT ),
+ FIELD_NAME_STR( LEN_UCOL, UCOL_QUATERNARY ),
+ // gap
+ FIELD_NAME_STR( LEN_UCOL, UCOL_IDENTICAL ),
+// FIELD_NAME_STR( LEN_UCOL, UCOL_STRENGTH_LIMIT ),
+ FIELD_NAME_STR( LEN_UCOL, UCOL_OFF ),
+ FIELD_NAME_STR( LEN_UCOL, UCOL_ON ),
+ // gap
+ FIELD_NAME_STR( LEN_UCOL, UCOL_SHIFTED ),
+ FIELD_NAME_STR( LEN_UCOL, UCOL_NON_IGNORABLE ),
+ // gap
+ FIELD_NAME_STR( LEN_UCOL, UCOL_LOWER_FIRST ),
+ FIELD_NAME_STR( LEN_UCOL, UCOL_UPPER_FIRST ),
+};
+
+#endif
+
+
+#if UCONFIG_ENABLE_PLUGINS
+#include "unicode/icuplug.h"
+
+#define LEN_UPLUG_REASON 13 /* UPLUG_REASON_ */
+static const int32_t count_UPlugReason = UPLUG_REASON_COUNT;
+static const Field names_UPlugReason[] = {
+ FIELD_NAME_STR( LEN_UPLUG_REASON, UPLUG_REASON_QUERY ),
+ FIELD_NAME_STR( LEN_UPLUG_REASON, UPLUG_REASON_LOAD ),
+ FIELD_NAME_STR( LEN_UPLUG_REASON, UPLUG_REASON_UNLOAD ),
+};
+
+#define LEN_UPLUG_LEVEL 12 /* UPLUG_LEVEL_ */
+static const int32_t count_UPlugLevel = UPLUG_LEVEL_COUNT;
+static const Field names_UPlugLevel[] = {
+ FIELD_NAME_STR( LEN_UPLUG_LEVEL, UPLUG_LEVEL_INVALID ),
+ FIELD_NAME_STR( LEN_UPLUG_LEVEL, UPLUG_LEVEL_UNKNOWN ),
+ FIELD_NAME_STR( LEN_UPLUG_LEVEL, UPLUG_LEVEL_LOW ),
+ FIELD_NAME_STR( LEN_UPLUG_LEVEL, UPLUG_LEVEL_HIGH ),
+};
+#endif
+
+#define LEN_UDBG 5 /* "UDBG_" */
+static const int32_t count_UDebugEnumType = UDBG_ENUM_COUNT;
+static const Field names_UDebugEnumType[] =
+{
+ FIELD_NAME_STR( LEN_UDBG, UDBG_UDebugEnumType ),
+#if !UCONFIG_NO_FORMATTING
+ FIELD_NAME_STR( LEN_UDBG, UDBG_UCalendarDateFields ),
+ FIELD_NAME_STR( LEN_UDBG, UDBG_UCalendarMonths ),
+ FIELD_NAME_STR( LEN_UDBG, UDBG_UDateFormatStyle ),
+#endif
+#if UCONFIG_ENABLE_PLUGINS
+ FIELD_NAME_STR( LEN_UDBG, UDBG_UPlugReason ),
+ FIELD_NAME_STR( LEN_UDBG, UDBG_UPlugLevel ),
+#endif
+ FIELD_NAME_STR( LEN_UDBG, UDBG_UAcceptResult ),
+#if !UCONFIG_NO_COLLATION
+ FIELD_NAME_STR( LEN_UDBG, UDBG_UColAttributeValue ),
+#endif
+};
+
+
+// --- Add new enum types above this line ---
+
+#define COUNT_CASE(x) case UDBG_##x: return (actual?count_##x:UPRV_LENGTHOF(names_##x));
+#define COUNT_FAIL_CASE(x) case UDBG_##x: return -1;
+
+#define FIELD_CASE(x) case UDBG_##x: return names_##x;
+#define FIELD_FAIL_CASE(x) case UDBG_##x: return nullptr;
+
+// low level
+
+/**
+ * @param type type of item
+ * @param actual true: for the actual enum's type (UCAL_FIELD_COUNT, etc), or false for the string count
+ */
+static int32_t _udbg_enumCount(UDebugEnumType type, UBool actual) {
+ switch(type) {
+ COUNT_CASE(UDebugEnumType)
+#if !UCONFIG_NO_FORMATTING
+ COUNT_CASE(UCalendarDateFields)
+ COUNT_CASE(UCalendarMonths)
+ COUNT_CASE(UDateFormatStyle)
+#endif
+#if UCONFIG_ENABLE_PLUGINS
+ COUNT_CASE(UPlugReason)
+ COUNT_CASE(UPlugLevel)
+#endif
+ COUNT_CASE(UAcceptResult)
+#if !UCONFIG_NO_COLLATION
+ COUNT_CASE(UColAttributeValue)
+#endif
+ // COUNT_FAIL_CASE(UNonExistentEnum)
+ default:
+ return -1;
+ }
+}
+
+static const Field* _udbg_enumFields(UDebugEnumType type) {
+ switch(type) {
+ FIELD_CASE(UDebugEnumType)
+#if !UCONFIG_NO_FORMATTING
+ FIELD_CASE(UCalendarDateFields)
+ FIELD_CASE(UCalendarMonths)
+ FIELD_CASE(UDateFormatStyle)
+#endif
+#if UCONFIG_ENABLE_PLUGINS
+ FIELD_CASE(UPlugReason)
+ FIELD_CASE(UPlugLevel)
+#endif
+ FIELD_CASE(UAcceptResult)
+ // FIELD_FAIL_CASE(UNonExistentEnum)
+#if !UCONFIG_NO_COLLATION
+ FIELD_CASE(UColAttributeValue)
+#endif
+ default:
+ return nullptr;
+ }
+}
+
+// implementation
+
+int32_t udbg_enumCount(UDebugEnumType type) {
+ return _udbg_enumCount(type, false);
+}
+
+int32_t udbg_enumExpectedCount(UDebugEnumType type) {
+ return _udbg_enumCount(type, true);
+}
+
+const char * udbg_enumName(UDebugEnumType type, int32_t field) {
+ if(field<0 ||
+ field>=_udbg_enumCount(type,false)) { // also will catch unsupported items
+ return nullptr;
+ } else {
+ const Field *fields = _udbg_enumFields(type);
+ if(fields == nullptr) {
+ return nullptr;
+ } else {
+ return fields[field].str + fields[field].prefix;
+ }
+ }
+}
+
+int32_t udbg_enumArrayValue(UDebugEnumType type, int32_t field) {
+ if(field<0 ||
+ field>=_udbg_enumCount(type,false)) { // also will catch unsupported items
+ return -1;
+ } else {
+ const Field *fields = _udbg_enumFields(type);
+ if(fields == nullptr) {
+ return -1;
+ } else {
+ return fields[field].num;
+ }
+ }
+}
+
+int32_t udbg_enumByName(UDebugEnumType type, const char *value) {
+ if(type<0||type>=_udbg_enumCount(UDBG_UDebugEnumType, true)) {
+ return -1; // type out of range
+ }
+ const Field *fields = _udbg_enumFields(type);
+ if (fields != nullptr) {
+ for(int32_t field = 0;field<_udbg_enumCount(type, false);field++) {
+ if(!strcmp(value, fields[field].str + fields[field].prefix)) {
+ return fields[field].num;
+ }
+ }
+ // try with the prefix
+ for(int32_t field = 0;field<_udbg_enumCount(type, false);field++) {
+ if(!strcmp(value, fields[field].str)) {
+ return fields[field].num;
+ }
+ }
+ }
+ // fail
+ return -1;
+}
+
+/* platform info */
+/**
+ * Print the current platform
+ */
+U_CAPI const char *udbg_getPlatform()
+{
+#if U_PLATFORM_USES_ONLY_WIN32_API
+ return "Windows";
+#elif U_PLATFORM == U_PF_CYGWIN
+ return "Cygwin";
+#elif U_PLATFORM == U_PF_UNKNOWN
+ return "unknown";
+#elif U_PLATFORM == U_PF_DARWIN
+ return "Darwin";
+#elif U_PLATFORM == U_PF_BSD
+ return "BSD";
+#elif U_PLATFORM == U_PF_QNX
+ return "QNX";
+#elif U_PLATFORM == U_PF_LINUX
+ return "Linux";
+#elif U_PLATFORM == U_PF_ANDROID
+ return "Android";
+#elif U_PLATFORM == U_PF_CLASSIC_MACOS
+ return "MacOS (Classic)";
+#elif U_PLATFORM == U_PF_OS390
+ return "IBM z";
+#elif U_PLATFORM == U_PF_OS400
+ return "IBM i";
+#else
+ return "Other (POSIX-like)";
+#endif
+}
+
+struct USystemParams;
+
+typedef int32_t U_CALLCONV USystemParameterCallback(const USystemParams *param, char *target, int32_t targetCapacity, UErrorCode *status);
+
+struct USystemParams {
+ const char *paramName;
+ USystemParameterCallback *paramFunction;
+ const char *paramStr;
+ int32_t paramInt;
+};
+
+/* parameter types */
+U_CAPI int32_t
+paramEmpty(const USystemParams * /* param */, char *target, int32_t targetCapacity, UErrorCode *status) {
+ if(U_FAILURE(*status))return 0;
+ return u_terminateChars(target, targetCapacity, 0, status);
+}
+
+U_CAPI int32_t
+paramStatic(const USystemParams *param, char *target, int32_t targetCapacity, UErrorCode *status) {
+ if(param->paramStr==nullptr) return paramEmpty(param,target,targetCapacity,status);
+ if(U_FAILURE(*status))return 0;
+ int32_t len = static_cast<int32_t>(uprv_strlen(param->paramStr));
+ if(target!=nullptr) {
+ uprv_strncpy(target,param->paramStr,uprv_min(len,targetCapacity));
+ }
+ return u_terminateChars(target, targetCapacity, len, status);
+}
+
+static const char *nullString = "(null)";
+
+static int32_t stringToStringBuffer(char *target, int32_t targetCapacity, const char *str, UErrorCode *status) {
+ if(str==nullptr) str=nullString;
+
+ int32_t len = static_cast<int32_t>(uprv_strlen(str));
+ if (U_SUCCESS(*status)) {
+ if(target!=nullptr) {
+ uprv_strncpy(target,str,uprv_min(len,targetCapacity));
+ }
+ } else {
+ const char *s = u_errorName(*status);
+ len = static_cast<int32_t>(uprv_strlen(s));
+ if(target!=nullptr) {
+ uprv_strncpy(target,s,uprv_min(len,targetCapacity));
+ }
+ }
+ return u_terminateChars(target, targetCapacity, len, status);
+}
+
+static int32_t integerToStringBuffer(char *target, int32_t targetCapacity, int32_t n, int32_t radix, UErrorCode *status) {
+ if(U_FAILURE(*status)) return 0;
+ char str[300];
+ T_CString_integerToString(str,n,radix);
+ return stringToStringBuffer(target,targetCapacity,str,status);
+}
+
+U_CAPI int32_t
+paramInteger(const USystemParams *param, char *target, int32_t targetCapacity, UErrorCode *status) {
+ if(U_FAILURE(*status))return 0;
+ if(param->paramStr==nullptr || param->paramStr[0]=='d') {
+ return integerToStringBuffer(target,targetCapacity,param->paramInt, 10,status);
+ } else if(param->paramStr[0]=='x') {
+ return integerToStringBuffer(target,targetCapacity,param->paramInt, 16,status);
+ } else if(param->paramStr[0]=='o') {
+ return integerToStringBuffer(target,targetCapacity,param->paramInt, 8,status);
+ } else if(param->paramStr[0]=='b') {
+ return integerToStringBuffer(target,targetCapacity,param->paramInt, 2,status);
+ } else {
+ *status = U_INTERNAL_PROGRAM_ERROR;
+ return 0;
+ }
+}
+
+
+U_CAPI int32_t
+paramCldrVersion(const USystemParams * /* param */, char *target, int32_t targetCapacity, UErrorCode *status) {
+ if(U_FAILURE(*status))return 0;
+ char str[200]="";
+ UVersionInfo icu;
+
+ ulocdata_getCLDRVersion(icu, status);
+ if(U_SUCCESS(*status)) {
+ u_versionToString(icu, str);
+ return stringToStringBuffer(target,targetCapacity,str,status);
+ } else {
+ return 0;
+ }
+}
+
+
+#if !UCONFIG_NO_FORMATTING
+U_CAPI int32_t
+paramTimezoneDefault(const USystemParams * /* param */, char *target, int32_t targetCapacity, UErrorCode *status) {
+ if(U_FAILURE(*status))return 0;
+ char16_t buf[100];
+ char buf2[100];
+ int32_t len;
+
+ len = ucal_getDefaultTimeZone(buf, 100, status);
+ if(U_SUCCESS(*status)&&len>0) {
+ u_UCharsToChars(buf, buf2, len+1);
+ return stringToStringBuffer(target,targetCapacity, buf2,status);
+ } else {
+ return 0;
+ }
+}
+#endif
+
+U_CAPI int32_t
+paramLocaleDefaultBcp47(const USystemParams * /* param */, char *target, int32_t targetCapacity, UErrorCode *status) {
+ if(U_FAILURE(*status))return 0;
+ const char *def = uloc_getDefault();
+ return uloc_toLanguageTag(def,target,targetCapacity,false,status);
+}
+
+
+/* simple 1-liner param functions */
+#define STRING_PARAM(func, str) U_CAPI int32_t \
+ func(const USystemParams *, char *target, int32_t targetCapacity, UErrorCode *status) \
+ { return stringToStringBuffer(target,targetCapacity,(str),status); }
+
+STRING_PARAM(paramIcudataPath, u_getDataDirectory())
+STRING_PARAM(paramPlatform, udbg_getPlatform())
+STRING_PARAM(paramLocaleDefault, uloc_getDefault())
+#if !UCONFIG_NO_CONVERSION
+STRING_PARAM(paramConverterDefault, ucnv_getDefaultName())
+#endif
+
+#if !UCONFIG_NO_FORMATTING
+STRING_PARAM(paramTimezoneVersion, ucal_getTZDataVersion(status))
+#endif
+
+static const USystemParams systemParams[] = {
+ { "copyright", paramStatic, U_COPYRIGHT_STRING,0 },
+ { "product", paramStatic, "icu4c",0 },
+ { "product.full", paramStatic, "International Components for Unicode for C/C++",0 },
+ { "version", paramStatic, U_ICU_VERSION,0 },
+ { "version.unicode", paramStatic, U_UNICODE_VERSION,0 },
+ { "platform.number", paramInteger, "d",U_PLATFORM},
+ { "platform.type", paramPlatform, nullptr ,0},
+ { "locale.default", paramLocaleDefault, nullptr, 0},
+ { "locale.default.bcp47", paramLocaleDefaultBcp47, nullptr, 0},
+#if !UCONFIG_NO_CONVERSION
+ { "converter.default", paramConverterDefault, nullptr, 0},
+#endif
+ { "icudata.name", paramStatic, U_ICUDATA_NAME, 0},
+ { "icudata.path", paramIcudataPath, nullptr, 0},
+
+ { "cldr.version", paramCldrVersion, nullptr, 0},
+
+#if !UCONFIG_NO_FORMATTING
+ { "tz.version", paramTimezoneVersion, nullptr, 0},
+ { "tz.default", paramTimezoneDefault, nullptr, 0},
+#endif
+
+ { "cpu.bits", paramInteger, "d", (sizeof(void*))*8},
+ { "cpu.big_endian", paramInteger, "b", U_IS_BIG_ENDIAN},
+ { "os.wchar_width", paramInteger, "d", U_SIZEOF_WCHAR_T},
+ { "os.charset_family", paramInteger, "d", U_CHARSET_FAMILY},
+#if defined (U_HOST)
+ { "os.host", paramStatic, U_HOST, 0},
+#endif
+#if defined (U_BUILD)
+ { "build.build", paramStatic, U_BUILD, 0},
+#endif
+#if defined (U_CC)
+ { "build.cc", paramStatic, U_CC, 0},
+#endif
+#if defined (U_CXX)
+ { "build.cxx", paramStatic, U_CXX, 0},
+#endif
+#if defined (CYGWINMSVC)
+ { "build.cygwinmsvc", paramInteger, "b", 1},
+#endif
+ { "uconfig.internal_digitlist", paramInteger, "b", 1}, /* always 1 */
+ { "uconfig.have_parseallinput", paramInteger, "b", UCONFIG_HAVE_PARSEALLINPUT},
+
+
+};
+
+#define U_SYSPARAM_COUNT UPRV_LENGTHOF(systemParams)
+
+U_CAPI const char *udbg_getSystemParameterNameByIndex(int32_t i) {
+ if(i>=0 && i < (int32_t)U_SYSPARAM_COUNT) {
+ return systemParams[i].paramName;
+ } else {
+ return nullptr;
+ }
+}
+
+
+U_CAPI int32_t udbg_getSystemParameterValueByIndex(int32_t i, char *buffer, int32_t bufferCapacity, UErrorCode *status) {
+ if(i>=0 && i< (int32_t)U_SYSPARAM_COUNT) {
+ return systemParams[i].paramFunction(&(systemParams[i]),buffer,bufferCapacity,status);
+ } else {
+ return 0;
+ }
+}
+
+U_CAPI void udbg_writeIcuInfo(FILE *out) {
+ char str[2000];
+ /* todo: API for writing DTD? */
+ fprintf(out, " <icuSystemParams type=\"icu4c\">\n");
+ const char *paramName;
+ for(int32_t i=0;(paramName=udbg_getSystemParameterNameByIndex(i))!=nullptr;i++) {
+ UErrorCode status2 = U_ZERO_ERROR;
+ udbg_getSystemParameterValueByIndex(i, str,2000,&status2);
+ if(U_SUCCESS(status2)) {
+ fprintf(out," <param name=\"%s\">%s</param>\n", paramName,str);
+ } else {
+ fprintf(out," <!-- n=\"%s\" ERROR: %s -->\n", paramName, u_errorName(status2));
+ }
+ }
+ fprintf(out, " </icuSystemParams>\n");
+}
+
+#define UNICODE_BUG_URL "https://unicode-org.atlassian.net/browse/"
+#define OLD_CLDR_PREFIX "cldrbug:"
+#define CLDR_BUG_PREFIX "CLDR-"
+#define ICU_BUG_PREFIX "ICU-"
+
+
+
+#include <set>
+#include <map>
+#include <string>
+#include <ostream>
+#include <iostream>
+
+class KnownIssues {
+public:
+ KnownIssues();
+ ~KnownIssues();
+ void add(const char *ticket, const char *where, const char16_t *msg, UBool *firstForTicket, UBool *firstForWhere);
+ void add(const char *ticket, const char *where, const char *msg, UBool *firstForTicket, UBool *firstForWhere);
+ UBool print();
+private:
+ std::map< std::string,
+ std::map < std::string, std::set < std::string > > > fTable;
+};
+
+KnownIssues::KnownIssues()
+ : fTable()
+{
+}
+
+KnownIssues::~KnownIssues()
+{
+}
+
+/**
+ * Map cldr:1234 to CLDR-1234
+ * Map 1234 to ICU-1234
+ */
+static std::string mapTicketId(const char *ticketStr) {
+ std::string ticket(ticketStr);
+ // TODO: Can remove this function once all logKnownIssue calls are switched over
+ // to the ICU-1234 and CLDR-1234 format.
+ if(ticket.rfind(OLD_CLDR_PREFIX) == 0) {
+ // map cldrbug:1234 to CLDR-1234
+ ticket.replace(0, uprv_strlen(OLD_CLDR_PREFIX), CLDR_BUG_PREFIX);
+ } else if(::isdigit(ticket[0])) {
+ // map 1234 to ICU-1234
+ ticket.insert(0, ICU_BUG_PREFIX);
+ }
+ return ticket;
+}
+
+void KnownIssues::add(const char *ticketStr, const char *where, const char16_t *msg, UBool *firstForTicket, UBool *firstForWhere)
+{
+ const std::string ticket = mapTicketId(ticketStr);
+ if(fTable.find(ticket) == fTable.end()) {
+ if(firstForTicket!=nullptr) *firstForTicket = true;
+ fTable[ticket] = std::map < std::string, std::set < std::string > >();
+ } else {
+ if(firstForTicket!=nullptr) *firstForTicket = false;
+ }
+ if(where==nullptr) return;
+
+ if(fTable[ticket].find(where) == fTable[ticket].end()) {
+ if(firstForWhere!=nullptr) *firstForWhere = true;
+ fTable[ticket][where] = std::set < std::string >();
+ } else {
+ if(firstForWhere!=nullptr) *firstForWhere = false;
+ }
+ if(msg==nullptr || !*msg) return;
+
+ const icu::UnicodeString ustr(msg);
+
+ fTable[ticket][where].insert(std::string(icu::CStr(ustr)()));
+}
+
+void KnownIssues::add(const char *ticketStr, const char *where, const char *msg, UBool *firstForTicket, UBool *firstForWhere)
+{
+ const std::string ticket = mapTicketId(ticketStr);
+ if(fTable.find(ticket) == fTable.end()) {
+ if(firstForTicket!=nullptr) *firstForTicket = true;
+ fTable[ticket] = std::map < std::string, std::set < std::string > >();
+ } else {
+ if(firstForTicket!=nullptr) *firstForTicket = false;
+ }
+ if(where==nullptr) return;
+
+ if(fTable[ticket].find(where) == fTable[ticket].end()) {
+ if(firstForWhere!=nullptr) *firstForWhere = true;
+ fTable[ticket][where] = std::set < std::string >();
+ } else {
+ if(firstForWhere!=nullptr) *firstForWhere = false;
+ }
+ if(msg==nullptr || !*msg) return;
+
+ std::string str(msg);
+ fTable[ticket][where].insert(str);
+}
+
+UBool KnownIssues::print()
+{
+ if(fTable.empty()) {
+ return false;
+ }
+
+ std::cout << "KNOWN ISSUES" << std::endl;
+ for( std::map< std::string,
+ std::map < std::string, std::set < std::string > > >::iterator i = fTable.begin();
+ i != fTable.end();
+ i++ ) {
+ const std::string ticketid = (*i).first;
+ std::cout << "[" << ticketid << "] ";
+ if(ticketid.rfind(ICU_BUG_PREFIX) == 0 || ticketid.rfind(CLDR_BUG_PREFIX) == 0) {
+ // If it's a unicode.org bug.
+ std::cout << UNICODE_BUG_URL << ticketid;
+ } // Else: some other kind of bug. Allow this, but without a URL.
+ std::cout << std::endl;
+
+ for( std::map< std::string, std::set < std::string > >::iterator ii = (*i).second.begin();
+ ii != (*i).second.end();
+ ii++ ) {
+ std::cout << " " << (*ii).first << std::endl;
+ for ( std::set < std::string >::iterator iii = (*ii).second.begin();
+ iii != (*ii).second.end();
+ iii++ ) {
+ std::cout << " " << '"' << (*iii) << '"' << std::endl;
+ }
+ }
+ }
+ return true;
+}
+
+U_CAPI void *udbg_knownIssue_openU(void *ptr, const char *ticket, char *where, const char16_t *msg, UBool *firstForTicket,
+ UBool *firstForWhere) {
+ KnownIssues *t = static_cast<KnownIssues*>(ptr);
+ if(t==nullptr) {
+ t = new KnownIssues();
+ }
+
+ t->add(ticket, where, msg, firstForTicket, firstForWhere);
+
+ return static_cast<void*>(t);
+}
+
+U_CAPI void *udbg_knownIssue_open(void *ptr, const char *ticket, char *where, const char *msg, UBool *firstForTicket,
+ UBool *firstForWhere) {
+ KnownIssues *t = static_cast<KnownIssues*>(ptr);
+ if(t==nullptr) {
+ t = new KnownIssues();
+ }
+
+ t->add(ticket, where, msg, firstForTicket, firstForWhere);
+
+ return static_cast<void*>(t);
+}
+
+U_CAPI UBool udbg_knownIssue_print(void *ptr) {
+ KnownIssues *t = static_cast<KnownIssues*>(ptr);
+ if(t==nullptr) {
+ return false;
+ } else {
+ t->print();
+ return true;
+ }
+}
+
+U_CAPI void udbg_knownIssue_close(void *ptr) {
+ KnownIssues *t = static_cast<KnownIssues*>(ptr);
+ delete t;
+}
diff --git a/intl/icu/source/tools/toolutil/udbgutil.h b/intl/icu/source/tools/toolutil/udbgutil.h
new file mode 100644
index 0000000000..e3ed513839
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/udbgutil.h
@@ -0,0 +1,147 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+************************************************************************
+* Copyright (c) 2008-2015, International Business Machines
+* Corporation and others. All Rights Reserved.
+************************************************************************
+*/
+
+/** C Utilities to aid in debugging **/
+
+#ifndef _UDBGUTIL_H
+#define _UDBGUTIL_H
+
+#include "unicode/utypes.h"
+#include <stdio.h>
+
+enum UDebugEnumType {
+ UDBG_UDebugEnumType = 0, /* Self-referential, strings for UDebugEnumType. Count=ENUM_COUNT. */
+#if !UCONFIG_NO_FORMATTING
+ UDBG_UCalendarDateFields, /* UCalendarDateFields. Count=UCAL_FIELD_COUNT. Unsupported if UCONFIG_NO_FORMATTING. */
+ UDBG_UCalendarMonths, /* UCalendarMonths. Count= (UCAL_UNDECIMBER+1) */
+ UDBG_UDateFormatStyle, /* Count = UDAT_SHORT=1 */
+#endif
+#if UCONFIG_ENABLE_PLUGINS
+ UDBG_UPlugReason, /* Count = UPLUG_REASON_COUNT */
+ UDBG_UPlugLevel, /* COUNT = UPLUG_LEVEL_COUNT */
+#endif
+ UDBG_UAcceptResult, /* Count = ULOC_ACCEPT_FALLBACK+1=3 */
+
+ /* All following enums may be discontiguous. */
+
+#if !UCONFIG_NO_COLLATION
+ UDBG_UColAttributeValue, /* UCOL_ATTRIBUTE_VALUE_COUNT */
+#endif
+ UDBG_ENUM_COUNT,
+ UDBG_HIGHEST_CONTIGUOUS_ENUM = UDBG_UAcceptResult, /**< last enum in this list with contiguous (testable) values. */
+ UDBG_INVALID_ENUM = -1 /** Invalid enum value **/
+};
+
+typedef enum UDebugEnumType UDebugEnumType;
+
+/**
+ * @param type the type of enum
+ * Print how many enums are contained for this type.
+ * Should be equal to the appropriate _COUNT constant or there is an error. Return -1 if unsupported.
+ */
+U_CAPI int32_t U_EXPORT2 udbg_enumCount(UDebugEnumType type);
+
+/**
+ * Convert an enum to a string
+ * @param type type of enum
+ * @param field field number
+ * @return string of the format "ERA", "YEAR", etc, or NULL if out of range or unsupported
+ */
+U_CAPI const char * U_EXPORT2 udbg_enumName(UDebugEnumType type, int32_t field);
+
+/**
+ * for consistency checking
+ * @param type the type of enum
+ * Print how many enums should be contained for this type.
+ * This is equal to the appropriate _COUNT constant or there is an error. Returns -1 if unsupported.
+ */
+U_CAPI int32_t U_EXPORT2 udbg_enumExpectedCount(UDebugEnumType type);
+
+/**
+ * For consistency checking, returns the expected enum ordinal value for the given index value.
+ * @param type which type
+ * @param field field number
+ * @return should be equal to 'field' or -1 if out of range.
+ */
+U_CAPI int32_t U_EXPORT2 udbg_enumArrayValue(UDebugEnumType type, int32_t field);
+
+/**
+ * Locate the specified field value by name.
+ * @param type which type
+ * @param name name of string (case sensitive)
+ * @return should be a field value or -1 if not found.
+ */
+U_CAPI int32_t U_EXPORT2 udbg_enumByName(UDebugEnumType type, const char *name);
+
+
+/**
+ * Return the Platform (U_PLATFORM) as a string
+ */
+U_CAPI const char *udbg_getPlatform(void);
+
+/**
+ * Get the nth system parameter's name
+ * @param i index of name, starting from zero
+ * @return name, or NULL if off the end
+ * @see udbg_getSystemParameterValue
+ */
+U_CAPI const char *udbg_getSystemParameterNameByIndex(int32_t i);
+
+/**
+ * Get the nth system parameter's value, in a user supplied buffer
+ * @parameter i index of value, starting from zero
+ * @param status error status
+ * @return length written (standard termination rules)
+ * @see udbg_getSystemParameterName
+ */
+U_CAPI int32_t udbg_getSystemParameterValueByIndex(int32_t i, char *buffer, int32_t bufferCapacity, UErrorCode *status);
+
+/**
+ * Write ICU info as XML
+ */
+U_CAPI void udbg_writeIcuInfo(FILE *f);
+
+/**
+ * \def UDBG_KNOWNISSUE_LEN
+ * Length of output buffer for udbg_knownIssueURLFrom
+ */
+#define UDBG_KNOWNISSUE_LEN 255
+
+/**
+ * Open (or reopen) a 'known issue' table.
+ * @param ptr pointer to 'table'. Opaque.
+ * @return new or existing ptr
+ */
+U_CAPI void *udbg_knownIssue_openU(void *ptr, const char *ticket, char *where, const UChar *msg, UBool *firstForTicket,
+ UBool *firstForWhere);
+
+
+/**
+ * Open (or reopen) a 'known issue' table.
+ * @param ptr pointer to 'table'. Opaque.
+ * @return new or existing ptr
+ */
+U_CAPI void *udbg_knownIssue_open(void *ptr, const char *ticket, char *where, const char *msg, UBool *firstForTicket,
+ UBool *firstForWhere);
+
+/**
+ * Print 'known issue' table, to std::cout.
+ * @param ptr pointer from udbg_knownIssue
+ * @return true if there were any issues.
+ */
+U_CAPI UBool udbg_knownIssue_print(void *ptr);
+
+/**
+ * Close 'known issue' table.
+ * @param ptr
+ */
+U_CAPI void udbg_knownIssue_close(void *ptr);
+
+
+#endif
diff --git a/intl/icu/source/tools/toolutil/unewdata.cpp b/intl/icu/source/tools/toolutil/unewdata.cpp
new file mode 100644
index 0000000000..27414d2eba
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/unewdata.cpp
@@ -0,0 +1,286 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 1999-2010, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: unewdata.c
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 1999oct25
+* created by: Markus W. Scherer
+*/
+
+#include <stdio.h>
+#include "unicode/utypes.h"
+#include "unicode/putil.h"
+#include "unicode/ustring.h"
+#include "cmemory.h"
+#include "cstring.h"
+#include "filestrm.h"
+#include "unicode/udata.h"
+#include "unewdata.h"
+
+struct UNewDataMemory {
+ FileStream *file;
+ uint16_t headerSize;
+ uint8_t magic1, magic2;
+};
+
+U_CAPI UNewDataMemory * U_EXPORT2
+udata_create(const char *dir, const char *type, const char *name,
+ const UDataInfo *pInfo,
+ const char *comment,
+ UErrorCode *pErrorCode) {
+ UNewDataMemory *pData;
+ uint16_t headerSize, commentLength;
+ char filename[512];
+ uint8_t bytes[16];
+ int32_t length;
+
+ if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ } else if(name==nullptr || *name==0 || pInfo==nullptr) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+
+ /* allocate the data structure */
+ pData=(UNewDataMemory *)uprv_malloc(sizeof(UNewDataMemory));
+ if(pData==nullptr) {
+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+ return nullptr;
+ }
+
+ char dirSepChar = U_FILE_SEP_CHAR;
+#if (U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR)
+ // We may need to append a different directory separator when building for Cygwin or MSYS2.
+ if(dir && *dir) {
+ if(!uprv_strchr(dir, U_FILE_SEP_CHAR) && uprv_strchr(dir, U_FILE_ALT_SEP_CHAR)) {
+ dirSepChar = U_FILE_ALT_SEP_CHAR;
+ }
+ }
+#endif
+
+ /* Check that the full path won't be too long */
+ length = 0; /* Start with nothing */
+ if(dir != nullptr && *dir !=0) /* Add directory length if one was given */
+ {
+ length += static_cast<int32_t>(strlen(dir));
+
+ /* Add 1 if dir doesn't end with path sep */
+ if (dir[strlen(dir) - 1]!= dirSepChar) {
+ length++;
+ }
+ }
+ length += static_cast<int32_t>(strlen(name)); /* Add the filename length */
+
+ if(type != nullptr && *type !=0) { /* Add directory length if given */
+ length += static_cast<int32_t>(strlen(type));
+ }
+
+
+ /* LDH buffer Length error check */
+ if(length > ((int32_t)sizeof(filename) - 1))
+ {
+ *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
+ uprv_free(pData);
+ return nullptr;
+ }
+
+ /* open the output file */
+ if(dir!=nullptr && *dir!=0) { /* if dir has a value, we prepend it to the filename */
+ char *p=filename+strlen(dir);
+ uprv_strcpy(filename, dir);
+ if (*(p-1)!=dirSepChar) {
+ *p++=dirSepChar;
+ *p=0;
+ }
+ } else { /* otherwise, we'll output to the current dir */
+ filename[0]=0;
+ }
+ uprv_strcat(filename, name);
+ if(type!=nullptr && *type!=0) {
+ uprv_strcat(filename, ".");
+ uprv_strcat(filename, type);
+ }
+ pData->file=T_FileStream_open(filename, "wb");
+ if(pData->file==nullptr) {
+ uprv_free(pData);
+ *pErrorCode=U_FILE_ACCESS_ERROR;
+ return nullptr;
+ }
+
+ /* write the header information */
+ headerSize=(uint16_t)(pInfo->size+4);
+ if(comment!=nullptr && *comment!=0) {
+ commentLength=(uint16_t)(uprv_strlen(comment)+1);
+ headerSize+=commentLength;
+ } else {
+ commentLength=0;
+ }
+
+ /* write the size of the header, take padding into account */
+ pData->headerSize=(uint16_t)((headerSize+15)&~0xf);
+ pData->magic1=0xda;
+ pData->magic2=0x27;
+ T_FileStream_write(pData->file, &pData->headerSize, 4);
+
+ /* write the information data */
+ T_FileStream_write(pData->file, pInfo, pInfo->size);
+
+ /* write the comment */
+ if(commentLength>0) {
+ T_FileStream_write(pData->file, comment, commentLength);
+ }
+
+ /* write padding bytes to align the data section to 16 bytes */
+ headerSize&=0xf;
+ if(headerSize!=0) {
+ headerSize=(uint16_t)(16-headerSize);
+ uprv_memset(bytes, 0, headerSize);
+ T_FileStream_write(pData->file, bytes, headerSize);
+ }
+
+ return pData;
+}
+
+U_CAPI uint32_t U_EXPORT2
+udata_finish(UNewDataMemory *pData, UErrorCode *pErrorCode) {
+ uint32_t fileLength=0;
+
+ if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+
+ if(pData!=nullptr) {
+ if(pData->file!=nullptr) {
+ /* fflush(pData->file);*/
+ fileLength=T_FileStream_size(pData->file);
+ if(T_FileStream_error(pData->file)) {
+ *pErrorCode=U_FILE_ACCESS_ERROR;
+ } else {
+ fileLength-=pData->headerSize;
+ }
+ T_FileStream_close(pData->file);
+ }
+ uprv_free(pData);
+ }
+
+ return fileLength;
+}
+
+/* dummy UDataInfo cf. udata.h */
+static const UDataInfo dummyDataInfo = {
+ sizeof(UDataInfo),
+ 0,
+
+ U_IS_BIG_ENDIAN,
+ U_CHARSET_FAMILY,
+ U_SIZEOF_UCHAR,
+ 0,
+
+ { 0, 0, 0, 0 }, /* dummy dataFormat */
+ { 0, 0, 0, 0 }, /* dummy formatVersion */
+ { 0, 0, 0, 0 } /* dummy dataVersion */
+};
+
+U_CAPI void U_EXPORT2
+udata_createDummy(const char *dir, const char *type, const char *name, UErrorCode *pErrorCode) {
+ if(U_SUCCESS(*pErrorCode)) {
+ udata_finish(udata_create(dir, type, name, &dummyDataInfo, nullptr, pErrorCode), pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ fprintf(stderr, "error %s writing dummy data file %s" U_FILE_SEP_STRING "%s.%s\n",
+ u_errorName(*pErrorCode), dir, name, type);
+ exit(*pErrorCode);
+ }
+ }
+}
+
+U_CAPI void U_EXPORT2
+udata_write8(UNewDataMemory *pData, uint8_t byte) {
+ if(pData!=nullptr && pData->file!=nullptr) {
+ T_FileStream_write(pData->file, &byte, 1);
+ }
+}
+
+U_CAPI void U_EXPORT2
+udata_write16(UNewDataMemory *pData, uint16_t word) {
+ if(pData!=nullptr && pData->file!=nullptr) {
+ T_FileStream_write(pData->file, &word, 2);
+ }
+}
+
+U_CAPI void U_EXPORT2
+udata_write32(UNewDataMemory *pData, uint32_t wyde) {
+ if(pData!=nullptr && pData->file!=nullptr) {
+ T_FileStream_write(pData->file, &wyde, 4);
+ }
+}
+
+U_CAPI void U_EXPORT2
+udata_writeBlock(UNewDataMemory *pData, const void *s, int32_t length) {
+ if(pData!=nullptr && pData->file!=nullptr) {
+ if(length>0) {
+ T_FileStream_write(pData->file, s, length);
+ }
+ }
+}
+
+U_CAPI void U_EXPORT2
+udata_writePadding(UNewDataMemory *pData, int32_t length) {
+ static const uint8_t padding[16]={
+ 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa,
+ 0xaa, 0xaa, 0xaa, 0xaa
+ };
+ if(pData!=nullptr && pData->file!=nullptr) {
+ while(length>=16) {
+ T_FileStream_write(pData->file, padding, 16);
+ length-=16;
+ }
+ if(length>0) {
+ T_FileStream_write(pData->file, padding, length);
+ }
+ }
+}
+
+U_CAPI void U_EXPORT2
+udata_writeString(UNewDataMemory *pData, const char *s, int32_t length) {
+ if(pData!=nullptr && pData->file!=nullptr) {
+ if(length==-1) {
+ length=(int32_t)uprv_strlen(s);
+ }
+ if(length>0) {
+ T_FileStream_write(pData->file, s, length);
+ }
+ }
+}
+
+U_CAPI void U_EXPORT2
+udata_writeUString(UNewDataMemory *pData, const char16_t *s, int32_t length) {
+ if(pData!=nullptr && pData->file!=nullptr) {
+ if(length==-1) {
+ length=u_strlen(s);
+ }
+ if(length>0) {
+ T_FileStream_write(pData->file, s, length*sizeof(char16_t));
+ }
+ }
+}
+
+/*
+ * Hey, Emacs, please set the following:
+ *
+ * Local Variables:
+ * indent-tabs-mode: nil
+ * End:
+ *
+ */
+
diff --git a/intl/icu/source/tools/toolutil/unewdata.h b/intl/icu/source/tools/toolutil/unewdata.h
new file mode 100644
index 0000000000..137fb49584
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/unewdata.h
@@ -0,0 +1,113 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 1999-2010, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: unewdata.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 1999oct25
+* created by: Markus W. Scherer
+*/
+
+#ifndef __UNEWDATA_H__
+#define __UNEWDATA_H__
+
+#include "unicode/utypes.h"
+#include "unicode/udata.h"
+
+/* API for writing data -----------------------------------------------------*/
+
+/** @memo Forward declaration of the data memory creation type. */
+typedef struct UNewDataMemory UNewDataMemory;
+
+/**
+ * Create a new binary data file.
+ * The file-writing <code>udata_</code> functions facilitate writing
+ * binary data files that can be read by ICU's <code>udata</code> API.
+ * This function opens a new file with a filename determined from its
+ * parameters - of the form "name.type".
+ * It then writes a short header, followed by the <code>UDataInfo</code>
+ * structure and, optionally, by the comment string.
+ * It then writes padding bytes to round up to a multiple of 16 bytes.
+ * Subsequent write operations will thus start at an offset in the file
+ * that is a multiple of 16. <code>udata_getMemory()</code> will return
+ * a pointer to this same starting offset.
+ *
+ * See udata.h .
+ *
+ * @param dir A string that specifies the directory where the data will be
+ * written. If <code>NULL</code>, then
+ * <code>u_getDataDirectory</code> is used.
+ * @param type A string that specifies the type of data to be written.
+ * For example, resource bundles are written with type "res",
+ * conversion tables with type "cnv".
+ * This may be <code>NULL</code> or empty.
+ * @param name A string that specifies the name of the data.
+ * @param pInfo A pointer to a correctly filled <code>UDataInfo</code>
+ * structure that will be copied into the file.
+ * @param comment A string (e.g., a copyright statement) that will be
+ * copied into the file if it is not <code>NULL</code>
+ * or empty. This string serves only as a comment in the binary
+ * file. It will not be accessible by any API.
+ * @param pErrorCode An ICU UErrorCode parameter. It must not be <code>NULL</code>.
+ */
+U_CAPI UNewDataMemory * U_EXPORT2
+udata_create(const char *dir, const char *type, const char *name,
+ const UDataInfo *pInfo,
+ const char *comment,
+ UErrorCode *pErrorCode);
+
+/** @memo Close a newly written binary file. */
+U_CAPI uint32_t U_EXPORT2
+udata_finish(UNewDataMemory *pData, UErrorCode *pErrorCode);
+
+/** @memo Write a dummy data file. */
+U_CAPI void U_EXPORT2
+udata_createDummy(const char *dir, const char *type, const char *name, UErrorCode *pErrorCode);
+
+/** @memo Write an 8-bit byte to the file. */
+U_CAPI void U_EXPORT2
+udata_write8(UNewDataMemory *pData, uint8_t byte);
+
+/** @memo Write a 16-bit word to the file. */
+U_CAPI void U_EXPORT2
+udata_write16(UNewDataMemory *pData, uint16_t word);
+
+/** @memo Write a 32-bit word to the file. */
+U_CAPI void U_EXPORT2
+udata_write32(UNewDataMemory *pData, uint32_t wyde);
+
+/** @memo Write a block of bytes to the file. */
+U_CAPI void U_EXPORT2
+udata_writeBlock(UNewDataMemory *pData, const void *s, int32_t length);
+
+/** @memo Write a block of arbitrary padding bytes to the file. */
+U_CAPI void U_EXPORT2
+udata_writePadding(UNewDataMemory *pData, int32_t length);
+
+/** @memo Write a <code>char*</code> string of platform "invariant characters" to the file. */
+U_CAPI void U_EXPORT2
+udata_writeString(UNewDataMemory *pData, const char *s, int32_t length);
+
+/** @memo Write a <code>UChar*</code> string of Unicode character code units to the file. */
+U_CAPI void U_EXPORT2
+udata_writeUString(UNewDataMemory *pData, const UChar *s, int32_t length);
+
+
+/*
+ * Hey, Emacs, please set the following:
+ *
+ * Local Variables:
+ * indent-tabs-mode: nil
+ * End:
+ *
+ */
+
+#endif
diff --git a/intl/icu/source/tools/toolutil/uoptions.cpp b/intl/icu/source/tools/toolutil/uoptions.cpp
new file mode 100644
index 0000000000..808164ae4d
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/uoptions.cpp
@@ -0,0 +1,133 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2000-2015, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: uoptions.c
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2000apr17
+* created by: Markus W. Scherer
+*
+* This file provides a command line argument parser.
+*/
+
+#include "unicode/utypes.h"
+#include "cstring.h"
+#include "uoptions.h"
+
+U_CAPI int U_EXPORT2
+u_parseArgs(int argc, char* argv[],
+ int optionCount, UOption options[]) {
+ char *arg;
+ int i=1, remaining=1;
+ char c, stopOptions=0;
+
+ while(i<argc) {
+ arg=argv[i];
+ if(!stopOptions && *arg=='-' && (c=arg[1])!=0) {
+ /* process an option */
+ UOption *option=nullptr;
+ arg+=2;
+ if(c=='-') {
+ /* process a long option */
+ if(*arg==0) {
+ /* stop processing options after "--" */
+ stopOptions=1;
+ } else {
+ /* search for the option string */
+ int j;
+ for(j=0; j<optionCount; ++j) {
+ if(options[j].longName && uprv_strcmp(arg, options[j].longName)==0) {
+ option=options+j;
+ break;
+ }
+ }
+ if(option==nullptr) {
+ /* no option matches */
+ return -i;
+ }
+ option->doesOccur=1;
+
+ if(option->hasArg!=UOPT_NO_ARG) {
+ /* parse the argument for the option, if any */
+ if(i+1<argc && !(argv[i+1][0]=='-' && argv[i+1][1]!=0)) {
+ /* argument in the next argv[], and there is not an option in there */
+ option->value=argv[++i];
+ } else if(option->hasArg==UOPT_REQUIRES_ARG) {
+ /* there is no argument, but one is required: return with error */
+ option->doesOccur=0;
+ return -i;
+ }
+ }
+
+ if(option->optionFn!=nullptr && option->optionFn(option->context, option)<0) {
+ /* the option function was called and returned an error */
+ option->doesOccur=0;
+ return -i;
+ }
+ }
+ } else {
+ /* process one or more short options */
+ do {
+ /* search for the option letter */
+ int j;
+ for(j=0; j<optionCount; ++j) {
+ if(c==options[j].shortName) {
+ option=options+j;
+ break;
+ }
+ }
+ if(option==nullptr) {
+ /* no option matches */
+ return -i;
+ }
+ option->doesOccur=1;
+
+ if(option->hasArg!=UOPT_NO_ARG) {
+ /* parse the argument for the option, if any */
+ if(*arg!=0) {
+ /* argument following in the same argv[] */
+ option->value=arg;
+ /* do not process the rest of this arg as option letters */
+ break;
+ } else if(i+1<argc && !(argv[i+1][0]=='-' && argv[i+1][1]!=0)) {
+ /* argument in the next argv[], and there is not an option in there */
+ option->value=argv[++i];
+ /* this break is redundant because we know that *arg==0 */
+ break;
+ } else if(option->hasArg==UOPT_REQUIRES_ARG) {
+ /* there is no argument, but one is required: return with error */
+ option->doesOccur=0;
+ return -i;
+ }
+ }
+
+ if(option->optionFn!=nullptr && option->optionFn(option->context, option)<0) {
+ /* the option function was called and returned an error */
+ option->doesOccur=0;
+ return -i;
+ }
+
+ /* get the next option letter */
+ option=nullptr;
+ c=*arg++;
+ } while(c!=0);
+ }
+
+ /* go to next argv[] */
+ ++i;
+ } else {
+ /* move a non-option up in argv[] */
+ argv[remaining++]=arg;
+ ++i;
+ }
+ }
+ return remaining;
+}
diff --git a/intl/icu/source/tools/toolutil/uoptions.h b/intl/icu/source/tools/toolutil/uoptions.h
new file mode 100644
index 0000000000..d00e3da924
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/uoptions.h
@@ -0,0 +1,143 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2000-2011, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: uoptions.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2000apr17
+* created by: Markus W. Scherer
+*
+* This file provides a command line argument parser.
+*/
+
+#ifndef __UOPTIONS_H__
+#define __UOPTIONS_H__
+
+#include "unicode/utypes.h"
+
+/* This should usually be called before calling u_parseArgs */
+/*#if U_PLATFORM == U_PF_OS390 && (U_CHARSET_FAMILY == U_ASCII_FAMILY)*/
+ /* translate args from EBCDIC to ASCII */
+/*# define U_MAIN_INIT_ARGS(argc, argv) __argvtoascii_a(argc, argv)*/
+/*#elif defined(XP_MAC_CONSOLE)*/
+#if defined(XP_MAC_CONSOLE)
+# include <console.h>
+ /* Get the arguments from the GUI, since old Macs don't have a console Window. */
+# define U_MAIN_INIT_ARGS(argc, argv) argc = ccommand((char***)&argv)
+#else
+ /* Normally we do nothing. */
+# define U_MAIN_INIT_ARGS(argc, argv)
+#endif
+
+
+
+/* forward declarations for the function declaration */
+struct UOption;
+typedef struct UOption UOption;
+
+/* function to be called for a command line option */
+typedef int UOptionFn(void *context, UOption *option);
+
+/* values of UOption.hasArg */
+enum { UOPT_NO_ARG, UOPT_REQUIRES_ARG, UOPT_OPTIONAL_ARG };
+
+/* structure describing a command line option */
+struct UOption {
+ const char *longName; /* "foo" for --foo */
+ const char *value; /* output placeholder, will point to the argument string, if any */
+ UOptionFn *optionFn; /* function to be called when this option occurs */
+ void *context; /* parameter for the function */
+ char shortName; /* 'f' for -f */
+ char hasArg; /* enum value: option takes no/requires/may have argument */
+ char doesOccur; /* boolean for "this one occurred" */
+};
+
+/* macro for an entry in a declaration of UOption[] */
+#define UOPTION_DEF(longName, shortName, hasArg) \
+ { longName, NULL, NULL, NULL, shortName, hasArg, 0 }
+
+/* ICU Tools option definitions */
+#define UOPTION_HELP_H UOPTION_DEF("help", 'h', UOPT_NO_ARG)
+#define UOPTION_HELP_QUESTION_MARK UOPTION_DEF("help", '?', UOPT_NO_ARG)
+#define UOPTION_VERBOSE UOPTION_DEF("verbose", 'v', UOPT_NO_ARG)
+#define UOPTION_QUIET UOPTION_DEF("quiet", 'q', UOPT_NO_ARG)
+#define UOPTION_VERSION UOPTION_DEF("version", 'V', UOPT_NO_ARG)
+#define UOPTION_COPYRIGHT UOPTION_DEF("copyright", 'c', UOPT_NO_ARG)
+
+#define UOPTION_DESTDIR UOPTION_DEF("destdir", 'd', UOPT_REQUIRES_ARG)
+#define UOPTION_SOURCEDIR UOPTION_DEF("sourcedir", 's', UOPT_REQUIRES_ARG)
+#define UOPTION_ENCODING UOPTION_DEF("encoding", 'e', UOPT_REQUIRES_ARG)
+#define UOPTION_ICUDATADIR UOPTION_DEF("icudatadir", 'i', UOPT_REQUIRES_ARG)
+#define UOPTION_WRITE_JAVA UOPTION_DEF("write-java", 'j', UOPT_OPTIONAL_ARG)
+#define UOPTION_PACKAGE_NAME UOPTION_DEF("package-name", 'p', UOPT_REQUIRES_ARG)
+#define UOPTION_BUNDLE_NAME UOPTION_DEF("bundle-name", 'b', UOPT_REQUIRES_ARG)
+
+/**
+ * C Command line argument parser.
+ *
+ * This function takes the argv[argc] command line and a description of
+ * the program's options in form of an array of UOption structures.
+ * Each UOption defines a long and a short name (a string and a character)
+ * for options like "--foo" and "-f".
+ *
+ * Each option is marked with whether it does not take an argument,
+ * requires one, or optionally takes one. The argument may follow in
+ * the same argv[] entry for short options, or it may always follow
+ * in the next argv[] entry.
+ *
+ * An argument is in the next argv[] entry for both long and short name
+ * options, except it is taken from directly behind the short name in
+ * its own argv[] entry if there are characters following the option letter.
+ * An argument in its own argv[] entry must not begin with a '-'
+ * unless it is only the '-' itself. There is no restriction of the
+ * argument format if it is part of the short name options's argv[] entry.
+ *
+ * The argument is stored in the value field of the corresponding
+ * UOption entry, and the doesOccur field is set to 1 if the option
+ * is found at all.
+ *
+ * Short name options without arguments can be collapsed into a single
+ * argv[] entry. After an option letter takes an argument, following
+ * letters will be taken as its argument.
+ *
+ * If the same option is found several times, then the last
+ * argument value will be stored in the value field.
+ *
+ * For each option, a function can be called. This could be used
+ * for options that occur multiple times and all arguments are to
+ * be collected.
+ *
+ * All options are removed from the argv[] array itself. If the parser
+ * is successful, then it returns the number of remaining non-option
+ * strings (including argv[0]).
+ * argv[0], the program name, is never read or modified.
+ *
+ * An option "--" ends option processing; everything after this
+ * remains in the argv[] array.
+ *
+ * An option string "-" alone is treated as a non-option.
+ *
+ * If an option is not recognized or an argument missing, then
+ * the parser returns with the negative index of the argv[] entry
+ * where the error was detected.
+ *
+ * The OS/400 compiler requires that argv either be "char* argv[]",
+ * or "const char* const argv[]", and it will not accept,
+ * "const char* argv[]" as a definition for main().
+ *
+ * @param argv This parameter is modified
+ * @param options This parameter is modified
+ */
+U_CAPI int U_EXPORT2
+u_parseArgs(int argc, char* argv[],
+ int optionCount, UOption options[]);
+
+#endif
diff --git a/intl/icu/source/tools/toolutil/uparse.cpp b/intl/icu/source/tools/toolutil/uparse.cpp
new file mode 100644
index 0000000000..5aee48b5a4
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/uparse.cpp
@@ -0,0 +1,383 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2000-2012, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: uparse.c
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2000apr18
+* created by: Markus W. Scherer
+*
+* This file provides a parser for files that are delimited by one single
+* character like ';' or TAB. Example: the Unicode Character Properties files
+* like UnicodeData.txt are semicolon-delimited.
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/uchar.h"
+#include "unicode/ustring.h"
+#include "unicode/utf16.h"
+#include "cstring.h"
+#include "filestrm.h"
+#include "uparse.h"
+#include "ustr_imp.h"
+
+#include <stdio.h>
+
+U_CAPI const char * U_EXPORT2
+u_skipWhitespace(const char *s) {
+ while(U_IS_INV_WHITESPACE(*s)) {
+ ++s;
+ }
+ return s;
+}
+
+U_CAPI char * U_EXPORT2
+u_rtrim(char *s) {
+ char *end=uprv_strchr(s, 0);
+ while(s<end && U_IS_INV_WHITESPACE(*(end-1))) {
+ *--end = 0;
+ }
+ return end;
+}
+
+/*
+ * If the string starts with # @missing: then return the pointer to the
+ * following non-whitespace character.
+ * Otherwise return the original pointer.
+ * Unicode 5.0 adds such lines in some data files to document
+ * default property values.
+ * Poor man's regex for variable amounts of white space.
+ */
+static const char *
+getMissingLimit(const char *s) {
+ const char *s0=s;
+ if(
+ *(s=u_skipWhitespace(s))=='#' &&
+ *(s=u_skipWhitespace(s+1))=='@' &&
+ 0==strncmp((s=u_skipWhitespace(s+1)), "missing", 7) &&
+ *(s=u_skipWhitespace(s+7))==':'
+ ) {
+ return u_skipWhitespace(s+1);
+ } else {
+ return s0;
+ }
+}
+
+U_CAPI void U_EXPORT2
+u_parseDelimitedFile(const char *filename, char delimiter,
+ char *fields[][2], int32_t fieldCount,
+ UParseLineFn *lineFn, void *context,
+ UErrorCode *pErrorCode) {
+ FileStream *file;
+ char line[10000];
+ char *start, *limit;
+ int32_t i, length;
+
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ }
+
+ if(fields==nullptr || lineFn==nullptr || fieldCount<=0) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+
+ if(filename==nullptr || *filename==0 || (*filename=='-' && filename[1]==0)) {
+ filename=nullptr;
+ file=T_FileStream_stdin();
+ } else {
+ file=T_FileStream_open(filename, "r");
+ }
+ if(file==nullptr) {
+ *pErrorCode=U_FILE_ACCESS_ERROR;
+ return;
+ }
+
+ while(T_FileStream_readLine(file, line, sizeof(line))!=nullptr) {
+ /* remove trailing newline characters */
+ length=(int32_t)(u_rtrim(line)-line);
+
+ /*
+ * detect a line with # @missing:
+ * start parsing after that, or else from the beginning of the line
+ * set the default warning for @missing lines
+ */
+ start=(char *)getMissingLimit(line);
+ if(start==line) {
+ *pErrorCode=U_ZERO_ERROR;
+ } else {
+ *pErrorCode=U_USING_DEFAULT_WARNING;
+ }
+
+ /* skip this line if it is empty or a comment */
+ if(*start==0 || *start=='#') {
+ continue;
+ }
+
+ /* remove in-line comments */
+ limit=uprv_strchr(start, '#');
+ if(limit!=nullptr) {
+ /* get white space before the pound sign */
+ while(limit>start && U_IS_INV_WHITESPACE(*(limit-1))) {
+ --limit;
+ }
+
+ /* truncate the line */
+ *limit=0;
+ }
+
+ /* skip lines with only whitespace */
+ if(u_skipWhitespace(start)[0]==0) {
+ continue;
+ }
+
+ /* for each field, call the corresponding field function */
+ for(i=0; i<fieldCount; ++i) {
+ /* set the limit pointer of this field */
+ limit=start;
+ while(*limit!=delimiter && *limit!=0) {
+ ++limit;
+ }
+
+ /* set the field start and limit in the fields array */
+ fields[i][0]=start;
+ fields[i][1]=limit;
+
+ /* set start to the beginning of the next field, if any */
+ start=limit;
+ if(*start!=0) {
+ ++start;
+ } else if(i+1<fieldCount) {
+ *pErrorCode=U_PARSE_ERROR;
+ limit=line+length;
+ i=fieldCount;
+ break;
+ }
+ }
+
+ /* too few fields? */
+ if(U_FAILURE(*pErrorCode)) {
+ break;
+ }
+
+ /* call the field function */
+ lineFn(context, fields, fieldCount, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ break;
+ }
+ }
+
+ if(filename!=nullptr) {
+ T_FileStream_close(file);
+ }
+}
+
+/*
+ * parse a list of code points
+ * store them as a UTF-32 string in dest[destCapacity]
+ * return the number of code points
+ */
+U_CAPI int32_t U_EXPORT2
+u_parseCodePoints(const char *s,
+ uint32_t *dest, int32_t destCapacity,
+ UErrorCode *pErrorCode) {
+ char *end;
+ uint32_t value;
+ int32_t count;
+
+ if(U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+ if(s==nullptr || destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+
+ count=0;
+ for(;;) {
+ s=u_skipWhitespace(s);
+ if(*s==';' || *s==0) {
+ return count;
+ }
+
+ /* read one code point */
+ value=(uint32_t)uprv_strtoul(s, &end, 16);
+ if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
+ *pErrorCode=U_PARSE_ERROR;
+ return 0;
+ }
+
+ /* append it to the destination array */
+ if(count<destCapacity) {
+ dest[count++]=value;
+ } else {
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ }
+
+ /* go to the following characters */
+ s=end;
+ }
+}
+
+/*
+ * parse a list of code points
+ * store them as a string in dest[destCapacity]
+ * set the first code point in *pFirst
+ * @return The length of the string in numbers of UChars.
+ */
+U_CAPI int32_t U_EXPORT2
+u_parseString(const char *s,
+ char16_t *dest, int32_t destCapacity,
+ uint32_t *pFirst,
+ UErrorCode *pErrorCode) {
+ char *end;
+ uint32_t value;
+ int32_t destLength;
+
+ if(U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+ if(s==nullptr || destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+
+ if(pFirst!=nullptr) {
+ *pFirst=0xffffffff;
+ }
+
+ destLength=0;
+ for(;;) {
+ s=u_skipWhitespace(s);
+ if(*s==';' || *s==0) {
+ if(destLength<destCapacity) {
+ dest[destLength]=0;
+ } else if(destLength==destCapacity) {
+ *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;
+ } else {
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ }
+ return destLength;
+ }
+
+ /* read one code point */
+ value=(uint32_t)uprv_strtoul(s, &end, 16);
+ if(end<=s || (!U_IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
+ *pErrorCode=U_PARSE_ERROR;
+ return 0;
+ }
+
+ /* store the first code point */
+ if(pFirst!=nullptr) {
+ *pFirst=value;
+ pFirst=nullptr;
+ }
+
+ /* append it to the destination array */
+ if((destLength+U16_LENGTH(value))<=destCapacity) {
+ U16_APPEND_UNSAFE(dest, destLength, value);
+ } else {
+ destLength+=U16_LENGTH(value);
+ }
+
+ /* go to the following characters */
+ s=end;
+ }
+}
+
+/* read a range like start or start..end */
+U_CAPI int32_t U_EXPORT2
+u_parseCodePointRangeAnyTerminator(const char *s,
+ uint32_t *pStart, uint32_t *pEnd,
+ const char **terminator,
+ UErrorCode *pErrorCode) {
+ char *end;
+ uint32_t value;
+
+ if(U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+ if(s==nullptr || pStart==nullptr || pEnd==nullptr) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+
+ /* read the start code point */
+ s=u_skipWhitespace(s);
+ value=(uint32_t)uprv_strtoul(s, &end, 16);
+ if(end<=s || value>=0x110000) {
+ *pErrorCode=U_PARSE_ERROR;
+ return 0;
+ }
+ *pStart=*pEnd=value;
+
+ /* is there a "..end"? */
+ s=u_skipWhitespace(end);
+ if(*s!='.' || s[1]!='.') {
+ *terminator=end;
+ return 1;
+ }
+ s=u_skipWhitespace(s+2);
+
+ /* read the end code point */
+ value=(uint32_t)uprv_strtoul(s, &end, 16);
+ if(end<=s || value>=0x110000) {
+ *pErrorCode=U_PARSE_ERROR;
+ return 0;
+ }
+ *pEnd=value;
+
+ /* is this a valid range? */
+ if(value<*pStart) {
+ *pErrorCode=U_PARSE_ERROR;
+ return 0;
+ }
+
+ *terminator=end;
+ return value-*pStart+1;
+}
+
+U_CAPI int32_t U_EXPORT2
+u_parseCodePointRange(const char *s,
+ uint32_t *pStart, uint32_t *pEnd,
+ UErrorCode *pErrorCode) {
+ const char *terminator;
+ int32_t rangeLength=
+ u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode);
+ if(U_SUCCESS(*pErrorCode)) {
+ terminator=u_skipWhitespace(terminator);
+ if(*terminator!=';' && *terminator!=0) {
+ *pErrorCode=U_PARSE_ERROR;
+ return 0;
+ }
+ }
+ return rangeLength;
+}
+
+U_CAPI int32_t U_EXPORT2
+u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status) {
+ const char *read = source;
+ int32_t i = 0;
+ unsigned int value = 0;
+ if(sLen == -1) {
+ sLen = (int32_t)strlen(source);
+ }
+
+ while(read < source+sLen) {
+ sscanf(read, "%2x", &value);
+ if(i < destCapacity) {
+ dest[i] = (char)value;
+ }
+ i++;
+ read += 2;
+ }
+ return u_terminateChars(dest, destCapacity, i, status);
+}
diff --git a/intl/icu/source/tools/toolutil/uparse.h b/intl/icu/source/tools/toolutil/uparse.h
new file mode 100644
index 0000000000..df0e79a21f
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/uparse.h
@@ -0,0 +1,153 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2000-2010, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: uparse.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2000apr18
+* created by: Markus W. Scherer
+*
+* This file provides a parser for files that are delimited by one single
+* character like ';' or TAB. Example: the Unicode Character Properties files
+* like UnicodeData.txt are semicolon-delimited.
+*/
+
+#ifndef __UPARSE_H__
+#define __UPARSE_H__
+
+#include "unicode/utypes.h"
+
+/**
+ * Is c an invariant-character whitespace?
+ * @param c invariant character
+ */
+#define U_IS_INV_WHITESPACE(c) ((c)==' ' || (c)=='\t' || (c)=='\r' || (c)=='\n')
+
+U_CDECL_BEGIN
+
+/**
+ * Skip space ' ' and TAB '\t' characters.
+ *
+ * @param s Pointer to characters.
+ * @return Pointer to first character at or after s that is not a space or TAB.
+ */
+U_CAPI const char * U_EXPORT2
+u_skipWhitespace(const char *s);
+
+/**
+ * Trim whitespace (including line endings) from the end of the string.
+ *
+ * @param s Pointer to the string.
+ * @return Pointer to the new end of the string.
+ */
+U_CAPI char * U_EXPORT2
+u_rtrim(char *s);
+
+/** Function type for u_parseDelimitedFile(). */
+typedef void U_CALLCONV
+UParseLineFn(void *context,
+ char *fields[][2],
+ int32_t fieldCount,
+ UErrorCode *pErrorCode);
+
+/**
+ * Parser for files that are similar to UnicodeData.txt:
+ * This function opens the file and reads it line by line. It skips empty lines
+ * and comment lines that start with a '#'.
+ * All other lines are separated into fields with one delimiter character
+ * (semicolon for Unicode Properties files) between two fields. The last field in
+ * a line does not need to be terminated with a delimiter.
+ *
+ * For each line, after segmenting it, a line function is called.
+ * It gets passed the array of field start and limit pointers that is
+ * passed into this parser and filled by it for each line.
+ * For each field i of the line, the start pointer in fields[i][0]
+ * points to the beginning of the field, while the limit pointer in fields[i][1]
+ * points behind the field, i.e., to the delimiter or the line end.
+ *
+ * The context parameter of the line function is
+ * the same as the one for the parse function.
+ *
+ * The line function may modify the contents of the fields including the
+ * limit characters.
+ *
+ * If the file cannot be opened, or there is a parsing error or a field function
+ * sets *pErrorCode, then the parser returns with *pErrorCode set to an error code.
+ */
+U_CAPI void U_EXPORT2
+u_parseDelimitedFile(const char *filename, char delimiter,
+ char *fields[][2], int32_t fieldCount,
+ UParseLineFn *lineFn, void *context,
+ UErrorCode *pErrorCode);
+
+/**
+ * Parse a string of code points like 0061 0308 0300.
+ * s must end with either ';' or NUL.
+ *
+ * @return Number of code points.
+ */
+U_CAPI int32_t U_EXPORT2
+u_parseCodePoints(const char *s,
+ uint32_t *dest, int32_t destCapacity,
+ UErrorCode *pErrorCode);
+
+/**
+ * Parse a list of code points like 0061 0308 0300
+ * into a UChar * string.
+ * s must end with either ';' or NUL.
+ *
+ * Set the first code point in *pFirst.
+ *
+ * @param s Input char * string.
+ * @param dest Output string buffer.
+ * @param destCapacity Capacity of dest in numbers of UChars.
+ * @param pFirst If pFirst!=NULL the *pFirst will be set to the first
+ * code point in the string.
+ * @param pErrorCode ICU error code.
+ * @return The length of the string in numbers of UChars.
+ */
+U_CAPI int32_t U_EXPORT2
+u_parseString(const char *s,
+ UChar *dest, int32_t destCapacity,
+ uint32_t *pFirst,
+ UErrorCode *pErrorCode);
+
+/**
+ * Parse a code point range like
+ * 0085 or
+ * 4E00..9FA5.
+ *
+ * s must contain such a range and end with either ';' or NUL.
+ *
+ * @return Length of code point range, end-start+1
+ */
+U_CAPI int32_t U_EXPORT2
+u_parseCodePointRange(const char *s,
+ uint32_t *pStart, uint32_t *pEnd,
+ UErrorCode *pErrorCode);
+
+/**
+ * Same as u_parseCodePointRange() but the range may be terminated by
+ * any character. The position of the terminating character is returned via
+ * the *terminator output parameter.
+ */
+U_CAPI int32_t U_EXPORT2
+u_parseCodePointRangeAnyTerminator(const char *s,
+ uint32_t *pStart, uint32_t *pEnd,
+ const char **terminator,
+ UErrorCode *pErrorCode);
+
+U_CAPI int32_t U_EXPORT2
+u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status);
+
+U_CDECL_END
+
+#endif
diff --git a/intl/icu/source/tools/toolutil/writesrc.cpp b/intl/icu/source/tools/toolutil/writesrc.cpp
new file mode 100644
index 0000000000..55c2f277b3
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/writesrc.cpp
@@ -0,0 +1,515 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2005-2012, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: writesrc.c
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2005apr23
+* created by: Markus W. Scherer
+*
+* Helper functions for writing source code for data.
+*/
+
+#include <stdio.h>
+#include <time.h>
+
+// The C99 standard suggested that C++ implementations not define PRId64 etc. constants
+// unless this macro is defined.
+// See the Notes at https://en.cppreference.com/w/cpp/types/integer .
+// Similar to defining __STDC_LIMIT_MACROS in unicode/ptypes.h .
+#ifndef __STDC_FORMAT_MACROS
+# define __STDC_FORMAT_MACROS
+#endif
+#include <cinttypes>
+
+#include "unicode/utypes.h"
+#include "unicode/putil.h"
+#include "unicode/ucptrie.h"
+#include "unicode/errorcode.h"
+#include "unicode/uniset.h"
+#include "unicode/usetiter.h"
+#include "unicode/utf16.h"
+#include "utrie2.h"
+#include "cstring.h"
+#include "writesrc.h"
+#include "util.h"
+
+U_NAMESPACE_BEGIN
+
+ValueNameGetter::~ValueNameGetter() {}
+
+U_NAMESPACE_END
+
+U_NAMESPACE_USE
+
+static FILE *
+usrc_createWithoutHeader(const char *path, const char *filename) {
+ char buffer[1024];
+ const char *p;
+ char *q;
+ FILE *f;
+ char c;
+
+ if(path==nullptr) {
+ p=filename;
+ } else {
+ /* concatenate path and filename, with U_FILE_SEP_CHAR in between if necessary */
+ uprv_strcpy(buffer, path);
+ q=buffer+uprv_strlen(buffer);
+ if(q>buffer && (c=*(q-1))!=U_FILE_SEP_CHAR && c!=U_FILE_ALT_SEP_CHAR) {
+ *q++=U_FILE_SEP_CHAR;
+ }
+ uprv_strcpy(q, filename);
+ p=buffer;
+ }
+
+ f=fopen(p, "w");
+ if (f==nullptr) {
+ fprintf(
+ stderr,
+ "usrc_create(%s, %s): unable to create file\n",
+ path!=nullptr ? path : "", filename);
+ }
+ return f;
+}
+
+U_CAPI FILE * U_EXPORT2
+usrc_create(const char *path, const char *filename, int32_t copyrightYear, const char *generator) {
+ FILE *f = usrc_createWithoutHeader(path, filename);
+ if (f == nullptr) {
+ return f;
+ }
+ usrc_writeCopyrightHeader(f, "//", copyrightYear);
+ usrc_writeFileNameGeneratedBy(f, "//", filename, generator);
+ return f;
+}
+
+U_CAPI FILE * U_EXPORT2
+usrc_createTextData(const char *path, const char *filename, int32_t copyrightYear, const char *generator) {
+ FILE *f = usrc_createWithoutHeader(path, filename);
+ if (f == nullptr) {
+ return f;
+ }
+ usrc_writeCopyrightHeader(f, "#", copyrightYear);
+ usrc_writeFileNameGeneratedBy(f, "#", filename, generator);
+ return f;
+}
+
+U_CAPI void U_EXPORT2
+usrc_writeCopyrightHeader(FILE *f, const char *prefix, int32_t copyrightYear) {
+ fprintf(f,
+ "%s Copyright (C) %d and later: Unicode, Inc. and others.\n"
+ "%s License & terms of use: http://www.unicode.org/copyright.html\n",
+ prefix, copyrightYear, prefix);
+ if (copyrightYear <= 2016) {
+ fprintf(f,
+ "%s Copyright (C) 1999-2016, International Business Machines\n"
+ "%s Corporation and others. All Rights Reserved.\n",
+ prefix, prefix);
+ }
+}
+
+U_CAPI void U_EXPORT2
+usrc_writeFileNameGeneratedBy(
+ FILE *f,
+ const char *prefix,
+ const char *filename,
+ const char *generator) {
+ char buffer[1024];
+ const struct tm *lt;
+ time_t t;
+
+ const char *pattern =
+ "%s\n"
+ "%s file name: %s\n"
+ "%s\n"
+ "%s machine-generated by: %s\n"
+ "\n";
+
+ time(&t);
+ lt=localtime(&t);
+ if(generator==nullptr) {
+ strftime(buffer, sizeof(buffer), "%Y-%m-%d", lt);
+ fprintf(f, pattern, prefix, prefix, filename, prefix, prefix, buffer);
+ } else {
+ fprintf(f, pattern, prefix, prefix, filename, prefix, prefix, generator);
+ }
+}
+
+U_CAPI void U_EXPORT2
+usrc_writeArray(FILE *f,
+ const char *prefix,
+ const void *p, int32_t width, int32_t length,
+ const char *indent,
+ const char *postfix) {
+ const uint8_t *p8;
+ const uint16_t *p16;
+ const uint32_t *p32;
+ const int64_t *p64; // Signed due to TOML!
+ int64_t value; // Signed due to TOML!
+ int32_t i, col;
+
+ p8=nullptr;
+ p16=nullptr;
+ p32=nullptr;
+ p64=nullptr;
+ switch(width) {
+ case 8:
+ p8=(const uint8_t *)p;
+ break;
+ case 16:
+ p16=(const uint16_t *)p;
+ break;
+ case 32:
+ p32=(const uint32_t *)p;
+ break;
+ case 64:
+ p64=(const int64_t *)p;
+ break;
+ default:
+ fprintf(stderr, "usrc_writeArray(width=%ld) unrecognized width\n", (long)width);
+ return;
+ }
+ if(prefix!=nullptr) {
+ fprintf(f, prefix, (long)length);
+ }
+ for(i=col=0; i<length; ++i, ++col) {
+ if(i>0) {
+ if(col<16) {
+ fputc(',', f);
+ } else {
+ fputs(",\n", f);
+ fputs(indent, f);
+ col=0;
+ }
+ }
+ switch(width) {
+ case 8:
+ value=p8[i];
+ break;
+ case 16:
+ value=p16[i];
+ break;
+ case 32:
+ value=p32[i];
+ break;
+ case 64:
+ value=p64[i];
+ break;
+ default:
+ value=0; /* unreachable */
+ break;
+ }
+ fprintf(f, value<=9 ? "%" PRId64 : "0x%" PRIx64, value);
+ }
+ if(postfix!=nullptr) {
+ fputs(postfix, f);
+ }
+}
+
+U_CAPI void U_EXPORT2
+usrc_writeUTrie2Arrays(FILE *f,
+ const char *indexPrefix, const char *data32Prefix,
+ const UTrie2 *pTrie,
+ const char *postfix) {
+ if(pTrie->data32==nullptr) {
+ /* 16-bit trie */
+ usrc_writeArray(f, indexPrefix, pTrie->index, 16, pTrie->indexLength+pTrie->dataLength, "", postfix);
+ } else {
+ /* 32-bit trie */
+ usrc_writeArray(f, indexPrefix, pTrie->index, 16, pTrie->indexLength, "", postfix);
+ usrc_writeArray(f, data32Prefix, pTrie->data32, 32, pTrie->dataLength, "", postfix);
+ }
+}
+
+U_CAPI void U_EXPORT2
+usrc_writeUTrie2Struct(FILE *f,
+ const char *prefix,
+ const UTrie2 *pTrie,
+ const char *indexName, const char *data32Name,
+ const char *postfix) {
+ if(prefix!=nullptr) {
+ fputs(prefix, f);
+ }
+ if(pTrie->data32==nullptr) {
+ /* 16-bit trie */
+ fprintf(
+ f,
+ " %s,\n" /* index */
+ " %s+%ld,\n" /* data16 */
+ " nullptr,\n", /* data32 */
+ indexName,
+ indexName,
+ (long)pTrie->indexLength);
+ } else {
+ /* 32-bit trie */
+ fprintf(
+ f,
+ " %s,\n" /* index */
+ " nullptr,\n" /* data16 */
+ " %s,\n", /* data32 */
+ indexName,
+ data32Name);
+ }
+ fprintf(
+ f,
+ " %ld,\n" /* indexLength */
+ " %ld,\n" /* dataLength */
+ " 0x%hx,\n" /* index2NullOffset */
+ " 0x%hx,\n" /* dataNullOffset */
+ " 0x%lx,\n" /* initialValue */
+ " 0x%lx,\n" /* errorValue */
+ " 0x%lx,\n" /* highStart */
+ " 0x%lx,\n" /* highValueIndex */
+ " nullptr, 0, false, false, 0, nullptr\n",
+ (long)pTrie->indexLength, (long)pTrie->dataLength,
+ (short)pTrie->index2NullOffset, (short)pTrie->dataNullOffset,
+ (long)pTrie->initialValue, (long)pTrie->errorValue,
+ (long)pTrie->highStart, (long)pTrie->highValueIndex);
+ if(postfix!=nullptr) {
+ fputs(postfix, f);
+ }
+}
+
+U_CAPI void U_EXPORT2
+usrc_writeUCPTrieArrays(FILE *f,
+ const char *indexPrefix, const char *dataPrefix,
+ const UCPTrie *pTrie,
+ const char *postfix,
+ UTargetSyntax syntax) {
+ const char* indent = (syntax == UPRV_TARGET_SYNTAX_TOML) ? " " : "";
+ usrc_writeArray(f, indexPrefix, pTrie->index, 16, pTrie->indexLength, indent, postfix);
+ int32_t width=
+ pTrie->valueWidth==UCPTRIE_VALUE_BITS_16 ? 16 :
+ pTrie->valueWidth==UCPTRIE_VALUE_BITS_32 ? 32 :
+ pTrie->valueWidth==UCPTRIE_VALUE_BITS_8 ? 8 : 0;
+ usrc_writeArray(f, dataPrefix, pTrie->data.ptr0, width, pTrie->dataLength, indent, postfix);
+}
+
+U_CAPI void U_EXPORT2
+usrc_writeUCPTrieStruct(FILE *f,
+ const char *prefix,
+ const UCPTrie *pTrie,
+ const char *indexName, const char *dataName,
+ const char *postfix,
+ UTargetSyntax syntax) {
+ if(prefix!=nullptr) {
+ fputs(prefix, f);
+ }
+ if (syntax == UPRV_TARGET_SYNTAX_CCODE) {
+ fprintf(
+ f,
+ " %s,\n" // index
+ " { %s },\n", // data (union)
+ indexName,
+ dataName);
+ }
+ const char* pattern =
+ (syntax == UPRV_TARGET_SYNTAX_CCODE) ?
+ " %ld, %ld,\n" // indexLength, dataLength
+ " 0x%lx, 0x%x,\n" // highStart, shifted12HighStart
+ " %d, %d,\n" // type, valueWidth
+ " 0, 0,\n" // reserved32, reserved16
+ " 0x%x, 0x%lx,\n" // index3NullOffset, dataNullOffset
+ " 0x%lx,\n" // nullValue
+ :
+ "indexLength = %ld\n"
+ "dataLength = %ld\n"
+ "highStart = 0x%lx\n"
+ "shifted12HighStart = 0x%x\n"
+ "type = %d\n"
+ "valueWidth = %d\n"
+ "index3NullOffset = 0x%x\n"
+ "dataNullOffset = 0x%lx\n"
+ "nullValue = 0x%lx\n"
+ ;
+ fprintf(
+ f,
+ pattern,
+ (long)pTrie->indexLength, (long)pTrie->dataLength,
+ (long)pTrie->highStart, pTrie->shifted12HighStart,
+ pTrie->type, pTrie->valueWidth,
+ pTrie->index3NullOffset, (long)pTrie->dataNullOffset,
+ (long)pTrie->nullValue);
+ if(postfix!=nullptr) {
+ fputs(postfix, f);
+ }
+}
+
+U_CAPI void U_EXPORT2
+usrc_writeUCPTrie(FILE *f, const char *name, const UCPTrie *pTrie, UTargetSyntax syntax) {
+ int32_t width=
+ pTrie->valueWidth==UCPTRIE_VALUE_BITS_16 ? 16 :
+ pTrie->valueWidth==UCPTRIE_VALUE_BITS_32 ? 32 :
+ pTrie->valueWidth==UCPTRIE_VALUE_BITS_8 ? 8 : 0;
+ char line[100], line2[100], line3[100], line4[100];
+
+ switch (syntax) {
+ case UPRV_TARGET_SYNTAX_CCODE:
+ snprintf(line, sizeof(line), "static const uint16_t %s_trieIndex[%%ld]={\n", name);
+ snprintf(line2, sizeof(line2), "static const uint%d_t %s_trieData[%%ld]={\n", (int)width, name);
+ snprintf(line3, sizeof(line3), "\n};\n\n");
+ break;
+ case UPRV_TARGET_SYNTAX_TOML:
+ snprintf(line, sizeof(line), "index = [\n ");
+ snprintf(line2, sizeof(line2), "data_%d = [\n ", (int)width);
+ snprintf(line3, sizeof(line3), "\n]\n");
+ break;
+ default:
+ UPRV_UNREACHABLE_EXIT;
+ }
+ usrc_writeUCPTrieArrays(f, line, line2, pTrie, line3, syntax);
+
+ switch (syntax) {
+ case UPRV_TARGET_SYNTAX_CCODE:
+ snprintf(line, sizeof(line), "static const UCPTrie %s_trie={\n", name);
+ snprintf(line2, sizeof(line2), "%s_trieIndex", name);
+ snprintf(line3, sizeof(line3), "%s_trieData", name);
+ snprintf(line4, sizeof(line4), "};\n\n");
+ break;
+ case UPRV_TARGET_SYNTAX_TOML:
+ line[0] = 0;
+ line2[0] = 0;
+ line3[0] = 0;
+ line4[0] = 0;
+ break;
+ default:
+ UPRV_UNREACHABLE_EXIT;
+ }
+ usrc_writeUCPTrieStruct(f, line, pTrie, line2, line3, line4, syntax);
+}
+
+U_CAPI void U_EXPORT2
+usrc_writeUnicodeSet(
+ FILE *f,
+ const USet *pSet,
+ UTargetSyntax syntax) {
+ // ccode is not yet supported
+ U_ASSERT(syntax == UPRV_TARGET_SYNTAX_TOML);
+
+ // Write out a list of ranges
+ const UnicodeSet* set = UnicodeSet::fromUSet(pSet);
+ UnicodeSetIterator it(*set);
+ fprintf(f, "# Inclusive ranges of the code points in the set.\n");
+ fprintf(f, "ranges = [\n");
+ bool seenFirstString = false;
+ while (it.nextRange()) {
+ if (it.isString()) {
+ if (!seenFirstString) {
+ seenFirstString = true;
+ fprintf(f, "]\nstrings = [\n");
+ }
+ const UnicodeString& str = it.getString();
+ fprintf(f, " ");
+ usrc_writeStringAsASCII(f, str.getBuffer(), str.length(), syntax);
+ fprintf(f, ",\n");
+ } else {
+ U_ASSERT(!seenFirstString);
+ UChar32 start = it.getCodepoint();
+ UChar32 end = it.getCodepointEnd();
+ fprintf(f, " [0x%x, 0x%x],\n", start, end);
+ }
+ }
+ fprintf(f, "]\n");
+}
+
+U_CAPI void U_EXPORT2
+usrc_writeUCPMap(
+ FILE *f,
+ const UCPMap *pMap,
+ icu::ValueNameGetter *valueNameGetter,
+ UTargetSyntax syntax) {
+ // ccode is not yet supported
+ U_ASSERT(syntax == UPRV_TARGET_SYNTAX_TOML);
+ (void) syntax; // silence unused variable errors
+
+ // Print out list of ranges
+ UChar32 start = 0, end;
+ uint32_t value;
+ fprintf(f, "# Code points `a` through `b` have value `v`, corresponding to `name`.\n");
+ fprintf(f, "ranges = [\n");
+ while ((end = ucpmap_getRange(pMap, start, UCPMAP_RANGE_NORMAL, 0, nullptr, nullptr, &value)) >= 0) {
+ if (valueNameGetter != nullptr) {
+ const char *name = valueNameGetter->getName(value);
+ fprintf(f, " {a=0x%x, b=0x%x, v=%u, name=\"%s\"},\n", start, end, value, name);
+ } else {
+ fprintf(f, " {a=0x%x, b=0x%x, v=%u},\n", start, end, value);
+ }
+ start = end + 1;
+ }
+ fprintf(f, "]\n");
+}
+
+U_CAPI void U_EXPORT2
+usrc_writeArrayOfMostlyInvChars(FILE *f,
+ const char *prefix,
+ const char *p, int32_t length,
+ const char *postfix) {
+ int32_t i, col;
+ int prev2, prev, c;
+
+ if(prefix!=nullptr) {
+ fprintf(f, prefix, (long)length);
+ }
+ prev2=prev=-1;
+ for(i=col=0; i<length; ++i, ++col) {
+ c=(uint8_t)p[i];
+ if(i>0) {
+ /* Break long lines. Try to break at interesting places, to minimize revision diffs. */
+ if(
+ /* Very long line. */
+ col>=32 ||
+ /* Long line, break after terminating NUL. */
+ (col>=24 && prev2>=0x20 && prev==0) ||
+ /* Medium-long line, break before non-NUL, non-character byte. */
+ (col>=16 && (prev==0 || prev>=0x20) && 0<c && c<0x20)
+ ) {
+ fputs(",\n", f);
+ col=0;
+ } else {
+ fputc(',', f);
+ }
+ }
+ fprintf(f, c<0x20 ? "%u" : "'%c'", c);
+ prev2=prev;
+ prev=c;
+ }
+ if(postfix!=nullptr) {
+ fputs(postfix, f);
+ }
+}
+
+U_CAPI void U_EXPORT2
+usrc_writeStringAsASCII(FILE *f,
+ const char16_t* ptr, int32_t length,
+ UTargetSyntax) {
+ // For now, assume all UTargetSyntax values are valid here.
+ fprintf(f, "\"");
+ int32_t i = 0;
+ UChar32 cp;
+ while (i < length) {
+ U16_NEXT(ptr, i, length, cp);
+ if (cp == u'"') {
+ fprintf(f, "\\\"");
+ } else if (ICU_Utility::isUnprintable(cp)) {
+ UnicodeString u16result;
+ ICU_Utility::escapeUnprintable(u16result, cp);
+ std::string u8result;
+ u16result.toUTF8String(u8result);
+ fprintf(f, "%s", u8result.data());
+ } else {
+ U_ASSERT(cp < 0x80);
+ char s[2] = {static_cast<char>(cp), 0};
+ fprintf(f, "%s", s);
+ }
+ }
+ fprintf(f, "\"");
+}
diff --git a/intl/icu/source/tools/toolutil/writesrc.h b/intl/icu/source/tools/toolutil/writesrc.h
new file mode 100644
index 0000000000..9c0be5a100
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/writesrc.h
@@ -0,0 +1,198 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2005-2012, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: writesrc.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2005apr23
+* created by: Markus W. Scherer
+*
+* Helper functions for writing source code for data.
+*/
+
+#ifndef __WRITESRC_H__
+#define __WRITESRC_H__
+
+#include <stdio.h>
+#include "unicode/utypes.h"
+#include "unicode/ucpmap.h"
+#include "unicode/ucptrie.h"
+#include "unicode/umutablecptrie.h"
+#include "unicode/uset.h"
+#include "utrie2.h"
+
+/**
+ * An input to some of the functions in this file specifying whether to write data
+ * as C/C++ code initializers or as TOML.
+ */
+typedef enum UTargetSyntax {
+ UPRV_TARGET_SYNTAX_CCODE = 0,
+ UPRV_TARGET_SYNTAX_TOML = 1,
+} UTargetSyntax;
+
+/**
+ * Creates a source text file and writes a header comment with the ICU copyright.
+ * Writes a C/Java-style comment with the generator name.
+ */
+U_CAPI FILE * U_EXPORT2
+usrc_create(const char *path, const char *filename, int32_t copyrightYear, const char *generator);
+
+/**
+ * Creates a source text file and writes a header comment with the ICU copyright.
+ * Writes the comment with # lines, as used in scripts and text data.
+ */
+U_CAPI FILE * U_EXPORT2
+usrc_createTextData(const char *path, const char *filename, int32_t copyrightYear, const char *generator);
+
+/**
+ * Writes the ICU copyright to a file stream, with configurable year and comment style.
+ */
+U_CAPI void U_EXPORT2
+usrc_writeCopyrightHeader(FILE *f, const char *prefix, int32_t copyrightYear);
+
+/**
+ * Writes information about the file being machine-generated.
+ */
+U_CAPI void U_EXPORT2
+usrc_writeFileNameGeneratedBy(
+ FILE *f,
+ const char *prefix,
+ const char *filename,
+ const char *generator);
+
+/**
+ * Writes the contents of an array of 8/16/32/64-bit words.
+ * The prefix and postfix are optional (can be NULL) and are written first/last.
+ * The prefix may contain a %ld or similar field for the array length.
+ * The {} and declaration etc. need to be included in prefix/postfix or
+ * printed before and after the array contents.
+ */
+U_CAPI void U_EXPORT2
+usrc_writeArray(FILE *f,
+ const char *prefix,
+ const void *p, int32_t width, int32_t length,
+ const char *indent,
+ const char *postfix);
+
+/**
+ * Calls usrc_writeArray() for the index and data arrays of a frozen UTrie2.
+ * Only the index array is written for a 16-bit UTrie2. In this case, dataPrefix
+ * is ignored and can be NULL.
+ */
+U_CAPI void U_EXPORT2
+usrc_writeUTrie2Arrays(FILE *f,
+ const char *indexPrefix, const char *dataPrefix,
+ const UTrie2 *pTrie,
+ const char *postfix);
+
+/**
+ * Writes the UTrie2 struct values.
+ * The {} and declaration etc. need to be included in prefix/postfix or
+ * printed before and after the array contents.
+ */
+U_CAPI void U_EXPORT2
+usrc_writeUTrie2Struct(FILE *f,
+ const char *prefix,
+ const UTrie2 *pTrie,
+ const char *indexName, const char *dataName,
+ const char *postfix);
+
+/**
+ * Calls usrc_writeArray() for the index and data arrays of a UCPTrie.
+ */
+U_CAPI void U_EXPORT2
+usrc_writeUCPTrieArrays(FILE *f,
+ const char *indexPrefix, const char *dataPrefix,
+ const UCPTrie *pTrie,
+ const char *postfix,
+ UTargetSyntax syntax);
+
+/**
+ * Writes the UCPTrie struct values.
+ * The {} and declaration etc. need to be included in prefix/postfix or
+ * printed before and after the array contents.
+ */
+U_CAPI void U_EXPORT2
+usrc_writeUCPTrieStruct(FILE *f,
+ const char *prefix,
+ const UCPTrie *pTrie,
+ const char *indexName, const char *dataName,
+ const char *postfix,
+ UTargetSyntax syntax);
+
+/**
+ * Writes the UCPTrie arrays and struct values.
+ */
+U_CAPI void U_EXPORT2
+usrc_writeUCPTrie(FILE *f, const char *name, const UCPTrie *pTrie, UTargetSyntax syntax);
+
+/**
+ * Writes the UnicodeSet range and string lists.
+ */
+U_CAPI void U_EXPORT2
+usrc_writeUnicodeSet(
+ FILE *f,
+ const USet *pSet,
+ UTargetSyntax syntax);
+
+#ifdef __cplusplus
+
+U_NAMESPACE_BEGIN
+
+class U_TOOLUTIL_API ValueNameGetter {
+public:
+ virtual ~ValueNameGetter();
+ virtual const char *getName(uint32_t value) = 0;
+};
+
+U_NAMESPACE_END
+
+/**
+ * Writes the UCPMap ranges list.
+ *
+ * The "valueNameGetter" argument is optional; ignored if nullptr.
+ * If present, it will be used to look up value name strings.
+ */
+U_CAPI void U_EXPORT2
+usrc_writeUCPMap(
+ FILE *f,
+ const UCPMap *pMap,
+ icu::ValueNameGetter *valueNameGetter,
+ UTargetSyntax syntax);
+
+#endif // __cplusplus
+
+/**
+ * Writes the contents of an array of mostly invariant characters.
+ * Characters 0..0x1f are printed as numbers,
+ * others as characters with single quotes: '%c'.
+ *
+ * The prefix and postfix are optional (can be NULL) and are written first/last.
+ * The prefix may contain a %ld or similar field for the array length.
+ * The {} and declaration etc. need to be included in prefix/postfix or
+ * printed before and after the array contents.
+ */
+U_CAPI void U_EXPORT2
+usrc_writeArrayOfMostlyInvChars(FILE *f,
+ const char *prefix,
+ const char *p, int32_t length,
+ const char *postfix);
+
+/**
+ * Writes a syntactically valid Unicode string in all ASCII, escaping quotes
+ * and non-ASCII characters.
+ */
+U_CAPI void U_EXPORT2
+usrc_writeStringAsASCII(FILE *f,
+ const UChar* ptr, int32_t length,
+ UTargetSyntax syntax);
+
+#endif
diff --git a/intl/icu/source/tools/toolutil/xmlparser.cpp b/intl/icu/source/tools/toolutil/xmlparser.cpp
new file mode 100644
index 0000000000..edb85bdab0
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/xmlparser.cpp
@@ -0,0 +1,827 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2004-2010, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: xmlparser.cpp
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2004jul21
+* created by: Andy Heninger
+*/
+
+#include <stdio.h>
+#include "unicode/uchar.h"
+#include "unicode/ucnv.h"
+#include "unicode/regex.h"
+#include "filestrm.h"
+#include "xmlparser.h"
+
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
+
+// character constants
+enum {
+ x_QUOT=0x22,
+ x_AMP=0x26,
+ x_APOS=0x27,
+ x_LT=0x3c,
+ x_GT=0x3e,
+ x_l=0x6c
+};
+
+#define XML_SPACES "[ \\u0009\\u000d\\u000a]"
+
+// XML #4
+#define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
+ "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
+ "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
+ "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
+
+// XML #5
+#define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
+
+// XML #6
+#define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
+
+U_NAMESPACE_BEGIN
+
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
+
+//
+// UXMLParser constructor. Mostly just initializes the ICU regexes that are
+// used for parsing.
+//
+UXMLParser::UXMLParser(UErrorCode &status) :
+ // XML Declaration. XML Production #23.
+ // example: "<?xml version=1.0 encoding="utf-16" ?>
+ // This is a sloppy implementation - just look for the leading <?xml and the closing ?>
+ // allow for a possible leading BOM.
+ mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
+
+ // XML Comment production #15
+ // example: "<!-- whatever -->
+ // note, does not detect an illegal "--" within comments
+ mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
+
+ // XML Spaces
+ // production [3]
+ mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
+
+ // XML Doctype decl production #28
+ // example "<!DOCTYPE foo SYSTEM "somewhere" >
+ // or "<!DOCTYPE foo [internal dtd]>
+ // TODO: we don't actually parse the DOCTYPE or internal subsets.
+ // Some internal dtd subsets could confuse this simple-minded
+ // attempt at skipping over them, specifically, occurrences
+ // of closing square brackets. These could appear in comments,
+ // or in parameter entity declarations, for example.
+ mXMLDoctype(UnicodeString(
+ "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
+ ), 0, status),
+
+ // XML PI production #16
+ // example "<?target stuff?>
+ mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
+
+ // XML Element Start Productions #40, #41
+ // example <foo att1='abc' att2="d e f" >
+ // capture #1: the tag name
+ //
+ mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"
+ "(?:"
+ XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
+ "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
+ ")*" // * for zero or more attributes.
+ XML_SPACES "*?>", -1, US_INV), 0, status), // match " >"
+
+ // XML Element End production #42
+ // example </foo>
+ mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
+
+ // XML Element Empty production #44
+ // example <foo att1="abc" att2="d e f" />
+ mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"
+ "(?:"
+ XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
+ "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
+ ")*" // * for zero or more attributes.
+ XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />"
+
+
+ // XMLCharData. Everything but '<'. Note that & will be dealt with later.
+ mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
+
+ // Attribute name = "value". XML Productions 10, 40/41
+ // Capture group 1 is name,
+ // 2 is the attribute value, including the quotes.
+ //
+ // Note that attributes are scanned twice. The first time is with
+ // the regex for an entire element start. There, the attributes
+ // are checked syntactically, but not separated out one by one.
+ // Here, we match a single attribute, and make its name and
+ // attribute value available to the parser code.
+ mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*"
+ "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
+
+
+ mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
+
+ // Match any of the new-line sequences in content.
+ // All are changed to \u000a.
+ mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
+
+ // & char references
+ // We will figure out what we've got based on which capture group has content.
+ // The last one is a catchall for unrecognized entity references..
+ // 1 2 3 4 5 6 7 8
+ mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
+ 0, status),
+
+ fNames(status),
+ fElementStack(status),
+ fOneLF((char16_t)0x0a) // Plain new-line string, used in new line normalization.
+ {
+ }
+
+UXMLParser *
+UXMLParser::createParser(UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) {
+ return nullptr;
+ } else {
+ return new UXMLParser(errorCode);
+ }
+}
+
+UXMLParser::~UXMLParser() {}
+
+UXMLElement *
+UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
+ char bytes[4096], charsetBuffer[100];
+ FileStream *f;
+ const char *charset, *pb;
+ UnicodeString src;
+ UConverter *cnv;
+ char16_t *buffer, *pu;
+ int32_t fileLength, bytesLength, length, capacity;
+ UBool flush;
+
+ if(U_FAILURE(errorCode)) {
+ return nullptr;
+ }
+
+ f=T_FileStream_open(filename, "rb");
+ if(f==nullptr) {
+ errorCode=U_FILE_ACCESS_ERROR;
+ return nullptr;
+ }
+
+ bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
+ if(bytesLength<(int32_t)sizeof(bytes)) {
+ // we have already read the entire file
+ fileLength=bytesLength;
+ } else {
+ // get the file length
+ fileLength=T_FileStream_size(f);
+ }
+
+ /*
+ * get the charset:
+ * 1. Unicode signature
+ * 2. treat as ISO-8859-1 and read XML encoding="charser"
+ * 3. default to UTF-8
+ */
+ charset=ucnv_detectUnicodeSignature(bytes, bytesLength, nullptr, &errorCode);
+ if(U_SUCCESS(errorCode) && charset!=nullptr) {
+ // open converter according to Unicode signature
+ cnv=ucnv_open(charset, &errorCode);
+ } else {
+ // read as Latin-1 and parse the XML declaration and encoding
+ cnv=ucnv_open("ISO-8859-1", &errorCode);
+ if(U_FAILURE(errorCode)) {
+ // unexpected error opening Latin-1 converter
+ goto exit;
+ }
+
+ buffer=toUCharPtr(src.getBuffer(bytesLength));
+ if(buffer==nullptr) {
+ // unexpected failure to reserve some string capacity
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ goto exit;
+ }
+ pb=bytes;
+ pu=buffer;
+ ucnv_toUnicode(
+ cnv,
+ &pu, buffer+src.getCapacity(),
+ &pb, bytes+bytesLength,
+ nullptr, true, &errorCode);
+ src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
+ ucnv_close(cnv);
+ cnv=nullptr;
+ if(U_FAILURE(errorCode)) {
+ // unexpected error in conversion from Latin-1
+ src.remove();
+ goto exit;
+ }
+
+ // parse XML declaration
+ if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
+ int32_t declEnd=mXMLDecl.end(errorCode);
+ // go beyond <?xml
+ int32_t pos=src.indexOf((char16_t)x_l)+1;
+
+ mAttrValue.reset(src);
+ while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element.
+ UnicodeString attName = mAttrValue.group(1, errorCode);
+ UnicodeString attValue = mAttrValue.group(2, errorCode);
+
+ // Trim the quotes from the att value. These are left over from the original regex
+ // that parsed the attribute, which couldn't conveniently strip them.
+ attValue.remove(0,1); // one char from the beginning
+ attValue.truncate(attValue.length()-1); // and one from the end.
+
+ if(attName==UNICODE_STRING("encoding", 8)) {
+ length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
+ charset=charsetBuffer;
+ break;
+ }
+ pos = mAttrValue.end(2, errorCode);
+ }
+
+ if(charset==nullptr) {
+ // default to UTF-8
+ charset="UTF-8";
+ }
+ cnv=ucnv_open(charset, &errorCode);
+ }
+ }
+
+ if(U_FAILURE(errorCode)) {
+ // unable to open the converter
+ goto exit;
+ }
+
+ // convert the file contents
+ capacity=fileLength; // estimated capacity
+ src.getBuffer(capacity);
+ src.releaseBuffer(0); // zero length
+ flush=false;
+ for(;;) {
+ // convert contents of bytes[bytesLength]
+ pb=bytes;
+ for(;;) {
+ length=src.length();
+ buffer=toUCharPtr(src.getBuffer(capacity));
+ if(buffer==nullptr) {
+ // unexpected failure to reserve some string capacity
+ errorCode=U_MEMORY_ALLOCATION_ERROR;
+ goto exit;
+ }
+
+ pu=buffer+length;
+ ucnv_toUnicode(
+ cnv, &pu, buffer+src.getCapacity(),
+ &pb, bytes+bytesLength,
+ nullptr, false, &errorCode);
+ src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
+ if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
+ errorCode=U_ZERO_ERROR;
+ capacity=(3*src.getCapacity())/2; // increase capacity by 50%
+ } else {
+ break;
+ }
+ }
+
+ if(U_FAILURE(errorCode)) {
+ break; // conversion error
+ }
+
+ if(flush) {
+ break; // completely converted the file
+ }
+
+ // read next block
+ bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
+ if(bytesLength==0) {
+ // reached end of file, convert once more to flush the converter
+ flush=true;
+ }
+ }
+
+exit:
+ ucnv_close(cnv);
+ T_FileStream_close(f);
+
+ if(U_SUCCESS(errorCode)) {
+ return parse(src, errorCode);
+ } else {
+ return nullptr;
+ }
+}
+
+UXMLElement *
+UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
+ if(U_FAILURE(status)) {
+ return nullptr;
+ }
+
+ UXMLElement *root = nullptr;
+ fPos = 0; // TODO use just a local pos variable and pass it into functions
+ // where necessary?
+
+ // set all matchers to work on the input string
+ mXMLDecl.reset(src);
+ mXMLComment.reset(src);
+ mXMLSP.reset(src);
+ mXMLDoctype.reset(src);
+ mXMLPI.reset(src);
+ mXMLElemStart.reset(src);
+ mXMLElemEnd.reset(src);
+ mXMLElemEmpty.reset(src);
+ mXMLCharData.reset(src);
+ mAttrValue.reset(src);
+ mAttrNormalizer.reset(src);
+ mNewLineNormalizer.reset(src);
+ mAmps.reset(src);
+
+ // Consume the XML Declaration, if present.
+ if (mXMLDecl.lookingAt(fPos, status)) {
+ fPos = mXMLDecl.end(status);
+ }
+
+ // Consume "misc" [XML production 27] appearing before DocType
+ parseMisc(status);
+
+ // Consume a DocType declaration, if present.
+ if (mXMLDoctype.lookingAt(fPos, status)) {
+ fPos = mXMLDoctype.end(status);
+ }
+
+ // Consume additional "misc" [XML production 27] appearing after the DocType
+ parseMisc(status);
+
+ // Get the root element
+ if (mXMLElemEmpty.lookingAt(fPos, status)) {
+ // Root is an empty element (no nested elements or content)
+ root = createElement(mXMLElemEmpty, status);
+ fPos = mXMLElemEmpty.end(status);
+ } else {
+ if (mXMLElemStart.lookingAt(fPos, status) == false) {
+ error("Root Element expected", status);
+ goto errorExit;
+ }
+ root = createElement(mXMLElemStart, status);
+ UXMLElement *el = root;
+
+ //
+ // This is the loop that consumes the root element of the document,
+ // including all nested content. Nested elements are handled by
+ // explicit pushes/pops of the element stack; there is no recursion
+ // in the control flow of this code.
+ // "el" always refers to the current element, the one to which content
+ // is being added. It is above the top of the element stack.
+ for (;;) {
+ // Nested Element Start
+ if (mXMLElemStart.lookingAt(fPos, status)) {
+ UXMLElement *t = createElement(mXMLElemStart, status);
+ el->fChildren.addElement(t, status);
+ t->fParent = el;
+ fElementStack.push(el, status);
+ el = t;
+ continue;
+ }
+
+ // Text Content. String is concatenated onto the current node's content,
+ // but only if it contains something other than spaces.
+ UnicodeString s = scanContent(status);
+ if (s.length() > 0) {
+ mXMLSP.reset(s);
+ if (mXMLSP.matches(status) == false) {
+ // This chunk of text contains something other than just
+ // white space. Make a child node for it.
+ replaceCharRefs(s, status);
+ el->fChildren.addElement(s.clone(), status);
+ }
+ mXMLSP.reset(src); // The matchers need to stay set to the main input string.
+ continue;
+ }
+
+ // Comments. Discard.
+ if (mXMLComment.lookingAt(fPos, status)) {
+ fPos = mXMLComment.end(status);
+ continue;
+ }
+
+ // PIs. Discard.
+ if (mXMLPI.lookingAt(fPos, status)) {
+ fPos = mXMLPI.end(status);
+ continue;
+ }
+
+ // Element End
+ if (mXMLElemEnd.lookingAt(fPos, status)) {
+ fPos = mXMLElemEnd.end(0, status);
+ const UnicodeString name = mXMLElemEnd.group(1, status);
+ if (name != *el->fName) {
+ error("Element start / end tag mismatch", status);
+ goto errorExit;
+ }
+ if (fElementStack.empty()) {
+ // Close of the root element. We're done with the doc.
+ el = nullptr;
+ break;
+ }
+ el = (UXMLElement *)fElementStack.pop();
+ continue;
+ }
+
+ // Empty Element. Stored as a child of the current element, but not stacked.
+ if (mXMLElemEmpty.lookingAt(fPos, status)) {
+ UXMLElement *t = createElement(mXMLElemEmpty, status);
+ el->fChildren.addElement(t, status);
+ continue;
+ }
+
+ // Hit something within the document that doesn't match anything.
+ // It's an error.
+ error("Unrecognized markup", status);
+ break;
+ }
+
+ if (el != nullptr || !fElementStack.empty()) {
+ // We bailed out early, for some reason.
+ error("Root element not closed.", status);
+ goto errorExit;
+ }
+ }
+
+ // Root Element parse is complete.
+ // Consume the annoying xml "Misc" that can appear at the end of the doc.
+ parseMisc(status);
+
+ // We should have reached the end of the input
+ if (fPos != src.length()) {
+ error("Extra content at the end of the document", status);
+ goto errorExit;
+ }
+
+ // Success!
+ return root;
+
+errorExit:
+ delete root;
+ return nullptr;
+}
+
+//
+// createElement
+// We've just matched an element start tag. Create and fill in a UXMLElement object
+// for it.
+//
+UXMLElement *
+UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) {
+ // First capture group is the element's name.
+ UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
+
+ // Scan for attributes.
+ int32_t pos = mEl.end(1, status); // The position after the end of the tag name
+
+ while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element.
+ UnicodeString attName = mAttrValue.group(1, status);
+ UnicodeString attValue = mAttrValue.group(2, status);
+
+ // Trim the quotes from the att value. These are left over from the original regex
+ // that parsed the attribute, which couldn't conveniently strip them.
+ attValue.remove(0,1); // one char from the beginning
+ attValue.truncate(attValue.length()-1); // and one from the end.
+
+ // XML Attribute value normalization.
+ // This is one of the really screwy parts of the XML spec.
+ // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
+ // Note that non-validating parsers must treat all entities as type CDATA
+ // which simplifies things some.
+
+ // Att normalization step 1: normalize any newlines in the attribute value
+ mNewLineNormalizer.reset(attValue);
+ attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
+
+ // Next change all xml white space chars to plain \u0020 spaces.
+ mAttrNormalizer.reset(attValue);
+ UnicodeString oneSpace((char16_t)0x0020);
+ attValue = mAttrNormalizer.replaceAll(oneSpace, status);
+
+ // Replace character entities.
+ replaceCharRefs(attValue, status);
+
+ // Save the attribute name and value in our document structure.
+ el->fAttNames.addElement((void *)intern(attName, status), status);
+ el->fAttValues.addElement(attValue.clone(), status);
+ pos = mAttrValue.end(2, status);
+ }
+ fPos = mEl.end(0, status);
+ return el;
+}
+
+//
+// parseMisc
+// Consume XML "Misc" [production #27]
+// which is any combination of space, PI and comments
+// Need to watch end-of-input because xml MISC stuff is allowed after
+// the document element, so we WILL scan off the end in this function
+//
+void
+UXMLParser::parseMisc(UErrorCode &status) {
+ for (;;) {
+ if (fPos >= mXMLPI.input().length()) {
+ break;
+ }
+ if (mXMLPI.lookingAt(fPos, status)) {
+ fPos = mXMLPI.end(status);
+ continue;
+ }
+ if (mXMLSP.lookingAt(fPos, status)) {
+ fPos = mXMLSP.end(status);
+ continue;
+ }
+ if (mXMLComment.lookingAt(fPos, status)) {
+ fPos = mXMLComment.end(status);
+ continue;
+ }
+ break;
+ }
+}
+
+//
+// Scan for document content.
+//
+UnicodeString
+UXMLParser::scanContent(UErrorCode &status) {
+ UnicodeString result;
+ if (mXMLCharData.lookingAt(fPos, status)) {
+ result = mXMLCharData.group((int32_t)0, status);
+ // Normalize the new-lines. (Before char ref substitution)
+ mNewLineNormalizer.reset(result);
+ result = mNewLineNormalizer.replaceAll(fOneLF, status);
+
+ // TODO: handle CDATA
+ fPos = mXMLCharData.end(0, status);
+ }
+
+ return result;
+}
+
+//
+// replaceCharRefs
+//
+// replace the char entities &lt; &amp; &#123; &#x12ab; etc. in a string
+// with the corresponding actual character.
+//
+void
+UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
+ UnicodeString result;
+ UnicodeString replacement;
+ int i;
+
+ mAmps.reset(s);
+ // See the initialization for the regex matcher mAmps.
+ // Which entity we've matched is determined by which capture group has content,
+ // which is flagged by start() of that group not being -1.
+ while (mAmps.find()) {
+ if (mAmps.start(1, status) != -1) {
+ replacement.setTo((char16_t)x_AMP);
+ } else if (mAmps.start(2, status) != -1) {
+ replacement.setTo((char16_t)x_LT);
+ } else if (mAmps.start(3, status) != -1) {
+ replacement.setTo((char16_t)x_GT);
+ } else if (mAmps.start(4, status) != -1) {
+ replacement.setTo((char16_t)x_APOS);
+ } else if (mAmps.start(5, status) != -1) {
+ replacement.setTo((char16_t)x_QUOT);
+ } else if (mAmps.start(6, status) != -1) {
+ UnicodeString hexString = mAmps.group(6, status);
+ UChar32 val = 0;
+ for (i=0; i<hexString.length(); i++) {
+ val = (val << 4) + u_digit(hexString.charAt(i), 16);
+ }
+ // TODO: some verification that the character is valid
+ replacement.setTo(val);
+ } else if (mAmps.start(7, status) != -1) {
+ UnicodeString decimalString = mAmps.group(7, status);
+ UChar32 val = 0;
+ for (i=0; i<decimalString.length(); i++) {
+ val = val*10 + u_digit(decimalString.charAt(i), 10);
+ }
+ // TODO: some verification that the character is valid
+ replacement.setTo(val);
+ } else {
+ // An unrecognized &entity; Leave it alone.
+ // TODO: check that it really looks like an entity, and is not some
+ // random & in the text.
+ replacement = mAmps.group((int32_t)0, status);
+ }
+ mAmps.appendReplacement(result, replacement, status);
+ }
+ mAmps.appendTail(result);
+ s = result;
+}
+
+void
+UXMLParser::error(const char *message, UErrorCode &status) {
+ // TODO: something better here...
+ const UnicodeString &src=mXMLDecl.input();
+ int line = 0;
+ int ci = 0;
+ while (ci < fPos && ci>=0) {
+ ci = src.indexOf((char16_t)0x0a, ci+1);
+ line++;
+ }
+ fprintf(stderr, "Error: %s at line %d\n", message, line);
+ if (U_SUCCESS(status)) {
+ status = U_PARSE_ERROR;
+ }
+}
+
+// intern strings like in Java
+
+const UnicodeString *
+UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
+ const UHashElement *he=fNames.find(s);
+ if(he!=nullptr) {
+ // already a known name, return its hashed key pointer
+ return (const UnicodeString *)he->key.pointer;
+ } else {
+ // add this new name and return its hashed key pointer
+ fNames.puti(s, 1, errorCode);
+ he=fNames.find(s);
+ return (const UnicodeString *)he->key.pointer;
+ }
+}
+
+const UnicodeString *
+UXMLParser::findName(const UnicodeString &s) const {
+ const UHashElement *he=fNames.find(s);
+ if(he!=nullptr) {
+ // a known name, return its hashed key pointer
+ return (const UnicodeString *)he->key.pointer;
+ } else {
+ // unknown name
+ return nullptr;
+ }
+}
+
+// UXMLElement ------------------------------------------------------------- ***
+
+UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
+ fParser(parser),
+ fName(name),
+ fAttNames(errorCode),
+ fAttValues(errorCode),
+ fChildren(errorCode),
+ fParent(nullptr)
+{
+}
+
+UXMLElement::~UXMLElement() {
+ int i;
+ // attribute names are owned by the UXMLParser, don't delete them here
+ for (i=fAttValues.size()-1; i>=0; i--) {
+ delete (UObject *)fAttValues.elementAt(i);
+ }
+ for (i=fChildren.size()-1; i>=0; i--) {
+ delete (UObject *)fChildren.elementAt(i);
+ }
+}
+
+const UnicodeString &
+UXMLElement::getTagName() const {
+ return *fName;
+}
+
+UnicodeString
+UXMLElement::getText(UBool recurse) const {
+ UnicodeString text;
+ appendText(text, recurse);
+ return text;
+}
+
+void
+UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
+ const UObject *node;
+ int32_t i, count=fChildren.size();
+ for(i=0; i<count; ++i) {
+ node=(const UObject *)fChildren.elementAt(i);
+ const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
+ if(s!=nullptr) {
+ text.append(*s);
+ } else if(recurse) /* must be a UXMLElement */ {
+ ((const UXMLElement *)node)->appendText(text, recurse);
+ }
+ }
+}
+
+int32_t
+UXMLElement::countAttributes() const {
+ return fAttNames.size();
+}
+
+const UnicodeString *
+UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
+ if(0<=i && i<fAttNames.size()) {
+ name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
+ value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
+ return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
+ } else {
+ return nullptr;
+ }
+}
+
+const UnicodeString *
+UXMLElement::getAttribute(const UnicodeString &name) const {
+ // search for the attribute name by comparing the interned pointer,
+ // not the string contents
+ const UnicodeString *p=fParser->findName(name);
+ if(p==nullptr) {
+ return nullptr; // no such attribute seen by the parser at all
+ }
+
+ int32_t i, count=fAttNames.size();
+ for(i=0; i<count; ++i) {
+ if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
+ return (const UnicodeString *)fAttValues.elementAt(i);
+ }
+ }
+ return nullptr;
+}
+
+int32_t
+UXMLElement::countChildren() const {
+ return fChildren.size();
+}
+
+const UObject *
+UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
+ if(0<=i && i<fChildren.size()) {
+ const UObject *node=(const UObject *)fChildren.elementAt(i);
+ if(dynamic_cast<const UXMLElement *>(node)!=nullptr) {
+ type=UXML_NODE_TYPE_ELEMENT;
+ } else {
+ type=UXML_NODE_TYPE_STRING;
+ }
+ return node;
+ } else {
+ return nullptr;
+ }
+}
+
+const UXMLElement *
+UXMLElement::nextChildElement(int32_t &i) const {
+ if(i<0) {
+ return nullptr;
+ }
+
+ const UObject *node;
+ int32_t count=fChildren.size();
+ while(i<count) {
+ node=(const UObject *)fChildren.elementAt(i++);
+ const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
+ if(elem!=nullptr) {
+ return elem;
+ }
+ }
+ return nullptr;
+}
+
+const UXMLElement *
+UXMLElement::getChildElement(const UnicodeString &name) const {
+ // search for the element name by comparing the interned pointer,
+ // not the string contents
+ const UnicodeString *p=fParser->findName(name);
+ if(p==nullptr) {
+ return nullptr; // no such element seen by the parser at all
+ }
+
+ const UObject *node;
+ int32_t i, count=fChildren.size();
+ for(i=0; i<count; ++i) {
+ node=(const UObject *)fChildren.elementAt(i);
+ const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
+ if(elem!=nullptr) {
+ if(p==elem->fName) {
+ return elem;
+ }
+ }
+ }
+ return nullptr;
+}
+
+U_NAMESPACE_END
+
+#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
+
diff --git a/intl/icu/source/tools/toolutil/xmlparser.h b/intl/icu/source/tools/toolutil/xmlparser.h
new file mode 100644
index 0000000000..d0dcd9a48a
--- /dev/null
+++ b/intl/icu/source/tools/toolutil/xmlparser.h
@@ -0,0 +1,247 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2004-2005, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: xmlparser.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2004jul21
+* created by: Andy Heninger
+*
+* Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
+* Not suitable for production use. Not supported.
+* Not conformant. Not efficient.
+* But very small.
+*/
+
+#ifndef __XMLPARSER_H__
+#define __XMLPARSER_H__
+
+#include "unicode/uobject.h"
+#include "unicode/unistr.h"
+#include "unicode/regex.h"
+#include "uvector.h"
+#include "hash.h"
+
+#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
+
+enum UXMLNodeType {
+ /** Node type string (text contents), stored as a UnicodeString. */
+ UXML_NODE_TYPE_STRING,
+ /** Node type element, stored as a UXMLElement. */
+ UXML_NODE_TYPE_ELEMENT,
+ UXML_NODE_TYPE_COUNT
+};
+
+U_NAMESPACE_BEGIN
+
+class UXMLParser;
+
+/**
+ * This class represents an element node in a parsed XML tree.
+ */
+class U_TOOLUTIL_API UXMLElement : public UObject {
+public:
+ /**
+ * Destructor.
+ */
+ virtual ~UXMLElement();
+
+ /**
+ * Get the tag name of this element.
+ */
+ const UnicodeString &getTagName() const;
+ /**
+ * Get the text contents of the element.
+ * Append the contents of all text child nodes.
+ * @param recurse If true, also recursively appends the contents of all
+ * text child nodes of element children.
+ * @return The text contents.
+ */
+ UnicodeString getText(UBool recurse) const;
+ /**
+ * Get the number of attributes.
+ */
+ int32_t countAttributes() const;
+ /**
+ * Get the i-th attribute.
+ * @param i Index of the attribute.
+ * @param name Output parameter, receives the attribute name.
+ * @param value Output parameter, receives the attribute value.
+ * @return A pointer to the attribute value (may be &value or a pointer to an
+ * internal string object), or nullptr if i is out of bounds.
+ */
+ const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
+ /**
+ * Get the value of the attribute with the given name.
+ * @param name Attribute name to be looked up.
+ * @return A pointer to the attribute value, or nullptr if this element
+ * does not have this attribute.
+ */
+ const UnicodeString *getAttribute(const UnicodeString &name) const;
+ /**
+ * Get the number of child nodes.
+ */
+ int32_t countChildren() const;
+ /**
+ * Get the i-th child node.
+ * @param i Index of the child node.
+ * @param type The child node type.
+ * @return A pointer to the child node object, or nullptr if i is out of bounds.
+ */
+ const UObject *getChild(int32_t i, UXMLNodeType &type) const;
+ /**
+ * Get the next child element node, skipping non-element child nodes.
+ * @param i Enumeration index; initialize to 0 before getting the first child element.
+ * @return A pointer to the next child element, or nullptr if there is none.
+ */
+ const UXMLElement *nextChildElement(int32_t &i) const;
+ /**
+ * Get the immediate child element with the given name.
+ * If there are multiple child elements with this name, then return
+ * the first one.
+ * @param name Element name to be looked up.
+ * @return A pointer to the element node, or nullptr if this element
+ * does not have this immediate child element.
+ */
+ const UXMLElement *getChildElement(const UnicodeString &name) const;
+
+ /**
+ * ICU "poor man's RTTI", returns a UClassID for the actual class.
+ */
+ virtual UClassID getDynamicClassID() const override;
+
+ /**
+ * ICU "poor man's RTTI", returns a UClassID for this class.
+ */
+ static UClassID U_EXPORT2 getStaticClassID();
+
+private:
+ // prevent default construction etc.
+ UXMLElement();
+ UXMLElement(const UXMLElement &other);
+ UXMLElement &operator=(const UXMLElement &other);
+
+ void appendText(UnicodeString &text, UBool recurse) const;
+
+ friend class UXMLParser;
+
+ UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
+
+ const UXMLParser *fParser;
+ const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser)
+ UnicodeString fContent; // The text content of this node. All element content is
+ // concatenated even when there are intervening nested elements
+ // (which doesn't happen with most xml files we care about)
+ // Sections of content containing only white space are dropped,
+ // which gets rid the bogus white space content from
+ // elements which are primarily containers for nested elements.
+ UVector fAttNames; // A vector containing the names of this element's attributes
+ // The names are UnicodeString objects, owned by the UXMLParser.
+ UVector fAttValues; // A vector containing the attribute values for
+ // this element's attributes. The order is the same
+ // as that of the attribute name vector.
+
+ UVector fChildren; // The child nodes of this element (a Vector)
+
+ UXMLElement *fParent; // A pointer to the parent element of this element.
+};
+
+/**
+ * A simple XML parser; it is neither efficient nor conformant and only useful for
+ * restricted types of XML documents.
+ *
+ * The parse methods parse whole documents and return the parse trees via their
+ * root elements.
+ */
+class U_TOOLUTIL_API UXMLParser : public UObject {
+public:
+ /**
+ * Create an XML parser.
+ */
+ static UXMLParser *createParser(UErrorCode &errorCode);
+ /**
+ * Destructor.
+ */
+ virtual ~UXMLParser();
+
+ /**
+ * Parse an XML document, create the entire document tree, and
+ * return a pointer to the root element of the parsed tree.
+ * The caller must delete the element.
+ */
+ UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
+ /**
+ * Parse an XML file, create the entire document tree, and
+ * return a pointer to the root element of the parsed tree.
+ * The caller must delete the element.
+ */
+ UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
+
+ /**
+ * ICU "poor man's RTTI", returns a UClassID for the actual class.
+ */
+ virtual UClassID getDynamicClassID() const override;
+
+ /**
+ * ICU "poor man's RTTI", returns a UClassID for this class.
+ */
+ static UClassID U_EXPORT2 getStaticClassID();
+
+private:
+ // prevent default construction etc.
+ UXMLParser();
+ UXMLParser(const UXMLParser &other);
+ UXMLParser &operator=(const UXMLParser &other);
+
+ // constructor
+ UXMLParser(UErrorCode &status);
+
+ void parseMisc(UErrorCode &status);
+ UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status);
+ void error(const char *message, UErrorCode &status);
+ UnicodeString scanContent(UErrorCode &status);
+ void replaceCharRefs(UnicodeString &s, UErrorCode &status);
+
+ const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
+public:
+ // public for UXMLElement only
+ const UnicodeString *findName(const UnicodeString &s) const;
+private:
+
+ // There is one ICU regex matcher for each of the major XML syntax items
+ // that are recognized.
+ RegexMatcher mXMLDecl;
+ RegexMatcher mXMLComment;
+ RegexMatcher mXMLSP;
+ RegexMatcher mXMLDoctype;
+ RegexMatcher mXMLPI;
+ RegexMatcher mXMLElemStart;
+ RegexMatcher mXMLElemEnd;
+ RegexMatcher mXMLElemEmpty;
+ RegexMatcher mXMLCharData;
+ RegexMatcher mAttrValue;
+ RegexMatcher mAttrNormalizer;
+ RegexMatcher mNewLineNormalizer;
+ RegexMatcher mAmps;
+
+ Hashtable fNames; // interned element/attribute name strings
+ UStack fElementStack; // Stack holds the parent elements when nested
+ // elements are being parsed. All items on this
+ // stack are of type UXMLElement.
+ int32_t fPos; // String index of the current scan position in
+ // xml source (in fSrc).
+ UnicodeString fOneLF;
+};
+
+U_NAMESPACE_END
+#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
+
+#endif