diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
commit | 6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch) | |
tree | a68f146d7fa01f0134297619fbe7e33db084e0aa /intl/icu/source/tools/genrb | |
parent | Initial commit. (diff) | |
download | thunderbird-upstream.tar.xz thunderbird-upstream.zip |
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'intl/icu/source/tools/genrb')
31 files changed, 11300 insertions, 0 deletions
diff --git a/intl/icu/source/tools/genrb/Makefile.in b/intl/icu/source/tools/genrb/Makefile.in new file mode 100644 index 0000000000..336d839448 --- /dev/null +++ b/intl/icu/source/tools/genrb/Makefile.in @@ -0,0 +1,114 @@ +################################################################################# +## Makefile.in for ICU - tools/genrb # +## Copyright (C) 2016 and later: Unicode, Inc. and others. # +## License & terms of use: http://www.unicode.org/copyright.html # +## Copyright (c) 1999-2014, International Business Machines Corporation and # +## others. All Rights Reserved. # +################################################################################# + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/genrb + +TARGET_STUB_NAME = genrb +DERB_STUB_NAME = derb + +SECTION = 1 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) +@ICUIO_TRUE@MAN_FILES += $(DERB_STUB_NAME).$(SECTION) + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(MAN_FILES) $(DEPS) $(DERB_DEPS) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) +# derb depends on icuio +@ICUIO_TRUE@DERB = $(BINDIR)/$(DERB_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(srcdir) -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(srcdir)/../toolutil -I$(top_srcdir)/io +CPPFLAGS += -DUNISTR_FROM_CHAR_EXPLICIT=explicit -DUNISTR_FROM_STRING_EXPLICIT=explicit +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(patsubst %.cpp,%.o,$(patsubst %.c,%.o, $(SOURCES))) +DERB_SOURCES = derb.cpp +DERB_OBJ = $(DERB_SOURCES:.cpp=.o) + +DEPS = $(OBJECTS:.o=.d) +DERB_DEPS = $(DERB_OBJ:.o=.d) + +-include Makefile.local + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(DERB) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(bindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(bindir) +@ICUIO_TRUE@ $(INSTALL) $(DERB) $(DESTDIR)$(bindir) + +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(DERB) $(OBJECTS) $(DERB_OBJ) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + +$(DERB) : $(DERB_OBJ) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBICUIO) $(LIBS) + $(POST_BUILD_STEP) + +# This line is needed to serialize builds when the gmake -j option is used. +$(TARGET_STUB_NAME).$(SECTION): $(DERB_STUB_NAME).$(SECTION) + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +-include $(DERB_DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif diff --git a/intl/icu/source/tools/genrb/derb.1.in b/intl/icu/source/tools/genrb/derb.1.in new file mode 100644 index 0000000000..725b571ce2 --- /dev/null +++ b/intl/icu/source/tools/genrb/derb.1.in @@ -0,0 +1,198 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" derb.1: manual page for the derb utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2000-2014 IBM, Inc. and others. +.\" +.TH DERB 1 "7 Mar 2014" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B derb +\- disassemble a resource bundle +.SH SYNOPSIS +.B derb +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-V\fP, \fB\-\-version" +] +[ +.BR "\-v\fP, \fB\-\-verbose" +] +[ +.BI "\-e\fP, \fB\-\-encoding" " encoding" +] +[ +.BI "\-\-bom" +] +[ +.BI "\-t\fP, \fB\-\-truncate" " \fR[ \fPsize\fR ]\fP" +] +[ +.BI "\-s\fP, \fB\-\-sourcedir" " source" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +[ +.BI "\-i\fP, \fB\-\-icudatadir" " directory" +] +[ +.BI "\-c\fP, \fB\-\-to\-stdout" +] +.IR bundle " \.\.\." +.SH DESCRIPTION +.B derb +reads the compiled resource +.I bundle +files passed on the command line and write them back in text form. +The resulting text files have a +.B .txt +extension while compiled resource bundle source files typically have a +.B .res +extension. +.PP +It is customary to name the resource bundles by their locale name, +i.e. to use a local identifier for the +.I bundle +filename, e.g. +.B ja_JP.res +for Japanese (Japan) data, or +.B root.res +for the root bundle. +This is especially important for +.B derb +since the locale name is not accessible directly from the compiled +resource bundle, and to know which locale to ask for when opening +the bundle. +.B derb +will produce a file whose base name is the base name of the compiled resource file itself. +If the +.BI "\-\-to\-stdout\fP, \fB\-c\fP" +option is used, however, the text will be written on the standard output. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BR "\-V\fP, \fB\-\-version" +Print the version of +.B derb +and exit. +.TP +.BR "\-v\fP, \fB\-\-verbose" +Display extra informative messages during execution. +.TP +.BR "\-A\fP, \fB\-\-suppressAliases" +Don't follow aliases when producing output. +.TP +.BI "\-e\fP, \fB\-\-encoding" " encoding" +Set the encoding used to write output files to +.IR encoding . +The default encoding is the invariant (subset of ASCII or EBCDIC) +codepage for the system (see section +.BR "INVARIANT CHARACTERS" ). +The choice of the encoding does not affect the data, just their +representation. Characters that cannot be represented in the +.I encoding +will be represented using +.BI \eu "hhhh" +escape sequences. +.TP +.BI "\-\-bom" +Write a byte order mark (BOM) at the beginning of the file. +.TP +.BI "\-l\fP, \fB\-\-locale" " locale" +Set the +.I locale +for the resource bundle, which is used both in the generated text and +as the base name of the output file. +.TP +.BI "\-t\fP, \fB\-\-truncate" " \fR[ \fPsize\fR ]\fP" +Truncate individual resources (strings or binary data) to +.I size +bytes. The default if +.I size +is not specified is +.B 80 +bytes. +.TP +.BI "\-s\fP, \fB\-\-sourcedir" " source" +Set the source directory to +.IR source . +The default source directory is the current directory. +If +.B - +is passed for +.IR source , +then the +.I bundle +will be looked for in its default location, specified by +the +.B ICU_DATA +environment variable (or defaulting to +the location set when ICU was built if +.B ICU_DATA +is not set). +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory to +.IR destination . +The default destination directory is specified by the environment variable +.BR ICU_DATA +or is the location set when ICU was built if +.B ICU_DATA +is not set. +.TP +.BI "\-i\fP, \fB\-\-icudatadir" " directory" +Look for any necessary ICU data files in +.IR directory . +For example, when processing collation overrides, the file +.B ucadata.dat +must be located. +The default ICU data directory is specified by the environment variable +.BR ICU_DATA . +.TP +.BI "\-c\fP, \fB\-\-to\-stdout" +Write the disassembled +.I bundle +on standard output instead of into a file. +.SH CAVEATS +When the option +.BI \-\-bom +is used, the character +.B U+FEFF +is written in the destination +.I encoding +regardless of whether it is a Unicode transformation format (UTF) or not. +This option should only be used with an UTF encoding, as byte order marks +are not meaningful for other encodings. +.SH INVARIANT CHARACTERS +The +.B invariant character set +consists of the following set of characters, expressed as a standard POSIX +regular expression: +.BR "[a-z]|[A-Z]|[0-9]|_| |+|-|*|/" . +This is the set which is guaranteed to be available regardless of code page. +.SH ENVIRONMENT +.TP 10 +.B ICU_DATA +Specifies the directory containing ICU data. Defaults to +.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ . +Some tools in ICU depend on the presence of the trailing slash. It is thus +important to make sure that it is present if +.B ICU_DATA +is set. +.SH AUTHORS +Vladimir Weinstein +.br +Yves Arrouye +.SH VERSION +1.0 +.SH COPYRIGHT +Copyright (C) 2002 IBM, Inc. and others. +.SH SEE ALSO +.BR genrb (1) + diff --git a/intl/icu/source/tools/genrb/derb.cpp b/intl/icu/source/tools/genrb/derb.cpp new file mode 100644 index 0000000000..3b28289569 --- /dev/null +++ b/intl/icu/source/tools/genrb/derb.cpp @@ -0,0 +1,657 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: derb.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000sep6 +* created by: Vladimir Weinstein as an ICU workshop example +* maintained by: Yves Arrouye <yves@realnames.com> +*/ + +#include "unicode/stringpiece.h" +#include "unicode/ucnv.h" +#include "unicode/unistr.h" +#include "unicode/ustring.h" +#include "unicode/putil.h" +#include "unicode/ustdio.h" + +#include "charstr.h" +#include "uresimp.h" +#include "cmemory.h" +#include "cstring.h" +#include "uoptions.h" +#include "toolutil.h" +#include "ustrfmt.h" + +#if !UCONFIG_NO_FORMATTING + +#define DERB_VERSION "1.1" + +#define DERB_DEFAULT_TRUNC 80 + +static const int32_t indentsize = 4; +static int32_t truncsize = DERB_DEFAULT_TRUNC; +static UBool opt_truncate = false; + +static const char *getEncodingName(const char *encoding); +static void reportError(const char *pname, UErrorCode *status, const char *when); +static char16_t *quotedString(const char16_t *string); +static void printOutBundle(UFILE *out, UResourceBundle *resource, int32_t indent, const char *pname, UErrorCode *status); +static void printString(UFILE *out, const char16_t *str, int32_t len); +static void printCString(UFILE *out, const char *str, int32_t len); +static void printIndent(UFILE *out, int32_t indent); +static void printHex(UFILE *out, uint8_t what); + +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, +/* 2 */ UOPTION_ENCODING, +/* 3 */ { "to-stdout", nullptr, nullptr, nullptr, 'c', UOPT_NO_ARG, 0 } , +/* 4 */ { "truncate", nullptr, nullptr, nullptr, 't', UOPT_OPTIONAL_ARG, 0 }, +/* 5 */ UOPTION_VERBOSE, +/* 6 */ UOPTION_DESTDIR, +/* 7 */ UOPTION_SOURCEDIR, +/* 8 */ { "bom", nullptr, nullptr, nullptr, 0, UOPT_NO_ARG, 0 }, +/* 9 */ UOPTION_ICUDATADIR, +/* 10 */ UOPTION_VERSION, +/* 11 */ { "suppressAliases", nullptr, nullptr, nullptr, 'A', UOPT_NO_ARG, 0 }, +}; + +static UBool verbose = false; +static UBool suppressAliases = false; +static UFILE *ustderr = nullptr; + +extern int +main(int argc, char* argv[]) { + const char *encoding = nullptr; + const char *outputDir = nullptr; /* nullptr = no output directory, use current */ + const char *inputDir = "."; + int tostdout = 0; + int prbom = 0; + + const char *pname; + + UResourceBundle *bundle = nullptr; + int32_t i = 0; + + const char* arg; + + /* Get the name of tool. */ + pname = uprv_strrchr(*argv, U_FILE_SEP_CHAR); +#if U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR + if (!pname) { + pname = uprv_strrchr(*argv, U_FILE_ALT_SEP_CHAR); + } +#endif + if (!pname) { + pname = *argv; + } else { + ++pname; + } + + /* error handling, printing usage message */ + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "%s: error in command line argument \"%s\"\n", pname, + argv[-argc]); + } + if(argc<0 || options[0].doesOccur || options[1].doesOccur) { + fprintf(argc < 0 ? stderr : stdout, + "%csage: %s [ -h, -?, --help ] [ -V, --version ]\n" + " [ -v, --verbose ] [ -e, --encoding encoding ] [ --bom ]\n" + " [ -t, --truncate [ size ] ]\n" + " [ -s, --sourcedir source ] [ -d, --destdir destination ]\n" + " [ -i, --icudatadir directory ] [ -c, --to-stdout ]\n" + " [ -A, --suppressAliases]\n" + " bundle ...\n", argc < 0 ? 'u' : 'U', + pname); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + if(options[10].doesOccur) { + fprintf(stderr, + "%s version %s (ICU version %s).\n" + "%s\n", + pname, DERB_VERSION, U_ICU_VERSION, U_COPYRIGHT_STRING); + return U_ZERO_ERROR; + } + if(options[2].doesOccur) { + encoding = options[2].value; + } + + if (options[3].doesOccur) { + if(options[2].doesOccur) { + fprintf(stderr, "%s: Error: don't specify an encoding (-e) when writing to stdout (-c).\n", pname); + return 3; + } + tostdout = 1; + } + + if(options[4].doesOccur) { + opt_truncate = true; + if(options[4].value != nullptr) { + truncsize = atoi(options[4].value); /* user defined printable size */ + } else { + truncsize = DERB_DEFAULT_TRUNC; /* we'll use default omitting size */ + } + } else { + opt_truncate = false; + } + + if(options[5].doesOccur) { + verbose = true; + } + + if (options[6].doesOccur) { + outputDir = options[6].value; + } + + if(options[7].doesOccur) { + inputDir = options[7].value; /* we'll use users resources */ + } + + if (options[8].doesOccur) { + prbom = 1; + } + + if (options[9].doesOccur) { + u_setDataDirectory(options[9].value); + } + + if (options[11].doesOccur) { + suppressAliases = true; + } + + fflush(stderr); // use ustderr now. + ustderr = u_finit(stderr, nullptr, nullptr); + + for (i = 1; i < argc; ++i) { + static const char16_t sp[] = { 0x0020 }; /* " " */ + + arg = getLongPathname(argv[i]); + + if (verbose) { + u_fprintf(ustderr, "processing bundle \"%s\"\n", argv[i]); + } + + icu::CharString locale; + UErrorCode status = U_ZERO_ERROR; + { + const char *p = findBasename(arg); + const char *q = uprv_strrchr(p, '.'); + if (q == nullptr) { + locale.append(p, status); + } else { + locale.append(p, (int32_t)(q - p), status); + } + } + if (U_FAILURE(status)) { + return status; + } + + icu::CharString infile; + const char *thename = nullptr; + UBool fromICUData = !uprv_strcmp(inputDir, "-"); + if (!fromICUData) { + UBool absfilename = *arg == U_FILE_SEP_CHAR; +#if U_PLATFORM_HAS_WIN32_API + if (!absfilename) { + absfilename = (uprv_strlen(arg) > 2 && isalpha(arg[0]) + && arg[1] == ':' && arg[2] == U_FILE_SEP_CHAR); + } +#endif + if (absfilename) { + thename = arg; + } else { + const char *q = uprv_strrchr(arg, U_FILE_SEP_CHAR); +#if U_FILE_SEP_CHAR != U_FILE_ALT_SEP_CHAR + if (q == nullptr) { + q = uprv_strrchr(arg, U_FILE_ALT_SEP_CHAR); + } +#endif + infile.append(inputDir, status); + if(q != nullptr) { + infile.appendPathPart(icu::StringPiece(arg, (int32_t)(q - arg)), status); + } + if (U_FAILURE(status)) { + return status; + } + thename = infile.data(); + } + } + if (thename) { + bundle = ures_openDirect(thename, locale.data(), &status); + } else { + bundle = ures_open(fromICUData ? 0 : inputDir, locale.data(), &status); + } + if (U_SUCCESS(status)) { + UFILE *out = nullptr; + + const char *filename = 0; + const char *ext = 0; + + if (locale.isEmpty() || !tostdout) { + filename = findBasename(arg); + ext = uprv_strrchr(filename, '.'); + if (!ext) { + ext = uprv_strchr(filename, 0); + } + } + + if (tostdout) { + out = u_get_stdout(); + } else { + icu::CharString thefile; + if (outputDir) { + thefile.append(outputDir, status); + } + thefile.appendPathPart(filename, status); + if (*ext) { + thefile.truncate(thefile.length() - (int32_t)uprv_strlen(ext)); + } + thefile.append(".txt", status); + if (U_FAILURE(status)) { + return status; + } + + out = u_fopen(thefile.data(), "w", nullptr, encoding); + if (!out) { + u_fprintf(ustderr, "%s: couldn't create %s\n", pname, thefile.data()); + u_fclose(ustderr); + return 4; + } + } + + // now, set the callback. + ucnv_setFromUCallBack(u_fgetConverter(out), UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C, 0, 0, &status); + if (U_FAILURE(status)) { + u_fprintf(ustderr, "%s: couldn't configure converter for encoding\n", pname); + u_fclose(ustderr); + if(!tostdout) { + u_fclose(out); + } + return 3; + } + + if (prbom) { /* XXX: Should be done only for UTFs */ + u_fputc(0xFEFF, out); + } + u_fprintf(out, "// -*- Coding: %s; -*-\n//\n", encoding ? encoding : getEncodingName(ucnv_getDefaultName())); + u_fprintf(out, "// This file was dumped by derb(8) from "); + if (thename) { + u_fprintf(out, "%s", thename); + } else if (fromICUData) { + u_fprintf(out, "the ICU internal %s locale", locale.data()); + } + + u_fprintf(out, "\n// derb(8) by Vladimir Weinstein and Yves Arrouye\n\n"); + + if (!locale.isEmpty()) { + u_fprintf(out, "%s", locale.data()); + } else { + u_fprintf(out, "%.*s%.*S", (int32_t)(ext - filename), filename, UPRV_LENGTHOF(sp), sp); + } + printOutBundle(out, bundle, 0, pname, &status); + + if (!tostdout) { + u_fclose(out); + } + } + else { + reportError(pname, &status, "opening resource file"); + } + + ures_close(bundle); + } + + return 0; +} + +static char16_t *quotedString(const char16_t *string) { + int len = u_strlen(string); + int alen = len; + const char16_t *sp; + char16_t *newstr, *np; + + for (sp = string; *sp; ++sp) { + switch (*sp) { + case '\n': + case 0x0022: + ++alen; + break; + } + } + + newstr = (char16_t *) uprv_malloc((1 + alen) * U_SIZEOF_UCHAR); + for (sp = string, np = newstr; *sp; ++sp) { + switch (*sp) { + case '\n': + *np++ = 0x005C; + *np++ = 0x006E; + break; + + case 0x0022: + *np++ = 0x005C; + U_FALLTHROUGH; + default: + *np++ = *sp; + break; + } + } + *np = 0; + + return newstr; +} + + +static void printString(UFILE *out, const char16_t *str, int32_t len) { + u_file_write(str, len, out); +} + +static void printCString(UFILE *out, const char *str, int32_t len) { + if(len==-1) { + u_fprintf(out, "%s", str); + } else { + u_fprintf(out, "%.*s", len, str); + } +} + +static void printIndent(UFILE *out, int32_t indent) { + icu::UnicodeString inchar(indent, 0x20, indent); + printString(out, inchar.getBuffer(), indent); +} + +static void printHex(UFILE *out, uint8_t what) { + static const char map[] = "0123456789ABCDEF"; + char16_t hex[2]; + + hex[0] = map[what >> 4]; + hex[1] = map[what & 0xf]; + + printString(out, hex, 2); +} + +static void printOutAlias(UFILE *out, UResourceBundle *parent, Resource r, const char *key, int32_t indent, const char *pname, UErrorCode *status) { + static const char16_t cr[] = { 0xA }; // LF + int32_t len = 0; + const char16_t* thestr = res_getAlias(&(parent->getResData()), r, &len); + char16_t *string = quotedString(thestr); + if(opt_truncate && len > truncsize) { + char msg[128]; + printIndent(out, indent); + snprintf(msg, sizeof(msg), "// WARNING: this resource, size %li is truncated to %li\n", + (long)len, (long)truncsize/2); + printCString(out, msg, -1); + len = truncsize; + } + if(U_SUCCESS(*status)) { + static const char16_t openStr[] = { 0x003A, 0x0061, 0x006C, 0x0069, 0x0061, 0x0073, 0x0020, 0x007B, 0x0020, 0x0022 }; /* ":alias { \"" */ + static const char16_t closeStr[] = { 0x0022, 0x0020, 0x007D, 0x0020 }; /* "\" } " */ + printIndent(out, indent); + if(key != nullptr) { + printCString(out, key, -1); + } + printString(out, openStr, UPRV_LENGTHOF(openStr)); + printString(out, string, len); + printString(out, closeStr, UPRV_LENGTHOF(closeStr)); + if(verbose) { + printCString(out, " // ALIAS", -1); + } + printString(out, cr, UPRV_LENGTHOF(cr)); + } else { + reportError(pname, status, "getting binary value"); + } + uprv_free(string); +} + +static void printOutBundle(UFILE *out, UResourceBundle *resource, int32_t indent, const char *pname, UErrorCode *status) +{ + static const char16_t cr[] = { 0xA }; // LF + +/* int32_t noOfElements = ures_getSize(resource);*/ + int32_t i = 0; + const char *key = ures_getKey(resource); + + switch(ures_getType(resource)) { + case URES_STRING : + { + int32_t len=0; + const char16_t* thestr = ures_getString(resource, &len, status); + char16_t *string = quotedString(thestr); + + /* TODO: String truncation */ + if(opt_truncate && len > truncsize) { + char msg[128]; + printIndent(out, indent); + snprintf(msg, sizeof(msg), "// WARNING: this resource, size %li is truncated to %li\n", + (long)len, (long)(truncsize/2)); + printCString(out, msg, -1); + len = truncsize/2; + } + printIndent(out, indent); + if(key != nullptr) { + static const char16_t openStr[] = { 0x0020, 0x007B, 0x0020, 0x0022 }; /* " { \"" */ + static const char16_t closeStr[] = { 0x0022, 0x0020, 0x007D }; /* "\" }" */ + printCString(out, key, (int32_t)uprv_strlen(key)); + printString(out, openStr, UPRV_LENGTHOF(openStr)); + printString(out, string, len); + printString(out, closeStr, UPRV_LENGTHOF(closeStr)); + } else { + static const char16_t openStr[] = { 0x0022 }; /* "\"" */ + static const char16_t closeStr[] = { 0x0022, 0x002C }; /* "\"," */ + + printString(out, openStr, UPRV_LENGTHOF(openStr)); + printString(out, string, (int32_t)(u_strlen(string))); + printString(out, closeStr, UPRV_LENGTHOF(closeStr)); + } + + if(verbose) { + printCString(out, "// STRING", -1); + } + printString(out, cr, UPRV_LENGTHOF(cr)); + + uprv_free(string); + } + break; + + case URES_INT : + { + static const char16_t openStr[] = { 0x003A, 0x0069, 0x006E, 0x0074, 0x0020, 0x007B, 0x0020 }; /* ":int { " */ + static const char16_t closeStr[] = { 0x0020, 0x007D }; /* " }" */ + char16_t num[20]; + + printIndent(out, indent); + if(key != nullptr) { + printCString(out, key, -1); + } + printString(out, openStr, UPRV_LENGTHOF(openStr)); + uprv_itou(num, 20, ures_getInt(resource, status), 10, 0); + printString(out, num, u_strlen(num)); + printString(out, closeStr, UPRV_LENGTHOF(closeStr)); + + if(verbose) { + printCString(out, "// INT", -1); + } + printString(out, cr, UPRV_LENGTHOF(cr)); + break; + } + case URES_BINARY : + { + int32_t len = 0; + const int8_t *data = (const int8_t *)ures_getBinary(resource, &len, status); + if(opt_truncate && len > truncsize) { + char msg[128]; + printIndent(out, indent); + snprintf(msg, sizeof(msg), "// WARNING: this resource, size %li is truncated to %li\n", + (long)len, (long)(truncsize/2)); + printCString(out, msg, -1); + len = truncsize; + } + if(U_SUCCESS(*status)) { + static const char16_t openStr[] = { 0x003A, 0x0062, 0x0069, 0x006E, 0x0061, 0x0072, 0x0079, 0x0020, 0x007B, 0x0020 }; /* ":binary { " */ + static const char16_t closeStr[] = { 0x0020, 0x007D, 0x0020 }; /* " } " */ + printIndent(out, indent); + if(key != nullptr) { + printCString(out, key, -1); + } + printString(out, openStr, UPRV_LENGTHOF(openStr)); + for(i = 0; i<len; i++) { + printHex(out, *data++); + } + printString(out, closeStr, UPRV_LENGTHOF(closeStr)); + if(verbose) { + printCString(out, " // BINARY", -1); + } + printString(out, cr, UPRV_LENGTHOF(cr)); + } else { + reportError(pname, status, "getting binary value"); + } + } + break; + case URES_INT_VECTOR : + { + int32_t len = 0; + const int32_t *data = ures_getIntVector(resource, &len, status); + if(U_SUCCESS(*status)) { + static const char16_t openStr[] = { 0x003A, 0x0069, 0x006E, 0x0074, 0x0076, 0x0065, 0x0063, 0x0074, 0x006F, 0x0072, 0x0020, 0x007B, 0x0020 }; /* ":intvector { " */ + static const char16_t closeStr[] = { 0x0020, 0x007D, 0x0020 }; /* " } " */ + char16_t num[20]; + + printIndent(out, indent); + if(key != nullptr) { + printCString(out, key, -1); + } + printString(out, openStr, UPRV_LENGTHOF(openStr)); + for(i = 0; i < len - 1; i++) { + int32_t numLen = uprv_itou(num, 20, data[i], 10, 0); + num[numLen++] = 0x002C; /* ',' */ + num[numLen++] = 0x0020; /* ' ' */ + num[numLen] = 0; + printString(out, num, u_strlen(num)); + } + if(len > 0) { + uprv_itou(num, 20, data[len - 1], 10, 0); + printString(out, num, u_strlen(num)); + } + printString(out, closeStr, UPRV_LENGTHOF(closeStr)); + if(verbose) { + printCString(out, "// INTVECTOR", -1); + } + printString(out, cr, UPRV_LENGTHOF(cr)); + } else { + reportError(pname, status, "getting int vector"); + } + } + break; + case URES_TABLE : + case URES_ARRAY : + { + static const char16_t openStr[] = { 0x007B }; /* "{" */ + static const char16_t closeStr[] = { 0x007D, '\n' }; /* "}\n" */ + + UResourceBundle *t = nullptr; + ures_resetIterator(resource); + printIndent(out, indent); + if(key != nullptr) { + printCString(out, key, -1); + } + printString(out, openStr, UPRV_LENGTHOF(openStr)); + if(verbose) { + if(ures_getType(resource) == URES_TABLE) { + printCString(out, "// TABLE", -1); + } else { + printCString(out, "// ARRAY", -1); + } + } + printString(out, cr, UPRV_LENGTHOF(cr)); + + if(suppressAliases == false) { + while(U_SUCCESS(*status) && ures_hasNext(resource)) { + t = ures_getNextResource(resource, t, status); + if(U_SUCCESS(*status)) { + printOutBundle(out, t, indent+indentsize, pname, status); + } else { + reportError(pname, status, "While processing table"); + *status = U_ZERO_ERROR; + } + } + } else { /* we have to use low level access to do this */ + Resource r; + int32_t resSize = ures_getSize(resource); + UBool isTable = (UBool)(ures_getType(resource) == URES_TABLE); + for(i = 0; i < resSize; i++) { + /* need to know if it's an alias */ + if(isTable) { + r = res_getTableItemByIndex(&resource->getResData(), resource->fRes, i, &key); + } else { + r = res_getArrayItem(&resource->getResData(), resource->fRes, i); + } + if(U_SUCCESS(*status)) { + if(res_getPublicType(r) == URES_ALIAS) { + printOutAlias(out, resource, r, key, indent+indentsize, pname, status); + } else { + t = ures_getByIndex(resource, i, t, status); + printOutBundle(out, t, indent+indentsize, pname, status); + } + } else { + reportError(pname, status, "While processing table"); + *status = U_ZERO_ERROR; + } + } + } + + printIndent(out, indent); + printString(out, closeStr, UPRV_LENGTHOF(closeStr)); + ures_close(t); + } + break; + default: + break; + } + +} + +static const char *getEncodingName(const char *encoding) { + UErrorCode err; + const char *enc; + + err = U_ZERO_ERROR; + if (!(enc = ucnv_getStandardName(encoding, "MIME", &err))) { + err = U_ZERO_ERROR; + if (!(enc = ucnv_getStandardName(encoding, "IANA", &err))) { + // do nothing + } + } + + return enc; +} + +static void reportError(const char *pname, UErrorCode *status, const char *when) { + u_fprintf(ustderr, "%s: error %d while %s: %s\n", pname, *status, when, u_errorName(*status)); +} + +#else +extern int +main(int argc, char* argv[]) { + /* Changing stdio.h ustdio.h requires that formatting not be disabled. */ + return 3; +} +#endif /* !UCONFIG_NO_FORMATTING */ + +/* + * Local Variables: + * indent-tabs-mode: nil + * End: + */ diff --git a/intl/icu/source/tools/genrb/derb.vcxproj b/intl/icu/source/tools/genrb/derb.vcxproj new file mode 100644 index 0000000000..f5ba9bf22f --- /dev/null +++ b/intl/icu/source/tools/genrb/derb.vcxproj @@ -0,0 +1,80 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{D3065ADB-8820-4CC7-9B6C-9510833961A3}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)/derb.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\i18n;..\..\common;..\toolutil;..\..\io;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)/derb.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)/derb.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)/derb.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icuind.lib;icuiod.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icuin.lib;icuio.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="derb.cpp" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/genrb/derb.vcxproj.filters b/intl/icu/source/tools/genrb/derb.vcxproj.filters new file mode 100644 index 0000000000..c62d612888 --- /dev/null +++ b/intl/icu/source/tools/genrb/derb.vcxproj.filters @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{b10d3c34-0b4c-43e9-9c28-e17fdabee575}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{0f0a70a2-7e7e-4e7a-88ab-b3bf739fabed}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{ac6d5215-57af-486d-81ed-badc17745780}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="derb.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/genrb/errmsg.c b/intl/icu/source/tools/genrb/errmsg.c new file mode 100644 index 0000000000..a99d797ec5 --- /dev/null +++ b/intl/icu/source/tools/genrb/errmsg.c @@ -0,0 +1,75 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File error.c +* +* Modification History: +* +* Date Name Description +* 05/28/99 stephen Creation. +******************************************************************************* +*/ + +#include <stdarg.h> +#include <stdbool.h> +#include <stdio.h> +#include "cstring.h" +#include "errmsg.h" +#include "toolutil.h" + +U_CFUNC void error(uint32_t linenumber, const char *msg, ...) +{ + va_list va; + + va_start(va, msg); + fprintf(stderr, "%s:%u: ", gCurrentFileName, (int)linenumber); + vfprintf(stderr, msg, va); + fprintf(stderr, "\n"); + va_end(va); +} + +static UBool gShowWarning = true; + +U_CFUNC void setShowWarning(UBool val) +{ + gShowWarning = val; +} + +U_CFUNC UBool getShowWarning(){ + return gShowWarning; +} + +static UBool gStrict =false; +U_CFUNC UBool isStrict(){ + return gStrict; +} +U_CFUNC void setStrict(UBool val){ + gStrict = val; +} +static UBool gVerbose =false; +U_CFUNC UBool isVerbose(){ + return gVerbose; +} +U_CFUNC void setVerbose(UBool val){ + gVerbose = val; +} +U_CFUNC void warning(uint32_t linenumber, const char *msg, ...) +{ + if (gShowWarning) + { + va_list va; + + va_start(va, msg); + fprintf(stderr, "%s:%u: warning: ", gCurrentFileName, (int)linenumber); + vfprintf(stderr, msg, va); + fprintf(stderr, "\n"); + va_end(va); + } +} diff --git a/intl/icu/source/tools/genrb/errmsg.h b/intl/icu/source/tools/genrb/errmsg.h new file mode 100644 index 0000000000..e01b9558f0 --- /dev/null +++ b/intl/icu/source/tools/genrb/errmsg.h @@ -0,0 +1,46 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File error.h +* +* Modification History: +* +* Date Name Description +* 05/28/99 stephen Creation. +******************************************************************************* +*/ + +#ifndef ERROR_H +#define ERROR_H 1 + +#include "unicode/utypes.h" + +U_CDECL_BEGIN + +extern const char *gCurrentFileName; + +U_CFUNC void error(uint32_t linenumber, const char *msg, ...); +U_CFUNC void warning(uint32_t linenumber, const char *msg, ...); + +/* Show warnings? */ +U_CFUNC void setShowWarning(UBool val); +U_CFUNC UBool getShowWarning(void); + +/* strict */ +U_CFUNC void setStrict(UBool val); +U_CFUNC UBool isStrict(void); + +/* verbosity */ +U_CFUNC void setVerbose(UBool val); +U_CFUNC UBool isVerbose(void); + +U_CDECL_END + +#endif diff --git a/intl/icu/source/tools/genrb/filterrb.cpp b/intl/icu/source/tools/genrb/filterrb.cpp new file mode 100644 index 0000000000..dcc02fc621 --- /dev/null +++ b/intl/icu/source/tools/genrb/filterrb.cpp @@ -0,0 +1,239 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include <iostream> +#include <stack> + +#include "filterrb.h" +#include "errmsg.h" + + +const char* PathFilter::kEInclusionNames[] = { + "INCLUDE", + "PARTIAL", + "EXCLUDE" +}; + + +ResKeyPath::ResKeyPath() {} + +ResKeyPath::ResKeyPath(const std::string& path, UErrorCode& status) { + if (path.empty() || path[0] != '/') { + std::cerr << "genrb error: path must start with /: " << path << std::endl; + status = U_PARSE_ERROR; + return; + } + if (path.length() == 1) { + return; + } + size_t i; + size_t j = 0; + while (true) { + i = j + 1; + j = path.find('/', i); + std::string key = path.substr(i, j - i); + if (key.empty()) { + std::cerr << "genrb error: empty subpaths and trailing slashes are not allowed: " << path << std::endl; + status = U_PARSE_ERROR; + return; + } + push(key); + if (j == std::string::npos) { + break; + } + } +} + +void ResKeyPath::push(const std::string& key) { + fPath.push_back(key); +} + +void ResKeyPath::pop() { + fPath.pop_back(); +} + +const std::list<std::string>& ResKeyPath::pieces() const { + return fPath; +} + +std::ostream& operator<<(std::ostream& out, const ResKeyPath& value) { + if (value.pieces().empty()) { + out << "/"; + } else for (auto& key : value.pieces()) { + out << "/" << key; + } + return out; +} + + +PathFilter::~PathFilter() = default; + + +void SimpleRuleBasedPathFilter::addRule(const std::string& ruleLine, UErrorCode& status) { + if (ruleLine.empty()) { + std::cerr << "genrb error: empty filter rules are not allowed" << std::endl; + status = U_PARSE_ERROR; + return; + } + bool inclusionRule = false; + if (ruleLine[0] == '+') { + inclusionRule = true; + } else if (ruleLine[0] != '-') { + std::cerr << "genrb error: rules must start with + or -: " << ruleLine << std::endl; + status = U_PARSE_ERROR; + return; + } + ResKeyPath path(ruleLine.substr(1), status); + addRule(path, inclusionRule, status); +} + +void SimpleRuleBasedPathFilter::addRule(const ResKeyPath& path, bool inclusionRule, UErrorCode& status) { + if (U_FAILURE(status)) { + return; + } + fRoot.applyRule(path, path.pieces().begin(), inclusionRule, status); +} + +PathFilter::EInclusion SimpleRuleBasedPathFilter::match(const ResKeyPath& path) const { + const Tree* node = &fRoot; + + // defaultResult "bubbles up" the nearest "definite" inclusion/exclusion rule + EInclusion defaultResult = INCLUDE; + if (node->fIncluded != PARTIAL) { + // rules handled here: "+/" and "-/" + defaultResult = node->fIncluded; + } + + // isLeaf is whether the filter tree can provide no additional information + // even if additional subpaths are added to the given key + bool isLeaf = false; + + for (auto& key : path.pieces()) { + auto child = node->fChildren.find(key); + // Leaf case 1: input path descends outside the filter tree + if (child == node->fChildren.end()) { + if (node->fWildcard) { + // A wildcard pattern is present; continue checking + node = node->fWildcard.get(); + } else { + isLeaf = true; + break; + } + } else { + node = &child->second; + } + if (node->fIncluded != PARTIAL) { + defaultResult = node->fIncluded; + } + } + + // Leaf case 2: input path exactly matches a filter leaf + if (node->isLeaf()) { + isLeaf = true; + } + + // Always return PARTIAL if we are not at a leaf + if (!isLeaf) { + return PARTIAL; + } + + // If leaf node is PARTIAL, return the default + if (node->fIncluded == PARTIAL) { + return defaultResult; + } + + return node->fIncluded; +} + + +SimpleRuleBasedPathFilter::Tree::Tree(const Tree& other) + : fIncluded(other.fIncluded), fChildren(other.fChildren) { + // Note: can't use the default copy assignment because of the std::unique_ptr + if (other.fWildcard) { + fWildcard.reset(new Tree(*other.fWildcard)); + } +} + +bool SimpleRuleBasedPathFilter::Tree::isLeaf() const { + return fChildren.empty() && !fWildcard; +} + +void SimpleRuleBasedPathFilter::Tree::applyRule( + const ResKeyPath& path, + std::list<std::string>::const_iterator it, + bool inclusionRule, + UErrorCode& status) { + + // Base Case + if (it == path.pieces().end()) { + if (isVerbose() && (fIncluded != PARTIAL || !isLeaf())) { + std::cout << "genrb info: rule on path " << path + << " overrides previous rules" << std::endl; + } + fIncluded = inclusionRule ? INCLUDE : EXCLUDE; + fChildren.clear(); + fWildcard.reset(); + return; + } + + // Recursive Step + auto& key = *it; + if (key == "*") { + // Case 1: Wildcard + if (!fWildcard) { + fWildcard.reset(new Tree()); + } + // Apply the rule to fWildcard and also to all existing children. + it++; + fWildcard->applyRule(path, it, inclusionRule, status); + for (auto& child : fChildren) { + child.second.applyRule(path, it, inclusionRule, status); + } + it--; + + } else { + // Case 2: Normal Key + auto search = fChildren.find(key); + if (search == fChildren.end()) { + if (fWildcard) { + // Deep-copy the existing wildcard tree into the new key + search = fChildren.emplace(key, Tree(*fWildcard)).first; + } else { + search = fChildren.emplace(key, Tree()).first; + } + } + it++; + search->second.applyRule(path, it, inclusionRule, status); + it--; + } +} + +void SimpleRuleBasedPathFilter::Tree::print(std::ostream& out, int32_t indent) const { + for (int32_t i=0; i<indent; i++) out << "\t"; + out << "included: " << kEInclusionNames[fIncluded] << std::endl; + for (auto& child : fChildren) { + for (int32_t i=0; i<indent; i++) out << "\t"; + out << child.first << ": {" << std::endl; + child.second.print(out, indent + 1); + for (int32_t i=0; i<indent; i++) out << "\t"; + out << "}" << std::endl; + } + if (fWildcard) { + for (int32_t i=0; i<indent; i++) out << "\t"; + out << "* {" << std::endl; + fWildcard->print(out, indent + 1); + for (int32_t i=0; i<indent; i++) out << "\t"; + out << "}" << std::endl; + } +} + +void SimpleRuleBasedPathFilter::print(std::ostream& out) const { + out << "SimpleRuleBasedPathFilter {" << std::endl; + fRoot.print(out, 1); + out << "}" << std::endl; +} + +std::ostream& operator<<(std::ostream& out, const SimpleRuleBasedPathFilter& value) { + value.print(out); + return out; +} diff --git a/intl/icu/source/tools/genrb/filterrb.h b/intl/icu/source/tools/genrb/filterrb.h new file mode 100644 index 0000000000..cf54766041 --- /dev/null +++ b/intl/icu/source/tools/genrb/filterrb.h @@ -0,0 +1,180 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#ifndef __FILTERRB_H__ +#define __FILTERRB_H__ + +#include <list> +#include <map> +#include <memory> +#include <ostream> +#include <string> + +#include "unicode/utypes.h" + + +/** + * Represents an absolute path into a resource bundle. + * For example: "/units/length/meter" + */ +class ResKeyPath { +public: + /** Constructs an empty path (top of tree) */ + ResKeyPath(); + + /** Constructs from a string path */ + ResKeyPath(const std::string& path, UErrorCode& status); + + void push(const std::string& key); + void pop(); + + const std::list<std::string>& pieces() const; + + private: + std::list<std::string> fPath; +}; + +std::ostream& operator<<(std::ostream& out, const ResKeyPath& value); + + +/** + * Interface used to determine whether to include or reject pieces of a + * resource bundle based on their absolute path. + */ +class PathFilter { +public: + enum EInclusion { + INCLUDE, + PARTIAL, + EXCLUDE + }; + + static const char* kEInclusionNames[]; + + virtual ~PathFilter(); + + /** + * Returns an EInclusion on whether or not the given path should be included. + * + * INCLUDE = include the whole subtree + * PARTIAL = recurse into the subtree + * EXCLUDE = reject the whole subtree + */ + virtual EInclusion match(const ResKeyPath& path) const = 0; +}; + + +/** + * Implementation of PathFilter for a list of inclusion/exclusion rules. + * + * The wildcard pattern "*" means that the subsequent filters are applied to + * every other tree sharing the same parent. + * + * For example, given this list of filter rules: + */ +// -/alabama +// +/alabama/alaska/arizona +// -/fornia/hawaii +// -/mississippi +// +/mississippi/michigan +// +/mississippi/*/maine +// -/mississippi/*/iowa +// +/mississippi/louisiana/iowa +/* + * You get the following structure: + * + * SimpleRuleBasedPathFilter { + * included: PARTIAL + * alabama: { + * included: EXCLUDE + * alaska: { + * included: PARTIAL + * arizona: { + * included: INCLUDE + * } + * } + * } + * fornia: { + * included: PARTIAL + * hawaii: { + * included: EXCLUDE + * } + * } + * mississippi: { + * included: EXCLUDE + * louisiana: { + * included: PARTIAL + * iowa: { + * included: INCLUDE + * } + * maine: { + * included: INCLUDE + * } + * } + * michigan: { + * included: INCLUDE + * iowa: { + * included: EXCLUDE + * } + * maine: { + * included: INCLUDE + * } + * } + * * { + * included: PARTIAL + * iowa: { + * included: EXCLUDE + * } + * maine: { + * included: INCLUDE + * } + * } + * } + * } + */ +class SimpleRuleBasedPathFilter : public PathFilter { +public: + void addRule(const std::string& ruleLine, UErrorCode& status); + void addRule(const ResKeyPath& path, bool inclusionRule, UErrorCode& status); + + EInclusion match(const ResKeyPath& path) const override; + + void print(std::ostream& out) const; + +private: + struct Tree { + + Tree() = default; + + /** Copy constructor */ + Tree(const Tree& other); + + /** + * Information on the USER-SPECIFIED inclusion/exclusion. + * + * INCLUDE = this path exactly matches a "+" rule + * PARTIAL = this path does not match any rule, but subpaths exist + * EXCLUDE = this path exactly matches a "-" rule + */ + EInclusion fIncluded = PARTIAL; + std::map<std::string, Tree> fChildren; + std::unique_ptr<Tree> fWildcard; + + void applyRule( + const ResKeyPath& path, + std::list<std::string>::const_iterator it, + bool inclusionRule, + UErrorCode& status); + + bool isLeaf() const; + + void print(std::ostream& out, int32_t indent) const; + }; + + Tree fRoot; +}; + +std::ostream& operator<<(std::ostream& out, const SimpleRuleBasedPathFilter& value); + + +#endif //__FILTERRB_H__ diff --git a/intl/icu/source/tools/genrb/genrb.1.in b/intl/icu/source/tools/genrb/genrb.1.in new file mode 100644 index 0000000000..a457719238 --- /dev/null +++ b/intl/icu/source/tools/genrb/genrb.1.in @@ -0,0 +1,148 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" genrb.1: manual page for the genrb utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2000-2002 IBM, Inc. and others. +.\" +.\" Manual page by Yves Arrouye <yves@realnames.com>. +.\" +.TH GENRB 1 "16 April 2002" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B genrb +\- compile a resource bundle +.SH SYNOPSIS +.B genrb +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-V\fP, \fB\-\-version" +] +[ +.BR "\-v\fP, \fB\-\-verbose" +] +[ +.BI "\-e\fP, \fB\-\-encoding" " encoding" +] +[ +.BI "\-j\fP, \fB\-\-write\-java" " \fR[ \fPencoding\fR ]\fP" +] +[ +.BI "\-s\fP, \fB\-\-sourcedir" " source" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +[ +.BI "\-i\fP, \fB\-\-icudatadir" " directory" +] +.IR bundle " \.\.\." +.SH DESCRIPTION +.B genrb +converts the resource +.I bundle +source files passed on the command line to their binary form or to +a Java source file for use with ICU4J. +The resulting binary files have a +.B .res +extension while resource bundle source files typically have a +.B .txt +extension. Java source files have a +.B java +extension and follow the ICU4J naming conventions. +.PP +It is customary to name the resource bundles by their locale name, +i.e. to use a local identifier for the +.I bundle +filename, e.g. +.B ja_JP.txt +for Japanese (Japan) data, or +.B root.txt +for the root bundle. +In any case, +.B genrb +will produce a file whose base name is the name of the locale found +in the resource file, not the base name of the resource file itself. +.PP +The binary files can be read directly by ICU, or used by +.BR pkgdata (1) +for incorporation into a larger archive or library. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BR "\-V\fP, \fB\-\-version" +Print the version of +.B genrb +and exit. +.TP +.BR "\-v\fP, \fB\-\-verbose" +Display extra informative messages during execution. +.TP +.BI "\-e\fP, \fB\-\-encoding" " encoding" +Set the encoding used to read input files to +.IR encoding . +The default encoding is the invariant (subset of ASCII or EBCDIC) +codepage for the system (see section +.BR "INVARIANT CHARACTERS" ). +The encodings UTF-8, UTF-16BE, and UTF-16LE are automatically detected +if a byte order mark (BOM) is present. +.TP +.BI "\-j\fP, \fB\-\-write\-java" " \fR[ \fPencoding\fR ]\fP" +Generate a Java source code for use with ICU4J. An optional +.I encoding +for the Java file can be given. +.TP +.BI "\-s\fP, \fB\-\-sourcedir" " source" +Set the source directory to +.IR source . +The default source directory is specified by the environment variable +.BR ICU_DATA , +or the location set when ICU was built if +.B ICU_DATA +is not set. +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory to +.IR destination . +The default destination directory is specified by the environment variable +.BR ICU_DATA +or is the location set when ICU was built if +.B ICU_DATA +is not set. +.TP +.BI "\-i\fP, \fB\-\-icudatadir" " directory" +Look for any necessary ICU data files in +.IR directory . +For example, when processing collation overrides, the file +.B ucadata.dat +must be located. +The default ICU data directory is specified by the environment variable +.BR ICU_DATA . +.SH INVARIANT CHARACTERS +The +.B invariant character set +consists of the following set of characters, expressed as a standard POSIX +regular expression: +.BR "[a-z]|[A-Z]|[0-9]|_| |+|-|*|/" . +This is the set which is guaranteed to be available regardless of code page. +.SH ENVIRONMENT +.TP 10 +.B ICU_DATA +Specifies the directory containing ICU data. Defaults to +.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ . +Some tools in ICU depend on the presence of the trailing slash. It is thus +important to make sure that it is present if +.B ICU_DATA +is set. +.SH VERSION +@VERSION@ +.SH COPYRIGHT +Copyright (C) 2000-2002 IBM, Inc. and others. +.SH SEE ALSO +.BR derb (1) +.br +.BR pkgdata (1) diff --git a/intl/icu/source/tools/genrb/genrb.cpp b/intl/icu/source/tools/genrb/genrb.cpp new file mode 100644 index 0000000000..fbf396d468 --- /dev/null +++ b/intl/icu/source/tools/genrb/genrb.cpp @@ -0,0 +1,869 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File genrb.cpp +* +* Modification History: +* +* Date Name Description +* 05/25/99 stephen Creation. +* 5/10/01 Ram removed ustdio dependency +******************************************************************************* +*/ + +#include <fstream> +#include <iostream> +#include <list> +#include <string> + +#include <assert.h> +#include "genrb.h" +#include "unicode/localpointer.h" +#include "unicode/uclean.h" +#include "unicode/utf16.h" +#include "charstr.h" +#include "cmemory.h" +#include "filterrb.h" +#include "reslist.h" +#include "ucmndata.h" /* TODO: for reading the pool bundle */ +#include "collationroot.h" + +U_NAMESPACE_USE + +/* Protos */ +void processFile(const char *filename, const char* cp, + const char *inputDir, const char *outputDir, const char *filterDir, + const char *packageName, + SRBRoot *newPoolBundle, UBool omitBinaryCollation, UErrorCode &status); +static char *make_res_filename(const char *filename, const char *outputDir, + const char *packageName, UErrorCode &status); + +/* File suffixes */ +#define RES_SUFFIX ".res" +#define COL_SUFFIX ".col" + +const char *gCurrentFileName = nullptr; +#ifdef XP_MAC_CONSOLE +#include <console.h> +#endif + +void ResFile::close() { + delete[] fBytes; + fBytes = nullptr; + delete fStrings; + fStrings = nullptr; +} + +enum +{ + HELP1, + HELP2, + VERBOSE, + QUIET, + VERSION, + SOURCEDIR, + DESTDIR, + ENCODING, + ICUDATADIR, + WRITE_JAVA, + COPYRIGHT, + JAVA_PACKAGE, + BUNDLE_NAME, + WRITE_XLIFF, + STRICT, + NO_BINARY_COLLATION, + LANGUAGE, + NO_COLLATION_RULES, + FORMAT_VERSION, + WRITE_POOL_BUNDLE, + USE_POOL_BUNDLE, + INCLUDE_UNIHAN_COLL, + FILTERDIR, + ICU4X_MODE, + UCADATA +}; + +UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_VERBOSE, + UOPTION_QUIET, + UOPTION_VERSION, + UOPTION_SOURCEDIR, + UOPTION_DESTDIR, + UOPTION_ENCODING, + UOPTION_ICUDATADIR, + UOPTION_WRITE_JAVA, + UOPTION_COPYRIGHT, + UOPTION_DEF("java-package", '\x01', UOPT_REQUIRES_ARG), + UOPTION_BUNDLE_NAME, + UOPTION_DEF("write-xliff", 'x', UOPT_OPTIONAL_ARG), + UOPTION_DEF("strict", 'k', UOPT_NO_ARG), /* 14 */ + UOPTION_DEF("noBinaryCollation", 'C', UOPT_NO_ARG),/* 15 */ + UOPTION_DEF("language", 'l', UOPT_REQUIRES_ARG), /* 16 */ + UOPTION_DEF("omitCollationRules", 'R', UOPT_NO_ARG),/* 17 */ + UOPTION_DEF("formatVersion", '\x01', UOPT_REQUIRES_ARG),/* 18 */ + UOPTION_DEF("writePoolBundle", '\x01', UOPT_OPTIONAL_ARG),/* 19 */ + UOPTION_DEF("usePoolBundle", '\x01', UOPT_OPTIONAL_ARG),/* 20 */ + UOPTION_DEF("includeUnihanColl", '\x01', UOPT_NO_ARG),/* 21 */ /* temporary, don't display in usage info */ + UOPTION_DEF("filterDir", '\x01', UOPT_OPTIONAL_ARG), /* 22 */ + UOPTION_DEF("icu4xMode", 'X', UOPT_NO_ARG),/* 23 */ + UOPTION_DEF("ucadata", '\x01', UOPT_REQUIRES_ARG),/* 24 */ + }; + +static UBool write_java = false; +static UBool write_xliff = false; +static const char* outputEnc =""; + +static ResFile poolBundle; + +/*added by Jing*/ +static const char* language = nullptr; +static const char* xliffOutputFileName = nullptr; +int +main(int argc, + char* argv[]) +{ + UErrorCode status = U_ZERO_ERROR; + const char *arg = nullptr; + const char *outputDir = nullptr; /* nullptr = no output directory, use current */ + const char *inputDir = nullptr; + const char *filterDir = nullptr; + const char *encoding = ""; + int i; + UBool illegalArg = false; + + U_MAIN_INIT_ARGS(argc, argv); + + options[JAVA_PACKAGE].value = "com.ibm.icu.impl.data"; + options[BUNDLE_NAME].value = "LocaleElements"; + argc = u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, "%s: error in command line argument \"%s\"\n", argv[0], argv[-argc]); + illegalArg = true; + } else if(argc<2) { + illegalArg = true; + } + if(options[WRITE_POOL_BUNDLE].doesOccur && options[USE_POOL_BUNDLE].doesOccur) { + fprintf(stderr, "%s: cannot combine --writePoolBundle and --usePoolBundle\n", argv[0]); + illegalArg = true; + } + if (options[ICU4X_MODE].doesOccur && !options[UCADATA].doesOccur) { + fprintf(stderr, "%s: --icu4xMode requires --ucadata\n", argv[0]); + illegalArg = true; + } + if(options[FORMAT_VERSION].doesOccur) { + const char *s = options[FORMAT_VERSION].value; + if(uprv_strlen(s) != 1 || (s[0] < '1' && '3' < s[0])) { + fprintf(stderr, "%s: unsupported --formatVersion %s\n", argv[0], s); + illegalArg = true; + } else if(s[0] == '1' && + (options[WRITE_POOL_BUNDLE].doesOccur || options[USE_POOL_BUNDLE].doesOccur) + ) { + fprintf(stderr, "%s: cannot combine --formatVersion 1 with --writePoolBundle or --usePoolBundle\n", argv[0]); + illegalArg = true; + } else { + setFormatVersion(s[0] - '0'); + } + } + + if((options[JAVA_PACKAGE].doesOccur || options[BUNDLE_NAME].doesOccur) && + !options[WRITE_JAVA].doesOccur) { + fprintf(stderr, + "%s error: command line argument --java-package or --bundle-name " + "without --write-java\n", + argv[0]); + illegalArg = true; + } + + if(options[VERSION].doesOccur) { + fprintf(stderr, + "%s version %s (ICU version %s).\n" + "%s\n", + argv[0], GENRB_VERSION, U_ICU_VERSION, U_COPYRIGHT_STRING); + if(!illegalArg) { + return U_ZERO_ERROR; + } + } + + if(illegalArg || options[HELP1].doesOccur || options[HELP2].doesOccur) { + /* + * Broken into chunks because the C89 standard says the minimum + * required supported string length is 509 bytes. + */ + fprintf(stderr, + "Usage: %s [OPTIONS] [FILES]\n" + "\tReads the list of resource bundle source files and creates\n" + "\tbinary version of resource bundles (.res files)\n", + argv[0]); + fprintf(stderr, + "Options:\n" + "\t-h or -? or --help this usage text\n" + "\t-q or --quiet do not display warnings\n" + "\t-v or --verbose print extra information when processing files\n" + "\t-V or --version prints out version number and exits\n" + "\t-c or --copyright include copyright notice\n"); + fprintf(stderr, + "\t-e or --encoding encoding of source files\n" + "\t-d or --destdir destination directory, followed by the path, defaults to '%s'\n" + "\t-s or --sourcedir source directory for files followed by path, defaults to '%s'\n" + "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" + "\t followed by path, defaults to '%s'\n", + u_getDataDirectory(), u_getDataDirectory(), u_getDataDirectory()); + fprintf(stderr, + "\t-j or --write-java write a Java ListResourceBundle for ICU4J, followed by optional encoding\n" + "\t defaults to ASCII and \\uXXXX format.\n" + "\t --java-package For --write-java: package name for writing the ListResourceBundle,\n" + "\t defaults to com.ibm.icu.impl.data\n"); + fprintf(stderr, + "\t-b or --bundle-name For --write-java: root resource bundle name for writing the ListResourceBundle,\n" + "\t defaults to LocaleElements\n" + "\t-x or --write-xliff write an XLIFF file for the resource bundle. Followed by\n" + "\t an optional output file name.\n" + "\t-k or --strict use pedantic parsing of syntax\n" + /*added by Jing*/ + "\t-l or --language for XLIFF: language code compliant with BCP 47.\n"); + fprintf(stderr, + "\t-C or --noBinaryCollation do not generate binary collation image;\n" + "\t makes .res file smaller but collator instantiation much slower;\n" + "\t maintains ability to get tailoring rules\n" + "\t-R or --omitCollationRules do not include collation (tailoring) rules;\n" + "\t makes .res file smaller and maintains collator instantiation speed\n" + "\t but tailoring rules will not be available (they are rarely used)\n"); + fprintf(stderr, + "\t --formatVersion write a .res file compatible with the requested formatVersion (single digit);\n" + "\t for example, --formatVersion 1\n"); + fprintf(stderr, + "\t --writePoolBundle [directory] write a pool.res file with all of the keys of all input bundles\n" + "\t --usePoolBundle [directory] point to keys from the pool.res keys pool bundle if they are available there;\n" + "\t makes .res files smaller but dependent on the pool bundle\n" + "\t (--writePoolBundle and --usePoolBundle cannot be combined)\n"); + fprintf(stderr, + "\t --filterDir Input directory where filter files are available.\n" + "\t For more on filter files, see ICU Data Build Tool.\n"); + + return illegalArg ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + if(options[VERBOSE].doesOccur) { + setVerbose(true); + } + + if(options[QUIET].doesOccur) { + setShowWarning(false); + } + if(options[STRICT].doesOccur) { + setStrict(true); + } + if(options[COPYRIGHT].doesOccur){ + setIncludeCopyright(true); + } + + if(options[SOURCEDIR].doesOccur) { + inputDir = options[SOURCEDIR].value; + } + + if(options[DESTDIR].doesOccur) { + outputDir = options[DESTDIR].value; + } + + if (options[FILTERDIR].doesOccur) { + filterDir = options[FILTERDIR].value; + } + + if(options[ENCODING].doesOccur) { + encoding = options[ENCODING].value; + } + + if(options[ICUDATADIR].doesOccur) { + u_setDataDirectory(options[ICUDATADIR].value); + } + /* Initialize ICU */ + u_init(&status); + if (U_FAILURE(status) && status != U_FILE_ACCESS_ERROR) { + /* Note: u_init() will try to open ICU property data. + * failures here are expected when building ICU from scratch. + * ignore them. + */ + fprintf(stderr, "%s: can not initialize ICU. status = %s\n", + argv[0], u_errorName(status)); + exit(1); + } + status = U_ZERO_ERROR; + if(options[WRITE_JAVA].doesOccur) { + write_java = true; + outputEnc = options[WRITE_JAVA].value; + } + + if(options[WRITE_XLIFF].doesOccur) { + write_xliff = true; + if(options[WRITE_XLIFF].value != nullptr){ + xliffOutputFileName = options[WRITE_XLIFF].value; + } + } + + if (options[UCADATA].doesOccur) { +#if !UCONFIG_NO_COLLATION + CollationRoot::forceLoadFromFile(options[UCADATA].value, status); +#else + fprintf(stderr, "--ucadata was used with UCONFIG_NO_COLLATION\n"); + return status; +#endif + } + + initParser(); + + /*added by Jing*/ + if(options[LANGUAGE].doesOccur) { + language = options[LANGUAGE].value; + } + + LocalPointer<SRBRoot> newPoolBundle; + if(options[WRITE_POOL_BUNDLE].doesOccur) { + newPoolBundle.adoptInsteadAndCheckErrorCode(new SRBRoot(nullptr, true, status), status); + if(U_FAILURE(status)) { + fprintf(stderr, "unable to create an empty bundle for the pool keys: %s\n", u_errorName(status)); + return status; + } else { + const char *poolResName = "pool.res"; + char *nameWithoutSuffix = static_cast<char *>(uprv_malloc(uprv_strlen(poolResName) + 1)); + if (nameWithoutSuffix == nullptr) { + fprintf(stderr, "out of memory error\n"); + return U_MEMORY_ALLOCATION_ERROR; + } + uprv_strcpy(nameWithoutSuffix, poolResName); + *uprv_strrchr(nameWithoutSuffix, '.') = 0; + newPoolBundle->fLocale = nameWithoutSuffix; + } + } + + if(options[USE_POOL_BUNDLE].doesOccur) { + const char *poolResName = "pool.res"; + FileStream *poolFile; + int32_t poolFileSize; + int32_t indexLength; + /* + * TODO: Consolidate inputDir/filename handling from main() and processFile() + * into a common function, and use it here as well. + * Try to create toolutil functions for dealing with dir/filenames and + * loading ICU data files without udata_open(). + * Share code with icupkg? + * Also, make_res_filename() seems to be unused. Review and remove. + */ + CharString poolFileName; + if (options[USE_POOL_BUNDLE].value!=nullptr) { + poolFileName.append(options[USE_POOL_BUNDLE].value, status); + } else if (inputDir) { + poolFileName.append(inputDir, status); + } + poolFileName.appendPathPart(poolResName, status); + if (U_FAILURE(status)) { + return status; + } + poolFile = T_FileStream_open(poolFileName.data(), "rb"); + if (poolFile == nullptr) { + fprintf(stderr, "unable to open pool bundle file %s\n", poolFileName.data()); + return 1; + } + poolFileSize = T_FileStream_size(poolFile); + if (poolFileSize < 32) { + fprintf(stderr, "the pool bundle file %s is too small\n", poolFileName.data()); + return 1; + } + poolBundle.fBytes = new uint8_t[(poolFileSize + 15) & ~15]; + if (poolFileSize > 0 && poolBundle.fBytes == nullptr) { + fprintf(stderr, "unable to allocate memory for the pool bundle file %s\n", poolFileName.data()); + return U_MEMORY_ALLOCATION_ERROR; + } + + UDataSwapper *ds; + const DataHeader *header; + int32_t bytesRead = T_FileStream_read(poolFile, poolBundle.fBytes, poolFileSize); + if (bytesRead != poolFileSize) { + fprintf(stderr, "unable to read the pool bundle file %s\n", poolFileName.data()); + return 1; + } + /* + * Swap the pool bundle so that a single checked-in file can be used. + * The swapper functions also test that the data looks like + * a well-formed .res file. + */ + ds = udata_openSwapperForInputData(poolBundle.fBytes, bytesRead, + U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, &status); + if (U_FAILURE(status)) { + fprintf(stderr, "udata_openSwapperForInputData(pool bundle %s) failed: %s\n", + poolFileName.data(), u_errorName(status)); + return status; + } + ures_swap(ds, poolBundle.fBytes, bytesRead, poolBundle.fBytes, &status); + udata_closeSwapper(ds); + if (U_FAILURE(status)) { + fprintf(stderr, "ures_swap(pool bundle %s) failed: %s\n", + poolFileName.data(), u_errorName(status)); + return status; + } + header = (const DataHeader *)poolBundle.fBytes; + if (header->info.formatVersion[0] < 2) { + fprintf(stderr, "invalid format of pool bundle file %s\n", poolFileName.data()); + return U_INVALID_FORMAT_ERROR; + } + const int32_t *pRoot = (const int32_t *)( + (const char *)header + header->dataHeader.headerSize); + poolBundle.fIndexes = pRoot + 1; + indexLength = poolBundle.fIndexes[URES_INDEX_LENGTH] & 0xff; + if (indexLength <= URES_INDEX_POOL_CHECKSUM) { + fprintf(stderr, "insufficient indexes[] in pool bundle file %s\n", poolFileName.data()); + return U_INVALID_FORMAT_ERROR; + } + int32_t keysBottom = 1 + indexLength; + int32_t keysTop = poolBundle.fIndexes[URES_INDEX_KEYS_TOP]; + poolBundle.fKeys = (const char *)(pRoot + keysBottom); + poolBundle.fKeysLength = (keysTop - keysBottom) * 4; + poolBundle.fChecksum = poolBundle.fIndexes[URES_INDEX_POOL_CHECKSUM]; + + for (i = 0; i < poolBundle.fKeysLength; ++i) { + if (poolBundle.fKeys[i] == 0) { + ++poolBundle.fKeysCount; + } + } + + // 16BitUnits[] begins with strings-v2. + // The strings-v2 may optionally be terminated by what looks like + // an explicit string length that exceeds the number of remaining 16-bit units. + int32_t stringUnitsLength = (poolBundle.fIndexes[URES_INDEX_16BIT_TOP] - keysTop) * 2; + if (stringUnitsLength >= 2 && getFormatVersion() >= 3) { + poolBundle.fStrings = new PseudoListResource(nullptr, status); + if (poolBundle.fStrings == nullptr) { + fprintf(stderr, "unable to allocate memory for the pool bundle strings %s\n", + poolFileName.data()); + return U_MEMORY_ALLOCATION_ERROR; + } + // The PseudoListResource constructor call did not allocate further memory. + assert(U_SUCCESS(status)); + const char16_t *p = (const char16_t *)(pRoot + keysTop); + int32_t remaining = stringUnitsLength; + do { + int32_t first = *p; + int8_t numCharsForLength; + int32_t length; + if (!U16_IS_TRAIL(first)) { + // NUL-terminated + numCharsForLength = 0; + for (length = 0; + length < remaining && p[length] != 0; + ++length) {} + } else if (first < 0xdfef) { + numCharsForLength = 1; + length = first & 0x3ff; + } else if (first < 0xdfff && remaining >= 2) { + numCharsForLength = 2; + length = ((first - 0xdfef) << 16) | p[1]; + } else if (first == 0xdfff && remaining >= 3) { + numCharsForLength = 3; + length = ((int32_t)p[1] << 16) | p[2]; + } else { + break; // overrun + } + // Check for overrun before changing remaining, + // so that it is always accurate after the loop body. + if ((numCharsForLength + length) >= remaining || + p[numCharsForLength + length] != 0) { + break; // overrun or explicitly terminated + } + int32_t poolStringIndex = stringUnitsLength - remaining; + // Maximum pool string index when suffix-sharing the last character. + int32_t maxStringIndex = poolStringIndex + numCharsForLength + length - 1; + if (maxStringIndex >= RES_MAX_OFFSET) { + // pool string index overrun + break; + } + p += numCharsForLength; + remaining -= numCharsForLength; + if (length != 0) { + StringResource *sr = + new StringResource(poolStringIndex, numCharsForLength, + p, length, status); + if (sr == nullptr) { + fprintf(stderr, "unable to allocate memory for a pool bundle string %s\n", + poolFileName.data()); + return U_MEMORY_ALLOCATION_ERROR; + } + poolBundle.fStrings->add(sr); + poolBundle.fStringIndexLimit = maxStringIndex + 1; + // The StringResource constructor did not allocate further memory. + assert(U_SUCCESS(status)); + } + p += length + 1; + remaining -= length + 1; + } while (remaining > 0); + if (poolBundle.fStrings->fCount == 0) { + delete poolBundle.fStrings; + poolBundle.fStrings = nullptr; + } + } + + T_FileStream_close(poolFile); + setUsePoolBundle(true); + if (isVerbose() && poolBundle.fStrings != nullptr) { + printf("number of shared strings: %d\n", (int)poolBundle.fStrings->fCount); + int32_t length = poolBundle.fStringIndexLimit + 1; // incl. last NUL + printf("16-bit units for strings: %6d = %6d bytes\n", + (int)length, (int)length * 2); + } + } + + if(!options[FORMAT_VERSION].doesOccur && getFormatVersion() == 3 && + poolBundle.fStrings == nullptr && + !options[WRITE_POOL_BUNDLE].doesOccur) { + // If we just default to formatVersion 3 + // but there are no pool bundle strings to share + // and we do not write a pool bundle, + // then write formatVersion 2 which is just as good. + setFormatVersion(2); + } + + if(options[INCLUDE_UNIHAN_COLL].doesOccur) { + puts("genrb option --includeUnihanColl ignored: \n" + "CLDR 26/ICU 54 unihan data is small, except\n" + "the ucadata-unihan.icu version of the collation root data\n" + "is about 300kB larger than the ucadata-implicithan.icu version."); + } + + if((argc-1)!=1) { + printf("genrb number of files: %d\n", argc - 1); + } + /* generate the binary files */ + for(i = 1; i < argc; ++i) { + status = U_ZERO_ERROR; + arg = getLongPathname(argv[i]); + + CharString theCurrentFileName; + if (inputDir) { + theCurrentFileName.append(inputDir, status); + } + theCurrentFileName.appendPathPart(arg, status); + if (U_FAILURE(status)) { + break; + } + + gCurrentFileName = theCurrentFileName.data(); + if (isVerbose()) { + printf("Processing file \"%s\"\n", theCurrentFileName.data()); + } + processFile(arg, encoding, inputDir, outputDir, filterDir, nullptr, + newPoolBundle.getAlias(), + options[NO_BINARY_COLLATION].doesOccur, status); + } + + poolBundle.close(); + + if(U_SUCCESS(status) && options[WRITE_POOL_BUNDLE].doesOccur) { + const char* writePoolDir; + if (options[WRITE_POOL_BUNDLE].value!=nullptr) { + writePoolDir = options[WRITE_POOL_BUNDLE].value; + } else { + writePoolDir = outputDir; + } + char outputFileName[256]; + newPoolBundle->write(writePoolDir, nullptr, outputFileName, sizeof(outputFileName), status); + if(U_FAILURE(status)) { + fprintf(stderr, "unable to write the pool bundle: %s\n", u_errorName(status)); + } + } + + u_cleanup(); + + /* Don't return warnings as a failure */ + if (U_SUCCESS(status)) { + return 0; + } + + return status; +} + +/* Process a file */ +void +processFile(const char *filename, const char *cp, + const char *inputDir, const char *outputDir, const char *filterDir, + const char *packageName, + SRBRoot *newPoolBundle, + UBool omitBinaryCollation, UErrorCode &status) { + LocalPointer<SRBRoot> data; + LocalUCHARBUFPointer ucbuf; + CharString openFileName; + CharString inputDirBuf; + + char outputFileName[256]; + int32_t dirlen = 0; + + if (U_FAILURE(status)) { + return; + } + if(filename==nullptr){ + status=U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + if(inputDir == nullptr) { + const char *filenameBegin = uprv_strrchr(filename, U_FILE_SEP_CHAR); + if (filenameBegin != nullptr) { + /* + * When a filename ../../../data/root.txt is specified, + * we presume that the input directory is ../../../data + * This is very important when the resource file includes + * another file, like UCARules.txt or thaidict.brk. + */ + int32_t filenameSize = (int32_t)(filenameBegin - filename + 1); + inputDirBuf.append(filename, filenameSize, status); + + inputDir = inputDirBuf.data(); + dirlen = inputDirBuf.length(); + } + }else{ + dirlen = (int32_t)uprv_strlen(inputDir); + + if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) { + /* + * append the input dir to openFileName if the first char in + * filename is not file separation char and the last char input directory is not '.'. + * This is to support : + * genrb -s. /home/icu/data + * genrb -s. icu/data + * The user cannot mix notations like + * genrb -s. /icu/data --- the absolute path specified. -s redundant + * user should use + * genrb -s. icu/data --- start from CWD and look in icu/data dir + */ + if( (filename[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')){ + openFileName.append(inputDir, status); + } + } else { + openFileName.append(inputDir, status); + } + } + openFileName.appendPathPart(filename, status); + + // Test for CharString failure + if (U_FAILURE(status)) { + return; + } + + ucbuf.adoptInstead(ucbuf_open(openFileName.data(), &cp,getShowWarning(),true, &status)); + if(status == U_FILE_ACCESS_ERROR) { + + fprintf(stderr, "couldn't open file %s\n", openFileName.data()); + return; + } + if (ucbuf.isNull() || U_FAILURE(status)) { + fprintf(stderr, "An error occurred processing file %s. Error: %s\n", + openFileName.data(), u_errorName(status)); + return; + } + /* auto detected popular encodings? */ + if (cp!=nullptr && isVerbose()) { + printf("autodetected encoding %s\n", cp); + } + /* Parse the data into an SRBRoot */ + data.adoptInstead(parse(ucbuf.getAlias(), inputDir, outputDir, filename, + !omitBinaryCollation, options[NO_COLLATION_RULES].doesOccur, options[ICU4X_MODE].doesOccur, &status)); + + if (data.isNull() || U_FAILURE(status)) { + fprintf(stderr, "couldn't parse the file %s. Error:%s\n", filename, u_errorName(status)); + return; + } + + // Run filtering before writing pool bundle + if (filterDir != nullptr) { + CharString filterFileName(filterDir, status); + filterFileName.appendPathPart(filename, status); + if (U_FAILURE(status)) { + return; + } + + // Open the file and read it into filter + SimpleRuleBasedPathFilter filter; + std::ifstream f(filterFileName.data()); + if (f.fail()) { + std::cerr << "genrb error: unable to open " << filterFileName.data() << std::endl; + status = U_FILE_ACCESS_ERROR; + return; + } + std::string currentLine; + while (std::getline(f, currentLine)) { + // Ignore # comments and empty lines + if (currentLine.empty() || currentLine[0] == '#') { + continue; + } + filter.addRule(currentLine, status); + if (U_FAILURE(status)) { + return; + } + } + + if (isVerbose()) { + filter.print(std::cout); + } + + // Apply the filter to the data + ResKeyPath path; + data->fRoot->applyFilter(filter, path, data.getAlias()); + } + + if(options[WRITE_POOL_BUNDLE].doesOccur) { + data->fWritePoolBundle = newPoolBundle; + data->compactKeys(status); + int32_t newKeysLength; + const char *newKeys = data->getKeyBytes(&newKeysLength); + newPoolBundle->addKeyBytes(newKeys, newKeysLength, status); + if(U_FAILURE(status)) { + fprintf(stderr, "bundle_compactKeys(%s) or bundle_getKeyBytes() failed: %s\n", + filename, u_errorName(status)); + return; + } + /* count the number of just-added key strings */ + for(const char *newKeysLimit = newKeys + newKeysLength; newKeys < newKeysLimit; ++newKeys) { + if(*newKeys == 0) { + ++newPoolBundle->fKeysCount; + } + } + } + + if(options[USE_POOL_BUNDLE].doesOccur) { + data->fUsePoolBundle = &poolBundle; + } + + /* Determine the target rb filename */ + uprv_free(make_res_filename(filename, outputDir, packageName, status)); + if(U_FAILURE(status)) { + fprintf(stderr, "couldn't make the res fileName for bundle %s. Error:%s\n", + filename, u_errorName(status)); + return; + } + if(write_java== true){ + bundle_write_java(data.getAlias(), outputDir, outputEnc, + outputFileName, sizeof(outputFileName), + options[JAVA_PACKAGE].value, options[BUNDLE_NAME].value, &status); + }else if(write_xliff ==true){ + bundle_write_xml(data.getAlias(), outputDir, outputEnc, + filename, outputFileName, sizeof(outputFileName), + language, xliffOutputFileName, &status); + }else{ + /* Write the data to the file */ + data->write(outputDir, packageName, outputFileName, sizeof(outputFileName), status); + } + if (U_FAILURE(status)) { + fprintf(stderr, "couldn't write bundle %s. Error:%s\n", outputFileName, u_errorName(status)); + } +} + +/* Generate the target .res file name from the input file name */ +static char* +make_res_filename(const char *filename, + const char *outputDir, + const char *packageName, + UErrorCode &status) { + char *basename; + char *dirname; + char *resName; + + int32_t pkgLen = 0; /* length of package prefix */ + + + if (U_FAILURE(status)) { + return 0; + } + + if(packageName != nullptr) + { + pkgLen = (int32_t)(1 + uprv_strlen(packageName)); + } + + /* setup */ + basename = dirname = resName = 0; + + /* determine basename, and compiled file names */ + basename = (char*) uprv_malloc(sizeof(char) * (uprv_strlen(filename) + 1)); + if(basename == 0) { + status = U_MEMORY_ALLOCATION_ERROR; + goto finish; + } + + get_basename(basename, filename); + + dirname = (char*) uprv_malloc(sizeof(char) * (uprv_strlen(filename) + 1)); + if(dirname == 0) { + status = U_MEMORY_ALLOCATION_ERROR; + goto finish; + } + + get_dirname(dirname, filename); + + if (outputDir == nullptr) { + /* output in same dir as .txt */ + resName = (char*) uprv_malloc(sizeof(char) * (uprv_strlen(dirname) + + pkgLen + + uprv_strlen(basename) + + uprv_strlen(RES_SUFFIX) + 8)); + if(resName == 0) { + status = U_MEMORY_ALLOCATION_ERROR; + goto finish; + } + + uprv_strcpy(resName, dirname); + + if(packageName != nullptr) + { + uprv_strcat(resName, packageName); + uprv_strcat(resName, "_"); + } + + uprv_strcat(resName, basename); + + } else { + int32_t dirlen = (int32_t)uprv_strlen(outputDir); + int32_t basenamelen = (int32_t)uprv_strlen(basename); + + resName = (char*) uprv_malloc(sizeof(char) * (dirlen + pkgLen + basenamelen + 8)); + + if (resName == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + goto finish; + } + + uprv_strcpy(resName, outputDir); + + if(outputDir[dirlen] != U_FILE_SEP_CHAR) { + resName[dirlen] = U_FILE_SEP_CHAR; + resName[dirlen + 1] = '\0'; + } + + if(packageName != nullptr) + { + uprv_strcat(resName, packageName); + uprv_strcat(resName, "_"); + } + + uprv_strcat(resName, basename); + } + +finish: + uprv_free(basename); + uprv_free(dirname); + + return resName; +} + +/* + * Local Variables: + * indent-tabs-mode: nil + * End: + */ diff --git a/intl/icu/source/tools/genrb/genrb.h b/intl/icu/source/tools/genrb/genrb.h new file mode 100644 index 0000000000..019020a34a --- /dev/null +++ b/intl/icu/source/tools/genrb/genrb.h @@ -0,0 +1,52 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2002-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File genrb.h +*/ + +#ifndef GENRB_H +#define GENRB_H + +#include <stdio.h> +#include "unicode/utypes.h" +#include "unicode/putil.h" +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" + + +#include "ucbuf.h" +#include "errmsg.h" +#include "parse.h" +#include "rbutil.h" + +#include "toolutil.h" +#include "uoptions.h" + +#include "unicode/ucol.h" +#include "unicode/uloc.h" + +/* The version of genrb */ +#define GENRB_VERSION "56" + +U_CDECL_BEGIN + +U_CAPI void processFile( + const char *filename, + const char* cp, + const char *inputDir, + const char *outputDir, + const char *packageName, + UBool omitBinaryCollation, + UErrorCode *status); + +U_CDECL_END + +#endif diff --git a/intl/icu/source/tools/genrb/genrb.vcxproj b/intl/icu/source/tools/genrb/genrb.vcxproj new file mode 100644 index 0000000000..66651c11d0 --- /dev/null +++ b/intl/icu/source/tools/genrb/genrb.vcxproj @@ -0,0 +1,113 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{97521D06-EC47-45D4-8BD0-9E16B3F93B2A}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)/genrb.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>true</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\..\include;..\..\common;..\toolutil;..\..\i18n;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)/genrb.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)/genrb.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)/genrb.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icuind.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icuin.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="errmsg.c" /> + <ClCompile Include="filterrb.cpp" /> + <ClCompile Include="genrb.cpp" /> + <ClCompile Include="parse.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="prscmnts.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="rbutil.c" /> + <ClCompile Include="read.c" /> + <ClCompile Include="reslist.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="rle.c" /> + <ClCompile Include="ustr.c" /> + <ClCompile Include="wrtjava.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + <ClCompile Include="wrtxml.cpp"> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="errmsg.h" /> + <ClInclude Include="genrb.h" /> + <ClInclude Include="filterrb.h" /> + <ClInclude Include="parse.h" /> + <ClInclude Include="prscmnts.h" /> + <ClInclude Include="rbutil.h" /> + <ClInclude Include="read.h" /> + <ClInclude Include="reslist.h" /> + <ClInclude Include="rle.h" /> + <ClInclude Include="ustr.h" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/genrb/genrb.vcxproj.filters b/intl/icu/source/tools/genrb/genrb.vcxproj.filters new file mode 100644 index 0000000000..1f2f5b3b8c --- /dev/null +++ b/intl/icu/source/tools/genrb/genrb.vcxproj.filters @@ -0,0 +1,87 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{2dee2c2f-25a5-43f0-985f-de4ba26925b4}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{7156c811-7116-4eef-8bb1-0400c51f9fd3}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{df647868-56cc-475d-a3f6-1d1f50aa5e4f}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="errmsg.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="filterrb.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="genrb.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="parse.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="prscmnts.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="rbutil.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="read.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="reslist.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="rle.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="ustr.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="wrtjava.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="wrtxml.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="errmsg.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="filterrb.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="genrb.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="parse.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="prscmnts.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="rbutil.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="read.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="reslist.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="rle.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="ustr.h"> + <Filter>Header Files</Filter> + </ClInclude> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/genrb/parse.cpp b/intl/icu/source/tools/genrb/parse.cpp new file mode 100644 index 0000000000..1e82bda6e5 --- /dev/null +++ b/intl/icu/source/tools/genrb/parse.cpp @@ -0,0 +1,2435 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File parse.cpp +* +* Modification History: +* +* Date Name Description +* 05/26/99 stephen Creation. +* 02/25/00 weiv Overhaul to write udata +* 5/10/01 Ram removed ustdio dependency +* 06/10/2001 Dominic Ludlam <dom@recoil.org> Rewritten +******************************************************************************* +*/ + +// Safer use of UnicodeString. +#include <cstdint> +#include "unicode/umachine.h" +#ifndef UNISTR_FROM_CHAR_EXPLICIT +# define UNISTR_FROM_CHAR_EXPLICIT explicit +#endif + +// Less important, but still a good idea. +#ifndef UNISTR_FROM_STRING_EXPLICIT +# define UNISTR_FROM_STRING_EXPLICIT explicit +#endif + +#include <assert.h> +#include "parse.h" +#include "errmsg.h" +#include "uhash.h" +#include "cmemory.h" +#include "cstring.h" +#include "uinvchar.h" +#include "read.h" +#include "ustr.h" +#include "reslist.h" +#include "rbt_pars.h" +#include "genrb.h" +#include "unicode/normalizer2.h" +#include "unicode/stringpiece.h" +#include "unicode/unistr.h" +#include "unicode/ustring.h" +#include "unicode/uscript.h" +#include "unicode/utf16.h" +#include "unicode/putil.h" +#include "charstr.h" +#include "collationbuilder.h" +#include "collationdata.h" +#include "collationdatareader.h" +#include "collationdatawriter.h" +#include "collationfastlatinbuilder.h" +#include "collationinfo.h" +#include "collationroot.h" +#include "collationruleparser.h" +#include "collationtailoring.h" +#include <stdio.h> +#include "writesrc.h" + +/* Number of tokens to read ahead of the current stream position */ +#define MAX_LOOKAHEAD 3 + +#define CR 0x000D +#define LF 0x000A +#define SPACE 0x0020 +#define TAB 0x0009 +#define ESCAPE 0x005C +#define HASH 0x0023 +#define QUOTE 0x0027 +#define ZERO 0x0030 +#define STARTCOMMAND 0x005B +#define ENDCOMMAND 0x005D +#define OPENSQBRACKET 0x005B +#define CLOSESQBRACKET 0x005D + +#define ICU4X_DIACRITIC_BASE 0x0300 +#define ICU4X_DIACRITIC_LIMIT 0x034F + +using icu::CharString; +using icu::LocalMemory; +using icu::LocalPointer; +using icu::LocalUCHARBUFPointer; +using icu::StringPiece; +using icu::UnicodeString; + +struct Lookahead +{ + enum ETokenType type; + struct UString value; + struct UString comment; + uint32_t line; +}; + +/* keep in sync with token defines in read.h */ +const char *tokenNames[TOK_TOKEN_COUNT] = +{ + "string", /* A string token, such as "MonthNames" */ + "'{'", /* An opening brace character */ + "'}'", /* A closing brace character */ + "','", /* A comma */ + "':'", /* A colon */ + + "<end of file>", /* End of the file has been reached successfully */ + "<end of line>" +}; + +/* Just to store "TRUE" */ +//static const char16_t trueValue[] = {0x0054, 0x0052, 0x0055, 0x0045, 0x0000}; + +typedef struct { + struct Lookahead lookahead[MAX_LOOKAHEAD + 1]; + uint32_t lookaheadPosition; + UCHARBUF *buffer; + struct SRBRoot *bundle; + const char *inputdir; + uint32_t inputdirLength; + const char *outputdir; + uint32_t outputdirLength; + const char *filename; + UBool makeBinaryCollation; + UBool omitCollationRules; + UBool icu4xMode; +} ParseState; + +typedef struct SResource * +ParseResourceFunction(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status); + +static struct SResource *parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status); + +/* The nature of the lookahead buffer: + There are MAX_LOOKAHEAD + 1 slots, used as a circular buffer. This provides + MAX_LOOKAHEAD lookahead tokens and a slot for the current token and value. + When getToken is called, the current pointer is moved to the next slot and the + old slot is filled with the next token from the reader by calling getNextToken. + The token values are stored in the slot, which means that token values don't + survive a call to getToken, ie. + + UString *value; + + getToken(&value, nullptr, status); + getToken(nullptr, nullptr, status); bad - value is now a different string +*/ +static void +initLookahead(ParseState* state, UCHARBUF *buf, UErrorCode *status) +{ + static uint32_t initTypeStrings = 0; + uint32_t i; + + if (!initTypeStrings) + { + initTypeStrings = 1; + } + + state->lookaheadPosition = 0; + state->buffer = buf; + + resetLineNumber(); + + for (i = 0; i < MAX_LOOKAHEAD; i++) + { + state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status); + if (U_FAILURE(*status)) + { + return; + } + } + + *status = U_ZERO_ERROR; +} + +static void +cleanupLookahead(ParseState* state) +{ + uint32_t i; + for (i = 0; i <= MAX_LOOKAHEAD; i++) + { + ustr_deinit(&state->lookahead[i].value); + ustr_deinit(&state->lookahead[i].comment); + } + +} + +static enum ETokenType +getToken(ParseState* state, struct UString **tokenValue, struct UString* comment, uint32_t *linenumber, UErrorCode *status) +{ + enum ETokenType result; + uint32_t i; + + result = state->lookahead[state->lookaheadPosition].type; + + if (tokenValue != nullptr) + { + *tokenValue = &state->lookahead[state->lookaheadPosition].value; + } + + if (linenumber != nullptr) + { + *linenumber = state->lookahead[state->lookaheadPosition].line; + } + + if (comment != nullptr) + { + ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status); + } + + i = (state->lookaheadPosition + MAX_LOOKAHEAD) % (MAX_LOOKAHEAD + 1); + state->lookaheadPosition = (state->lookaheadPosition + 1) % (MAX_LOOKAHEAD + 1); + ustr_setlen(&state->lookahead[i].comment, 0, status); + ustr_setlen(&state->lookahead[i].value, 0, status); + state->lookahead[i].type = getNextToken(state->buffer, &state->lookahead[i].value, &state->lookahead[i].line, &state->lookahead[i].comment, status); + + /* printf("getToken, returning %s\n", tokenNames[result]); */ + + return result; +} + +static enum ETokenType +peekToken(ParseState* state, uint32_t lookaheadCount, struct UString **tokenValue, uint32_t *linenumber, struct UString *comment, UErrorCode *status) +{ + uint32_t i = (state->lookaheadPosition + lookaheadCount) % (MAX_LOOKAHEAD + 1); + + if (U_FAILURE(*status)) + { + return TOK_ERROR; + } + + if (lookaheadCount >= MAX_LOOKAHEAD) + { + *status = U_INTERNAL_PROGRAM_ERROR; + return TOK_ERROR; + } + + if (tokenValue != nullptr) + { + *tokenValue = &state->lookahead[i].value; + } + + if (linenumber != nullptr) + { + *linenumber = state->lookahead[i].line; + } + + if(comment != nullptr){ + ustr_cpy(comment, &(state->lookahead[state->lookaheadPosition].comment), status); + } + + return state->lookahead[i].type; +} + +static void +expect(ParseState* state, enum ETokenType expectedToken, struct UString **tokenValue, struct UString *comment, uint32_t *linenumber, UErrorCode *status) +{ + uint32_t line; + + enum ETokenType token = getToken(state, tokenValue, comment, &line, status); + + if (linenumber != nullptr) + { + *linenumber = line; + } + + if (U_FAILURE(*status)) + { + return; + } + + if (token != expectedToken) + { + *status = U_INVALID_FORMAT_ERROR; + error(line, "expecting %s, got %s", tokenNames[expectedToken], tokenNames[token]); + } + else + { + *status = U_ZERO_ERROR; + } +} + +static char *getInvariantString(ParseState* state, uint32_t *line, struct UString *comment, + int32_t &stringLength, UErrorCode *status) +{ + struct UString *tokenValue; + char *result; + + expect(state, TOK_STRING, &tokenValue, comment, line, status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + + if(!uprv_isInvariantUString(tokenValue->fChars, tokenValue->fLength)) { + *status = U_INVALID_FORMAT_ERROR; + error(*line, "invariant characters required for table keys, binary data, etc."); + return nullptr; + } + + result = static_cast<char *>(uprv_malloc(tokenValue->fLength+1)); + + if (result == nullptr) + { + *status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + + u_UCharsToChars(tokenValue->fChars, result, tokenValue->fLength+1); + stringLength = tokenValue->fLength; + return result; +} + +static struct SResource * +parseUCARules(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status) +{ + struct SResource *result = nullptr; + struct UString *tokenValue; + FileStream *file = nullptr; + char filename[256] = { '\0' }; + char cs[128] = { '\0' }; + uint32_t line; + UBool quoted = false; + UCHARBUF *ucbuf=nullptr; + UChar32 c = 0; + const char* cp = nullptr; + char16_t *pTarget = nullptr; + char16_t *target = nullptr; + char16_t *targetLimit = nullptr; + int32_t size = 0; + + expect(state, TOK_STRING, &tokenValue, nullptr, &line, status); + + if(isVerbose()){ + printf(" %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + if (U_FAILURE(*status)) + { + return nullptr; + } + /* make the filename including the directory */ + if (state->inputdir != nullptr) + { + uprv_strcat(filename, state->inputdir); + + if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR) + { + uprv_strcat(filename, U_FILE_SEP_STRING); + } + } + + u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength); + + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + uprv_strcat(filename, cs); + + if(state->omitCollationRules) { + return res_none(); + } + + ucbuf = ucbuf_open(filename, &cp, getShowWarning(),false, status); + + if (U_FAILURE(*status)) { + error(line, "An error occurred while opening the input file %s\n", filename); + return nullptr; + } + + /* We allocate more space than actually required + * since the actual size needed for storing UChars + * is not known in UTF-8 byte stream + */ + size = ucbuf_size(ucbuf) + 1; + pTarget = (char16_t*) uprv_malloc(U_SIZEOF_UCHAR * size); + uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR); + target = pTarget; + targetLimit = pTarget+size; + + /* read the rules into the buffer */ + while (target < targetLimit) + { + c = ucbuf_getc(ucbuf, status); + if(c == QUOTE) { + quoted = (UBool)!quoted; + } + /* weiv (06/26/2002): adding the following: + * - preserving spaces in commands [...] + * - # comments until the end of line + */ + if (c == STARTCOMMAND && !quoted) + { + /* preserve commands + * closing bracket will be handled by the + * append at the end of the loop + */ + while(c != ENDCOMMAND) { + U_APPEND_CHAR32_ONLY(c, target); + c = ucbuf_getc(ucbuf, status); + } + } + else if (c == HASH && !quoted) { + /* skip comments */ + while(c != CR && c != LF) { + c = ucbuf_getc(ucbuf, status); + } + continue; + } + else if (c == ESCAPE) + { + c = unescape(ucbuf, status); + + if (c == (UChar32)U_ERR) + { + uprv_free(pTarget); + T_FileStream_close(file); + return nullptr; + } + } + else if (!quoted && (c == SPACE || c == TAB || c == CR || c == LF)) + { + /* ignore spaces carriage returns + * and line feed unless in the form \uXXXX + */ + continue; + } + + /* Append char16_t * after dissembling if c > 0xffff*/ + if (c != (UChar32)U_EOF) + { + U_APPEND_CHAR32_ONLY(c, target); + } + else + { + break; + } + } + + /* terminate the string */ + if(target < targetLimit){ + *target = 0x0000; + } + + result = string_open(state->bundle, tag, pTarget, (int32_t)(target - pTarget), nullptr, status); + + + ucbuf_close(ucbuf); + uprv_free(pTarget); + T_FileStream_close(file); + + return result; +} + +static struct SResource * +parseTransliterator(ParseState* state, char *tag, uint32_t startline, const struct UString* /*comment*/, UErrorCode *status) +{ + struct SResource *result = nullptr; + struct UString *tokenValue; + FileStream *file = nullptr; + char filename[256] = { '\0' }; + char cs[128] = { '\0' }; + uint32_t line; + UCHARBUF *ucbuf=nullptr; + const char* cp = nullptr; + char16_t *pTarget = nullptr; + const char16_t *pSource = nullptr; + int32_t size = 0; + + expect(state, TOK_STRING, &tokenValue, nullptr, &line, status); + + if(isVerbose()){ + printf(" %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + if (U_FAILURE(*status)) + { + return nullptr; + } + /* make the filename including the directory */ + if (state->inputdir != nullptr) + { + uprv_strcat(filename, state->inputdir); + + if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR) + { + uprv_strcat(filename, U_FILE_SEP_STRING); + } + } + + u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength); + + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + uprv_strcat(filename, cs); + + + ucbuf = ucbuf_open(filename, &cp, getShowWarning(),false, status); + + if (U_FAILURE(*status)) { + error(line, "An error occurred while opening the input file %s\n", filename); + return nullptr; + } + + /* We allocate more space than actually required + * since the actual size needed for storing UChars + * is not known in UTF-8 byte stream + */ + pSource = ucbuf_getBuffer(ucbuf, &size, status); + pTarget = (char16_t*) uprv_malloc(U_SIZEOF_UCHAR * (size + 1)); + uprv_memset(pTarget, 0, size*U_SIZEOF_UCHAR); + +#if !UCONFIG_NO_TRANSLITERATION + size = utrans_stripRules(pSource, size, pTarget, status); +#else + size = 0; + fprintf(stderr, " Warning: writing empty transliteration data ( UCONFIG_NO_TRANSLITERATION ) \n"); +#endif + result = string_open(state->bundle, tag, pTarget, size, nullptr, status); + + ucbuf_close(ucbuf); + uprv_free(pTarget); + T_FileStream_close(file); + + return result; +} +static ArrayResource* dependencyArray = nullptr; + +static struct SResource * +parseDependency(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status) +{ + struct SResource *result = nullptr; + struct SResource *elem = nullptr; + struct UString *tokenValue; + uint32_t line; + char filename[256] = { '\0' }; + char cs[128] = { '\0' }; + + expect(state, TOK_STRING, &tokenValue, nullptr, &line, status); + + if(isVerbose()){ + printf(" %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + if (U_FAILURE(*status)) + { + return nullptr; + } + /* make the filename including the directory */ + if (state->outputdir != nullptr) + { + uprv_strcat(filename, state->outputdir); + + if (state->outputdir[state->outputdirLength - 1] != U_FILE_SEP_CHAR) + { + uprv_strcat(filename, U_FILE_SEP_STRING); + } + } + + u_UCharsToChars(tokenValue->fChars, cs, tokenValue->fLength); + + if (U_FAILURE(*status)) + { + return nullptr; + } + uprv_strcat(filename, cs); + if(!T_FileStream_file_exists(filename)){ + if(isStrict()){ + error(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename); + }else{ + warning(line, "The dependency file %s does not exist. Please make sure it exists.\n",filename); + } + } + if(dependencyArray==nullptr){ + dependencyArray = array_open(state->bundle, "%%DEPENDENCY", nullptr, status); + } + if(tag!=nullptr){ + result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status); + } + elem = string_open(state->bundle, nullptr, tokenValue->fChars, tokenValue->fLength, comment, status); + + dependencyArray->add(elem); + + if (U_FAILURE(*status)) + { + return nullptr; + } + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + return result; +} +static struct SResource * +parseString(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status) +{ + struct UString *tokenValue; + struct SResource *result = nullptr; + +/* if (tag != nullptr && uprv_strcmp(tag, "%%UCARULES") == 0) + { + return parseUCARules(tag, startline, status); + }*/ + if(isVerbose()){ + printf(" string %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + expect(state, TOK_STRING, &tokenValue, nullptr, nullptr, status); + + if (U_SUCCESS(*status)) + { + /* create the string now - tokenValue doesn't survive a call to getToken (and therefore + doesn't survive expect either) */ + + result = string_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status); + if(U_SUCCESS(*status) && result) { + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + } + } + + return result; +} + +static struct SResource * +parseAlias(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) +{ + struct UString *tokenValue; + struct SResource *result = nullptr; + + expect(state, TOK_STRING, &tokenValue, nullptr, nullptr, status); + + if(isVerbose()){ + printf(" alias %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + if (U_SUCCESS(*status)) + { + /* create the string now - tokenValue doesn't survive a call to getToken (and therefore + doesn't survive expect either) */ + + result = alias_open(state->bundle, tag, tokenValue->fChars, tokenValue->fLength, comment, status); + + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + } + + return result; +} + +#if !UCONFIG_NO_COLLATION + +namespace { + +static struct SResource* resLookup(struct SResource* res, const char* key){ + if (res == res_none() || !res->isTable()) { + return nullptr; + } + + TableResource *list = static_cast<TableResource *>(res); + SResource *current = list->fFirst; + while (current != nullptr) { + if (uprv_strcmp(((list->fRoot->fKeys) + (current->fKey)), key) == 0) { + return current; + } + current = current->fNext; + } + return nullptr; +} + +class GenrbImporter : public icu::CollationRuleParser::Importer { +public: + GenrbImporter(const char *in, const char *out) : inputDir(in), outputDir(out) {} + virtual ~GenrbImporter(); + virtual void getRules( + const char *localeID, const char *collationType, + UnicodeString &rules, + const char *&errorReason, UErrorCode &errorCode) override; + +private: + const char *inputDir; + const char *outputDir; +}; + +GenrbImporter::~GenrbImporter() {} + +void +GenrbImporter::getRules( + const char *localeID, const char *collationType, + UnicodeString &rules, + const char *& /*errorReason*/, UErrorCode &errorCode) { + CharString filename(localeID, errorCode); + for(int32_t i = 0; i < filename.length(); i++){ + if(filename[i] == '-'){ + filename.data()[i] = '_'; + } + } + filename.append(".txt", errorCode); + if (U_FAILURE(errorCode)) { + return; + } + CharString inputDirBuf; + CharString openFileName; + if(inputDir == nullptr) { + const char *filenameBegin = uprv_strrchr(filename.data(), U_FILE_SEP_CHAR); + if (filenameBegin != nullptr) { + /* + * When a filename ../../../data/root.txt is specified, + * we presume that the input directory is ../../../data + * This is very important when the resource file includes + * another file, like UCARules.txt or thaidict.brk. + */ + StringPiece dir = filename.toStringPiece(); + const char *filenameLimit = filename.data() + filename.length(); + dir.remove_suffix((int32_t)(filenameLimit - filenameBegin)); + inputDirBuf.append(dir, errorCode); + inputDir = inputDirBuf.data(); + } + }else{ + int32_t dirlen = (int32_t)uprv_strlen(inputDir); + + if((filename[0] != U_FILE_SEP_CHAR) && (inputDir[dirlen-1] !='.')) { + /* + * append the input dir to openFileName if the first char in + * filename is not file separator char and the last char input directory is not '.'. + * This is to support : + * genrb -s. /home/icu/data + * genrb -s. icu/data + * The user cannot mix notations like + * genrb -s. /icu/data --- the absolute path specified. -s redundant + * user should use + * genrb -s. icu/data --- start from CWD and look in icu/data dir + */ + openFileName.append(inputDir, dirlen, errorCode); + if(inputDir[dirlen-1] != U_FILE_SEP_CHAR) { + openFileName.append(U_FILE_SEP_CHAR, errorCode); + } + } + } + openFileName.append(filename, errorCode); + if(U_FAILURE(errorCode)) { + return; + } + // printf("GenrbImporter::getRules(%s, %s) reads %s\n", localeID, collationType, openFileName.data()); + const char* cp = ""; + LocalUCHARBUFPointer ucbuf( + ucbuf_open(openFileName.data(), &cp, getShowWarning(), true, &errorCode)); + if(errorCode == U_FILE_ACCESS_ERROR) { + fprintf(stderr, "couldn't open file %s\n", openFileName.data()); + return; + } + if (ucbuf.isNull() || U_FAILURE(errorCode)) { + fprintf(stderr, "An error occurred processing file %s. Error: %s\n", openFileName.data(), u_errorName(errorCode)); + return; + } + + /* Parse the data into an SRBRoot */ + LocalPointer<SRBRoot> data( + parse(ucbuf.getAlias(), inputDir, outputDir, filename.data(), false, false, false, &errorCode)); + if (U_FAILURE(errorCode)) { + return; + } + + struct SResource *root = data->fRoot; + struct SResource *collations = resLookup(root, "collations"); + if (collations != nullptr) { + struct SResource *collation = resLookup(collations, collationType); + if (collation != nullptr) { + struct SResource *sequence = resLookup(collation, "Sequence"); + if (sequence != nullptr && sequence->isString()) { + // No string pointer aliasing so that we need not hold onto the resource bundle. + StringResource *sr = static_cast<StringResource *>(sequence); + rules = sr->fString; + } + } + } +} + +// Quick-and-dirty escaping function. +// Assumes that we are on an ASCII-based platform. +static void +escape(const char16_t *s, char *buffer, size_t n) { + int32_t length = u_strlen(s); + int32_t i = 0; + for (;;) { + UChar32 c; + U16_NEXT(s, i, length, c); + if (c == 0) { + *buffer = 0; + return; + } else if (0x20 <= c && c <= 0x7e) { + // printable ASCII + *buffer++ = (char)c; // assumes ASCII-based platform + } else { + buffer += snprintf(buffer, n, "\\u%04X", (int)c); + } + } +} + +} // namespace + +static FILE* +openTOML(const char* outputdir, const char* name, const char* collationType, const char* structType, UErrorCode *status) { + CharString baseName; + baseName.append(name, *status); + baseName.append("_", *status); + baseName.append(collationType, *status); + baseName.append("_", *status); + baseName.append(structType, *status); + + CharString outFileName; + if (outputdir && *outputdir) { + outFileName.append(outputdir, *status).ensureEndsWithFileSeparator(*status); + } + outFileName.append(baseName, *status); + outFileName.append(".toml", *status); + if (U_FAILURE(*status)) { + return nullptr; + } + + FILE* f = fopen(outFileName.data(), "w"); + if (!f) { + *status = U_FILE_ACCESS_ERROR; + return nullptr; + } + usrc_writeFileNameGeneratedBy(f, "#", baseName.data(), "genrb -X"); + + return f; +} + +static void +writeCollationMetadataTOML(const char* outputdir, const char* name, const char* collationType, const uint32_t metadataBits, UErrorCode *status) { + FILE* f = openTOML(outputdir, name, collationType, "meta", status); + if (!f) { + return; + } + // printf("writeCollationMetadataTOML %s %s\n", name, collationType); + fprintf(f, "bits = 0x%X\n", metadataBits); + fclose(f); +} + +static UChar32 +writeCollationDiacriticsTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) { + UChar32 limit = ICU4X_DIACRITIC_LIMIT; + FILE* f = openTOML(outputdir, name, collationType, "dia", status); + if (!f) { + return limit; + } + // printf("writeCollationDiacriticsTOML %s %s\n", name, collationType); + uint16_t secondaries[ICU4X_DIACRITIC_LIMIT-ICU4X_DIACRITIC_BASE]; + for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) { + uint16_t secondary = 0; + uint32_t ce32 = data->getCE32(c); + if (ce32 == icu::Collation::FALLBACK_CE32) { + ce32 = data->base->getCE32(c); + } + if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) { + // These never occur in NFD data + } else if (!icu::Collation::isSimpleOrLongCE32(ce32)) { + if (uprv_strcmp(name, "root") == 0) { + printf("UNSUPPORTED DIACRITIC CE32 in root: TAG: %X CE32: %X char: %X\n", icu::Collation::tagFromCE32(ce32), ce32, c); + fclose(f); + *status = U_INTERNAL_PROGRAM_ERROR; + return limit; + } + limit = c; + break; + } else { + uint64_t ce = uint64_t(icu::Collation::ceFromCE32(ce32)); + if ((ce & 0xFFFFFFFF0000FFFF) != uint64_t(icu::Collation::COMMON_TERTIARY_CE)) { + // Not a CE where only the secondary weight differs from the expected + // pattern. + limit = c; + break; + } + secondary = uint16_t(ce >> 16); + } + secondaries[c - ICU4X_DIACRITIC_BASE] = secondary; + + } + usrc_writeArray(f, "secondaries = [\n ", secondaries, 16, limit-ICU4X_DIACRITIC_BASE, " ", "\n]\n"); + fclose(f); + return limit; +} + +static void +writeCollationReorderingTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationSettings* settings, UErrorCode *status) { + FILE* f = openTOML(outputdir, name, collationType, "reord", status); + if (!f) { + return; + } + // printf("writeCollationReorderingTOML %s %s\n", name, collationType); + fprintf(f, "min_high_no_reorder = 0x%X\n", settings->minHighNoReorder); + usrc_writeArray(f, "reorder_table = [\n ", settings->reorderTable, 8, 256, " ", "\n]\n"); + usrc_writeArray(f, "reorder_ranges = [\n ", settings->reorderRanges, 32, settings->reorderRangesLength, " ", "\n]\n"); + fclose(f); +} + + +static void +writeCollationJamoTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) { + FILE* f = openTOML(outputdir, name, collationType, "jamo", status); + if (!f) { + printf("writeCollationJamoTOML FAILED TO OPEN FILE %s %s\n", name, collationType); + return; + } + uint32_t jamo[0x1200-0x1100]; + for (UChar32 c = 0x1100; c < 0x1200; ++c) { + uint32_t ce32 = data->getCE32(c); + if (ce32 == icu::Collation::FALLBACK_CE32) { + ce32 = data->base->getCE32(c); + } + // Can't reject complex CE32s, because search collations have expansions. + // These expansions refer to the tailoring, which foils the reuse of the + // these jamo tables. + // XXX Figure out what to do. Perhaps instead of having Latin mini expansions, + // there should be Hangul mini expansions. + // XXX in any case, validate that modern jamo are self-contained. + jamo[c - 0x1100] = ce32; + + } + usrc_writeArray(f, "ce32s = [\n ", jamo, 32, 0x1200-0x1100, " ", "\n]\n"); + fclose(f); +} + +static UBool +convertTrie(const void *context, UChar32 start, UChar32 end, uint32_t value) { + if (start >= 0x1100 && start < 0x1200 && end >= 0x1100 && end < 0x1200) { + // Range entirely in conjoining jamo block. + return true; + } + icu::IcuToolErrorCode status("genrb: convertTrie"); + umutablecptrie_setRange((UMutableCPTrie*)context, start, end, value, status); + return !U_FAILURE(*status); +} + +static void +writeCollationDataTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UBool root, UChar32 diacriticLimit, UErrorCode *status) { + FILE* f = openTOML(outputdir, name, collationType, "data", status); + if (!f) { + return; + } + // printf("writeCollationDataTOML %s %s\n", name, collationType); + + icu::UnicodeSet tailoringSet; + + if (data->base) { + tailoringSet.addAll(*(data->unsafeBackwardSet)); + tailoringSet.removeAll(*(data->base->unsafeBackwardSet)); + } else { + tailoringSet.addAll(*(data->unsafeBackwardSet)); + } + + // Use the same value for out-of-range and default in the hope of not having to allocate + // different blocks, since ICU4X never does out-of-range queries. + uint32_t trieDefault = root ? icu::Collation::UNASSIGNED_CE32 : icu::Collation::FALLBACK_CE32; + icu::LocalUMutableCPTriePointer builder(umutablecptrie_open(trieDefault, trieDefault, status)); + + utrie2_enum(data->trie, nullptr, &convertTrie, builder.getAlias()); + + // If the diacritic table was cut short, copy CE32s between the lowered + // limit and the max limit from the root to the tailoring. As of June 2022, + // no collation in CLDR needs this. + for (UChar32 c = diacriticLimit; c < ICU4X_DIACRITIC_LIMIT; ++c) { + if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) { + // These never occur in NFD data. + continue; + } + uint32_t ce32 = data->getCE32(c); + if (ce32 == icu::Collation::FALLBACK_CE32) { + ce32 = data->base->getCE32(c); + umutablecptrie_set(builder.getAlias(), c, ce32, status); + } + } + + // Ensure that the range covered by the diacritic table isn't duplicated + // in the trie. + for (UChar32 c = ICU4X_DIACRITIC_BASE; c < diacriticLimit; ++c) { + if (umutablecptrie_get(builder.getAlias(), c) != trieDefault) { + umutablecptrie_set(builder.getAlias(), c, trieDefault, status); + } + } + + icu::LocalUCPTriePointer utrie(umutablecptrie_buildImmutable( + builder.getAlias(), + UCPTRIE_TYPE_SMALL, + UCPTRIE_VALUE_BITS_32, + status)); + usrc_writeArray(f, "contexts = [\n ", data->contexts, 16, data->contextsLength, " ", "\n]\n"); + usrc_writeArray(f, "ce32s = [\n ", data->ce32s, 32, data->ce32sLength, " ", "\n]\n"); + usrc_writeArray(f, "ces = [\n ", data->ces, 64, data->cesLength, " ", "\n]\n"); + fprintf(f, "[trie]\n"); + usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML); + + fclose(f); +} + +static void +writeCollationSpecialPrimariesTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, UErrorCode *status) { + FILE* f = openTOML(outputdir, name, collationType, "prim", status); + if (!f) { + return; + } + // printf("writeCollationSpecialPrimariesTOML %s %s\n", name, collationType); + + uint16_t lastPrimaries[4]; + for (int32_t i = 0; i < 4; ++i) { + // getLastPrimaryForGroup subtracts one from a 16-bit value, so we add one + // back to get a value that fits in 16 bits. + lastPrimaries[i] = (uint16_t)((data->getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i) + 1) >> 16); + } + + uint32_t numericPrimary = data->numericPrimary; + if (numericPrimary & 0xFFFFFF) { + printf("Lower 24 bits set in numeric primary"); + *status = U_INTERNAL_PROGRAM_ERROR; + return; + } + + usrc_writeArray(f, "last_primaries = [\n ", lastPrimaries, 16, 4, " ", "\n]\n"); + fprintf(f, "numeric_primary = 0x%X\n", numericPrimary >> 24); + fclose(f); +} + +static void +writeCollationTOML(const char* outputdir, const char* name, const char* collationType, const icu::CollationData* data, const icu::CollationSettings* settings, UErrorCode *status) { + UBool tailored = false; + UBool tailoredDiacritics = false; + UBool lithuanianDotAbove = (uprv_strcmp(name, "lt") == 0); + UBool reordering = false; + UBool isRoot = uprv_strcmp(name, "root") == 0; + UChar32 diacriticLimit = ICU4X_DIACRITIC_LIMIT; + if (!data->base && isRoot) { + diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status); + if (U_FAILURE(*status)) { + return; + } + writeCollationJamoTOML(outputdir, name, collationType, data, status); + if (U_FAILURE(*status)) { + return; + } + writeCollationSpecialPrimariesTOML(outputdir, name, collationType, data, status); + if (U_FAILURE(*status)) { + return; + } + } else if (data->base && !lithuanianDotAbove) { + for (UChar32 c = ICU4X_DIACRITIC_BASE; c < ICU4X_DIACRITIC_LIMIT; ++c) { + if (c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344) { + // These never occur in NFD data. + continue; + } + uint32_t ce32 = data->getCE32(c); + if ((ce32 != icu::Collation::FALLBACK_CE32) && (ce32 != data->base->getCE32(c))) { + tailoredDiacritics = true; + diacriticLimit = writeCollationDiacriticsTOML(outputdir, name, collationType, data, status); + if (U_FAILURE(*status)) { + return; + } + break; + } + } + } + + if (settings->hasReordering()) { + reordering = true; + // Note: There are duplicate reorderings. Expecting the ICU4X provider + // to take care of deduplication. + writeCollationReorderingTOML(outputdir, name, collationType, settings, status); + if (U_FAILURE(*status)) { + return; + } + } + + // Write collation data if either base is non-null or the name is root. + // Languages that only reorder scripts are otherwise root-like and have + // null base. + if (data->base || isRoot) { + tailored = !isRoot; + writeCollationDataTOML(outputdir, name, collationType, data, (!data->base && isRoot), diacriticLimit, status); + if (U_FAILURE(*status)) { + return; + } + } + + uint32_t maxVariable = (uint32_t)settings->getMaxVariable(); + if (maxVariable >= 4) { + printf("Max variable out of range"); + *status = U_INTERNAL_PROGRAM_ERROR; + return; + } + + uint32_t metadataBits = maxVariable; + if (tailored) { + metadataBits |= (1 << 3); + } + if (tailoredDiacritics) { + metadataBits |= (1 << 4); + } + if (reordering) { + metadataBits |= (1 << 5); + } + if (lithuanianDotAbove) { + metadataBits |= (1 << 6); + } + if ((settings->options & icu::CollationSettings::BACKWARD_SECONDARY) != 0) { + metadataBits |= (1 << 7); + } + if (settings->getAlternateHandling() == UCOL_SHIFTED) { + metadataBits |= (1 << 8); + } + switch (settings->getCaseFirst()) { + case UCOL_OFF: + break; + case UCOL_UPPER_FIRST: + metadataBits |= (1 << 9); + metadataBits |= (1 << 10); + break; + case UCOL_LOWER_FIRST: + metadataBits |= (1 << 9); + break; + default: + *status = U_INTERNAL_PROGRAM_ERROR; + return; + } + + writeCollationMetadataTOML(outputdir, name, collationType, metadataBits, status); +} + +#endif // !UCONFIG_NO_COLLATION + +static TableResource * +addCollation(ParseState* state, TableResource *result, const char *collationType, + uint32_t startline, UErrorCode *status) +{ + // TODO: Use LocalPointer for result, or make caller close it when there is a failure. + struct SResource *member = nullptr; + struct UString *tokenValue; + struct UString comment; + enum ETokenType token; + char subtag[1024]; + UnicodeString rules; + UBool haveRules = false; + UVersionInfo version; + uint32_t line; + + /* '{' . (name resource)* '}' */ + version[0]=0; version[1]=0; version[2]=0; version[3]=0; + + for (;;) + { + ustr_init(&comment); + token = getToken(state, &tokenValue, &comment, &line, status); + + if (token == TOK_CLOSE_BRACE) + { + break; + } + + if (token != TOK_STRING) + { + res_close(result); + *status = U_INVALID_FORMAT_ERROR; + + if (token == TOK_EOF) + { + error(startline, "unterminated table"); + } + else + { + error(line, "Unexpected token %s", tokenNames[token]); + } + + return nullptr; + } + + u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1); + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + + member = parseResource(state, subtag, nullptr, status); + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + if (result == nullptr) + { + // Ignore the parsed resources, continue parsing. + } + else if (uprv_strcmp(subtag, "Version") == 0 && member->isString()) + { + StringResource *sr = static_cast<StringResource *>(member); + char ver[40]; + int32_t length = sr->length(); + + if (length >= UPRV_LENGTHOF(ver)) + { + length = UPRV_LENGTHOF(ver) - 1; + } + + sr->fString.extract(0, length, ver, UPRV_LENGTHOF(ver), US_INV); + u_versionFromString(version, ver); + + result->add(member, line, *status); + member = nullptr; + } + else if(uprv_strcmp(subtag, "%%CollationBin")==0) + { + /* discard duplicate %%CollationBin if any*/ + } + else if (uprv_strcmp(subtag, "Sequence") == 0 && member->isString()) + { + StringResource *sr = static_cast<StringResource *>(member); + rules = sr->fString; + haveRules = true; + // Defer building the collator until we have seen + // all sub-elements of the collation table, including the Version. + /* in order to achieve smaller data files, we can direct genrb */ + /* to omit collation rules */ + if(!state->omitCollationRules) { + result->add(member, line, *status); + member = nullptr; + } + } + else // Just copy non-special items. + { + result->add(member, line, *status); + member = nullptr; + } + res_close(member); // TODO: use LocalPointer + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + } + + if (!haveRules) { return result; } + +#if UCONFIG_NO_COLLATION || UCONFIG_NO_FILE_IO + warning(line, "Not building collation elements because of UCONFIG_NO_COLLATION and/or UCONFIG_NO_FILE_IO, see uconfig.h"); + (void)collationType; +#else + // CLDR ticket #3949, ICU ticket #8082: + // Do not build collation binary data for for-import-only "private" collation rule strings. + if (uprv_strncmp(collationType, "private-", 8) == 0) { + if(isVerbose()) { + printf("Not building %s~%s collation binary\n", state->filename, collationType); + } + return result; + } + + if(!state->makeBinaryCollation) { + if(isVerbose()) { + printf("Not building %s~%s collation binary\n", state->filename, collationType); + } + return result; + } + UErrorCode intStatus = U_ZERO_ERROR; + UParseError parseError; + uprv_memset(&parseError, 0, sizeof(parseError)); + GenrbImporter importer(state->inputdir, state->outputdir); + const icu::CollationTailoring *base = icu::CollationRoot::getRoot(intStatus); + if(U_FAILURE(intStatus)) { + error(line, "failed to load root collator (ucadata.icu) - %s", u_errorName(intStatus)); + res_close(result); + return nullptr; // TODO: use LocalUResourceBundlePointer for result + } + icu::CollationBuilder builder(base, state->icu4xMode, intStatus); + if(state->icu4xMode || (uprv_strncmp(collationType, "search", 6) == 0)) { + builder.disableFastLatin(); // build fast-Latin table unless search collator or ICU4X + } + LocalPointer<icu::CollationTailoring> t( + builder.parseAndBuild(rules, version, &importer, &parseError, intStatus)); + if(U_FAILURE(intStatus)) { + const char *reason = builder.getErrorReason(); + if(reason == nullptr) { reason = ""; } + error(line, "CollationBuilder failed at %s~%s/Sequence rule offset %ld: %s %s", + state->filename, collationType, + (long)parseError.offset, u_errorName(intStatus), reason); + if(parseError.preContext[0] != 0 || parseError.postContext[0] != 0) { + // Print pre- and post-context. + char preBuffer[100], postBuffer[100]; + escape(parseError.preContext, preBuffer, sizeof(preBuffer)); + escape(parseError.postContext, postBuffer, sizeof(postBuffer)); + error(line, " error context: \"...%s\" ! \"%s...\"", preBuffer, postBuffer); + } + if(isStrict() || t.isNull()) { + *status = intStatus; + res_close(result); + return nullptr; + } + } + if (state->icu4xMode) { + char *nameWithoutSuffix = static_cast<char *>(uprv_malloc(uprv_strlen(state->filename) + 1)); + if (nameWithoutSuffix == nullptr) { + *status = U_MEMORY_ALLOCATION_ERROR; + res_close(result); + return nullptr; + } + uprv_strcpy(nameWithoutSuffix, state->filename); + *uprv_strrchr(nameWithoutSuffix, '.') = 0; + + writeCollationTOML(state->outputdir, nameWithoutSuffix, collationType, t->data, t->settings, status); + uprv_free(nameWithoutSuffix); + } + icu::LocalMemory<uint8_t> buffer; + int32_t capacity = 100000; + uint8_t *dest = buffer.allocateInsteadAndCopy(capacity); + if(dest == nullptr) { + fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n", + (long)capacity); + *status = U_MEMORY_ALLOCATION_ERROR; + res_close(result); + return nullptr; + } + int32_t indexes[icu::CollationDataReader::IX_TOTAL_SIZE + 1]; + int32_t totalSize = icu::CollationDataWriter::writeTailoring( + *t, *t->settings, indexes, dest, capacity, intStatus); + if(intStatus == U_BUFFER_OVERFLOW_ERROR) { + intStatus = U_ZERO_ERROR; + capacity = totalSize; + dest = buffer.allocateInsteadAndCopy(capacity); + if(dest == nullptr) { + fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n", + (long)capacity); + *status = U_MEMORY_ALLOCATION_ERROR; + res_close(result); + return nullptr; + } + totalSize = icu::CollationDataWriter::writeTailoring( + *t, *t->settings, indexes, dest, capacity, intStatus); + } + if(U_FAILURE(intStatus)) { + fprintf(stderr, "CollationDataWriter::writeTailoring() failed: %s\n", + u_errorName(intStatus)); + res_close(result); + return nullptr; + } + if(isVerbose()) { + printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType); + icu::CollationInfo::printSizes(totalSize, indexes); + if(t->settings->hasReordering()) { + printf("%s~%s collation reordering ranges:\n", state->filename, collationType); + icu::CollationInfo::printReorderRanges( + *t->data, t->settings->reorderCodes, t->settings->reorderCodesLength); + } +#if 0 // debugging output + } else { + printf("%s~%s collation tailoring part sizes:\n", state->filename, collationType); + icu::CollationInfo::printSizes(totalSize, indexes); +#endif + } + struct SResource *collationBin = bin_open(state->bundle, "%%CollationBin", totalSize, dest, nullptr, nullptr, status); + result->add(collationBin, line, *status); + if (U_FAILURE(*status)) { + res_close(result); + return nullptr; + } +#endif + return result; +} + +static UBool +keepCollationType(const char * /*type*/) { + return true; +} + +static struct SResource * +parseCollationElements(ParseState* state, char *tag, uint32_t startline, UBool newCollation, UErrorCode *status) +{ + TableResource *result = nullptr; + struct SResource *member = nullptr; + struct UString *tokenValue; + struct UString comment; + enum ETokenType token; + char subtag[1024], typeKeyword[1024]; + uint32_t line; + + result = table_open(state->bundle, tag, nullptr, status); + + if (result == nullptr || U_FAILURE(*status)) + { + return nullptr; + } + if(isVerbose()){ + printf(" collation elements %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + if(!newCollation) { + return addCollation(state, result, "(no type)", startline, status); + } + else { + for(;;) { + ustr_init(&comment); + token = getToken(state, &tokenValue, &comment, &line, status); + + if (token == TOK_CLOSE_BRACE) + { + return result; + } + + if (token != TOK_STRING) + { + res_close(result); + *status = U_INVALID_FORMAT_ERROR; + + if (token == TOK_EOF) + { + error(startline, "unterminated table"); + } + else + { + error(line, "Unexpected token %s", tokenNames[token]); + } + + return nullptr; + } + + u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1); + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + + if (uprv_strcmp(subtag, "default") == 0) + { + member = parseResource(state, subtag, nullptr, status); + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + + result->add(member, line, *status); + } + else + { + token = peekToken(state, 0, &tokenValue, &line, &comment, status); + /* this probably needs to be refactored or recursively use the parser */ + /* first we assume that our collation table won't have the explicit type */ + /* then, we cannot handle aliases */ + if(token == TOK_OPEN_BRACE) { + token = getToken(state, &tokenValue, &comment, &line, status); + TableResource *collationRes; + if (keepCollationType(subtag)) { + collationRes = table_open(state->bundle, subtag, nullptr, status); + } else { + collationRes = nullptr; + } + // need to parse the collation data regardless + collationRes = addCollation(state, collationRes, subtag, startline, status); + if (collationRes != nullptr) { + result->add(collationRes, startline, *status); + } + } else if(token == TOK_COLON) { /* right now, we'll just try to see if we have aliases */ + /* we could have a table too */ + token = peekToken(state, 1, &tokenValue, &line, &comment, status); + u_UCharsToChars(tokenValue->fChars, typeKeyword, u_strlen(tokenValue->fChars) + 1); + if(uprv_strcmp(typeKeyword, "alias") == 0) { + member = parseResource(state, subtag, nullptr, status); + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + + result->add(member, line, *status); + } else { + res_close(result); + *status = U_INVALID_FORMAT_ERROR; + return nullptr; + } + } else { + res_close(result); + *status = U_INVALID_FORMAT_ERROR; + return nullptr; + } + } + + /*member = string_open(bundle, subtag, tokenValue->fChars, tokenValue->fLength, status);*/ + + /*expect(TOK_CLOSE_BRACE, nullptr, nullptr, status);*/ + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + } + } +} + +/* Necessary, because CollationElements requires the bundle->fRoot member to be present which, + if this weren't special-cased, wouldn't be set until the entire file had been processed. */ +static struct SResource * +realParseTable(ParseState* state, TableResource *table, char *tag, uint32_t startline, UErrorCode *status) +{ + struct SResource *member = nullptr; + struct UString *tokenValue=nullptr; + struct UString comment; + enum ETokenType token; + char subtag[1024]; + uint32_t line; + UBool readToken = false; + + /* '{' . (name resource)* '}' */ + + if(isVerbose()){ + printf(" parsing table %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + for (;;) + { + ustr_init(&comment); + token = getToken(state, &tokenValue, &comment, &line, status); + + if (token == TOK_CLOSE_BRACE) + { + if (!readToken && isVerbose()) { + warning(startline, "Encountered empty table"); + } + return table; + } + + if (token != TOK_STRING) + { + *status = U_INVALID_FORMAT_ERROR; + + if (token == TOK_EOF) + { + error(startline, "unterminated table"); + } + else + { + error(line, "unexpected token %s", tokenNames[token]); + } + + return nullptr; + } + + if(uprv_isInvariantUString(tokenValue->fChars, -1)) { + u_UCharsToChars(tokenValue->fChars, subtag, u_strlen(tokenValue->fChars) + 1); + } else { + *status = U_INVALID_FORMAT_ERROR; + error(line, "invariant characters required for table keys"); + return nullptr; + } + + if (U_FAILURE(*status)) + { + error(line, "parse error. Stopped parsing tokens with %s", u_errorName(*status)); + return nullptr; + } + + member = parseResource(state, subtag, &comment, status); + + if (member == nullptr || U_FAILURE(*status)) + { + error(line, "parse error. Stopped parsing resource with %s", u_errorName(*status)); + return nullptr; + } + + table->add(member, line, *status); + + if (U_FAILURE(*status)) + { + error(line, "parse error. Stopped parsing table with %s", u_errorName(*status)); + return nullptr; + } + readToken = true; + ustr_deinit(&comment); + } + + /* not reached */ + /* A compiler warning will appear if all paths don't contain a return statement. */ +/* *status = U_INTERNAL_PROGRAM_ERROR; + return nullptr;*/ +} + +static struct SResource * +parseTable(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) +{ + if (tag != nullptr && uprv_strcmp(tag, "CollationElements") == 0) + { + return parseCollationElements(state, tag, startline, false, status); + } + if (tag != nullptr && uprv_strcmp(tag, "collations") == 0) + { + return parseCollationElements(state, tag, startline, true, status); + } + if(isVerbose()){ + printf(" table %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + TableResource *result = table_open(state->bundle, tag, comment, status); + + if (result == nullptr || U_FAILURE(*status)) + { + return nullptr; + } + return realParseTable(state, result, tag, startline, status); +} + +static struct SResource * +parseArray(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) +{ + struct SResource *member = nullptr; + struct UString *tokenValue; + struct UString memberComments; + enum ETokenType token; + UBool readToken = false; + + ArrayResource *result = array_open(state->bundle, tag, comment, status); + + if (result == nullptr || U_FAILURE(*status)) + { + return nullptr; + } + if(isVerbose()){ + printf(" array %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + ustr_init(&memberComments); + + /* '{' . resource [','] '}' */ + for (;;) + { + /* reset length */ + ustr_setlen(&memberComments, 0, status); + + /* check for end of array, but don't consume next token unless it really is the end */ + token = peekToken(state, 0, &tokenValue, nullptr, &memberComments, status); + + + if (token == TOK_CLOSE_BRACE) + { + getToken(state, nullptr, nullptr, nullptr, status); + if (!readToken) { + warning(startline, "Encountered empty array"); + } + break; + } + + if (token == TOK_EOF) + { + res_close(result); + *status = U_INVALID_FORMAT_ERROR; + error(startline, "unterminated array"); + return nullptr; + } + + /* string arrays are a special case */ + if (token == TOK_STRING) + { + getToken(state, &tokenValue, &memberComments, nullptr, status); + member = string_open(state->bundle, nullptr, tokenValue->fChars, tokenValue->fLength, &memberComments, status); + } + else + { + member = parseResource(state, nullptr, &memberComments, status); + } + + if (member == nullptr || U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + + result->add(member); + + /* eat optional comma if present */ + token = peekToken(state, 0, nullptr, nullptr, nullptr, status); + + if (token == TOK_COMMA) + { + getToken(state, nullptr, nullptr, nullptr, status); + } + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + readToken = true; + } + + ustr_deinit(&memberComments); + return result; +} + +static struct SResource * +parseIntVector(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) +{ + enum ETokenType token; + char *string; + int32_t value; + UBool readToken = false; + char *stopstring; + struct UString memberComments; + + IntVectorResource *result = intvector_open(state->bundle, tag, comment, status); + + if (result == nullptr || U_FAILURE(*status)) + { + return nullptr; + } + + if(isVerbose()){ + printf(" vector %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + ustr_init(&memberComments); + /* '{' . string [','] '}' */ + for (;;) + { + ustr_setlen(&memberComments, 0, status); + + /* check for end of array, but don't consume next token unless it really is the end */ + token = peekToken(state, 0, nullptr, nullptr,&memberComments, status); + + if (token == TOK_CLOSE_BRACE) + { + /* it's the end, consume the close brace */ + getToken(state, nullptr, nullptr, nullptr, status); + if (!readToken) { + warning(startline, "Encountered empty int vector"); + } + ustr_deinit(&memberComments); + return result; + } + + int32_t stringLength; + string = getInvariantString(state, nullptr, nullptr, stringLength, status); + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + + /* For handling illegal char in the Intvector */ + value = uprv_strtoul(string, &stopstring, 0);/* make intvector support decimal,hexdigit,octal digit ranging from -2^31-2^32-1*/ + int32_t len = (int32_t)(stopstring-string); + + if(len==stringLength) + { + result->add(value, *status); + uprv_free(string); + token = peekToken(state, 0, nullptr, nullptr, nullptr, status); + } + else + { + uprv_free(string); + *status=U_INVALID_CHAR_FOUND; + } + + if (U_FAILURE(*status)) + { + res_close(result); + return nullptr; + } + + /* the comma is optional (even though it is required to prevent the reader from concatenating + consecutive entries) so that a missing comma on the last entry isn't an error */ + if (token == TOK_COMMA) + { + getToken(state, nullptr, nullptr, nullptr, status); + } + readToken = true; + } + + /* not reached */ + /* A compiler warning will appear if all paths don't contain a return statement. */ +/* intvector_close(result, status); + *status = U_INTERNAL_PROGRAM_ERROR; + return nullptr;*/ +} + +static struct SResource * +parseBinary(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) +{ + uint32_t line; + int32_t stringLength; + LocalMemory<char> string(getInvariantString(state, &line, nullptr, stringLength, status)); + if (string.isNull() || U_FAILURE(*status)) + { + return nullptr; + } + + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + if (U_FAILURE(*status)) + { + return nullptr; + } + + if(isVerbose()){ + printf(" binary %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + LocalMemory<uint8_t> value; + int32_t count = 0; + if (stringLength > 0 && value.allocateInsteadAndCopy(stringLength) == nullptr) + { + *status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + + char toConv[3] = {'\0', '\0', '\0'}; + for (int32_t i = 0; i < stringLength;) + { + // Skip spaces (which may have been line endings). + char c0 = string[i++]; + if (c0 == ' ') { continue; } + if (i == stringLength) { + *status=U_INVALID_CHAR_FOUND; + error(line, "Encountered invalid binary value (odd number of hex digits)"); + return nullptr; + } + toConv[0] = c0; + toConv[1] = string[i++]; + + char *stopstring; + value[count++] = (uint8_t) uprv_strtoul(toConv, &stopstring, 16); + uint32_t len=(uint32_t)(stopstring-toConv); + + if(len!=2) + { + *status=U_INVALID_CHAR_FOUND; + error(line, "Encountered invalid binary value (not all pairs of hex digits)"); + return nullptr; + } + } + + if (count == 0) { + warning(startline, "Encountered empty binary value"); + return bin_open(state->bundle, tag, 0, nullptr, "", comment, status); + } else { + return bin_open(state->bundle, tag, count, value.getAlias(), nullptr, comment, status); + } +} + +static struct SResource * +parseInteger(ParseState* state, char *tag, uint32_t startline, const struct UString *comment, UErrorCode *status) +{ + struct SResource *result = nullptr; + int32_t value; + char *string; + char *stopstring; + + int32_t stringLength; + string = getInvariantString(state, nullptr, nullptr, stringLength, status); + + if (string == nullptr || U_FAILURE(*status)) + { + return nullptr; + } + + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + + if (U_FAILURE(*status)) + { + uprv_free(string); + return nullptr; + } + + if(isVerbose()){ + printf(" integer %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + if (stringLength == 0) + { + warning(startline, "Encountered empty integer. Default value is 0."); + } + + /* Allow integer support for hexdecimal, octal digit and decimal*/ + /* and handle illegal char in the integer*/ + value = uprv_strtoul(string, &stopstring, 0); + int32_t len = (int32_t)(stopstring-string); + if(len==stringLength) + { + result = int_open(state->bundle, tag, value, comment, status); + } + else + { + *status=U_INVALID_CHAR_FOUND; + } + uprv_free(string); + + return result; +} + +static struct SResource * +parseImport(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status) +{ + uint32_t line; + int32_t stringLength; + LocalMemory<char> filename(getInvariantString(state, &line, nullptr, stringLength, status)); + if (U_FAILURE(*status)) + { + return nullptr; + } + + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + + if(isVerbose()){ + printf(" import %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + /* Open the input file for reading */ + CharString fullname; + if (state->inputdir != nullptr) { + fullname.append(state->inputdir, *status); + } + fullname.appendPathPart(filename.getAlias(), *status); + if (U_FAILURE(*status)) { + return nullptr; + } + + FileStream *file = T_FileStream_open(fullname.data(), "rb"); + if (file == nullptr) + { + error(line, "couldn't open input file %s", filename.getAlias()); + *status = U_FILE_ACCESS_ERROR; + return nullptr; + } + + int32_t len = T_FileStream_size(file); + LocalMemory<uint8_t> data; + if(data.allocateInsteadAndCopy(len) == nullptr) + { + *status = U_MEMORY_ALLOCATION_ERROR; + T_FileStream_close (file); + return nullptr; + } + + /* int32_t numRead = */ T_FileStream_read(file, data.getAlias(), len); + T_FileStream_close (file); + + return bin_open(state->bundle, tag, len, data.getAlias(), fullname.data(), comment, status); +} + +static struct SResource * +parseInclude(ParseState* state, char *tag, uint32_t startline, const struct UString* comment, UErrorCode *status) +{ + struct SResource *result; + int32_t len=0; + char *filename; + uint32_t line; + char16_t *pTarget = nullptr; + + UCHARBUF *ucbuf; + char *fullname = nullptr; + const char* cp = nullptr; + const char16_t* uBuffer = nullptr; + + int32_t stringLength; + filename = getInvariantString(state, &line, nullptr, stringLength, status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + + expect(state, TOK_CLOSE_BRACE, nullptr, nullptr, nullptr, status); + + if (U_FAILURE(*status)) + { + uprv_free(filename); + return nullptr; + } + + if(isVerbose()){ + printf(" include %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + fullname = (char *) uprv_malloc(state->inputdirLength + stringLength + 2); + /* test for nullptr */ + if(fullname == nullptr) + { + *status = U_MEMORY_ALLOCATION_ERROR; + uprv_free(filename); + return nullptr; + } + + if(state->inputdir!=nullptr){ + if (state->inputdir[state->inputdirLength - 1] != U_FILE_SEP_CHAR) + { + + uprv_strcpy(fullname, state->inputdir); + + fullname[state->inputdirLength] = U_FILE_SEP_CHAR; + fullname[state->inputdirLength + 1] = '\0'; + + uprv_strcat(fullname, filename); + } + else + { + uprv_strcpy(fullname, state->inputdir); + uprv_strcat(fullname, filename); + } + }else{ + uprv_strcpy(fullname,filename); + } + + ucbuf = ucbuf_open(fullname, &cp,getShowWarning(),false,status); + + if (U_FAILURE(*status)) { + error(line, "couldn't open input file %s\n", filename); + return nullptr; + } + + uBuffer = ucbuf_getBuffer(ucbuf,&len,status); + result = string_open(state->bundle, tag, uBuffer, len, comment, status); + + ucbuf_close(ucbuf); + + uprv_free(pTarget); + + uprv_free(filename); + uprv_free(fullname); + + return result; +} + + + + + +U_STRING_DECL(k_type_string, "string", 6); +U_STRING_DECL(k_type_binary, "binary", 6); +U_STRING_DECL(k_type_bin, "bin", 3); +U_STRING_DECL(k_type_table, "table", 5); +U_STRING_DECL(k_type_table_no_fallback, "table(nofallback)", 17); +U_STRING_DECL(k_type_int, "int", 3); +U_STRING_DECL(k_type_integer, "integer", 7); +U_STRING_DECL(k_type_array, "array", 5); +U_STRING_DECL(k_type_alias, "alias", 5); +U_STRING_DECL(k_type_intvector, "intvector", 9); +U_STRING_DECL(k_type_import, "import", 6); +U_STRING_DECL(k_type_include, "include", 7); + +/* Various non-standard processing plugins that create one or more special resources. */ +U_STRING_DECL(k_type_plugin_uca_rules, "process(uca_rules)", 18); +U_STRING_DECL(k_type_plugin_collation, "process(collation)", 18); +U_STRING_DECL(k_type_plugin_transliterator, "process(transliterator)", 23); +U_STRING_DECL(k_type_plugin_dependency, "process(dependency)", 19); + +typedef enum EResourceType +{ + RESTYPE_UNKNOWN, + RESTYPE_STRING, + RESTYPE_BINARY, + RESTYPE_TABLE, + RESTYPE_TABLE_NO_FALLBACK, + RESTYPE_INTEGER, + RESTYPE_ARRAY, + RESTYPE_ALIAS, + RESTYPE_INTVECTOR, + RESTYPE_IMPORT, + RESTYPE_INCLUDE, + RESTYPE_PROCESS_UCA_RULES, + RESTYPE_PROCESS_COLLATION, + RESTYPE_PROCESS_TRANSLITERATOR, + RESTYPE_PROCESS_DEPENDENCY, + RESTYPE_RESERVED +} EResourceType; + +static struct { + const char *nameChars; /* only used for debugging */ + const char16_t *nameUChars; + ParseResourceFunction *parseFunction; +} gResourceTypes[] = { + {"Unknown", nullptr, nullptr}, + {"string", k_type_string, parseString}, + {"binary", k_type_binary, parseBinary}, + {"table", k_type_table, parseTable}, + {"table(nofallback)", k_type_table_no_fallback, nullptr}, /* parseFunction will never be called */ + {"integer", k_type_integer, parseInteger}, + {"array", k_type_array, parseArray}, + {"alias", k_type_alias, parseAlias}, + {"intvector", k_type_intvector, parseIntVector}, + {"import", k_type_import, parseImport}, + {"include", k_type_include, parseInclude}, + {"process(uca_rules)", k_type_plugin_uca_rules, parseUCARules}, + {"process(collation)", k_type_plugin_collation, nullptr /* not implemented yet */}, + {"process(transliterator)", k_type_plugin_transliterator, parseTransliterator}, + {"process(dependency)", k_type_plugin_dependency, parseDependency}, + {"reserved", nullptr, nullptr} +}; + +void initParser() +{ + U_STRING_INIT(k_type_string, "string", 6); + U_STRING_INIT(k_type_binary, "binary", 6); + U_STRING_INIT(k_type_bin, "bin", 3); + U_STRING_INIT(k_type_table, "table", 5); + U_STRING_INIT(k_type_table_no_fallback, "table(nofallback)", 17); + U_STRING_INIT(k_type_int, "int", 3); + U_STRING_INIT(k_type_integer, "integer", 7); + U_STRING_INIT(k_type_array, "array", 5); + U_STRING_INIT(k_type_alias, "alias", 5); + U_STRING_INIT(k_type_intvector, "intvector", 9); + U_STRING_INIT(k_type_import, "import", 6); + U_STRING_INIT(k_type_include, "include", 7); + + U_STRING_INIT(k_type_plugin_uca_rules, "process(uca_rules)", 18); + U_STRING_INIT(k_type_plugin_collation, "process(collation)", 18); + U_STRING_INIT(k_type_plugin_transliterator, "process(transliterator)", 23); + U_STRING_INIT(k_type_plugin_dependency, "process(dependency)", 19); +} + +static inline UBool isTable(enum EResourceType type) { + return (UBool)(type==RESTYPE_TABLE || type==RESTYPE_TABLE_NO_FALLBACK); +} + +static enum EResourceType +parseResourceType(ParseState* state, UErrorCode *status) +{ + struct UString *tokenValue; + struct UString comment; + enum EResourceType result = RESTYPE_UNKNOWN; + uint32_t line=0; + ustr_init(&comment); + expect(state, TOK_STRING, &tokenValue, &comment, &line, status); + + if (U_FAILURE(*status)) + { + return RESTYPE_UNKNOWN; + } + + *status = U_ZERO_ERROR; + + /* Search for normal types */ + result=RESTYPE_UNKNOWN; + while ((result=(EResourceType)(result+1)) < RESTYPE_RESERVED) { + if (u_strcmp(tokenValue->fChars, gResourceTypes[result].nameUChars) == 0) { + break; + } + } + /* Now search for the aliases */ + if (u_strcmp(tokenValue->fChars, k_type_int) == 0) { + result = RESTYPE_INTEGER; + } + else if (u_strcmp(tokenValue->fChars, k_type_bin) == 0) { + result = RESTYPE_BINARY; + } + else if (result == RESTYPE_RESERVED) { + char tokenBuffer[1024]; + u_austrncpy(tokenBuffer, tokenValue->fChars, sizeof(tokenBuffer)); + tokenBuffer[sizeof(tokenBuffer) - 1] = 0; + *status = U_INVALID_FORMAT_ERROR; + error(line, "unknown resource type '%s'", tokenBuffer); + } + + return result; +} + +/* parse a non-top-level resource */ +static struct SResource * +parseResource(ParseState* state, char *tag, const struct UString *comment, UErrorCode *status) +{ + enum ETokenType token; + enum EResourceType resType = RESTYPE_UNKNOWN; + ParseResourceFunction *parseFunction = nullptr; + struct UString *tokenValue; + uint32_t startline; + uint32_t line; + + + token = getToken(state, &tokenValue, nullptr, &startline, status); + + if(isVerbose()){ + printf(" resource %s at line %i \n", (tag == nullptr) ? "(null)" : tag, (int)startline); + } + + /* name . [ ':' type ] '{' resource '}' */ + /* This function parses from the colon onwards. If the colon is present, parse the + type then try to parse a resource of that type. If there is no explicit type, + work it out using the lookahead tokens. */ + switch (token) + { + case TOK_EOF: + *status = U_INVALID_FORMAT_ERROR; + error(startline, "Unexpected EOF encountered"); + return nullptr; + + case TOK_ERROR: + *status = U_INVALID_FORMAT_ERROR; + return nullptr; + + case TOK_COLON: + resType = parseResourceType(state, status); + expect(state, TOK_OPEN_BRACE, &tokenValue, nullptr, &startline, status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + + break; + + case TOK_OPEN_BRACE: + break; + + default: + *status = U_INVALID_FORMAT_ERROR; + error(startline, "syntax error while reading a resource, expected '{' or ':'"); + return nullptr; + } + + + if (resType == RESTYPE_UNKNOWN) + { + /* No explicit type, so try to work it out. At this point, we've read the first '{'. + We could have any of the following: + { { => array (nested) + { :/} => array + { string , => string array + + { string { => table + + { string :/{ => table + { string } => string + */ + + token = peekToken(state, 0, nullptr, &line, nullptr,status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + + if (token == TOK_OPEN_BRACE || token == TOK_COLON ||token ==TOK_CLOSE_BRACE ) + { + resType = RESTYPE_ARRAY; + } + else if (token == TOK_STRING) + { + token = peekToken(state, 1, nullptr, &line, nullptr, status); + + if (U_FAILURE(*status)) + { + return nullptr; + } + + switch (token) + { + case TOK_COMMA: resType = RESTYPE_ARRAY; break; + case TOK_OPEN_BRACE: resType = RESTYPE_TABLE; break; + case TOK_CLOSE_BRACE: resType = RESTYPE_STRING; break; + case TOK_COLON: resType = RESTYPE_TABLE; break; + default: + *status = U_INVALID_FORMAT_ERROR; + error(line, "Unexpected token after string, expected ',', '{' or '}'"); + return nullptr; + } + } + else + { + *status = U_INVALID_FORMAT_ERROR; + error(line, "Unexpected token after '{'"); + return nullptr; + } + + /* printf("Type guessed as %s\n", resourceNames[resType]); */ + } else if(resType == RESTYPE_TABLE_NO_FALLBACK) { + *status = U_INVALID_FORMAT_ERROR; + error(startline, "error: %s resource type not valid except on top bundle level", gResourceTypes[resType].nameChars); + return nullptr; + } + + + /* We should now know what we need to parse next, so call the appropriate parser + function and return. */ + parseFunction = gResourceTypes[resType].parseFunction; + if (parseFunction != nullptr) { + return parseFunction(state, tag, startline, comment, status); + } + else { + *status = U_INTERNAL_PROGRAM_ERROR; + error(startline, "internal error: %s resource type found and not handled", gResourceTypes[resType].nameChars); + } + + return nullptr; +} + +/* parse the top-level resource */ +struct SRBRoot * +parse(UCHARBUF *buf, const char *inputDir, const char *outputDir, const char *filename, + UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status) +{ + struct UString *tokenValue; + struct UString comment; + uint32_t line; + enum EResourceType bundleType; + enum ETokenType token; + ParseState state; + uint32_t i; + + + for (i = 0; i < MAX_LOOKAHEAD + 1; i++) + { + ustr_init(&state.lookahead[i].value); + ustr_init(&state.lookahead[i].comment); + } + + initLookahead(&state, buf, status); + + state.inputdir = inputDir; + state.inputdirLength = (state.inputdir != nullptr) ? (uint32_t)uprv_strlen(state.inputdir) : 0; + state.outputdir = outputDir; + state.outputdirLength = (state.outputdir != nullptr) ? (uint32_t)uprv_strlen(state.outputdir) : 0; + state.filename = filename; + state.makeBinaryCollation = makeBinaryCollation; + state.omitCollationRules = omitCollationRules; + state.icu4xMode = icu4xMode; + + ustr_init(&comment); + expect(&state, TOK_STRING, &tokenValue, &comment, nullptr, status); + + state.bundle = new SRBRoot(&comment, false, *status); + + if (state.bundle == nullptr || U_FAILURE(*status)) + { + delete state.bundle; + + return nullptr; + } + + + state.bundle->setLocale(tokenValue->fChars, *status); + + /* The following code is to make Empty bundle work no matter with :table specifer or not */ + token = getToken(&state, nullptr, nullptr, &line, status); + if(token==TOK_COLON) { + *status=U_ZERO_ERROR; + bundleType=parseResourceType(&state, status); + + if(isTable(bundleType)) + { + expect(&state, TOK_OPEN_BRACE, nullptr, nullptr, &line, status); + } + else + { + *status=U_PARSE_ERROR; + error(line, "parse error. Stopped parsing with %s", u_errorName(*status)); + } + } + else + { + /* not a colon */ + if(token==TOK_OPEN_BRACE) + { + *status=U_ZERO_ERROR; + bundleType=RESTYPE_TABLE; + } + else + { + /* neither colon nor open brace */ + *status=U_PARSE_ERROR; + bundleType=RESTYPE_UNKNOWN; + error(line, "parse error, did not find open-brace '{' or colon ':', stopped with %s", u_errorName(*status)); + } + } + + if (U_FAILURE(*status)) + { + delete state.bundle; + return nullptr; + } + + if(bundleType==RESTYPE_TABLE_NO_FALLBACK) { + /* + * Parse a top-level table with the table(nofallback) declaration. + * This is the same as a regular table, but also sets the + * URES_ATT_NO_FALLBACK flag in indexes[URES_INDEX_ATTRIBUTES] . + */ + state.bundle->fNoFallback=true; + } + /* top-level tables need not handle special table names like "collations" */ + assert(!state.bundle->fIsPoolBundle); + assert(state.bundle->fRoot->fType == URES_TABLE); + TableResource *rootTable = static_cast<TableResource *>(state.bundle->fRoot); + realParseTable(&state, rootTable, nullptr, line, status); + if(dependencyArray!=nullptr){ + rootTable->add(dependencyArray, 0, *status); + dependencyArray = nullptr; + } + if (U_FAILURE(*status)) + { + delete state.bundle; + res_close(dependencyArray); + return nullptr; + } + + if (getToken(&state, nullptr, nullptr, &line, status) != TOK_EOF) + { + warning(line, "extraneous text after resource bundle (perhaps unmatched braces)"); + if(isStrict()){ + *status = U_INVALID_FORMAT_ERROR; + return nullptr; + } + } + + cleanupLookahead(&state); + ustr_deinit(&comment); + return state.bundle; +} diff --git a/intl/icu/source/tools/genrb/parse.h b/intl/icu/source/tools/genrb/parse.h new file mode 100644 index 0000000000..fa90ede9d2 --- /dev/null +++ b/intl/icu/source/tools/genrb/parse.h @@ -0,0 +1,38 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File parse.h +* +* Modification History: +* +* Date Name Description +* 05/26/99 stephen Creation. +******************************************************************************* +*/ + +#ifndef PARSE_H +#define PARSE_H 1 + +#include "unicode/utypes.h" +#include "filestrm.h" +#include "ucbuf.h" + +U_CDECL_BEGIN +/* One time parser initialization */ +void initParser(); + +/* Parse a ResourceBundle text file */ +struct SRBRoot* parse(UCHARBUF *buf, const char* inputDir, const char* outputDir, + const char *filename, + UBool makeBinaryCollation, UBool omitCollationRules, UBool icu4xMode, UErrorCode *status); + +U_CDECL_END + +#endif diff --git a/intl/icu/source/tools/genrb/prscmnts.cpp b/intl/icu/source/tools/genrb/prscmnts.cpp new file mode 100644 index 0000000000..ea55352b41 --- /dev/null +++ b/intl/icu/source/tools/genrb/prscmnts.cpp @@ -0,0 +1,248 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* + ******************************************************************************* + * Copyright (C) 2003-2014, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + * + * File prscmnts.cpp + * + * Modification History: + * + * Date Name Description + * 08/22/2003 ram Creation. + ******************************************************************************* + */ + +// Safer use of UnicodeString. +#ifndef UNISTR_FROM_CHAR_EXPLICIT +# define UNISTR_FROM_CHAR_EXPLICIT explicit +#endif + +// Less important, but still a good idea. +#ifndef UNISTR_FROM_STRING_EXPLICIT +# define UNISTR_FROM_STRING_EXPLICIT explicit +#endif + +#include "unicode/regex.h" +#include "unicode/unistr.h" +#include "unicode/parseerr.h" +#include "prscmnts.h" +#include <stdio.h> +#include <stdlib.h> + +U_NAMESPACE_USE + +#if UCONFIG_NO_REGULAR_EXPRESSIONS==0 /* donot compile when RegularExpressions not available */ + +#define MAX_SPLIT_STRINGS 20 + +const char *patternStrings[UPC_LIMIT]={ + "^translate\\s*(.*)", + "^note\\s*(.*)" +}; + +U_CFUNC int32_t +removeText(char16_t *source, int32_t srcLen, + UnicodeString patString,uint32_t options, + UnicodeString replaceText, UErrorCode *status){ + + if(status == nullptr || U_FAILURE(*status)){ + return 0; + } + + UnicodeString src(source, srcLen); + + RegexMatcher myMatcher(patString, src, options, *status); + if(U_FAILURE(*status)){ + return 0; + } + UnicodeString dest; + + + dest = myMatcher.replaceAll(replaceText,*status); + + + return dest.extract(source, srcLen, *status); + +} +U_CFUNC int32_t +trim(char16_t *src, int32_t srcLen, UErrorCode *status){ + srcLen = removeText(src, srcLen, UnicodeString("^[ \\r\\n]+ "), 0, UnicodeString(), status); // remove leading new lines + srcLen = removeText(src, srcLen, UnicodeString("^\\s+"), 0, UnicodeString(), status); // remove leading spaces + srcLen = removeText(src, srcLen, UnicodeString("\\s+$"), 0, UnicodeString(), status); // remove trailing spcaes + return srcLen; +} + +U_CFUNC int32_t +removeCmtText(char16_t* source, int32_t srcLen, UErrorCode* status){ + srcLen = trim(source, srcLen, status); + UnicodeString patString("^\\s*?\\*\\s*?"); // remove pattern like " * " at the beginning of the line + srcLen = removeText(source, srcLen, patString, UREGEX_MULTILINE, UnicodeString(), status); + return removeText(source, srcLen, UnicodeString("[ \\r\\n]+"), 0, UnicodeString(" "), status);// remove new lines; +} + +U_CFUNC int32_t +getText(const char16_t* source, int32_t srcLen, + char16_t** dest, int32_t destCapacity, + UnicodeString patternString, + UErrorCode* status){ + + if(status == nullptr || U_FAILURE(*status)){ + return 0; + } + + UnicodeString stringArray[MAX_SPLIT_STRINGS]; + RegexPattern *pattern = RegexPattern::compile(UnicodeString("@"), 0, *status); + UnicodeString src (source,srcLen); + + if (U_FAILURE(*status)) { + return 0; + } + pattern->split(src, stringArray, MAX_SPLIT_STRINGS, *status); + + RegexMatcher matcher(patternString, UREGEX_DOTALL, *status); + if (U_FAILURE(*status)) { + return 0; + } + for(int32_t i=0; i<MAX_SPLIT_STRINGS; i++){ + matcher.reset(stringArray[i]); + if(matcher.lookingAt(*status)){ + UnicodeString out = matcher.group(1, *status); + + return out.extract(*dest, destCapacity,*status); + } + } + return 0; +} + + +#define AT_SIGN 0x0040 + +U_CFUNC int32_t +getDescription( const char16_t* source, int32_t srcLen, + char16_t** dest, int32_t destCapacity, + UErrorCode* status){ + if(status == nullptr || U_FAILURE(*status)){ + return 0; + } + + UnicodeString stringArray[MAX_SPLIT_STRINGS]; + RegexPattern *pattern = RegexPattern::compile(UnicodeString("@"), UREGEX_MULTILINE, *status); + UnicodeString src(source, srcLen); + + if (U_FAILURE(*status)) { + return 0; + } + pattern->split(src, stringArray,MAX_SPLIT_STRINGS , *status); + + if(stringArray[0].indexOf((char16_t)AT_SIGN)==-1){ + int32_t destLen = stringArray[0].extract(*dest, destCapacity, *status); + return trim(*dest, destLen, status); + } + return 0; +} + +U_CFUNC int32_t +getCount(const char16_t* source, int32_t srcLen, + UParseCommentsOption option, UErrorCode *status){ + + if(status == nullptr || U_FAILURE(*status)){ + return 0; + } + + UnicodeString stringArray[MAX_SPLIT_STRINGS]; + RegexPattern *pattern = RegexPattern::compile(UnicodeString("@"), UREGEX_MULTILINE, *status); + UnicodeString src (source, srcLen); + + + if (U_FAILURE(*status)) { + return 0; + } + int32_t retLen = pattern->split(src, stringArray, MAX_SPLIT_STRINGS, *status); + + UnicodeString patternString(patternStrings[option]); + RegexMatcher matcher(patternString, UREGEX_DOTALL, *status); + if (U_FAILURE(*status)) { + return 0; + } + int32_t count = 0; + for(int32_t i=0; i<retLen; i++){ + matcher.reset(stringArray[i]); + if(matcher.lookingAt(*status)){ + count++; + } + } + if(option == UPC_TRANSLATE && count > 1){ + fprintf(stderr, "Multiple @translate tags cannot be supported.\n"); + exit(U_UNSUPPORTED_ERROR); + } + return count; +} + +U_CFUNC int32_t +getAt(const char16_t* source, int32_t srcLen, + char16_t** dest, int32_t destCapacity, + int32_t index, + UParseCommentsOption option, + UErrorCode* status){ + + if(status == nullptr || U_FAILURE(*status)){ + return 0; + } + + UnicodeString stringArray[MAX_SPLIT_STRINGS]; + RegexPattern *pattern = RegexPattern::compile(UnicodeString("@"), UREGEX_MULTILINE, *status); + UnicodeString src (source, srcLen); + + + if (U_FAILURE(*status)) { + return 0; + } + int32_t retLen = pattern->split(src, stringArray, MAX_SPLIT_STRINGS, *status); + + UnicodeString patternString(patternStrings[option]); + RegexMatcher matcher(patternString, UREGEX_DOTALL, *status); + if (U_FAILURE(*status)) { + return 0; + } + int32_t count = 0; + for(int32_t i=0; i<retLen; i++){ + matcher.reset(stringArray[i]); + if(matcher.lookingAt(*status)){ + if(count == index){ + UnicodeString out = matcher.group(1, *status); + return out.extract(*dest, destCapacity,*status); + } + count++; + + } + } + return 0; + +} + +U_CFUNC int32_t +getTranslate( const char16_t* source, int32_t srcLen, + char16_t** dest, int32_t destCapacity, + UErrorCode* status){ + UnicodeString notePatternString("^translate\\s*?(.*)"); + + int32_t destLen = getText(source, srcLen, dest, destCapacity, notePatternString, status); + return trim(*dest, destLen, status); +} + +U_CFUNC int32_t +getNote(const char16_t* source, int32_t srcLen, + char16_t** dest, int32_t destCapacity, + UErrorCode* status){ + + UnicodeString notePatternString("^note\\s*?(.*)"); + int32_t destLen = getText(source, srcLen, dest, destCapacity, notePatternString, status); + return trim(*dest, destLen, status); + +} + +#endif /* UCONFIG_NO_REGULAR_EXPRESSIONS */ + diff --git a/intl/icu/source/tools/genrb/prscmnts.h b/intl/icu/source/tools/genrb/prscmnts.h new file mode 100644 index 0000000000..43195d2d30 --- /dev/null +++ b/intl/icu/source/tools/genrb/prscmnts.h @@ -0,0 +1,66 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File read.h +* +* Modification History: +* +* Date Name Description +* 05/26/99 stephen Creation. +* 5/10/01 Ram removed ustdio dependency +******************************************************************************* +*/ + +#ifndef PRSCMNTS_H +#define PRSCMNTS_H 1 + +#include "unicode/utypes.h" + +#if UCONFIG_NO_REGULAR_EXPRESSIONS==0 /* donot compile when no RegularExpressions are available */ + +enum UParseCommentsOption { + UPC_TRANSLATE, + UPC_NOTE, + UPC_LIMIT +}; + +typedef enum UParseCommentsOption UParseCommentsOption; + +U_CFUNC int32_t +getNote(const UChar* source, int32_t srcLen, + UChar** dest, int32_t destCapacity, + UErrorCode* status); +U_CFUNC int32_t +removeCmtText(UChar* source, int32_t srcLen, UErrorCode* status); + +U_CFUNC int32_t +getDescription( const UChar* source, int32_t srcLen, + UChar** dest, int32_t destCapacity, + UErrorCode* status); +U_CFUNC int32_t +getTranslate( const UChar* source, int32_t srcLen, + UChar** dest, int32_t destCapacity, + UErrorCode* status); + +U_CFUNC int32_t +getAt(const UChar* source, int32_t srcLen, + UChar** dest, int32_t destCapacity, + int32_t index, + UParseCommentsOption option, + UErrorCode* status); + +U_CFUNC int32_t +getCount(const UChar* source, int32_t srcLen, + UParseCommentsOption option, UErrorCode *status); + +#endif /* UCONFIG_NO_REGULAR_EXPRESSIONS */ + +#endif + diff --git a/intl/icu/source/tools/genrb/rbutil.c b/intl/icu/source/tools/genrb/rbutil.c new file mode 100644 index 0000000000..ed3e66b250 --- /dev/null +++ b/intl/icu/source/tools/genrb/rbutil.c @@ -0,0 +1,119 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2008, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File util.c +* +* Modification History: +* +* Date Name Description +* 06/10/99 stephen Creation. +* 02/07/08 Spieth Correct XLIFF generation on EBCDIC platform +* +******************************************************************************* +*/ + +#include "unicode/putil.h" +#include "rbutil.h" +#include "cmemory.h" +#include "cstring.h" + + +/* go from "/usr/local/include/curses.h" to "/usr/local/include" */ +void +get_dirname(char *dirname, + const char *filename) +{ + const char *lastSlash = uprv_strrchr(filename, U_FILE_SEP_CHAR); + if (lastSlash != NULL) { + lastSlash++; + } + + if(lastSlash>filename) { + uprv_strncpy(dirname, filename, (lastSlash - filename)); + *(dirname + (lastSlash - filename)) = '\0'; + } else { + *dirname = '\0'; + } +} + +/* go from "/usr/local/include/curses.h" to "curses" */ +void +get_basename(char *basename, + const char *filename) +{ + /* strip off any leading directory portions */ + const char *lastSlash = uprv_strrchr(filename, U_FILE_SEP_CHAR); + if (lastSlash != NULL) { + lastSlash++; + } + char *lastDot; + + if(lastSlash>filename) { + uprv_strcpy(basename, lastSlash); + } else { + uprv_strcpy(basename, filename); + } + + /* strip off any suffix */ + lastDot = uprv_strrchr(basename, '.'); + + if(lastDot != NULL) { + *lastDot = '\0'; + } +} + +#define MAX_DIGITS 10 +int32_t +itostr(char * buffer, int32_t i, uint32_t radix, int32_t pad) +{ + const char digits[16] = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'}; + int32_t length = 0; + int32_t num = 0; + int32_t save = i; + int digit; + int32_t j; + char temp; + + /* if i is negative make it positive */ + if(i<0){ + i=-i; + } + + do{ + digit = (int)(i % radix); + buffer[length++]= digits[digit]; + i=i/radix; + } while(i); + + while (length < pad){ + buffer[length++] = '0';/*zero padding */ + } + + /* if i is negative add the negative sign */ + if(save < 0){ + buffer[length++]='-'; + } + + /* null terminate the buffer */ + if(length<MAX_DIGITS){ + buffer[length] = 0x0000; + } + + num= (pad>=length) ? pad :length; + + + /* Reverses the string */ + for (j = 0; j < (num / 2); j++){ + temp = buffer[(length-1) - j]; + buffer[(length-1) - j] = buffer[j]; + buffer[j] = temp; + } + return length; +} diff --git a/intl/icu/source/tools/genrb/rbutil.h b/intl/icu/source/tools/genrb/rbutil.h new file mode 100644 index 0000000000..9a12c50959 --- /dev/null +++ b/intl/icu/source/tools/genrb/rbutil.h @@ -0,0 +1,33 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File rbutil.h +* +* Modification History: +* +* Date Name Description +* 06/10/99 stephen Creation. +******************************************************************************* +*/ + +#ifndef UTIL_H +#define UTIL_H 1 + +#include "unicode/utypes.h" + +U_CDECL_BEGIN + +void get_dirname(char *dirname, const char *filename); +void get_basename(char *basename, const char *filename); +int32_t itostr(char * buffer, int32_t i, uint32_t radix, int32_t pad); + +U_CDECL_END + +#endif /* ! UTIL_H */ diff --git a/intl/icu/source/tools/genrb/read.c b/intl/icu/source/tools/genrb/read.c new file mode 100644 index 0000000000..0d4a318a89 --- /dev/null +++ b/intl/icu/source/tools/genrb/read.c @@ -0,0 +1,479 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File read.c +* +* Modification History: +* +* Date Name Description +* 05/26/99 stephen Creation. +* 5/10/01 Ram removed ustdio dependency +******************************************************************************* +*/ + +#include <stdbool.h> + +#include "read.h" +#include "errmsg.h" +#include "toolutil.h" +#include "unicode/ustring.h" +#include "unicode/utf16.h" + +#define OPENBRACE 0x007B +#define CLOSEBRACE 0x007D +#define COMMA 0x002C +#define QUOTE 0x0022 +#define ESCAPE 0x005C +#define SLASH 0x002F +#define ASTERISK 0x002A +#define SPACE 0x0020 +#define COLON 0x003A +#define BADBOM 0xFFFE +#define CR 0x000D +#define LF 0x000A + +static int32_t lineCount; + +/* Protos */ +static enum ETokenType getStringToken(UCHARBUF *buf, + UChar32 initialChar, + struct UString *token, + UErrorCode *status); + +static UChar32 getNextChar (UCHARBUF *buf, UBool skipwhite, struct UString *token, UErrorCode *status); +static void seekUntilNewline (UCHARBUF *buf, struct UString *token, UErrorCode *status); +static void seekUntilEndOfComment (UCHARBUF *buf, struct UString *token, UErrorCode *status); +static UBool isWhitespace (UChar32 c); +static UBool isNewline (UChar32 c); + +U_CFUNC void resetLineNumber() { + lineCount = 1; +} + +/* Read and return the next token from the stream. If the token is of + type eString, fill in the token parameter with the token. If the + token is eError, then the status parameter will contain the + specific error. This will be eItemNotFound at the end of file, + indicating that all tokens have been returned. This method will + never return eString twice in a row; instead, multiple adjacent + string tokens will be merged into one, with no intervening + space. */ +U_CFUNC enum ETokenType +getNextToken(UCHARBUF* buf, + struct UString *token, + uint32_t *linenumber, /* out: linenumber of token */ + struct UString *comment, + UErrorCode *status) { + enum ETokenType result; + UChar32 c; + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + /* Skip whitespace */ + c = getNextChar(buf, true, comment, status); + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + *linenumber = lineCount; + + switch(c) { + case BADBOM: + return TOK_ERROR; + case OPENBRACE: + return TOK_OPEN_BRACE; + case CLOSEBRACE: + return TOK_CLOSE_BRACE; + case COMMA: + return TOK_COMMA; + case U_EOF: + return TOK_EOF; + case COLON: + return TOK_COLON; + + default: + result = getStringToken(buf, c, token, status); + } + + *linenumber = lineCount; + return result; +} + +/* Copy a string token into the given UnicodeString. Upon entry, we + have already read the first character of the string token, which is + not a whitespace character (but may be a QUOTE or ESCAPE). This + function reads all subsequent characters that belong with this + string, and copy them into the token parameter. The other + important, and slightly convoluted purpose of this function is to + merge adjacent strings. It looks forward a bit, and if the next + non comment, non whitespace item is a string, it reads it in as + well. If two adjacent strings are quoted, they are merged without + intervening space. Otherwise a single SPACE character is + inserted. */ +static enum ETokenType getStringToken(UCHARBUF* buf, + UChar32 initialChar, + struct UString *token, + UErrorCode *status) { + UBool lastStringWasQuoted; + UChar32 c; + UChar target[3] = { '\0' }; + UChar *pTarget = target; + int len=0; + UBool isFollowingCharEscaped=false; + UBool isNLUnescaped = false; + UChar32 prevC=0; + + /* We are guaranteed on entry that initialChar is not a whitespace + character. If we are at the EOF, or have some other problem, it + doesn't matter; we still want to validly return the initialChar + (if nothing else) as a string token. */ + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + /* setup */ + lastStringWasQuoted = false; + c = initialChar; + ustr_setlen(token, 0, status); + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + for (;;) { + if (c == QUOTE) { + if (!lastStringWasQuoted && token->fLength > 0) { + ustr_ucat(token, SPACE, status); + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + } + + lastStringWasQuoted = true; + + for (;;) { + c = ucbuf_getc(buf,status); + + /* EOF reached */ + if (c == U_EOF) { + return TOK_EOF; + } + + /* Unterminated quoted strings */ + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + if (c == QUOTE && !isFollowingCharEscaped) { + break; + } + + if (c == ESCAPE && !isFollowingCharEscaped) { + pTarget = target; + c = unescape(buf, status); + + if (c == U_ERR) { + return TOK_ERROR; + } + if(c == CR || c == LF){ + isNLUnescaped = true; + } + } + + if(c==ESCAPE && !isFollowingCharEscaped){ + isFollowingCharEscaped = true; + }else{ + U_APPEND_CHAR32(c, pTarget,len); + pTarget = target; + ustr_uscat(token, pTarget,len, status); + isFollowingCharEscaped = false; + len=0; + if(c == CR || c == LF){ + if(isNLUnescaped == false && prevC!=CR){ + lineCount++; + } + isNLUnescaped = false; + } + } + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + prevC = c; + } + } else { + if (token->fLength > 0) { + ustr_ucat(token, SPACE, status); + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + } + + if(lastStringWasQuoted){ + if(getShowWarning()){ + warning(lineCount, "Mixing quoted and unquoted strings"); + } + if(isStrict()){ + return TOK_ERROR; + } + + } + + lastStringWasQuoted = false; + + /* if we reach here we are mixing + * quoted and unquoted strings + * warn in normal mode and error in + * pedantic mode + */ + + if (c == ESCAPE) { + pTarget = target; + c = unescape(buf, status); + + /* EOF reached */ + if (c == U_EOF) { + return TOK_ERROR; + } + } + + U_APPEND_CHAR32(c, pTarget,len); + pTarget = target; + ustr_uscat(token, pTarget,len, status); + len=0; + + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + + for (;;) { + /* DON'T skip whitespace */ + c = getNextChar(buf, false, NULL, status); + + /* EOF reached */ + if (c == U_EOF) { + ucbuf_ungetc(c, buf); + return TOK_STRING; + } + + if (U_FAILURE(*status)) { + return TOK_STRING; + } + + if (c == QUOTE + || c == OPENBRACE + || c == CLOSEBRACE + || c == COMMA + || c == COLON) { + ucbuf_ungetc(c, buf); + break; + } + + if (isWhitespace(c)) { + break; + } + + if (c == ESCAPE) { + pTarget = target; + c = unescape(buf, status); + + if (c == U_ERR) { + return TOK_ERROR; + } + } + + U_APPEND_CHAR32(c, pTarget,len); + pTarget = target; + ustr_uscat(token, pTarget,len, status); + len=0; + if (U_FAILURE(*status)) { + return TOK_ERROR; + } + } + } + + /* DO skip whitespace */ + c = getNextChar(buf, true, NULL, status); + + if (U_FAILURE(*status)) { + return TOK_STRING; + } + + if (c == OPENBRACE || c == CLOSEBRACE || c == COMMA || c == COLON) { + ucbuf_ungetc(c, buf); + return TOK_STRING; + } + } +} + +/* Retrieve the next character. If skipwhite is + true, whitespace is skipped as well. */ +static UChar32 getNextChar(UCHARBUF* buf, + UBool skipwhite, + struct UString *token, + UErrorCode *status) { + UChar32 c, c2; + + if (U_FAILURE(*status)) { + return U_EOF; + } + + for (;;) { + c = ucbuf_getc(buf,status); + + if (c == U_EOF) { + return U_EOF; + } + + if (skipwhite && isWhitespace(c)) { + continue; + } + + /* This also handles the get() failing case */ + if (c != SLASH) { + return c; + } + + c = ucbuf_getc(buf,status); /* "/c" */ + + if (c == U_EOF) { + return U_EOF; + } + + switch (c) { + case SLASH: /* "//" */ + seekUntilNewline(buf, NULL, status); + break; + + case ASTERISK: /* " / * " */ + c2 = ucbuf_getc(buf, status); /* "/ * c" */ + if(c2 == ASTERISK){ /* "/ * *" */ + /* parse multi-line comment and store it in token*/ + seekUntilEndOfComment(buf, token, status); + } else { + ucbuf_ungetc(c2, buf); /* c2 is the non-asterisk following "/ *". Include c2 back in buffer. */ + seekUntilEndOfComment(buf, NULL, status); + } + break; + + default: + ucbuf_ungetc(c, buf); /* "/c" - put back the c */ + /* If get() failed this is a NOP */ + return SLASH; + } + + } +} + +static void seekUntilNewline(UCHARBUF* buf, + struct UString *token, + UErrorCode *status) { + UChar32 c; + + if (U_FAILURE(*status)) { + return; + } + + do { + c = ucbuf_getc(buf,status); + /* add the char to token */ + if(token!=NULL){ + ustr_u32cat(token, c, status); + } + } while (!isNewline(c) && c != U_EOF && *status == U_ZERO_ERROR); +} + +static void seekUntilEndOfComment(UCHARBUF *buf, + struct UString *token, + UErrorCode *status) { + UChar32 c, d; + uint32_t line; + + if (U_FAILURE(*status)) { + return; + } + + line = lineCount; + + do { + c = ucbuf_getc(buf, status); + + if (c == ASTERISK) { + d = ucbuf_getc(buf, status); + + if (d != SLASH) { + ucbuf_ungetc(d, buf); + } else { + break; + } + } + /* add the char to token */ + if(token!=NULL){ + ustr_u32cat(token, c, status); + } + /* increment the lineCount */ + isNewline(c); + + } while (c != U_EOF && *status == U_ZERO_ERROR); + + if (c == U_EOF) { + *status = U_INVALID_FORMAT_ERROR; + error(line, "unterminated comment detected"); + } +} + +U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status) { + if (U_FAILURE(*status)) { + return U_EOF; + } + + /* We expect to be called after the ESCAPE has been seen, but + * u_fgetcx needs an ESCAPE to do its magic. */ + ucbuf_ungetc(ESCAPE, buf); + + return ucbuf_getcx32(buf, status); +} + +static UBool isWhitespace(UChar32 c) { + switch (c) { + /* ' ', '\t', '\n', '\r', 0x2029, 0xFEFF */ + case 0x000A: + case 0x2029: + lineCount++; + case 0x000D: + case 0x0020: + case 0x0009: + case 0xFEFF: + return true; + + default: + return false; + } +} + +static UBool isNewline(UChar32 c) { + switch (c) { + /* '\n', '\r', 0x2029 */ + case 0x000A: + case 0x2029: + lineCount++; + case 0x000D: + return true; + + default: + return false; + } +} diff --git a/intl/icu/source/tools/genrb/read.h b/intl/icu/source/tools/genrb/read.h new file mode 100644 index 0000000000..e5b8d155da --- /dev/null +++ b/intl/icu/source/tools/genrb/read.h @@ -0,0 +1,54 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2011, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File read.h +* +* Modification History: +* +* Date Name Description +* 05/26/99 stephen Creation. +* 5/10/01 Ram removed ustdio dependency +******************************************************************************* +*/ + +#ifndef READ_H +#define READ_H 1 + +#include "unicode/utypes.h" +#include "ustr.h" +#include "ucbuf.h" + +/* The types of tokens which may be returned by getNextToken. + NOTE: Keep these in sync with tokenNames in parse.c */ +enum ETokenType +{ + TOK_STRING, /* A string token, such as "MonthNames" */ + TOK_OPEN_BRACE, /* An opening brace character */ + TOK_CLOSE_BRACE, /* A closing brace character */ + TOK_COMMA, /* A comma */ + TOK_COLON, /* A colon */ + + TOK_EOF, /* End of the file has been reached successfully */ + TOK_ERROR, /* An error, such an unterminated quoted string */ + TOK_TOKEN_COUNT /* Number of "real" token types */ +}; + +U_CFUNC UChar32 unescape(UCHARBUF *buf, UErrorCode *status); + +U_CFUNC void resetLineNumber(void); + +U_CFUNC enum ETokenType +getNextToken(UCHARBUF *buf, + struct UString *token, + uint32_t *linenumber, /* out: linenumber of token */ + struct UString *comment, + UErrorCode *status); + +#endif diff --git a/intl/icu/source/tools/genrb/reslist.cpp b/intl/icu/source/tools/genrb/reslist.cpp new file mode 100644 index 0000000000..e1c2d25061 --- /dev/null +++ b/intl/icu/source/tools/genrb/reslist.cpp @@ -0,0 +1,1794 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File reslist.cpp +* +* Modification History: +* +* Date Name Description +* 02/21/00 weiv Creation. +******************************************************************************* +*/ + +// Safer use of UnicodeString. +#ifndef UNISTR_FROM_CHAR_EXPLICIT +# define UNISTR_FROM_CHAR_EXPLICIT explicit +#endif + +// Less important, but still a good idea. +#ifndef UNISTR_FROM_STRING_EXPLICIT +# define UNISTR_FROM_STRING_EXPLICIT explicit +#endif + +#include <assert.h> +#include <iostream> +#include <set> +#include <stdio.h> + +#include "unicode/localpointer.h" +#include "reslist.h" +#include "unewdata.h" +#include "unicode/ures.h" +#include "unicode/putil.h" +#include "errmsg.h" +#include "filterrb.h" +#include "toolutil.h" + +#include "uarrsort.h" +#include "uelement.h" +#include "uhash.h" +#include "uinvchar.h" +#include "ustr_imp.h" +#include "unicode/utf16.h" +#include "uassert.h" + +/* + * Align binary data at a 16-byte offset from the start of the resource bundle, + * to be safe for any data type it may contain. + */ +#define BIN_ALIGNMENT 16 + +// This numeric constant must be at least 1. +// If StringResource.fNumUnitsSaved == 0 then the string occurs only once, +// and it makes no sense to move it to the pool bundle. +// The larger the threshold for fNumUnitsSaved +// the smaller the savings, and the smaller the pool bundle. +// We trade some total size reduction to reduce the pool bundle a bit, +// so that one can reasonably save data size by +// removing bundle files without rebuilding the pool bundle. +// This can also help to keep the pool and total (pool+local) string indexes +// within 16 bits, that is, within range of Table16 and Array16 containers. +#ifndef GENRB_MIN_16BIT_UNITS_SAVED_FOR_POOL_STRING +# define GENRB_MIN_16BIT_UNITS_SAVED_FOR_POOL_STRING 10 +#endif + +U_NAMESPACE_USE + +static UBool gIncludeCopyright = false; +static UBool gUsePoolBundle = false; +static UBool gIsDefaultFormatVersion = true; +static int32_t gFormatVersion = 3; + +/* How do we store string values? */ +enum { + STRINGS_UTF16_V1, /* formatVersion 1: int length + UChars + NUL + padding to 4 bytes */ + STRINGS_UTF16_V2 /* formatVersion 2 & up: optional length in 1..3 UChars + UChars + NUL */ +}; + +static const int32_t MAX_IMPLICIT_STRING_LENGTH = 40; /* do not store the length explicitly for such strings */ + +static const ResFile kNoPoolBundle; + +/* + * res_none() returns the address of kNoResource, + * for use in non-error cases when no resource is to be added to the bundle. + * (nullptr is used in error cases.) + */ +static SResource kNoResource; // TODO: const + +static UDataInfo dataInfo= { + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + sizeof(char16_t), + 0, + + {0x52, 0x65, 0x73, 0x42}, /* dataFormat="ResB" */ + {1, 3, 0, 0}, /* formatVersion */ + {1, 4, 0, 0} /* dataVersion take a look at version inside parsed resb*/ +}; + +static const UVersionInfo gFormatVersions[4] = { /* indexed by a major-formatVersion integer */ + { 0, 0, 0, 0 }, + { 1, 3, 0, 0 }, + { 2, 0, 0, 0 }, + { 3, 0, 0, 0 } +}; +// Remember to update genrb.h GENRB_VERSION when changing the data format. +// (Or maybe we should remove GENRB_VERSION and report the ICU version number?) + +static uint8_t calcPadding(uint32_t size) { + /* returns space we need to pad */ + return (uint8_t) ((size % sizeof(uint32_t)) ? (sizeof(uint32_t) - (size % sizeof(uint32_t))) : 0); + +} + +void setIncludeCopyright(UBool val){ + gIncludeCopyright=val; +} + +UBool getIncludeCopyright(){ + return gIncludeCopyright; +} + +void setFormatVersion(int32_t formatVersion) { + gIsDefaultFormatVersion = false; + gFormatVersion = formatVersion; +} + +int32_t getFormatVersion() { + return gFormatVersion; +} + +void setUsePoolBundle(UBool use) { + gUsePoolBundle = use; +} + +// TODO: return const pointer, or find another way to express "none" +struct SResource* res_none() { + return &kNoResource; +} + +SResource::SResource() + : fType(URES_NONE), fWritten(false), fRes(RES_BOGUS), fRes16(-1), fKey(-1), fKey16(-1), + line(0), fNext(nullptr) { + ustr_init(&fComment); +} + +SResource::SResource(SRBRoot *bundle, const char *tag, int8_t type, const UString* comment, + UErrorCode &errorCode) + : fType(type), fWritten(false), fRes(RES_BOGUS), fRes16(-1), + fKey(bundle != nullptr ? bundle->addTag(tag, errorCode) : -1), fKey16(-1), + line(0), fNext(nullptr) { + ustr_init(&fComment); + if(comment != nullptr) { + ustr_cpy(&fComment, comment, &errorCode); + } +} + +SResource::~SResource() { + ustr_deinit(&fComment); +} + +ContainerResource::~ContainerResource() { + SResource *current = fFirst; + while (current != nullptr) { + SResource *next = current->fNext; + delete current; + current = next; + } +} + +TableResource::~TableResource() {} + +// TODO: clarify that containers adopt new items, even in error cases; use LocalPointer +void TableResource::add(SResource *res, int linenumber, UErrorCode &errorCode) { + if (U_FAILURE(errorCode) || res == nullptr || res == &kNoResource) { + return; + } + + /* remember this linenumber to report to the user if there is a duplicate key */ + res->line = linenumber; + + /* here we need to traverse the list */ + ++fCount; + + /* is the list still empty? */ + if (fFirst == nullptr) { + fFirst = res; + res->fNext = nullptr; + return; + } + + const char *resKeyString = fRoot->fKeys + res->fKey; + + SResource *current = fFirst; + + SResource *prev = nullptr; + while (current != nullptr) { + const char *currentKeyString = fRoot->fKeys + current->fKey; + int diff; + /* + * formatVersion 1: compare key strings in native-charset order + * formatVersion 2 and up: compare key strings in ASCII order + */ + if (gFormatVersion == 1 || U_CHARSET_FAMILY == U_ASCII_FAMILY) { + diff = uprv_strcmp(currentKeyString, resKeyString); + } else { + diff = uprv_compareInvCharsAsAscii(currentKeyString, resKeyString); + } + if (diff < 0) { + prev = current; + current = current->fNext; + } else if (diff > 0) { + /* we're either in front of the list, or in the middle */ + if (prev == nullptr) { + /* front of the list */ + fFirst = res; + } else { + /* middle of the list */ + prev->fNext = res; + } + + res->fNext = current; + return; + } else { + /* Key already exists! ERROR! */ + error(linenumber, "duplicate key '%s' in table, first appeared at line %d", currentKeyString, current->line); + errorCode = U_UNSUPPORTED_ERROR; + return; + } + } + + /* end of list */ + prev->fNext = res; + res->fNext = nullptr; +} + +ArrayResource::~ArrayResource() {} + +void ArrayResource::add(SResource *res) { + if (res != nullptr && res != &kNoResource) { + if (fFirst == nullptr) { + fFirst = res; + } else { + fLast->fNext = res; + } + fLast = res; + ++fCount; + } +} + +PseudoListResource::~PseudoListResource() {} + +void PseudoListResource::add(SResource *res) { + if (res != nullptr && res != &kNoResource) { + res->fNext = fFirst; + fFirst = res; + ++fCount; + } +} + +StringBaseResource::StringBaseResource(SRBRoot *bundle, const char *tag, int8_t type, + const char16_t *value, int32_t len, + const UString* comment, UErrorCode &errorCode) + : SResource(bundle, tag, type, comment, errorCode) { + if (len == 0 && gFormatVersion > 1) { + fRes = URES_MAKE_EMPTY_RESOURCE(type); + fWritten = true; + return; + } + + fString.setTo(ConstChar16Ptr(value), len); + fString.getTerminatedBuffer(); // Some code relies on NUL-termination. + if (U_SUCCESS(errorCode) && fString.isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } +} + +StringBaseResource::StringBaseResource(SRBRoot *bundle, int8_t type, + const icu::UnicodeString &value, UErrorCode &errorCode) + : SResource(bundle, nullptr, type, nullptr, errorCode), fString(value) { + if (value.isEmpty() && gFormatVersion > 1) { + fRes = URES_MAKE_EMPTY_RESOURCE(type); + fWritten = true; + return; + } + + fString.getTerminatedBuffer(); // Some code relies on NUL-termination. + if (U_SUCCESS(errorCode) && fString.isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } +} + +// Pool bundle string, alias the buffer. Guaranteed NUL-terminated and not empty. +StringBaseResource::StringBaseResource(int8_t type, const char16_t *value, int32_t len, + UErrorCode &errorCode) + : SResource(nullptr, nullptr, type, nullptr, errorCode), fString(true, value, len) { + assert(len > 0); + assert(!fString.isBogus()); +} + +StringBaseResource::~StringBaseResource() {} + +static int32_t U_CALLCONV +string_hash(const UElement key) { + const StringResource *res = static_cast<const StringResource *>(key.pointer); + return res->fString.hashCode(); +} + +static UBool U_CALLCONV +string_comp(const UElement key1, const UElement key2) { + const StringResource *res1 = static_cast<const StringResource *>(key1.pointer); + const StringResource *res2 = static_cast<const StringResource *>(key2.pointer); + return res1->fString == res2->fString; +} + +StringResource::~StringResource() {} + +AliasResource::~AliasResource() {} + +IntResource::IntResource(SRBRoot *bundle, const char *tag, int32_t value, + const UString* comment, UErrorCode &errorCode) + : SResource(bundle, tag, URES_INT, comment, errorCode) { + fValue = value; + fRes = URES_MAKE_RESOURCE(URES_INT, value & RES_MAX_OFFSET); + fWritten = true; +} + +IntResource::~IntResource() {} + +IntVectorResource::IntVectorResource(SRBRoot *bundle, const char *tag, + const UString* comment, UErrorCode &errorCode) + : SResource(bundle, tag, URES_INT_VECTOR, comment, errorCode), + fCount(0), fSize(RESLIST_INT_VECTOR_INIT_SIZE), + fArray(new uint32_t[fSize]) { + if (fArray == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } +} + +IntVectorResource::~IntVectorResource() { + delete[] fArray; +} + +void IntVectorResource::add(int32_t value, UErrorCode &errorCode) { + if (fCount == fSize) { + uint32_t* tmp = new uint32_t[2 * fSize]; + if (tmp == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_memcpy(tmp, fArray, fSize * sizeof(uint32_t)); + delete[] fArray; + fArray = tmp; + fSize *= 2; + } + if (U_SUCCESS(errorCode)) { + fArray[fCount++] = value; + } +} + +BinaryResource::BinaryResource(SRBRoot *bundle, const char *tag, + uint32_t length, uint8_t *data, const char* fileName, + const UString* comment, UErrorCode &errorCode) + : SResource(bundle, tag, URES_BINARY, comment, errorCode), + fLength(length), fData(nullptr), fFileName(nullptr) { + if (U_FAILURE(errorCode)) { + return; + } + if (fileName != nullptr && *fileName != 0){ + fFileName = new char[uprv_strlen(fileName)+1]; + if (fFileName == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_strcpy(fFileName, fileName); + } + if (length > 0) { + fData = new uint8_t[length]; + if (fData == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_memcpy(fData, data, length); + } else { + if (gFormatVersion > 1) { + fRes = URES_MAKE_EMPTY_RESOURCE(URES_BINARY); + fWritten = true; + } + } +} + +BinaryResource::~BinaryResource() { + delete[] fData; + delete[] fFileName; +} + +/* Writing Functions */ + +void +StringResource::handlePreflightStrings(SRBRoot *bundle, UHashtable *stringSet, + UErrorCode &errorCode) { + assert(fSame == nullptr); + fSame = static_cast<StringResource *>(uhash_get(stringSet, this)); + if (fSame != nullptr) { + // This is a duplicate of a pool bundle string or of an earlier-visited string. + if (++fSame->fNumCopies == 1) { + assert(fSame->fWritten); + int32_t poolStringIndex = (int32_t)RES_GET_OFFSET(fSame->fRes); + if (poolStringIndex >= bundle->fPoolStringIndexLimit) { + bundle->fPoolStringIndexLimit = poolStringIndex + 1; + } + } + return; + } + /* Put this string into the set for finding duplicates. */ + fNumCopies = 1; + uhash_put(stringSet, this, this, &errorCode); + + if (bundle->fStringsForm != STRINGS_UTF16_V1) { + int32_t len = length(); + if (len <= MAX_IMPLICIT_STRING_LENGTH && + !U16_IS_TRAIL(fString[0]) && fString.indexOf((char16_t)0) < 0) { + /* + * This string will be stored without an explicit length. + * Runtime will detect !U16_IS_TRAIL(s[0]) and call u_strlen(). + */ + fNumCharsForLength = 0; + } else if (len <= 0x3ee) { + fNumCharsForLength = 1; + } else if (len <= 0xfffff) { + fNumCharsForLength = 2; + } else { + fNumCharsForLength = 3; + } + bundle->f16BitStringsLength += fNumCharsForLength + len + 1; /* +1 for the NUL */ + } +} + +void +ContainerResource::handlePreflightStrings(SRBRoot *bundle, UHashtable *stringSet, + UErrorCode &errorCode) { + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + current->preflightStrings(bundle, stringSet, errorCode); + } +} + +void +SResource::preflightStrings(SRBRoot *bundle, UHashtable *stringSet, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { + return; + } + if (fRes != RES_BOGUS) { + /* + * The resource item word was already precomputed, which means + * no further data needs to be written. + * This might be an integer, or an empty string/binary/etc. + */ + return; + } + handlePreflightStrings(bundle, stringSet, errorCode); +} + +void +SResource::handlePreflightStrings(SRBRoot * /*bundle*/, UHashtable * /*stringSet*/, + UErrorCode & /*errorCode*/) { + /* Neither a string nor a container. */ +} + +int32_t +SRBRoot::makeRes16(uint32_t resWord) const { + if (resWord == 0) { + return 0; /* empty string */ + } + uint32_t type = RES_GET_TYPE(resWord); + int32_t offset = (int32_t)RES_GET_OFFSET(resWord); + if (type == URES_STRING_V2) { + assert(offset > 0); + if (offset < fPoolStringIndexLimit) { + if (offset < fPoolStringIndex16Limit) { + return offset; + } + } else { + offset = offset - fPoolStringIndexLimit + fPoolStringIndex16Limit; + if (offset <= 0xffff) { + return offset; + } + } + } + return -1; +} + +int32_t +SRBRoot::mapKey(int32_t oldpos) const { + const KeyMapEntry *map = fKeyMap; + if (map == nullptr) { + return oldpos; + } + int32_t i, start, limit; + + /* do a binary search for the old, pre-compactKeys() key offset */ + start = fUsePoolBundle->fKeysCount; + limit = start + fKeysCount; + while (start < limit - 1) { + i = (start + limit) / 2; + if (oldpos < map[i].oldpos) { + limit = i; + } else { + start = i; + } + } + assert(oldpos == map[start].oldpos); + return map[start].newpos; +} + +/* + * Only called for UTF-16 v1 strings and duplicate UTF-16 v2 strings. + * For unique UTF-16 v2 strings, write16() sees fRes != RES_BOGUS + * and exits early. + */ +void +StringResource::handleWrite16(SRBRoot * /*bundle*/) { + SResource *same; + if ((same = fSame) != nullptr) { + /* This is a duplicate. */ + assert(same->fRes != RES_BOGUS && same->fWritten); + fRes = same->fRes; + fWritten = same->fWritten; + } +} + +void +ContainerResource::writeAllRes16(SRBRoot *bundle) { + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + bundle->f16BitUnits.append((char16_t)current->fRes16); + } + fWritten = true; +} + +void +ArrayResource::handleWrite16(SRBRoot *bundle) { + if (fCount == 0 && gFormatVersion > 1) { + fRes = URES_MAKE_EMPTY_RESOURCE(URES_ARRAY); + fWritten = true; + return; + } + + int32_t res16 = 0; + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + current->write16(bundle); + res16 |= current->fRes16; + } + if (fCount <= 0xffff && res16 >= 0 && gFormatVersion > 1) { + fRes = URES_MAKE_RESOURCE(URES_ARRAY16, bundle->f16BitUnits.length()); + bundle->f16BitUnits.append((char16_t)fCount); + writeAllRes16(bundle); + } +} + +void +TableResource::handleWrite16(SRBRoot *bundle) { + if (fCount == 0 && gFormatVersion > 1) { + fRes = URES_MAKE_EMPTY_RESOURCE(URES_TABLE); + fWritten = true; + return; + } + /* Find the smallest table type that fits the data. */ + int32_t key16 = 0; + int32_t res16 = 0; + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + current->write16(bundle); + key16 |= current->fKey16; + res16 |= current->fRes16; + } + if(fCount > (uint32_t)bundle->fMaxTableLength) { + bundle->fMaxTableLength = fCount; + } + if (fCount <= 0xffff && key16 >= 0) { + if (res16 >= 0 && gFormatVersion > 1) { + /* 16-bit count, key offsets and values */ + fRes = URES_MAKE_RESOURCE(URES_TABLE16, bundle->f16BitUnits.length()); + bundle->f16BitUnits.append((char16_t)fCount); + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + bundle->f16BitUnits.append((char16_t)current->fKey16); + } + writeAllRes16(bundle); + } else { + /* 16-bit count, 16-bit key offsets, 32-bit values */ + fTableType = URES_TABLE; + } + } else { + /* 32-bit count, key offsets and values */ + fTableType = URES_TABLE32; + } +} + +void +PseudoListResource::handleWrite16(SRBRoot * /*bundle*/) { + fRes = URES_MAKE_EMPTY_RESOURCE(URES_TABLE); + fWritten = true; +} + +void +SResource::write16(SRBRoot *bundle) { + if (fKey >= 0) { + // A tagged resource has a non-negative key index into the parsed key strings. + // compactKeys() built a map from parsed key index to the final key index. + // After the mapping, negative key indexes are used for shared pool bundle keys. + fKey = bundle->mapKey(fKey); + // If the key index fits into a Key16 for a Table or Table16, + // then set the fKey16 field accordingly. + // Otherwise keep it at -1. + if (fKey >= 0) { + if (fKey < bundle->fLocalKeyLimit) { + fKey16 = fKey; + } + } else { + int32_t poolKeyIndex = fKey & 0x7fffffff; + if (poolKeyIndex <= 0xffff) { + poolKeyIndex += bundle->fLocalKeyLimit; + if (poolKeyIndex <= 0xffff) { + fKey16 = poolKeyIndex; + } + } + } + } + /* + * fRes != RES_BOGUS: + * The resource item word was already precomputed, which means + * no further data needs to be written. + * This might be an integer, or an empty or UTF-16 v2 string, + * an empty binary, etc. + */ + if (fRes == RES_BOGUS) { + handleWrite16(bundle); + } + // Compute fRes16 for precomputed as well as just-computed fRes. + fRes16 = bundle->makeRes16(fRes); +} + +void +SResource::handleWrite16(SRBRoot * /*bundle*/) { + /* Only a few resource types write 16-bit units. */ +} + +/* + * Only called for UTF-16 v1 strings, and for aliases. + * For UTF-16 v2 strings, preWrite() sees fRes != RES_BOGUS + * and exits early. + */ +void +StringBaseResource::handlePreWrite(uint32_t *byteOffset) { + /* Write the UTF-16 v1 string. */ + fRes = URES_MAKE_RESOURCE(fType, *byteOffset >> 2); + *byteOffset += 4 + (length() + 1) * U_SIZEOF_UCHAR; +} + +void +IntVectorResource::handlePreWrite(uint32_t *byteOffset) { + if (fCount == 0 && gFormatVersion > 1) { + fRes = URES_MAKE_EMPTY_RESOURCE(URES_INT_VECTOR); + fWritten = true; + } else { + fRes = URES_MAKE_RESOURCE(URES_INT_VECTOR, *byteOffset >> 2); + *byteOffset += (1 + fCount) * 4; + } +} + +void +BinaryResource::handlePreWrite(uint32_t *byteOffset) { + uint32_t pad = 0; + uint32_t dataStart = *byteOffset + sizeof(fLength); + + if (dataStart % BIN_ALIGNMENT) { + pad = (BIN_ALIGNMENT - dataStart % BIN_ALIGNMENT); + *byteOffset += pad; /* pad == 4 or 8 or 12 */ + } + fRes = URES_MAKE_RESOURCE(URES_BINARY, *byteOffset >> 2); + *byteOffset += 4 + fLength; +} + +void +ContainerResource::preWriteAllRes(uint32_t *byteOffset) { + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + current->preWrite(byteOffset); + } +} + +void +ArrayResource::handlePreWrite(uint32_t *byteOffset) { + preWriteAllRes(byteOffset); + fRes = URES_MAKE_RESOURCE(URES_ARRAY, *byteOffset >> 2); + *byteOffset += (1 + fCount) * 4; +} + +void +TableResource::handlePreWrite(uint32_t *byteOffset) { + preWriteAllRes(byteOffset); + if (fTableType == URES_TABLE) { + /* 16-bit count, 16-bit key offsets, 32-bit values */ + fRes = URES_MAKE_RESOURCE(URES_TABLE, *byteOffset >> 2); + *byteOffset += 2 + fCount * 6; + } else { + /* 32-bit count, key offsets and values */ + fRes = URES_MAKE_RESOURCE(URES_TABLE32, *byteOffset >> 2); + *byteOffset += 4 + fCount * 8; + } +} + +void +SResource::preWrite(uint32_t *byteOffset) { + if (fRes != RES_BOGUS) { + /* + * The resource item word was already precomputed, which means + * no further data needs to be written. + * This might be an integer, or an empty or UTF-16 v2 string, + * an empty binary, etc. + */ + return; + } + handlePreWrite(byteOffset); + *byteOffset += calcPadding(*byteOffset); +} + +void +SResource::handlePreWrite(uint32_t * /*byteOffset*/) { + assert(false); +} + +/* + * Only called for UTF-16 v1 strings, and for aliases. For UTF-16 v2 strings, + * write() sees fWritten and exits early. + */ +void +StringBaseResource::handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) { + /* Write the UTF-16 v1 string. */ + int32_t len = length(); + udata_write32(mem, len); + udata_writeUString(mem, getBuffer(), len + 1); + *byteOffset += 4 + (len + 1) * U_SIZEOF_UCHAR; + fWritten = true; +} + +void +ContainerResource::writeAllRes(UNewDataMemory *mem, uint32_t *byteOffset) { + uint32_t i = 0; + for (SResource *current = fFirst; current != nullptr; ++i, current = current->fNext) { + current->write(mem, byteOffset); + } + assert(i == fCount); +} + +void +ContainerResource::writeAllRes32(UNewDataMemory *mem, uint32_t *byteOffset) { + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + udata_write32(mem, current->fRes); + } + *byteOffset += fCount * 4; +} + +void +ArrayResource::handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) { + writeAllRes(mem, byteOffset); + udata_write32(mem, fCount); + *byteOffset += 4; + writeAllRes32(mem, byteOffset); +} + +void +IntVectorResource::handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) { + udata_write32(mem, fCount); + for(uint32_t i = 0; i < fCount; ++i) { + udata_write32(mem, fArray[i]); + } + *byteOffset += (1 + fCount) * 4; +} + +void +BinaryResource::handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) { + uint32_t pad = 0; + uint32_t dataStart = *byteOffset + sizeof(fLength); + + if (dataStart % BIN_ALIGNMENT) { + pad = (BIN_ALIGNMENT - dataStart % BIN_ALIGNMENT); + udata_writePadding(mem, pad); /* pad == 4 or 8 or 12 */ + *byteOffset += pad; + } + + udata_write32(mem, fLength); + if (fLength > 0) { + udata_writeBlock(mem, fData, fLength); + } + *byteOffset += 4 + fLength; +} + +void +TableResource::handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) { + writeAllRes(mem, byteOffset); + if(fTableType == URES_TABLE) { + udata_write16(mem, (uint16_t)fCount); + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + udata_write16(mem, current->fKey16); + } + *byteOffset += (1 + fCount)* 2; + if ((fCount & 1) == 0) { + /* 16-bit count and even number of 16-bit key offsets need padding before 32-bit resource items */ + udata_writePadding(mem, 2); + *byteOffset += 2; + } + } else /* URES_TABLE32 */ { + udata_write32(mem, fCount); + for (SResource *current = fFirst; current != nullptr; current = current->fNext) { + udata_write32(mem, (uint32_t)current->fKey); + } + *byteOffset += (1 + fCount)* 4; + } + writeAllRes32(mem, byteOffset); +} + +void +SResource::write(UNewDataMemory *mem, uint32_t *byteOffset) { + if (fWritten) { + assert(fRes != RES_BOGUS); + return; + } + handleWrite(mem, byteOffset); + uint8_t paddingSize = calcPadding(*byteOffset); + if (paddingSize > 0) { + udata_writePadding(mem, paddingSize); + *byteOffset += paddingSize; + } + fWritten = true; +} + +void +SResource::handleWrite(UNewDataMemory * /*mem*/, uint32_t * /*byteOffset*/) { + assert(false); +} + +void SRBRoot::write(const char *outputDir, const char *outputPkg, + char *writtenFilename, int writtenFilenameLen, + UErrorCode &errorCode) { + UNewDataMemory *mem = nullptr; + uint32_t byteOffset = 0; + uint32_t top, size; + char dataName[1024]; + int32_t indexes[URES_INDEX_TOP]; + + compactKeys(errorCode); + /* + * Add padding bytes to fKeys so that fKeysTop is 4-aligned. + * Safe because the capacity is a multiple of 4. + */ + while (fKeysTop & 3) { + fKeys[fKeysTop++] = (char)0xaa; + } + /* + * In URES_TABLE, use all local key offsets that fit into 16 bits, + * and use the remaining 16-bit offsets for pool key offsets + * if there are any. + * If there are no local keys, then use the whole 16-bit space + * for pool key offsets. + * Note: This cannot be changed without changing the major formatVersion. + */ + if (fKeysBottom < fKeysTop) { + if (fKeysTop <= 0x10000) { + fLocalKeyLimit = fKeysTop; + } else { + fLocalKeyLimit = 0x10000; + } + } else { + fLocalKeyLimit = 0; + } + + UHashtable *stringSet; + if (gFormatVersion > 1) { + stringSet = uhash_open(string_hash, string_comp, string_comp, &errorCode); + if (U_SUCCESS(errorCode) && + fUsePoolBundle != nullptr && fUsePoolBundle->fStrings != nullptr) { + for (SResource *current = fUsePoolBundle->fStrings->fFirst; + current != nullptr; + current = current->fNext) { + StringResource *sr = static_cast<StringResource *>(current); + sr->fNumCopies = 0; + sr->fNumUnitsSaved = 0; + uhash_put(stringSet, sr, sr, &errorCode); + } + } + fRoot->preflightStrings(this, stringSet, errorCode); + } else { + stringSet = nullptr; + } + if (fStringsForm == STRINGS_UTF16_V2 && f16BitStringsLength > 0) { + compactStringsV2(stringSet, errorCode); + } + uhash_close(stringSet); + if (U_FAILURE(errorCode)) { + return; + } + + int32_t formatVersion = gFormatVersion; + if (fPoolStringIndexLimit != 0) { + int32_t sum = fPoolStringIndexLimit + fLocalStringIndexLimit; + if ((sum - 1) > RES_MAX_OFFSET) { + errorCode = U_BUFFER_OVERFLOW_ERROR; + return; + } + if (fPoolStringIndexLimit < 0x10000 && sum <= 0x10000) { + // 16-bit indexes work for all pool + local strings. + fPoolStringIndex16Limit = fPoolStringIndexLimit; + } else { + // Set the pool index threshold so that 16-bit indexes work + // for some pool strings and some local strings. + fPoolStringIndex16Limit = (int32_t)( + ((int64_t)fPoolStringIndexLimit * 0xffff) / sum); + } + } else if (gIsDefaultFormatVersion && formatVersion == 3 && !fIsPoolBundle) { + // If we just default to formatVersion 3 + // but there are no pool bundle strings to share + // and we do not write a pool bundle, + // then write formatVersion 2 which is just as good. + formatVersion = 2; + } + + fRoot->write16(this); + if (f16BitUnits.isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + if (f16BitUnits.length() & 1) { + f16BitUnits.append((char16_t)0xaaaa); /* pad to multiple of 4 bytes */ + } + + byteOffset = fKeysTop + f16BitUnits.length() * 2; + fRoot->preWrite(&byteOffset); + + /* total size including the root item */ + top = byteOffset; + + if (writtenFilename && writtenFilenameLen) { + *writtenFilename = 0; + } + + if (writtenFilename) { + int32_t off = 0, len = 0; + if (outputDir) { + uprv_strncpy(writtenFilename, outputDir, writtenFilenameLen); + } + if (writtenFilenameLen -= len) { + off += len; + writtenFilename[off] = U_FILE_SEP_CHAR; + if (--writtenFilenameLen) { + ++off; + if(outputPkg != nullptr) + { + uprv_strcpy(writtenFilename+off, outputPkg); + off += (int32_t)uprv_strlen(outputPkg); + writtenFilename[off] = '_'; + ++off; + } + + len = (int32_t)uprv_strlen(fLocale); + if (len > writtenFilenameLen) { + len = writtenFilenameLen; + } + uprv_strncpy(writtenFilename + off, fLocale, writtenFilenameLen - off); + if (writtenFilenameLen -= len) { + off += len; + uprv_strncpy(writtenFilename + off, ".res", writtenFilenameLen - off); + } + } + } + } + + if(outputPkg) + { + uprv_strcpy(dataName, outputPkg); + uprv_strcat(dataName, "_"); + uprv_strcat(dataName, fLocale); + } + else + { + uprv_strcpy(dataName, fLocale); + } + + uprv_memcpy(dataInfo.formatVersion, gFormatVersions + formatVersion, sizeof(UVersionInfo)); + + mem = udata_create(outputDir, "res", dataName, + &dataInfo, (gIncludeCopyright==true)? U_COPYRIGHT_STRING:nullptr, &errorCode); + if(U_FAILURE(errorCode)){ + return; + } + + /* write the root item */ + udata_write32(mem, fRoot->fRes); + + /* + * formatVersion 1.1 (ICU 2.8): + * write int32_t indexes[] after root and before the key strings + * to make it easier to parse resource bundles in icuswap or from Java etc. + */ + uprv_memset(indexes, 0, sizeof(indexes)); + indexes[URES_INDEX_LENGTH]= fIndexLength; + indexes[URES_INDEX_KEYS_TOP]= fKeysTop>>2; + indexes[URES_INDEX_RESOURCES_TOP]= (int32_t)(top>>2); + indexes[URES_INDEX_BUNDLE_TOP]= indexes[URES_INDEX_RESOURCES_TOP]; + indexes[URES_INDEX_MAX_TABLE_LENGTH]= fMaxTableLength; + + /* + * formatVersion 1.2 (ICU 3.6): + * write indexes[URES_INDEX_ATTRIBUTES] with URES_ATT_NO_FALLBACK set or not set + * the memset() above initialized all indexes[] to 0 + */ + if (fNoFallback) { + indexes[URES_INDEX_ATTRIBUTES]=URES_ATT_NO_FALLBACK; + } + /* + * formatVersion 2.0 (ICU 4.4): + * more compact string value storage, optional pool bundle + */ + if (URES_INDEX_16BIT_TOP < fIndexLength) { + indexes[URES_INDEX_16BIT_TOP] = (fKeysTop>>2) + (f16BitUnits.length()>>1); + } + if (URES_INDEX_POOL_CHECKSUM < fIndexLength) { + if (fIsPoolBundle) { + indexes[URES_INDEX_ATTRIBUTES] |= URES_ATT_IS_POOL_BUNDLE | URES_ATT_NO_FALLBACK; + uint32_t checksum = computeCRC((const char *)(fKeys + fKeysBottom), + (uint32_t)(fKeysTop - fKeysBottom), 0); + if (f16BitUnits.length() <= 1) { + // no pool strings to checksum + } else if (U_IS_BIG_ENDIAN) { + checksum = computeCRC(reinterpret_cast<const char *>(f16BitUnits.getBuffer()), + (uint32_t)f16BitUnits.length() * 2, checksum); + } else { + // Swap to big-endian so we get the same checksum on all platforms + // (except for charset family, due to the key strings). + UnicodeString s(f16BitUnits); + assert(!s.isBogus()); + // .getBuffer(capacity) returns a mutable buffer + char16_t* p = s.getBuffer(f16BitUnits.length()); + for (int32_t count = f16BitUnits.length(); count > 0; --count) { + uint16_t x = *p; + *p++ = (uint16_t)((x << 8) | (x >> 8)); + } + s.releaseBuffer(f16BitUnits.length()); + checksum = computeCRC((const char *)s.getBuffer(), + (uint32_t)f16BitUnits.length() * 2, checksum); + } + indexes[URES_INDEX_POOL_CHECKSUM] = (int32_t)checksum; + } else if (gUsePoolBundle) { + indexes[URES_INDEX_ATTRIBUTES] |= URES_ATT_USES_POOL_BUNDLE; + indexes[URES_INDEX_POOL_CHECKSUM] = fUsePoolBundle->fChecksum; + } + } + // formatVersion 3 (ICU 56): + // share string values via pool bundle strings + indexes[URES_INDEX_LENGTH] |= fPoolStringIndexLimit << 8; // bits 23..0 -> 31..8 + indexes[URES_INDEX_ATTRIBUTES] |= (fPoolStringIndexLimit >> 12) & 0xf000; // bits 27..24 -> 15..12 + indexes[URES_INDEX_ATTRIBUTES] |= fPoolStringIndex16Limit << 16; + + /* write the indexes[] */ + udata_writeBlock(mem, indexes, fIndexLength*4); + + /* write the table key strings */ + udata_writeBlock(mem, fKeys+fKeysBottom, + fKeysTop-fKeysBottom); + + /* write the v2 UTF-16 strings, URES_TABLE16 and URES_ARRAY16 */ + udata_writeBlock(mem, f16BitUnits.getBuffer(), f16BitUnits.length()*2); + + /* write all of the bundle contents: the root item and its children */ + byteOffset = fKeysTop + f16BitUnits.length() * 2; + fRoot->write(mem, &byteOffset); + assert(byteOffset == top); + + size = udata_finish(mem, &errorCode); + if(top != size) { + fprintf(stderr, "genrb error: wrote %u bytes but counted %u\n", + (int)size, (int)top); + errorCode = U_INTERNAL_PROGRAM_ERROR; + } +} + +/* Opening Functions */ + +TableResource* table_open(struct SRBRoot *bundle, const char *tag, const struct UString* comment, UErrorCode *status) { + LocalPointer<TableResource> res(new TableResource(bundle, tag, comment, *status), *status); + return U_SUCCESS(*status) ? res.orphan() : nullptr; +} + +ArrayResource* array_open(struct SRBRoot *bundle, const char *tag, const struct UString* comment, UErrorCode *status) { + LocalPointer<ArrayResource> res(new ArrayResource(bundle, tag, comment, *status), *status); + return U_SUCCESS(*status) ? res.orphan() : nullptr; +} + +struct SResource *string_open(struct SRBRoot *bundle, const char *tag, const char16_t *value, int32_t len, const struct UString* comment, UErrorCode *status) { + LocalPointer<SResource> res( + new StringResource(bundle, tag, value, len, comment, *status), *status); + return U_SUCCESS(*status) ? res.orphan() : nullptr; +} + +struct SResource *alias_open(struct SRBRoot *bundle, const char *tag, char16_t *value, int32_t len, const struct UString* comment, UErrorCode *status) { + LocalPointer<SResource> res( + new AliasResource(bundle, tag, value, len, comment, *status), *status); + return U_SUCCESS(*status) ? res.orphan() : nullptr; +} + +IntVectorResource *intvector_open(struct SRBRoot *bundle, const char *tag, const struct UString* comment, UErrorCode *status) { + LocalPointer<IntVectorResource> res( + new IntVectorResource(bundle, tag, comment, *status), *status); + return U_SUCCESS(*status) ? res.orphan() : nullptr; +} + +struct SResource *int_open(struct SRBRoot *bundle, const char *tag, int32_t value, const struct UString* comment, UErrorCode *status) { + LocalPointer<SResource> res(new IntResource(bundle, tag, value, comment, *status), *status); + return U_SUCCESS(*status) ? res.orphan() : nullptr; +} + +struct SResource *bin_open(struct SRBRoot *bundle, const char *tag, uint32_t length, uint8_t *data, const char* fileName, const struct UString* comment, UErrorCode *status) { + LocalPointer<SResource> res( + new BinaryResource(bundle, tag, length, data, fileName, comment, *status), *status); + return U_SUCCESS(*status) ? res.orphan() : nullptr; +} + +SRBRoot::SRBRoot(const UString *comment, UBool isPoolBundle, UErrorCode &errorCode) + : fRoot(nullptr), fLocale(nullptr), fIndexLength(0), fMaxTableLength(0), fNoFallback(false), + fStringsForm(STRINGS_UTF16_V1), fIsPoolBundle(isPoolBundle), + fKeys(nullptr), fKeyMap(nullptr), + fKeysBottom(0), fKeysTop(0), fKeysCapacity(0), + fKeysCount(0), fLocalKeyLimit(0), + f16BitUnits(), f16BitStringsLength(0), + fUsePoolBundle(&kNoPoolBundle), + fPoolStringIndexLimit(0), fPoolStringIndex16Limit(0), fLocalStringIndexLimit(0), + fWritePoolBundle(nullptr) { + if (U_FAILURE(errorCode)) { + return; + } + + if (gFormatVersion > 1) { + // f16BitUnits must start with a zero for empty resources. + // We might be able to omit it if there are no empty 16-bit resources. + f16BitUnits.append((char16_t)0); + } + + fKeys = (char *) uprv_malloc(sizeof(char) * KEY_SPACE_SIZE); + if (isPoolBundle) { + fRoot = new PseudoListResource(this, errorCode); + } else { + fRoot = new TableResource(this, nullptr, comment, errorCode); + } + if (fKeys == nullptr || fRoot == nullptr || U_FAILURE(errorCode)) { + if (U_SUCCESS(errorCode)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } + return; + } + + fKeysCapacity = KEY_SPACE_SIZE; + /* formatVersion 1.1 and up: start fKeysTop after the root item and indexes[] */ + if (gUsePoolBundle || isPoolBundle) { + fIndexLength = URES_INDEX_POOL_CHECKSUM + 1; + } else if (gFormatVersion >= 2) { + fIndexLength = URES_INDEX_16BIT_TOP + 1; + } else /* formatVersion 1 */ { + fIndexLength = URES_INDEX_ATTRIBUTES + 1; + } + fKeysBottom = (1 /* root */ + fIndexLength) * 4; + uprv_memset(fKeys, 0, fKeysBottom); + fKeysTop = fKeysBottom; + + if (gFormatVersion == 1) { + fStringsForm = STRINGS_UTF16_V1; + } else { + fStringsForm = STRINGS_UTF16_V2; + } +} + +/* Closing Functions */ + +void res_close(struct SResource *res) { + delete res; +} + +SRBRoot::~SRBRoot() { + delete fRoot; + uprv_free(fLocale); + uprv_free(fKeys); + uprv_free(fKeyMap); +} + +/* Misc Functions */ + +void SRBRoot::setLocale(char16_t *locale, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { + return; + } + + uprv_free(fLocale); + fLocale = (char*) uprv_malloc(sizeof(char) * (u_strlen(locale)+1)); + if(fLocale == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + + u_UCharsToChars(locale, fLocale, u_strlen(locale)+1); +} + +const char * +SRBRoot::getKeyString(int32_t key) const { + if (key < 0) { + return fUsePoolBundle->fKeys + (key & 0x7fffffff); + } else { + return fKeys + key; + } +} + +const char * +SResource::getKeyString(const SRBRoot *bundle) const { + if (fKey == -1) { + return nullptr; + } + return bundle->getKeyString(fKey); +} + +const char * +SRBRoot::getKeyBytes(int32_t *pLength) const { + *pLength = fKeysTop - fKeysBottom; + return fKeys + fKeysBottom; +} + +int32_t +SRBRoot::addKeyBytes(const char *keyBytes, int32_t length, UErrorCode &errorCode) { + int32_t keypos; + + // It is not legal to add new key bytes after compactKeys is run! + U_ASSERT(fKeyMap == nullptr); + + if (U_FAILURE(errorCode)) { + return -1; + } + if (length < 0 || (keyBytes == nullptr && length != 0)) { + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return -1; + } + if (length == 0) { + return fKeysTop; + } + + keypos = fKeysTop; + fKeysTop += length; + if (fKeysTop >= fKeysCapacity) { + /* overflow - resize the keys buffer */ + fKeysCapacity += KEY_SPACE_SIZE; + fKeys = static_cast<char *>(uprv_realloc(fKeys, fKeysCapacity)); + if(fKeys == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return -1; + } + } + + uprv_memcpy(fKeys + keypos, keyBytes, length); + + return keypos; +} + +int32_t +SRBRoot::addTag(const char *tag, UErrorCode &errorCode) { + int32_t keypos; + + if (U_FAILURE(errorCode)) { + return -1; + } + + if (tag == nullptr) { + /* no error: the root table and array items have no keys */ + return -1; + } + + keypos = addKeyBytes(tag, (int32_t)(uprv_strlen(tag) + 1), errorCode); + if (U_SUCCESS(errorCode)) { + ++fKeysCount; + } + return keypos; +} + +static int32_t +compareInt32(int32_t lPos, int32_t rPos) { + /* + * Compare possibly-negative key offsets. Don't just return lPos - rPos + * because that is prone to negative-integer underflows. + */ + if (lPos < rPos) { + return -1; + } else if (lPos > rPos) { + return 1; + } else { + return 0; + } +} + +static int32_t U_CALLCONV +compareKeySuffixes(const void *context, const void *l, const void *r) { + const struct SRBRoot *bundle=(const struct SRBRoot *)context; + int32_t lPos = ((const KeyMapEntry *)l)->oldpos; + int32_t rPos = ((const KeyMapEntry *)r)->oldpos; + const char *lStart = bundle->getKeyString(lPos); + const char *lLimit = lStart; + const char *rStart = bundle->getKeyString(rPos); + const char *rLimit = rStart; + int32_t diff; + while (*lLimit != 0) { ++lLimit; } + while (*rLimit != 0) { ++rLimit; } + /* compare keys in reverse character order */ + while (lStart < lLimit && rStart < rLimit) { + diff = (int32_t)(uint8_t)*--lLimit - (int32_t)(uint8_t)*--rLimit; + if (diff != 0) { + return diff; + } + } + /* sort equal suffixes by descending key length */ + diff = (int32_t)(rLimit - rStart) - (int32_t)(lLimit - lStart); + if (diff != 0) { + return diff; + } + /* Sort pool bundle keys first (negative oldpos), and otherwise keys in parsing order. */ + return compareInt32(lPos, rPos); +} + +static int32_t U_CALLCONV +compareKeyNewpos(const void * /*context*/, const void *l, const void *r) { + return compareInt32(((const KeyMapEntry *)l)->newpos, ((const KeyMapEntry *)r)->newpos); +} + +static int32_t U_CALLCONV +compareKeyOldpos(const void * /*context*/, const void *l, const void *r) { + return compareInt32(((const KeyMapEntry *)l)->oldpos, ((const KeyMapEntry *)r)->oldpos); +} + +void SResource::collectKeys(std::function<void(int32_t)> collector) const { + collector(fKey); +} + +void ContainerResource::collectKeys(std::function<void(int32_t)> collector) const { + collector(fKey); + for (SResource* curr = fFirst; curr != nullptr; curr = curr->fNext) { + curr->collectKeys(collector); + } +} + +void +SRBRoot::compactKeys(UErrorCode &errorCode) { + KeyMapEntry *map; + char *keys; + int32_t i; + + // Except for pool bundles, keys might not be used. + // Do not add unused keys to the final bundle. + std::set<int32_t> keysInUse; + if (!fIsPoolBundle) { + fRoot->collectKeys([&keysInUse](int32_t key) { + if (key >= 0) { + keysInUse.insert(key); + } + }); + fKeysCount = static_cast<int32_t>(keysInUse.size()); + } + + int32_t keysCount = fUsePoolBundle->fKeysCount + fKeysCount; + if (U_FAILURE(errorCode) || fKeyMap != nullptr) { + return; + } + map = (KeyMapEntry *)uprv_malloc(keysCount * sizeof(KeyMapEntry)); + if (map == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + keys = (char *)fUsePoolBundle->fKeys; + for (i = 0; i < fUsePoolBundle->fKeysCount; ++i) { + map[i].oldpos = + (int32_t)(keys - fUsePoolBundle->fKeys) | 0x80000000; /* negative oldpos */ + map[i].newpos = 0; + while (*keys != 0) { ++keys; } /* skip the key */ + ++keys; /* skip the NUL */ + } + keys = fKeys + fKeysBottom; + while (i < keysCount) { + int32_t keyOffset = static_cast<int32_t>(keys - fKeys); + if (!fIsPoolBundle && keysInUse.count(keyOffset) == 0) { + // Mark the unused key as deleted + while (*keys != 0) { *keys++ = 1; } + *keys++ = 1; + } else { + map[i].oldpos = keyOffset; + map[i].newpos = 0; + while (*keys != 0) { ++keys; } /* skip the key */ + ++keys; /* skip the NUL */ + i++; + } + } + if (keys != fKeys + fKeysTop) { + // Throw away any unused keys from the end + fKeysTop = static_cast<int32_t>(keys - fKeys); + } + /* Sort the keys so that each one is immediately followed by all of its suffixes. */ + uprv_sortArray(map, keysCount, (int32_t)sizeof(KeyMapEntry), + compareKeySuffixes, this, false, &errorCode); + /* + * Make suffixes point into earlier, longer strings that contain them + * and mark the old, now unused suffix bytes as deleted. + */ + if (U_SUCCESS(errorCode)) { + keys = fKeys; + for (i = 0; i < keysCount;) { + /* + * This key is not a suffix of the previous one; + * keep this one and delete the following ones that are + * suffixes of this one. + */ + const char *key; + const char *keyLimit; + int32_t j = i + 1; + map[i].newpos = map[i].oldpos; + if (j < keysCount && map[j].oldpos < 0) { + /* Key string from the pool bundle, do not delete. */ + i = j; + continue; + } + key = getKeyString(map[i].oldpos); + for (keyLimit = key; *keyLimit != 0; ++keyLimit) {} + for (; j < keysCount && map[j].oldpos >= 0; ++j) { + const char *k; + char *suffix; + const char *suffixLimit; + int32_t offset; + suffix = keys + map[j].oldpos; + for (suffixLimit = suffix; *suffixLimit != 0; ++suffixLimit) {} + offset = static_cast<int32_t>((keyLimit - key) - (suffixLimit - suffix)); + if (offset < 0) { + break; /* suffix cannot be longer than the original */ + } + /* Is it a suffix of the earlier, longer key? */ + for (k = keyLimit; suffix < suffixLimit && *--k == *--suffixLimit;) {} + if (suffix == suffixLimit && *k == *suffixLimit) { + map[j].newpos = map[i].oldpos + offset; /* yes, point to the earlier key */ + // Mark the suffix as deleted + while (*suffix != 0) { *suffix++ = 1; } + *suffix = 1; + } else { + break; /* not a suffix, restart from here */ + } + } + i = j; + } + /* + * Re-sort by newpos, then modify the key characters array in-place + * to squeeze out unused bytes, and readjust the newpos offsets. + */ + uprv_sortArray(map, keysCount, (int32_t)sizeof(KeyMapEntry), + compareKeyNewpos, nullptr, false, &errorCode); + if (U_SUCCESS(errorCode)) { + int32_t oldpos, newpos, limit; + oldpos = newpos = fKeysBottom; + limit = fKeysTop; + /* skip key offsets that point into the pool bundle rather than this new bundle */ + for (i = 0; i < keysCount && map[i].newpos < 0; ++i) {} + if (i < keysCount) { + while (oldpos < limit) { + if (keys[oldpos] == 1) { + ++oldpos; /* skip unused bytes */ + } else { + /* adjust the new offsets for keys starting here */ + while (i < keysCount && map[i].newpos == oldpos) { + map[i++].newpos = newpos; + } + /* move the key characters to their new position */ + keys[newpos++] = keys[oldpos++]; + } + } + U_ASSERT(i == keysCount); + } + fKeysTop = newpos; + /* Re-sort once more, by old offsets for binary searching. */ + uprv_sortArray(map, keysCount, (int32_t)sizeof(KeyMapEntry), + compareKeyOldpos, nullptr, false, &errorCode); + if (U_SUCCESS(errorCode)) { + /* key size reduction by limit - newpos */ + fKeyMap = map; + map = nullptr; + } + } + } + uprv_free(map); +} + +static int32_t U_CALLCONV +compareStringSuffixes(const void * /*context*/, const void *l, const void *r) { + const StringResource *left = *((const StringResource **)l); + const StringResource *right = *((const StringResource **)r); + const char16_t *lStart = left->getBuffer(); + const char16_t *lLimit = lStart + left->length(); + const char16_t *rStart = right->getBuffer(); + const char16_t *rLimit = rStart + right->length(); + int32_t diff; + /* compare keys in reverse character order */ + while (lStart < lLimit && rStart < rLimit) { + diff = (int32_t)*--lLimit - (int32_t)*--rLimit; + if (diff != 0) { + return diff; + } + } + /* sort equal suffixes by descending string length */ + return right->length() - left->length(); +} + +static int32_t U_CALLCONV +compareStringLengths(const void * /*context*/, const void *l, const void *r) { + const StringResource *left = *((const StringResource **)l); + const StringResource *right = *((const StringResource **)r); + int32_t diff; + /* Make "is suffix of another string" compare greater than a non-suffix. */ + diff = (int)(left->fSame != nullptr) - (int)(right->fSame != nullptr); + if (diff != 0) { + return diff; + } + /* sort by ascending string length */ + diff = left->length() - right->length(); + if (diff != 0) { + return diff; + } + // sort by descending size reduction + diff = right->fNumUnitsSaved - left->fNumUnitsSaved; + if (diff != 0) { + return diff; + } + // sort lexically + return left->fString.compare(right->fString); +} + +void +StringResource::writeUTF16v2(int32_t base, UnicodeString &dest) { + int32_t len = length(); + fRes = URES_MAKE_RESOURCE(URES_STRING_V2, base + dest.length()); + fWritten = true; + switch(fNumCharsForLength) { + case 0: + break; + case 1: + dest.append((char16_t)(0xdc00 + len)); + break; + case 2: + dest.append((char16_t)(0xdfef + (len >> 16))); + dest.append((char16_t)len); + break; + case 3: + dest.append((char16_t)0xdfff); + dest.append((char16_t)(len >> 16)); + dest.append((char16_t)len); + break; + default: + break; /* will not occur */ + } + dest.append(fString); + dest.append((char16_t)0); +} + +void +SRBRoot::compactStringsV2(UHashtable *stringSet, UErrorCode &errorCode) { + if (U_FAILURE(errorCode)) { + return; + } + // Store the StringResource pointers in an array for + // easy sorting and processing. + // We enumerate a set of strings, so there are no duplicates. + int32_t count = uhash_count(stringSet); + LocalArray<StringResource *> array(new StringResource *[count], errorCode); + if (U_FAILURE(errorCode)) { + return; + } + for (int32_t pos = UHASH_FIRST, i = 0; i < count; ++i) { + array[i] = (StringResource *)uhash_nextElement(stringSet, &pos)->key.pointer; + } + /* Sort the strings so that each one is immediately followed by all of its suffixes. */ + uprv_sortArray(array.getAlias(), count, (int32_t)sizeof(struct SResource **), + compareStringSuffixes, nullptr, false, &errorCode); + if (U_FAILURE(errorCode)) { + return; + } + /* + * Make suffixes point into earlier, longer strings that contain them. + * Temporarily use fSame and fSuffixOffset for suffix strings to + * refer to the remaining ones. + */ + for (int32_t i = 0; i < count;) { + /* + * This string is not a suffix of the previous one; + * write this one and subsume the following ones that are + * suffixes of this one. + */ + StringResource *res = array[i]; + res->fNumUnitsSaved = (res->fNumCopies - 1) * res->get16BitStringsLength(); + // Whole duplicates of pool strings are already account for in fPoolStringIndexLimit, + // see StringResource::handlePreflightStrings(). + int32_t j; + for (j = i + 1; j < count; ++j) { + StringResource *suffixRes = array[j]; + /* Is it a suffix of the earlier, longer string? */ + if (res->fString.endsWith(suffixRes->fString)) { + assert(res->length() != suffixRes->length()); // Set strings are unique. + if (suffixRes->fWritten) { + // Pool string, skip. + } else if (suffixRes->fNumCharsForLength == 0) { + /* yes, point to the earlier string */ + suffixRes->fSame = res; + suffixRes->fSuffixOffset = res->length() - suffixRes->length(); + if (res->fWritten) { + // Suffix-share res which is a pool string. + // Compute the resource word and collect the maximum. + suffixRes->fRes = + res->fRes + res->fNumCharsForLength + suffixRes->fSuffixOffset; + int32_t poolStringIndex = (int32_t)RES_GET_OFFSET(suffixRes->fRes); + if (poolStringIndex >= fPoolStringIndexLimit) { + fPoolStringIndexLimit = poolStringIndex + 1; + } + suffixRes->fWritten = true; + } + res->fNumUnitsSaved += suffixRes->fNumCopies * suffixRes->get16BitStringsLength(); + } else { + /* write the suffix by itself if we need explicit length */ + } + } else { + break; /* not a suffix, restart from here */ + } + } + i = j; + } + /* + * Re-sort the strings by ascending length (except suffixes last) + * to optimize for URES_TABLE16 and URES_ARRAY16: + * Keep as many as possible within reach of 16-bit offsets. + */ + uprv_sortArray(array.getAlias(), count, (int32_t)sizeof(struct SResource **), + compareStringLengths, nullptr, false, &errorCode); + if (U_FAILURE(errorCode)) { + return; + } + if (fIsPoolBundle) { + // Write strings that are sufficiently shared. + // Avoid writing other strings. + int32_t numStringsWritten = 0; + int32_t numUnitsSaved = 0; + int32_t numUnitsNotSaved = 0; + for (int32_t i = 0; i < count; ++i) { + StringResource *res = array[i]; + // Maximum pool string index when suffix-sharing the last character. + int32_t maxStringIndex = + f16BitUnits.length() + res->fNumCharsForLength + res->length() - 1; + if (res->fNumUnitsSaved >= GENRB_MIN_16BIT_UNITS_SAVED_FOR_POOL_STRING && + maxStringIndex < RES_MAX_OFFSET) { + res->writeUTF16v2(0, f16BitUnits); + ++numStringsWritten; + numUnitsSaved += res->fNumUnitsSaved; + } else { + numUnitsNotSaved += res->fNumUnitsSaved; + res->fRes = URES_MAKE_EMPTY_RESOURCE(URES_STRING); + res->fWritten = true; + } + } + if (f16BitUnits.isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + } + if (getShowWarning()) { // not quiet + printf("number of shared strings: %d\n", (int)numStringsWritten); + printf("16-bit units for strings: %6d = %6d bytes\n", + (int)f16BitUnits.length(), (int)f16BitUnits.length() * 2); + printf("16-bit units saved: %6d = %6d bytes\n", + (int)numUnitsSaved, (int)numUnitsSaved * 2); + printf("16-bit units not saved: %6d = %6d bytes\n", + (int)numUnitsNotSaved, (int)numUnitsNotSaved * 2); + } + } else { + assert(fPoolStringIndexLimit <= fUsePoolBundle->fStringIndexLimit); + /* Write the non-suffix strings. */ + int32_t i; + for (i = 0; i < count && array[i]->fSame == nullptr; ++i) { + StringResource *res = array[i]; + if (!res->fWritten) { + int32_t localStringIndex = f16BitUnits.length(); + if (localStringIndex >= fLocalStringIndexLimit) { + fLocalStringIndexLimit = localStringIndex + 1; + } + res->writeUTF16v2(fPoolStringIndexLimit, f16BitUnits); + } + } + if (f16BitUnits.isBogus()) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return; + } + if (fWritePoolBundle != nullptr && gFormatVersion >= 3) { + PseudoListResource *poolStrings = + static_cast<PseudoListResource *>(fWritePoolBundle->fRoot); + for (i = 0; i < count && array[i]->fSame == nullptr; ++i) { + assert(!array[i]->fString.isEmpty()); + StringResource *poolString = + new StringResource(fWritePoolBundle, array[i]->fString, errorCode); + if (poolString == nullptr) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + break; + } + poolStrings->add(poolString); + } + } + /* Write the suffix strings. Make each point to the real string. */ + for (; i < count; ++i) { + StringResource *res = array[i]; + if (res->fWritten) { + continue; + } + StringResource *same = res->fSame; + assert(res->length() != same->length()); // Set strings are unique. + res->fRes = same->fRes + same->fNumCharsForLength + res->fSuffixOffset; + int32_t localStringIndex = (int32_t)RES_GET_OFFSET(res->fRes) - fPoolStringIndexLimit; + // Suffixes of pool strings have been set already. + assert(localStringIndex >= 0); + if (localStringIndex >= fLocalStringIndexLimit) { + fLocalStringIndexLimit = localStringIndex + 1; + } + res->fWritten = true; + } + } + // +1 to account for the initial zero in f16BitUnits + assert(f16BitUnits.length() <= (f16BitStringsLength + 1)); +} + +void SResource::applyFilter( + const PathFilter& /*filter*/, + ResKeyPath& /*path*/, + const SRBRoot* /*bundle*/) { + // Only a few resource types (tables) are capable of being filtered. +} + +void TableResource::applyFilter( + const PathFilter& filter, + ResKeyPath& path, + const SRBRoot* bundle) { + SResource* prev = nullptr; + SResource* curr = fFirst; + for (; curr != nullptr;) { + path.push(curr->getKeyString(bundle)); + auto inclusion = filter.match(path); + if (inclusion == PathFilter::EInclusion::INCLUDE) { + // Include whole subtree + // no-op + if (isVerbose()) { + std::cout << "genrb subtree: " << bundle->fLocale << ": INCLUDE: " << path << std::endl; + } + } else if (inclusion == PathFilter::EInclusion::EXCLUDE) { + // Reject the whole subtree + // Remove it from the linked list + if (isVerbose()) { + std::cout << "genrb subtree: " << bundle->fLocale << ": DELETE: " << path << std::endl; + } + if (prev == nullptr) { + fFirst = curr->fNext; + } else { + prev->fNext = curr->fNext; + } + fCount--; + delete curr; + curr = prev; + } else { + U_ASSERT(inclusion == PathFilter::EInclusion::PARTIAL); + // Recurse into the child + curr->applyFilter(filter, path, bundle); + } + path.pop(); + + prev = curr; + if (curr == nullptr) { + curr = fFirst; + } else { + curr = curr->fNext; + } + } +} diff --git a/intl/icu/source/tools/genrb/reslist.h b/intl/icu/source/tools/genrb/reslist.h new file mode 100644 index 0000000000..17797bc36c --- /dev/null +++ b/intl/icu/source/tools/genrb/reslist.h @@ -0,0 +1,446 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File reslist.h +* +* Modification History: +* +* Date Name Description +* 02/21/00 weiv Creation. +******************************************************************************* +*/ + +#ifndef RESLIST_H +#define RESLIST_H + +#define KEY_SPACE_SIZE 65536 +#define RESLIST_INT_VECTOR_INIT_SIZE 2048 + +#include <functional> + +#include "unicode/utypes.h" +#include "unicode/unistr.h" +#include "unicode/ures.h" +#include "unicode/ustring.h" +#include "cmemory.h" +#include "cstring.h" +#include "uhash.h" +#include "unewdata.h" +#include "uresdata.h" +#include "ustr.h" + +U_CDECL_BEGIN + +class PathFilter; +class PseudoListResource; +class ResKeyPath; + +struct ResFile { + ResFile() + : fBytes(nullptr), fIndexes(nullptr), + fKeys(nullptr), fKeysLength(0), fKeysCount(0), + fStrings(nullptr), fStringIndexLimit(0), + fChecksum(0) {} + ~ResFile() { close(); } + + void close(); + + uint8_t *fBytes; + const int32_t *fIndexes; + const char *fKeys; + int32_t fKeysLength; + int32_t fKeysCount; + + PseudoListResource *fStrings; + int32_t fStringIndexLimit; + + int32_t fChecksum; +}; + +struct SResource; + +typedef struct KeyMapEntry { + int32_t oldpos, newpos; +} KeyMapEntry; + +/* Resource bundle root table */ +struct SRBRoot { + SRBRoot(const UString *comment, UBool isPoolBundle, UErrorCode &errorCode); + ~SRBRoot(); + + void write(const char *outputDir, const char *outputPkg, + char *writtenFilename, int writtenFilenameLen, UErrorCode &errorCode); + + void setLocale(char16_t *locale, UErrorCode &errorCode); + int32_t addTag(const char *tag, UErrorCode &errorCode); + + const char *getKeyString(int32_t key) const; + const char *getKeyBytes(int32_t *pLength) const; + + int32_t addKeyBytes(const char *keyBytes, int32_t length, UErrorCode &errorCode); + + void compactKeys(UErrorCode &errorCode); + + int32_t makeRes16(uint32_t resWord) const; + int32_t mapKey(int32_t oldpos) const; + +private: + void compactStringsV2(UHashtable *stringSet, UErrorCode &errorCode); + +public: + // TODO: private + + SResource *fRoot; // Normally a TableResource. + char *fLocale; + int32_t fIndexLength; + int32_t fMaxTableLength; + UBool fNoFallback; /* see URES_ATT_NO_FALLBACK */ + int8_t fStringsForm; /* default STRINGS_UTF16_V1 */ + UBool fIsPoolBundle; + + char *fKeys; + KeyMapEntry *fKeyMap; + int32_t fKeysBottom, fKeysTop; + int32_t fKeysCapacity; + int32_t fKeysCount; + int32_t fLocalKeyLimit; /* key offset < limit fits into URES_TABLE */ + + icu::UnicodeString f16BitUnits; + int32_t f16BitStringsLength; + + const ResFile *fUsePoolBundle; + int32_t fPoolStringIndexLimit; + int32_t fPoolStringIndex16Limit; + int32_t fLocalStringIndexLimit; + SRBRoot *fWritePoolBundle; +}; + +/* write a java resource file */ +// TODO: C++ify +void bundle_write_java(struct SRBRoot *bundle, const char *outputDir, const char* outputEnc, char *writtenFilename, + int writtenFilenameLen, const char* packageName, const char* bundleName, UErrorCode *status); + +/* write a xml resource file */ +// TODO: C++ify +void bundle_write_xml(struct SRBRoot *bundle, const char *outputDir,const char* outputEnc, const char* rbname, + char *writtenFilename, int writtenFilenameLen, const char* language, const char* package, UErrorCode *status); + +/* Various resource types */ + +/* + * Return a unique pointer to a dummy object, + * for use in non-error cases when no resource is to be added to the bundle. + * (nullptr is used in error cases.) + */ +struct SResource* res_none(); + +class ArrayResource; +class TableResource; +class IntVectorResource; + +TableResource *table_open(struct SRBRoot *bundle, const char *tag, const struct UString* comment, UErrorCode *status); + +ArrayResource *array_open(struct SRBRoot *bundle, const char *tag, const struct UString* comment, UErrorCode *status); + +struct SResource *string_open(struct SRBRoot *bundle, const char *tag, const char16_t *value, int32_t len, const struct UString* comment, UErrorCode *status); + +struct SResource *alias_open(struct SRBRoot *bundle, const char *tag, char16_t *value, int32_t len, const struct UString* comment, UErrorCode *status); + +IntVectorResource *intvector_open(struct SRBRoot *bundle, const char *tag, const struct UString* comment, UErrorCode *status); + +struct SResource *int_open(struct SRBRoot *bundle, const char *tag, int32_t value, const struct UString* comment, UErrorCode *status); + +struct SResource *bin_open(struct SRBRoot *bundle, const char *tag, uint32_t length, uint8_t *data, const char* fileName, const struct UString* comment, UErrorCode *status); + +/* Resource place holder */ + +struct SResource { + SResource(); + SResource(SRBRoot *bundle, const char *tag, int8_t type, const UString* comment, + UErrorCode &errorCode); + virtual ~SResource(); + + UBool isTable() const { return fType == URES_TABLE; } + UBool isString() const { return fType == URES_STRING; } + + const char *getKeyString(const SRBRoot *bundle) const; + + /** + * Preflights strings. + * Finds duplicates and counts the total number of string code units + * so that they can be written first to the 16-bit array, + * for minimal string and container storage. + * + * We walk the final parse tree, rather than collecting this information while building it, + * so that we need not deal with changes to the parse tree (especially removing resources). + */ + void preflightStrings(SRBRoot *bundle, UHashtable *stringSet, UErrorCode &errorCode); + virtual void handlePreflightStrings(SRBRoot *bundle, UHashtable *stringSet, UErrorCode &errorCode); + + /** + * Writes resource values into f16BitUnits + * and determines the resource item word, if possible. + */ + void write16(SRBRoot *bundle); + virtual void handleWrite16(SRBRoot *bundle); + + /** + * Calculates ("preflights") and advances the *byteOffset + * by the size of the resource's data in the binary file and + * determines the resource item word. + * + * Most handlePreWrite() functions may add any number of bytes, but preWrite() + * will always pad it to a multiple of 4. + * The resource item type may be a related subtype of the fType. + * + * The preWrite() and write() functions start and end at the same + * byteOffset values. + * Prewriting allows bundle.write() to determine the root resource item word, + * before actually writing the bundle contents to the file, + * which is necessary because the root item is stored at the beginning. + */ + void preWrite(uint32_t *byteOffset); + virtual void handlePreWrite(uint32_t *byteOffset); + + /** + * Writes the resource's data to mem and updates the byteOffset + * in parallel. + */ + void write(UNewDataMemory *mem, uint32_t *byteOffset); + virtual void handleWrite(UNewDataMemory *mem, uint32_t *byteOffset); + + /** + * Applies the given filter with the given base path to this resource. + * Removes child resources rejected by the filter recursively. + * + * @param bundle Needed in order to access the key for this and child resources. + */ + virtual void applyFilter(const PathFilter& filter, ResKeyPath& path, const SRBRoot* bundle); + + /** + * Calls the given function for every key ID present in this tree. + */ + virtual void collectKeys(std::function<void(int32_t)> collector) const; + + int8_t fType; /* nominal type: fRes (when != 0xffffffff) may use subtype */ + UBool fWritten; /* res_write() can exit early */ + uint32_t fRes; /* resource item word; RES_BOGUS=0xffffffff if not known yet */ + int32_t fRes16; /* Res16 version of fRes for Table, Table16, Array16; -1 if it does not fit. */ + int32_t fKey; /* Index into bundle->fKeys; -1 if no key. */ + int32_t fKey16; /* Key16 version of fKey for Table & Table16; -1 if no key or it does not fit. */ + int line; /* used internally to report duplicate keys in tables */ + SResource *fNext; /* This is for internal chaining while building */ + struct UString fComment; +}; + +class ContainerResource : public SResource { +public: + ContainerResource(SRBRoot *bundle, const char *tag, int8_t type, + const UString* comment, UErrorCode &errorCode) + : SResource(bundle, tag, type, comment, errorCode), + fCount(0), fFirst(nullptr) {} + virtual ~ContainerResource(); + + void handlePreflightStrings(SRBRoot *bundle, UHashtable *stringSet, UErrorCode &errorCode) override; + + void collectKeys(std::function<void(int32_t)> collector) const override; + +protected: + void writeAllRes16(SRBRoot *bundle); + void preWriteAllRes(uint32_t *byteOffset); + void writeAllRes(UNewDataMemory *mem, uint32_t *byteOffset); + void writeAllRes32(UNewDataMemory *mem, uint32_t *byteOffset); + +public: + // TODO: private with getter? + uint32_t fCount; + SResource *fFirst; +}; + +class TableResource : public ContainerResource { +public: + TableResource(SRBRoot *bundle, const char *tag, + const UString* comment, UErrorCode &errorCode) + : ContainerResource(bundle, tag, URES_TABLE, comment, errorCode), + fTableType(URES_TABLE), fRoot(bundle) {} + virtual ~TableResource(); + + void add(SResource *res, int linenumber, UErrorCode &errorCode); + + void handleWrite16(SRBRoot *bundle) override; + void handlePreWrite(uint32_t *byteOffset) override; + void handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) override; + + void applyFilter(const PathFilter& filter, ResKeyPath& path, const SRBRoot* bundle) override; + + int8_t fTableType; // determined by table_write16() for table_preWrite() & table_write() + SRBRoot *fRoot; +}; + +class ArrayResource : public ContainerResource { +public: + ArrayResource(SRBRoot *bundle, const char *tag, + const UString* comment, UErrorCode &errorCode) + : ContainerResource(bundle, tag, URES_ARRAY, comment, errorCode), + fLast(nullptr) {} + virtual ~ArrayResource(); + + void add(SResource *res); + + virtual void handleWrite16(SRBRoot *bundle) override; + virtual void handlePreWrite(uint32_t *byteOffset) override; + virtual void handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) override; + + SResource *fLast; +}; + +/** + * List of resources for a pool bundle. + * Writes an empty table resource, rather than a container structure. + */ +class PseudoListResource : public ContainerResource { +public: + PseudoListResource(SRBRoot *bundle, UErrorCode &errorCode) + : ContainerResource(bundle, nullptr, URES_TABLE, nullptr, errorCode) {} + virtual ~PseudoListResource(); + + void add(SResource *res); + + virtual void handleWrite16(SRBRoot *bundle) override; +}; + +class StringBaseResource : public SResource { +public: + StringBaseResource(SRBRoot *bundle, const char *tag, int8_t type, + const char16_t *value, int32_t len, + const UString* comment, UErrorCode &errorCode); + StringBaseResource(SRBRoot *bundle, int8_t type, + const icu::UnicodeString &value, UErrorCode &errorCode); + StringBaseResource(int8_t type, const char16_t *value, int32_t len, UErrorCode &errorCode); + virtual ~StringBaseResource(); + + const char16_t *getBuffer() const { return icu::toUCharPtr(fString.getBuffer()); } + int32_t length() const { return fString.length(); } + + virtual void handlePreWrite(uint32_t *byteOffset) override; + virtual void handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) override; + + // TODO: private with getter? + icu::UnicodeString fString; +}; + +class StringResource : public StringBaseResource { +public: + StringResource(SRBRoot *bundle, const char *tag, const char16_t *value, int32_t len, + const UString* comment, UErrorCode &errorCode) + : StringBaseResource(bundle, tag, URES_STRING, value, len, comment, errorCode), + fSame(nullptr), fSuffixOffset(0), + fNumCopies(0), fNumUnitsSaved(0), fNumCharsForLength(0) {} + StringResource(SRBRoot *bundle, const icu::UnicodeString &value, UErrorCode &errorCode) + : StringBaseResource(bundle, URES_STRING, value, errorCode), + fSame(nullptr), fSuffixOffset(0), + fNumCopies(0), fNumUnitsSaved(0), fNumCharsForLength(0) {} + StringResource(int32_t poolStringIndex, int8_t numCharsForLength, + const char16_t *value, int32_t length, + UErrorCode &errorCode) + : StringBaseResource(URES_STRING, value, length, errorCode), + fSame(nullptr), fSuffixOffset(0), + fNumCopies(0), fNumUnitsSaved(0), fNumCharsForLength(numCharsForLength) { + // v3 pool string encoded as string-v2 with low offset + fRes = URES_MAKE_RESOURCE(URES_STRING_V2, poolStringIndex); + fWritten = true; + } + virtual ~StringResource(); + + int32_t get16BitStringsLength() const { + return fNumCharsForLength + length() + 1; // +1 for the NUL + } + + virtual void handlePreflightStrings(SRBRoot *bundle, UHashtable *stringSet, UErrorCode &errorCode) override; + virtual void handleWrite16(SRBRoot *bundle) override; + + void writeUTF16v2(int32_t base, icu::UnicodeString &dest); + + StringResource *fSame; // used for duplicates + int32_t fSuffixOffset; // this string is a suffix of fSame at this offset + int32_t fNumCopies; // number of equal strings represented by one stringSet element + int32_t fNumUnitsSaved; // from not writing duplicates and suffixes + int8_t fNumCharsForLength; +}; + +class AliasResource : public StringBaseResource { +public: + AliasResource(SRBRoot *bundle, const char *tag, const char16_t *value, int32_t len, + const UString* comment, UErrorCode &errorCode) + : StringBaseResource(bundle, tag, URES_ALIAS, value, len, comment, errorCode) {} + virtual ~AliasResource(); +}; + +class IntResource : public SResource { +public: + IntResource(SRBRoot *bundle, const char *tag, int32_t value, + const UString* comment, UErrorCode &errorCode); + virtual ~IntResource(); + + // TODO: private with getter? + int32_t fValue; +}; + +class IntVectorResource : public SResource { +public: + IntVectorResource(SRBRoot *bundle, const char *tag, + const UString* comment, UErrorCode &errorCode); + virtual ~IntVectorResource(); + + void add(int32_t value, UErrorCode &errorCode); + + virtual void handlePreWrite(uint32_t *byteOffset) override; + virtual void handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) override; + + // TODO: UVector32 + size_t fCount; + size_t fSize; + uint32_t *fArray; +}; + +class BinaryResource : public SResource { +public: + BinaryResource(SRBRoot *bundle, const char *tag, + uint32_t length, uint8_t *data, const char* fileName, + const UString* comment, UErrorCode &errorCode); + virtual ~BinaryResource(); + + virtual void handlePreWrite(uint32_t *byteOffset) override; + virtual void handleWrite(UNewDataMemory *mem, uint32_t *byteOffset) override; + + // TODO: CharString? + uint32_t fLength; + uint8_t *fData; + // TODO: CharString + char* fFileName; // file name for binary or import binary tags if any +}; + +// TODO: use LocalPointer or delete +void res_close(struct SResource *res); + +void setIncludeCopyright(UBool val); +UBool getIncludeCopyright(); + +void setFormatVersion(int32_t formatVersion); + +int32_t getFormatVersion(); + +void setUsePoolBundle(UBool use); + +/* in wrtxml.cpp */ +uint32_t computeCRC(const char *ptr, uint32_t len, uint32_t lastcrc); + +U_CDECL_END +#endif /* #ifndef RESLIST_H */ diff --git a/intl/icu/source/tools/genrb/rle.c b/intl/icu/source/tools/genrb/rle.c new file mode 100644 index 0000000000..f737c45491 --- /dev/null +++ b/intl/icu/source/tools/genrb/rle.c @@ -0,0 +1,408 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2003, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File writejava.c +* +* Modification History: +* +* Date Name Description +* 01/11/02 Ram Creation. +******************************************************************************* +*/ +#include <stdbool.h> +#include "rle.h" +/** + * The ESCAPE character is used during run-length encoding. It signals + * a run of identical chars. + */ +static const uint16_t ESCAPE = 0xA5A5; + +/** + * The ESCAPE_BYTE character is used during run-length encoding. It signals + * a run of identical bytes. + */ +static const uint8_t ESCAPE_BYTE = (uint8_t)0xA5; + +/** + * Append a byte to the given StringBuffer, packing two bytes into each + * character. The state parameter maintains intermediary data between + * calls. + * @param state A two-element array, with state[0] == 0 if this is the + * first byte of a pair, or state[0] != 0 if this is the second byte + * of a pair, in which case state[1] is the first byte. + */ +static uint16_t* +appendEncodedByte(uint16_t* buffer, uint16_t* buffLimit, uint8_t value, uint8_t state[],UErrorCode* status) { + if(!status || U_FAILURE(*status)){ + return NULL; + } + if (state[0] != 0) { + uint16_t c = (uint16_t) ((state[1] << 8) | (((int32_t) value) & 0xFF)); + if(buffer < buffLimit){ + *buffer++ = c; + }else{ + *status = U_BUFFER_OVERFLOW_ERROR; + } + state[0] = 0; + return buffer; + } + else { + state[0] = 1; + state[1] = value; + return buffer; + } +} +/** + * Encode a run, possibly a degenerate run (of < 4 values). + * @param length The length of the run; must be > 0 && <= 0xFF. + */ +static uint16_t* +encodeRunByte(uint16_t* buffer,uint16_t* bufLimit, uint8_t value, int32_t length, uint8_t state[], UErrorCode* status) { + if(!status || U_FAILURE(*status)){ + return NULL; + } + if (length < 4) { + int32_t j=0; + for (; j<length; ++j) { + if (value == ESCAPE_BYTE) { + buffer = appendEncodedByte(buffer,bufLimit, ESCAPE_BYTE, state,status); + } + buffer = appendEncodedByte(buffer,bufLimit, value, state, status); + } + } + else { + if (length == ESCAPE_BYTE) { + if (value == ESCAPE_BYTE){ + buffer = appendEncodedByte(buffer, bufLimit,ESCAPE_BYTE, state,status); + } + buffer = appendEncodedByte(buffer,bufLimit, value, state, status); + --length; + } + buffer = appendEncodedByte(buffer,bufLimit, ESCAPE_BYTE, state,status); + buffer = appendEncodedByte(buffer,bufLimit, (char)length, state, status); + buffer = appendEncodedByte(buffer,bufLimit, value, state, status); /* Don't need to escape this value*/ + } + return buffer; +} + +#define APPEND( buffer, bufLimit, value, num, status) UPRV_BLOCK_MACRO_BEGIN { \ + if(buffer<bufLimit){ \ + *buffer++=(value); \ + }else{ \ + *status = U_BUFFER_OVERFLOW_ERROR; \ + } \ + num++; \ +} UPRV_BLOCK_MACRO_END + +/** + * Encode a run, possibly a degenerate run (of < 4 values). + * @param length The length of the run; must be > 0 && <= 0xFFFF. + */ +static uint16_t* +encodeRunShort(uint16_t* buffer,uint16_t* bufLimit, uint16_t value, int32_t length,UErrorCode* status) { + int32_t num=0; + if (length < 4) { + int j=0; + for (; j<length; ++j) { + if (value == (int32_t) ESCAPE){ + APPEND(buffer,bufLimit,ESCAPE, num, status); + + } + APPEND(buffer,bufLimit,value,num, status); + } + } + else { + if (length == (int32_t) ESCAPE) { + if (value == (int32_t) ESCAPE){ + APPEND(buffer,bufLimit,ESCAPE,num,status); + + } + APPEND(buffer,bufLimit,value,num,status); + --length; + } + APPEND(buffer,bufLimit,ESCAPE,num,status); + APPEND(buffer,bufLimit,(uint16_t) length, num,status); + APPEND(buffer,bufLimit,(uint16_t)value, num, status); /* Don't need to escape this value */ + } + return buffer; +} + +/** + * Construct a string representing a char array. Use run-length encoding. + * A character represents itself, unless it is the ESCAPE character. Then + * the following notations are possible: + * ESCAPE ESCAPE ESCAPE literal + * ESCAPE n c n instances of character c + * Since an encoded run occupies 3 characters, we only encode runs of 4 or + * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. + * If we encounter a run where n == ESCAPE, we represent this as: + * c ESCAPE n-1 c + * The ESCAPE value is chosen so as not to collide with commonly + * seen values. + */ +int32_t +usArrayToRLEString(const uint16_t* src,int32_t srcLen,uint16_t* buffer, int32_t bufLen,UErrorCode* status) { + uint16_t* bufLimit = buffer+bufLen; + uint16_t* saveBuffer = buffer; + if(buffer < bufLimit){ + *buffer++ = (uint16_t)(srcLen>>16); + if(buffer<bufLimit){ + uint16_t runValue = src[0]; + int32_t runLength = 1; + int i=1; + *buffer++ = (uint16_t) srcLen; + + for (; i<srcLen; ++i) { + uint16_t s = src[i]; + if (s == runValue && runLength < 0xFFFF){ + ++runLength; + }else { + buffer = encodeRunShort(buffer,bufLimit, (uint16_t)runValue, runLength,status); + runValue = s; + runLength = 1; + } + } + buffer= encodeRunShort(buffer,bufLimit,(uint16_t)runValue, runLength,status); + }else{ + *status = U_BUFFER_OVERFLOW_ERROR; + } + }else{ + *status = U_BUFFER_OVERFLOW_ERROR; + } + return (int32_t)(buffer - saveBuffer); +} + +/** + * Construct a string representing a byte array. Use run-length encoding. + * Two bytes are packed into a single char, with a single extra zero byte at + * the end if needed. A byte represents itself, unless it is the + * ESCAPE_BYTE. Then the following notations are possible: + * ESCAPE_BYTE ESCAPE_BYTE ESCAPE_BYTE literal + * ESCAPE_BYTE n b n instances of byte b + * Since an encoded run occupies 3 bytes, we only encode runs of 4 or + * more bytes. Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF. + * If we encounter a run where n == ESCAPE_BYTE, we represent this as: + * b ESCAPE_BYTE n-1 b + * The ESCAPE_BYTE value is chosen so as not to collide with commonly + * seen values. + */ +int32_t +byteArrayToRLEString(const uint8_t* src,int32_t srcLen, uint16_t* buffer,int32_t bufLen, UErrorCode* status) { + const uint16_t* saveBuf = buffer; + uint16_t* bufLimit = buffer+bufLen; + if(buffer < bufLimit){ + *buffer++ = ((uint16_t) (srcLen >> 16)); + + if(buffer<bufLimit){ + uint8_t runValue = src[0]; + int runLength = 1; + uint8_t state[2]= {0}; + int i=1; + *buffer++=((uint16_t) srcLen); + for (; i<srcLen; ++i) { + uint8_t b = src[i]; + if (b == runValue && runLength < 0xFF){ + ++runLength; + } + else { + buffer = encodeRunByte(buffer, bufLimit,runValue, runLength, state,status); + runValue = b; + runLength = 1; + } + } + buffer = encodeRunByte(buffer,bufLimit, runValue, runLength, state, status); + + /* We must save the final byte, if there is one, by padding + * an extra zero. + */ + if (state[0] != 0) { + buffer = appendEncodedByte(buffer,bufLimit, 0, state ,status); + } + }else{ + *status = U_BUFFER_OVERFLOW_ERROR; + } + }else{ + *status = U_BUFFER_OVERFLOW_ERROR; + } + return (int32_t) (buffer - saveBuf); +} + + +/** + * Construct an array of shorts from a run-length encoded string. + */ +int32_t +rleStringToUCharArray(uint16_t* src, int32_t srcLen, uint16_t* target, int32_t tgtLen, UErrorCode* status) { + int32_t length = 0; + int32_t ai = 0; + int i=2; + + if(!status || U_FAILURE(*status)){ + return 0; + } + /* the source is null terminated */ + if(srcLen == -1){ + srcLen = u_strlen(src); + } + if(srcLen <= 2){ + return 2; + } + length = (((int32_t) src[0]) << 16) | ((int32_t) src[1]); + + if(target == NULL){ + return length; + } + if(tgtLen < length){ + *status = U_BUFFER_OVERFLOW_ERROR; + return length; + } + + for (; i<srcLen; ++i) { + uint16_t c = src[i]; + if (c == ESCAPE) { + c = src[++i]; + if (c == ESCAPE) { + target[ai++] = c; + } else { + int32_t runLength = (int32_t) c; + uint16_t runValue = src[++i]; + int j=0; + for (; j<runLength; ++j) { + target[ai++] = runValue; + } + } + } + else { + target[ai++] = c; + } + } + + if (ai != length){ + *status = U_INTERNAL_PROGRAM_ERROR; + } + + return length; +} + +/** + * Construct an array of bytes from a run-length encoded string. + */ +int32_t +rleStringToByteArray(uint16_t* src, int32_t srcLen, uint8_t* target, int32_t tgtLen, UErrorCode* status) { + + int32_t length = 0; + UBool nextChar = true; + uint16_t c = 0; + int32_t node = 0; + int32_t runLength = 0; + int32_t i = 2; + int32_t ai=0; + + if(!status || U_FAILURE(*status)){ + return 0; + } + /* the source is null terminated */ + if(srcLen == -1){ + srcLen = u_strlen(src); + } + if(srcLen <= 2){ + return 2; + } + length = (((int32_t) src[0]) << 16) | ((int32_t) src[1]); + + if(target == NULL){ + return length; + } + if(tgtLen < length){ + *status = U_BUFFER_OVERFLOW_ERROR; + return length; + } + + for (; ai<tgtLen; ) { + /* This part of the loop places the next byte into the local + * variable 'b' each time through the loop. It keeps the + * current character in 'c' and uses the boolean 'nextChar' + * to see if we've taken both bytes out of 'c' yet. + */ + uint8_t b; + if (nextChar) { + c = src[i++]; + b = (uint8_t) (c >> 8); + nextChar = false; + } + else { + b = (uint8_t) (c & 0xFF); + nextChar = true; + } + + /* This part of the loop is a tiny state machine which handles + * the parsing of the run-length encoding. This would be simpler + * if we could look ahead, but we can't, so we use 'node' to + * move between three nodes in the state machine. + */ + switch (node) { + case 0: + /* Normal idle node */ + if (b == ESCAPE_BYTE) { + node = 1; + } + else { + target[ai++] = b; + } + break; + case 1: + /* We have seen one ESCAPE_BYTE; we expect either a second + * one, or a run length and value. + */ + if (b == ESCAPE_BYTE) { + target[ai++] = ESCAPE_BYTE; + node = 0; + } + else { + runLength = b; + node = 2; + } + break; + case 2: + { + int j=0; + /* We have seen an ESCAPE_BYTE and length byte. We interpret + * the next byte as the value to be repeated. + */ + for (; j<runLength; ++j){ + if(ai<tgtLen){ + target[ai++] = b; + }else{ + *status = U_BUFFER_OVERFLOW_ERROR; + return ai; + } + } + node = 0; + break; + } + } + } + + if (node != 0){ + *status = U_INTERNAL_PROGRAM_ERROR; + /*("Bad run-length encoded byte array")*/ + return 0; + } + + + if (i != srcLen){ + /*("Excess data in RLE byte array string");*/ + *status = U_INTERNAL_PROGRAM_ERROR; + return ai; + } + + return ai; +} + diff --git a/intl/icu/source/tools/genrb/rle.h b/intl/icu/source/tools/genrb/rle.h new file mode 100644 index 0000000000..2684bbe6b2 --- /dev/null +++ b/intl/icu/source/tools/genrb/rle.h @@ -0,0 +1,74 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File writejava.c +* +* Modification History: +* +* Date Name Description +* 01/11/02 Ram Creation. +******************************************************************************* +*/ + +#ifndef RLE_H +#define RLE_H 1 + +#include "unicode/utypes.h" +#include "unicode/ustring.h" + +U_CDECL_BEGIN +/** + * Construct a string representing a byte array. Use run-length encoding. + * Two bytes are packed into a single char, with a single extra zero byte at + * the end if needed. A byte represents itself, unless it is the + * ESCAPE_BYTE. Then the following notations are possible: + * ESCAPE_BYTE ESCAPE_BYTE ESCAPE_BYTE literal + * ESCAPE_BYTE n b n instances of byte b + * Since an encoded run occupies 3 bytes, we only encode runs of 4 or + * more bytes. Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF. + * If we encounter a run where n == ESCAPE_BYTE, we represent this as: + * b ESCAPE_BYTE n-1 b + * The ESCAPE_BYTE value is chosen so as not to collide with commonly + * seen values. + */ +int32_t +byteArrayToRLEString(const uint8_t* src,int32_t srcLen, uint16_t* buffer,int32_t bufLen, UErrorCode* status); + + +/** + * Construct a string representing a char array. Use run-length encoding. + * A character represents itself, unless it is the ESCAPE character. Then + * the following notations are possible: + * ESCAPE ESCAPE ESCAPE literal + * ESCAPE n c n instances of character c + * Since an encoded run occupies 3 characters, we only encode runs of 4 or + * more characters. Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF. + * If we encounter a run where n == ESCAPE, we represent this as: + * c ESCAPE n-1 c + * The ESCAPE value is chosen so as not to collide with commonly + * seen values. + */ +int32_t +usArrayToRLEString(const uint16_t* src,int32_t srcLen,uint16_t* buffer, int32_t bufLen,UErrorCode* status); + +/** + * Construct an array of bytes from a run-length encoded string. + */ +int32_t +rleStringToByteArray(uint16_t* src, int32_t srcLen, uint8_t* target, int32_t tgtLen, UErrorCode* status); +/** + * Construct an array of shorts from a run-length encoded string. + */ +int32_t +rleStringToUCharArray(uint16_t* src, int32_t srcLen, uint16_t* target, int32_t tgtLen, UErrorCode* status); + +U_CDECL_END + +#endif diff --git a/intl/icu/source/tools/genrb/sources.txt b/intl/icu/source/tools/genrb/sources.txt new file mode 100644 index 0000000000..0128e2094f --- /dev/null +++ b/intl/icu/source/tools/genrb/sources.txt @@ -0,0 +1,12 @@ +errmsg.c +filterrb.cpp +genrb.cpp +parse.cpp +prscmnts.cpp +rbutil.c +read.c +reslist.cpp +rle.c +ustr.c +wrtjava.cpp +wrtxml.cpp diff --git a/intl/icu/source/tools/genrb/ustr.c b/intl/icu/source/tools/genrb/ustr.c new file mode 100644 index 0000000000..15f76a80ca --- /dev/null +++ b/intl/icu/source/tools/genrb/ustr.c @@ -0,0 +1,219 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File ustr.c +* +* Modification History: +* +* Date Name Description +* 05/28/99 stephen Creation. +******************************************************************************* +*/ + +#include "ustr.h" +#include "cmemory.h" +#include "cstring.h" +#include "unicode/ustring.h" +#include "unicode/putil.h" +#include "unicode/utf16.h" + +/* Protos */ +static void ustr_resize(struct UString *s, int32_t len, UErrorCode *status); + +/* Macros */ +#define ALLOCATION(minSize) (minSize < 0x80 ? 0x80 : (2 * minSize + 0x80) & ~(0x80 - 1)) + +U_CFUNC void +ustr_init(struct UString *s) +{ + s->fChars = 0; + s->fLength = s->fCapacity = 0; +} + +U_CFUNC void +ustr_initChars(struct UString *s, const char* source, int32_t length, UErrorCode *status) +{ + int i = 0; + if (U_FAILURE(*status)) return; + s->fChars = 0; + s->fLength = s->fCapacity = 0; + if (length == -1) { + length = (int32_t)uprv_strlen(source); + } + if(s->fCapacity < length) { + ustr_resize(s, ALLOCATION(length), status); + if(U_FAILURE(*status)) return; + } + for (; i < length; i++) + { + UChar charToAppend; + u_charsToUChars(source+i, &charToAppend, 1); + ustr_ucat(s, charToAppend, status); + /* +#if U_CHARSET_FAMILY==U_ASCII_FAMILY + ustr_ucat(s, (UChar)(uint8_t)(source[i]), status); +#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY + ustr_ucat(s, (UChar)asciiFromEbcdic[(uint8_t)(*cs++)], status); +#else +# error U_CHARSET_FAMILY is not valid +#endif + */ + } +} + +U_CFUNC void +ustr_deinit(struct UString *s) +{ + if (s) { + uprv_free(s->fChars); + s->fChars = 0; + s->fLength = s->fCapacity = 0; + } +} + +U_CFUNC void +ustr_cpy(struct UString *dst, + const struct UString *src, + UErrorCode *status) +{ + if(U_FAILURE(*status) || dst == src) + return; + + if(dst->fCapacity < src->fLength) { + ustr_resize(dst, ALLOCATION(src->fLength), status); + if(U_FAILURE(*status)) + return; + } + if(src->fChars == NULL || dst->fChars == NULL){ + return; + } + u_memcpy(dst->fChars, src->fChars, src->fLength); + dst->fLength = src->fLength; + dst->fChars[dst->fLength] = 0x0000; +} + +U_CFUNC void +ustr_setlen(struct UString *s, + int32_t len, + UErrorCode *status) +{ + if(U_FAILURE(*status)) + return; + + if(s->fCapacity < (len + 1)) { + ustr_resize(s, ALLOCATION(len), status); + if(U_FAILURE(*status)) + return; + } + + s->fLength = len; + s->fChars[len] = 0x0000; +} + +U_CFUNC void +ustr_cat(struct UString *dst, + const struct UString *src, + UErrorCode *status) +{ + ustr_ncat(dst, src, src->fLength, status); +} + +U_CFUNC void +ustr_ncat(struct UString *dst, + const struct UString *src, + int32_t n, + UErrorCode *status) +{ + if(U_FAILURE(*status) || dst == src) + return; + + if(dst->fCapacity < (dst->fLength + n)) { + ustr_resize(dst, ALLOCATION(dst->fLength + n), status); + if(U_FAILURE(*status)) + return; + } + + uprv_memcpy(dst->fChars + dst->fLength, src->fChars, + sizeof(UChar) * n); + dst->fLength += src->fLength; + dst->fChars[dst->fLength] = 0x0000; +} + +U_CFUNC void +ustr_ucat(struct UString *dst, + UChar c, + UErrorCode *status) +{ + if(U_FAILURE(*status)) + return; + + if(dst->fCapacity < (dst->fLength + 1)) { + ustr_resize(dst, ALLOCATION(dst->fLength + 1), status); + if(U_FAILURE(*status)) + return; + } + + uprv_memcpy(dst->fChars + dst->fLength, &c, + sizeof(UChar) * 1); + dst->fLength += 1; + dst->fChars[dst->fLength] = 0x0000; +} +U_CFUNC void +ustr_u32cat(struct UString *dst, UChar32 c, UErrorCode *status){ + if(c > 0x10FFFF){ + *status = U_ILLEGAL_CHAR_FOUND; + return; + } + if(c >0xFFFF){ + ustr_ucat(dst, U16_LEAD(c), status); + ustr_ucat(dst, U16_TRAIL(c), status); + }else{ + ustr_ucat(dst, (UChar) c, status); + } +} +U_CFUNC void +ustr_uscat(struct UString *dst, + const UChar* src,int len, + UErrorCode *status) +{ + if(U_FAILURE(*status)) + return; + + if(dst->fCapacity < (dst->fLength + len)) { + ustr_resize(dst, ALLOCATION(dst->fLength + len), status); + if(U_FAILURE(*status)) + return; + } + + uprv_memcpy(dst->fChars + dst->fLength, src, + sizeof(UChar) * len); + dst->fLength += len; + dst->fChars[dst->fLength] = 0x0000; +} + +/* Destroys data in the string */ +static void +ustr_resize(struct UString *s, + int32_t len, + UErrorCode *status) +{ + if(U_FAILURE(*status)) + return; + + /* +1 for trailing 0x0000 */ + s->fChars = (UChar*) uprv_realloc(s->fChars, sizeof(UChar) * (len + 1)); + if(s->fChars == 0) { + *status = U_MEMORY_ALLOCATION_ERROR; + s->fLength = s->fCapacity = 0; + return; + } + + s->fCapacity = len; +} diff --git a/intl/icu/source/tools/genrb/ustr.h b/intl/icu/source/tools/genrb/ustr.h new file mode 100644 index 0000000000..8a69e9d4d5 --- /dev/null +++ b/intl/icu/source/tools/genrb/ustr.h @@ -0,0 +1,81 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1998-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File ustr.h +* +* Modification History: +* +* Date Name Description +* 05/28/99 stephen Creation. +******************************************************************************* +*/ + +#ifndef USTR_H +#define USTR_H 1 + +#include "unicode/utypes.h" + +#define U_APPEND_CHAR32(c,target,len) UPRV_BLOCK_MACRO_BEGIN { \ + if (c <= 0xffff) \ + { \ + *(target)++ = (UChar) c; \ + len=1; \ + } \ + else \ + { \ + target[0] = U16_LEAD(c); \ + target[1] = U16_TRAIL(c); \ + len=2; \ + target +=2; \ + } \ +} UPRV_BLOCK_MACRO_END + +#define U_APPEND_CHAR32_ONLY(c,target) UPRV_BLOCK_MACRO_BEGIN { \ + if (c <= 0xffff) \ + { \ + *(target)++ = (UChar) c; \ + } \ + else \ + { \ + target[0] = U16_LEAD(c); \ + target[1] = U16_TRAIL(c); \ + target +=2; \ + } \ +} UPRV_BLOCK_MACRO_END + +/* A C representation of a string "object" (to avoid realloc all the time) */ +struct UString { + UChar *fChars; + int32_t fLength; + int32_t fCapacity; +}; + +U_CFUNC void ustr_init(struct UString *s); + +U_CFUNC void +ustr_initChars(struct UString *s, const char* source, int32_t length, UErrorCode *status); + +U_CFUNC void ustr_deinit(struct UString *s); + +U_CFUNC void ustr_setlen(struct UString *s, int32_t len, UErrorCode *status); + +U_CFUNC void ustr_cpy(struct UString *dst, const struct UString *src, + UErrorCode *status); + +U_CFUNC void ustr_cat(struct UString *dst, const struct UString *src, + UErrorCode *status); + +U_CFUNC void ustr_ncat(struct UString *dst, const struct UString *src, + int32_t n, UErrorCode *status); + +U_CFUNC void ustr_ucat(struct UString *dst, UChar c, UErrorCode *status); +U_CFUNC void ustr_u32cat(struct UString *dst, UChar32 c, UErrorCode *status); +U_CFUNC void ustr_uscat(struct UString *dst, const UChar* src,int len,UErrorCode *status); +#endif diff --git a/intl/icu/source/tools/genrb/wrtjava.cpp b/intl/icu/source/tools/genrb/wrtjava.cpp new file mode 100644 index 0000000000..cb04b5a44a --- /dev/null +++ b/intl/icu/source/tools/genrb/wrtjava.cpp @@ -0,0 +1,701 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2000-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File wrtjava.cpp +* +* Modification History: +* +* Date Name Description +* 01/11/02 Ram Creation. +* 02/12/08 Spieth Fix errant 'new Object[][]{' insertion +* 02/19/08 Spieth Removed ICUListResourceBundle dependency +******************************************************************************* +*/ + +#include <assert.h> +#include "unicode/unistr.h" +#include "reslist.h" +#include "unewdata.h" +#include "unicode/ures.h" +#include "errmsg.h" +#include "filestrm.h" +#include "cstring.h" +#include "unicode/ucnv.h" +#include "genrb.h" +#include "rle.h" +#include "uhash.h" +#include "uresimp.h" +#include "unicode/ustring.h" +#include "unicode/utf8.h" + +void res_write_java(struct SResource *res,UErrorCode *status); + + +static const char copyRight[] = + "/* \n" + " *******************************************************************************\n" + " *\n" + " * Copyright (C) International Business Machines\n" + " * Corporation and others. All Rights Reserved.\n" + " *\n" + " *******************************************************************************\n" + " * $" "Source: $ \n" + " * $" "Date: $ \n" + " * $" "Revision: $ \n" + " *******************************************************************************\n" + " */\n\n"; +static const char warningMsg[] = + "/*********************************************************************\n" + "######################################################################\n" + "\n" + " WARNING: This file is generated by genrb Version " GENRB_VERSION ".\n" + " If you edit this file, please make sure that, the source\n" + " of this file (XXXX.txt in LocaleElements_XXXX.java)\n" + " is also edited.\n" + "######################################################################\n" + " *********************************************************************\n" + " */\n\n"; +static const char* openBrace="{\n"; +static const char* closeClass=" };\n" + "}\n"; + +static const char* javaClass = "import java.util.ListResourceBundle;\n\n" + "public class "; + +static const char* javaClass1= " extends ListResourceBundle {\n\n" + " /**\n" + " * Overrides ListResourceBundle \n" + " */\n" + " public final Object[][] getContents() { \n" + " return contents;\n" + " }\n\n" + " private static Object[][] contents = {\n"; +/*static const char* javaClassICU= " extends ListResourceBundle {\n\n" + " public %s () {\n" + " super.contents = data;\n" + " }\n" + " static final Object[][] data = new Object[][] { \n";*/ +static int tabCount = 3; + +static FileStream* out=nullptr; +static struct SRBRoot* srBundle ; +/*static const char* outDir = nullptr;*/ + +static const char* bName=nullptr; +static const char* pName=nullptr; + +static void write_tabs(FileStream* os){ + int i=0; + for(;i<=tabCount;i++){ + T_FileStream_write(os," ",4); + } +} + +#define ZERO 0x30 + +static const char* enc =""; +static UConverter* conv = nullptr; + +static int32_t +uCharsToChars(char *target, int32_t targetLen, const char16_t *source, int32_t sourceLen, UErrorCode *status) { + int i=0, j=0; + char str[30]={'\0'}; + while(i<sourceLen){ + if (source[i] == '\n') { + if (j + 2 < targetLen) { + uprv_strcat(target, "\\n"); + } + j += 2; + }else if(source[i]==0x0D){ + if(j+2<targetLen){ + uprv_strcat(target,"\\f"); + } + j+=2; + }else if(source[i] == '"'){ + if(source[i-1]=='\''){ + if(j+2<targetLen){ + uprv_strcat(target,"\\"); + target[j+1]= (char)source[i]; + } + j+=2; + }else if(source[i-1]!='\\'){ + + if(j+2<targetLen){ + uprv_strcat(target,"\\"); + target[j+1]= (char)source[i]; + } + j+=2; + }else if(source[i-1]=='\\'){ + target[j++]= (char)source[i]; + } + }else if(source[i]=='\\'){ + if(i+1<sourceLen){ + switch(source[i+1]){ + case ',': + case '!': + case '?': + case '#': + case '.': + case '%': + case '&': + case ':': + case ';': + if(j+2<targetLen){ + uprv_strcat(target,"\\\\"); + } + j+=2; + break; + case '"': + case '\'': + if(j+3<targetLen){ + uprv_strcat(target,"\\\\\\"); + } + j+=3; + break; + default : + if(j<targetLen){ + target[j]=(char)source[i]; + } + j++; + break; + } + }else{ + if(j<targetLen){ + uprv_strcat(target,"\\\\"); + } + j+=2; + } + }else if(source[i]>=0x20 && source[i]<0x7F/*ASCII*/){ + if(j<targetLen){ + target[j] = (char) source[i]; + } + j++; + }else{ + if(*enc =='\0' || source[i]==0x0000){ + uprv_strcpy(str,"\\u"); + itostr(str+2,source[i],16,4); + if(j+6<targetLen){ + uprv_strcat(target,str); + } + j+=6; + }else{ + char dest[30] = {0}; + int retVal=ucnv_fromUChars(conv,dest,30,source+i,1,status); + if(U_FAILURE(*status)){ + return 0; + } + if(j+retVal<targetLen){ + uprv_strcat(target,dest); + } + j+=retVal; + } + } + i++; + } + return j; +} + + +static uint32_t +strrch(const char* source,uint32_t sourceLen,char find){ + const char* tSourceEnd =source + (sourceLen-1); + while(tSourceEnd>= source){ + if(*tSourceEnd==find){ + return (uint32_t)(tSourceEnd-source); + } + tSourceEnd--; + } + return (uint32_t)(tSourceEnd-source); +} + +static int32_t getColumnCount(int32_t len){ + int32_t columnCount = 80; + int32_t maxLines = 3000; + int32_t adjustedLen = len*5; /* assume that every codepoint is represented in \uXXXX format*/ + /* + * calculate the number of lines that + * may be required if column count is 80 + */ + if (maxLines < (adjustedLen / columnCount) ){ + columnCount = adjustedLen / maxLines; + } + return columnCount; +} +static void +str_write_java(const char16_t *src, int32_t srcLen, UBool printEndLine, UErrorCode *status) { + + uint32_t length = srcLen*8; + uint32_t bufLen = 0; + uint32_t columnCount; + char* buf = (char*) malloc(sizeof(char)*length); + + if(buf == nullptr) { + *status = U_MEMORY_ALLOCATION_ERROR; + return; + } + + columnCount = getColumnCount(srcLen); + memset(buf,0,length); + + bufLen = uCharsToChars(buf,length,src,srcLen,status); + // buflen accounts for extra bytes added due to multi byte encoding of + // non ASCII characters + if(printEndLine) + write_tabs(out); + + if(U_FAILURE(*status)){ + uprv_free(buf); + return; + } + + if(bufLen+(tabCount*4) > columnCount ){ + uint32_t len = 0; + char* current = buf; + uint32_t add; + while(len < bufLen){ + add = columnCount-(tabCount*4)-5/* for ", +\n */; + current = buf +len; + if (add < (bufLen-len)) { + uint32_t idx = strrch(current,add,'\\'); + if (idx > add) { + idx = add; + } else { + int32_t num =idx-1; + uint32_t seqLen; + while(num>0){ + if(current[num]=='\\'){ + num--; + }else{ + break; + } + } + if ((idx-num)%2==0) { + idx--; + } + seqLen = (current[idx+1]=='u') ? 6 : 2; + if ((add-idx) < seqLen) { + add = idx + seqLen; + } + } + } + T_FileStream_write(out,"\"",1); + uint32_t byteIndex = 0; + uint32_t trailBytes = 0; + if(len+add<bufLen){ + // check the trail bytes to be added to the output line + while (byteIndex < add) { + if (U8_IS_LEAD(*(current + byteIndex))) { + trailBytes = U8_COUNT_TRAIL_BYTES(*(current + byteIndex)); + add += trailBytes; + } + byteIndex++; + } + T_FileStream_write(out,current,add); + if (len + add < bufLen) { + T_FileStream_write(out,"\" +\n",4); + write_tabs(out); + } + }else{ + T_FileStream_write(out,current,bufLen-len); + } + len+=add; + } + }else{ + T_FileStream_write(out,"\"",1); + T_FileStream_write(out, buf,bufLen); + } + if(printEndLine){ + T_FileStream_write(out,"\",\n",3); + }else{ + T_FileStream_write(out,"\"",1); + } + uprv_free(buf); +} + +/* Writing Functions */ +static void +string_write_java(const StringResource *res,UErrorCode *status) { + (void)res->getKeyString(srBundle); + + str_write_java(res->getBuffer(), res->length(), true, status); +} + +static void +array_write_java(const ArrayResource *res, UErrorCode *status) { + + uint32_t i = 0; + const char* arr ="new String[] { \n"; + struct SResource *current = nullptr; + UBool allStrings = true; + + if (U_FAILURE(*status)) { + return; + } + + if (res->fCount > 0) { + + current = res->fFirst; + i = 0; + while(current != nullptr){ + if(!current->isString()){ + allStrings = false; + break; + } + current= current->fNext; + } + + current = res->fFirst; + if(allStrings==false){ + const char* object = "new Object[]{\n"; + write_tabs(out); + T_FileStream_write(out, object, (int32_t)uprv_strlen(object)); + tabCount++; + }else{ + write_tabs(out); + T_FileStream_write(out, arr, (int32_t)uprv_strlen(arr)); + tabCount++; + } + while (current != nullptr) { + /*if(current->isString()){ + write_tabs(out); + }*/ + res_write_java(current, status); + if(U_FAILURE(*status)){ + return; + } + i++; + current = current->fNext; + } + T_FileStream_write(out,"\n",1); + + tabCount--; + write_tabs(out); + T_FileStream_write(out,"},\n",3); + + } else { + write_tabs(out); + T_FileStream_write(out,arr,(int32_t)uprv_strlen(arr)); + write_tabs(out); + T_FileStream_write(out,"},\n",3); + } +} + +static void +intvector_write_java(const IntVectorResource *res, UErrorCode * /*status*/) { + uint32_t i = 0; + const char* intArr = "new int[] {\n"; + /* const char* intC = "new Integer("; */ + const char* stringArr = "new String[]{\n"; + const char *resname = res->getKeyString(srBundle); + char buf[100]; + int len =0; + buf[0]=0; + write_tabs(out); + + if(resname != nullptr && uprv_strcmp(resname,"DateTimeElements")==0){ + T_FileStream_write(out, stringArr, (int32_t)uprv_strlen(stringArr)); + tabCount++; + for(i = 0; i<res->fCount; i++) { + write_tabs(out); + len=itostr(buf,res->fArray[i],10,0); + T_FileStream_write(out,"\"",1); + T_FileStream_write(out,buf,len); + T_FileStream_write(out,"\",",2); + T_FileStream_write(out,"\n",1); + } + }else{ + T_FileStream_write(out, intArr, (int32_t)uprv_strlen(intArr)); + tabCount++; + for(i = 0; i<res->fCount; i++) { + write_tabs(out); + /* T_FileStream_write(out, intC, (int32_t)uprv_strlen(intC)); */ + len=itostr(buf,res->fArray[i],10,0); + T_FileStream_write(out,buf,len); + /* T_FileStream_write(out,"),",2); */ + /* T_FileStream_write(out,"\n",1); */ + T_FileStream_write(out,",\n",2); + } + } + tabCount--; + write_tabs(out); + T_FileStream_write(out,"},\n",3); +} + +static void +int_write_java(const IntResource *res, UErrorCode * /*status*/) { + const char* intC = "new Integer("; + char buf[100]; + int len =0; + buf[0]=0; + + /* write the binary data */ + write_tabs(out); + T_FileStream_write(out, intC, (int32_t)uprv_strlen(intC)); + len=itostr(buf, res->fValue, 10, 0); + T_FileStream_write(out,buf,len); + T_FileStream_write(out,"),\n",3 ); + +} + +static void +bytes_write_java(const BinaryResource *res, UErrorCode * /*status*/) { + const char* type = "new byte[] {"; + const char* byteDecl = "%i, "; + char byteBuffer[100] = { 0 }; + uint8_t* byteArray = nullptr; + int byteIterator = 0; + int32_t srcLen=res->fLength; + if(srcLen>0 ) + { + byteArray = res->fData; + + write_tabs(out); + T_FileStream_write(out, type, (int32_t)uprv_strlen(type)); + T_FileStream_write(out, "\n", 1); + tabCount++; + + for (;byteIterator<srcLen;byteIterator++) + { + if (byteIterator%16 == 0) + { + write_tabs(out); + } + + if (byteArray[byteIterator] < 128) + { + snprintf(byteBuffer, sizeof(byteBuffer), byteDecl, byteArray[byteIterator]); + } + else + { + snprintf(byteBuffer, sizeof(byteBuffer), byteDecl, (byteArray[byteIterator]-256)); + } + + T_FileStream_write(out, byteBuffer, (int32_t)uprv_strlen(byteBuffer)); + + if (byteIterator%16 == 15) + { + T_FileStream_write(out, "\n", 1); + } + + } + + if (((byteIterator-1)%16) != 15) + { + T_FileStream_write(out, "\n", 1); + } + + tabCount--; + write_tabs(out); + T_FileStream_write(out, "},\n", 3); + + } + else + { + /* Empty array */ + write_tabs(out); + T_FileStream_write(out,type,(int32_t)uprv_strlen(type)); + T_FileStream_write(out,"},\n",3); + } + +} + +static UBool start = true; + +static void +table_write_java(const TableResource *res, UErrorCode *status) { + uint32_t i = 0; + struct SResource *current = nullptr; + const char* obj = "new Object[][]{\n"; + + if (U_FAILURE(*status)) { + return ; + } + + if (res->fCount > 0) { + if(start==false){ + write_tabs(out); + T_FileStream_write(out, obj, (int32_t)uprv_strlen(obj)); + tabCount++; + } + start = false; + current = res->fFirst; + i = 0; + + + while (current != nullptr) { + const char *currentKeyString = current->getKeyString(srBundle); + + assert(i < res->fCount); + write_tabs(out); + + T_FileStream_write(out, openBrace, 2); + + + tabCount++; + + write_tabs(out); + if(currentKeyString != nullptr) { + T_FileStream_write(out, "\"", 1); + T_FileStream_write(out, currentKeyString, + (int32_t)uprv_strlen(currentKeyString)); + T_FileStream_write(out, "\",\n", 2); + + T_FileStream_write(out, "\n", 1); + } + res_write_java(current, status); + if(U_FAILURE(*status)){ + return; + } + i++; + current = current->fNext; + tabCount--; + write_tabs(out); + T_FileStream_write(out, "},\n", 3); + } + if(tabCount>4){ + tabCount--; + write_tabs(out); + T_FileStream_write(out, "},\n", 3); + } + + } else { + write_tabs(out); + T_FileStream_write(out,obj,(int32_t)uprv_strlen(obj)); + + write_tabs(out); + T_FileStream_write(out,"},\n",3); + + } + +} + +void +res_write_java(struct SResource *res,UErrorCode *status) { + + if (U_FAILURE(*status)) { + return ; + } + + if (res != nullptr) { + switch (res->fType) { + case URES_STRING: + string_write_java (static_cast<const StringResource *>(res), status); + return; + case URES_ALIAS: + printf("Encountered unsupported resource type %d of alias\n", res->fType); + *status = U_UNSUPPORTED_ERROR; + return; + case URES_INT_VECTOR: + intvector_write_java (static_cast<const IntVectorResource *>(res), status); + return; + case URES_BINARY: + bytes_write_java (static_cast<const BinaryResource *>(res), status); + return; + case URES_INT: + int_write_java (static_cast<const IntResource *>(res), status); + return; + case URES_ARRAY: + array_write_java (static_cast<const ArrayResource *>(res), status); + return; + case URES_TABLE: + table_write_java (static_cast<const TableResource *>(res), status); + return; + default: + break; + } + } + + *status = U_INTERNAL_PROGRAM_ERROR; +} + +void +bundle_write_java(struct SRBRoot *bundle, const char *outputDir,const char* outputEnc, + char *writtenFilename, int writtenFilenameLen, + const char* packageName, const char* bundleName, + UErrorCode *status) { + + char fileName[256] = {'\0'}; + char className[256]={'\0'}; + /*char constructor[1000] = { 0 };*/ + /*UBool j1 =false;*/ + /*outDir = outputDir;*/ + + start = true; /* Reset the start indicator*/ + + bName = (bundleName==nullptr) ? "LocaleElements" : bundleName; + pName = (packageName==nullptr)? "com.ibm.icu.impl.data" : packageName; + + uprv_strcpy(className, bName); + srBundle = bundle; + if(uprv_strcmp(srBundle->fLocale,"root")!=0){ + uprv_strcat(className,"_"); + uprv_strcat(className,srBundle->fLocale); + } + if(outputDir){ + uprv_strcpy(fileName, outputDir); + if(outputDir[uprv_strlen(outputDir)-1] !=U_FILE_SEP_CHAR){ + uprv_strcat(fileName,U_FILE_SEP_STRING); + } + uprv_strcat(fileName,className); + uprv_strcat(fileName,".java"); + }else{ + uprv_strcat(fileName,className); + uprv_strcat(fileName,".java"); + } + + if (writtenFilename) { + uprv_strncpy(writtenFilename, fileName, writtenFilenameLen); + } + + if (U_FAILURE(*status)) { + return; + } + + out= T_FileStream_open(fileName,"w"); + + if(out==nullptr){ + *status = U_FILE_ACCESS_ERROR; + return; + } + if(getIncludeCopyright()){ + T_FileStream_write(out, copyRight, (int32_t)uprv_strlen(copyRight)); + T_FileStream_write(out, warningMsg, (int32_t)uprv_strlen(warningMsg)); + } + T_FileStream_write(out,"package ",(int32_t)uprv_strlen("package ")); + T_FileStream_write(out,pName,(int32_t)uprv_strlen(pName)); + T_FileStream_write(out,";\n\n",3); + T_FileStream_write(out, javaClass, (int32_t)uprv_strlen(javaClass)); + T_FileStream_write(out, className, (int32_t)uprv_strlen(className)); + T_FileStream_write(out, javaClass1, (int32_t)uprv_strlen(javaClass1)); + + /* if(j1){ + T_FileStream_write(out, javaClass1, (int32_t)uprv_strlen(javaClass1)); + }else{ + sprintf(constructor,javaClassICU,className); + T_FileStream_write(out, constructor, (int32_t)uprv_strlen(constructor)); + } + */ + + if(outputEnc && *outputEnc!='\0'){ + /* store the output encoding */ + enc = outputEnc; + conv=ucnv_open(enc,status); + if(U_FAILURE(*status)){ + return; + } + } + res_write_java(bundle->fRoot, status); + + T_FileStream_write(out, closeClass, (int32_t)uprv_strlen(closeClass)); + + T_FileStream_close(out); + + ucnv_close(conv); +} diff --git a/intl/icu/source/tools/genrb/wrtxml.cpp b/intl/icu/source/tools/genrb/wrtxml.cpp new file mode 100644 index 0000000000..16f67fabca --- /dev/null +++ b/intl/icu/source/tools/genrb/wrtxml.cpp @@ -0,0 +1,1213 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2002-2015, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* +* File wrtxml.cpp +* +* Modification History: +* +* Date Name Description +* 10/01/02 Ram Creation. +* 02/07/08 Spieth Correct XLIFF generation on EBCDIC platform +* +******************************************************************************* +*/ + +// Safer use of UnicodeString. +#ifndef UNISTR_FROM_CHAR_EXPLICIT +# define UNISTR_FROM_CHAR_EXPLICIT explicit +#endif + +// Less important, but still a good idea. +#ifndef UNISTR_FROM_STRING_EXPLICIT +# define UNISTR_FROM_STRING_EXPLICIT explicit +#endif + +#include "reslist.h" +#include "unewdata.h" +#include "unicode/ures.h" +#include "errmsg.h" +#include "filestrm.h" +#include "cstring.h" +#include "unicode/ucnv.h" +#include "genrb.h" +#include "rle.h" +#include "uhash.h" +#include "uresimp.h" +#include "unicode/ustring.h" +#include "unicode/uchar.h" +#include "ustr.h" +#include "prscmnts.h" +#include "unicode/unistr.h" +#include "unicode/utf8.h" +#include "unicode/utf16.h" +#include <time.h> + +U_NAMESPACE_USE + +static int tabCount = 0; + +static FileStream* out=nullptr; +static struct SRBRoot* srBundle ; +static const char* outDir = nullptr; +static const char* enc =""; +static UConverter* conv = nullptr; + +const char* const* ISOLanguages; +const char* const* ISOCountries; +const char* textExt = ".txt"; +const char* xliffExt = ".xlf"; + +static int32_t write_utf8_file(FileStream* fileStream, UnicodeString outString) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t len = 0; + + // preflight to get the destination buffer size + u_strToUTF8(nullptr, + 0, + &len, + toUCharPtr(outString.getBuffer()), + outString.length(), + &status); + + // allocate the buffer + char* dest = (char*)uprv_malloc(len); + status = U_ZERO_ERROR; + + // convert the data + u_strToUTF8(dest, + len, + &len, + toUCharPtr(outString.getBuffer()), + outString.length(), + &status); + + // write data to out file + int32_t ret = T_FileStream_write(fileStream, dest, len); + uprv_free(dest); + return (ret); +} + +/*write indentation for formatting*/ +static void write_tabs(FileStream* os){ + int i=0; + for(;i<=tabCount;i++){ + write_utf8_file(os,UnicodeString(" ")); + } +} + +/*get ID for each element. ID is globally unique.*/ +static char* getID(const char* id, const char* curKey, char* result) { + if(curKey == nullptr) { + result = (char *)uprv_malloc(sizeof(char)*uprv_strlen(id) + 1); + uprv_memset(result, 0, sizeof(char)*uprv_strlen(id) + 1); + uprv_strcpy(result, id); + } else { + result = (char *)uprv_malloc(sizeof(char)*(uprv_strlen(id) + 1 + uprv_strlen(curKey)) + 1); + uprv_memset(result, 0, sizeof(char)*(uprv_strlen(id) + 1 + uprv_strlen(curKey)) + 1); + if(id[0]!='\0'){ + uprv_strcpy(result, id); + uprv_strcat(result, "_"); + } + uprv_strcat(result, curKey); + } + return result; +} + +/*compute CRC for binary code*/ +/* The code is from http://www.theorem.com/java/CRC32.java + * Calculates the CRC32 - 32 bit Cyclical Redundancy Check + * <P> This check is used in numerous systems to verify the integrity + * of information. It's also used as a hashing function. Unlike a regular + * checksum, it's sensitive to the order of the characters. + * It produces a 32 bit + * + * @author Michael Lecuyer (mjl@theorem.com) + * @version 1.1 August 11, 1998 + */ + +/* ICU is not endian portable, because ICU data generated on big endian machines can be + * ported to big endian machines but not to little endian machines and vice versa. The + * conversion is not portable across platforms with different endianness. + */ + +uint32_t computeCRC(const char *ptr, uint32_t len, uint32_t lastcrc){ + int32_t crc; + uint32_t temp1; + uint32_t temp2; + + int32_t crc_ta[256]; + int i = 0; + int j = 0; + uint32_t crc2 = 0; + +#define CRC32_POLYNOMIAL 0xEDB88320 + + /*build crc table*/ + for (i = 0; i <= 255; i++) { + crc2 = i; + for (j = 8; j > 0; j--) { + if ((crc2 & 1) == 1) { + crc2 = (crc2 >> 1) ^ CRC32_POLYNOMIAL; + } else { + crc2 >>= 1; + } + } + crc_ta[i] = crc2; + } + + crc = lastcrc; + while(len--!=0) { + temp1 = (uint32_t)crc>>8; + temp2 = crc_ta[(crc^*ptr) & 0xFF]; + crc = temp1^temp2; + ptr++; + } + return(crc); +} + +static void strnrepchr(char* src, int32_t srcLen, char s, char r){ + int32_t i = 0; + for(i=0;i<srcLen;i++){ + if(src[i]==s){ + src[i]=r; + } + } +} +/* Parse the filename, and get its language information. + * If it fails to get the language information from the filename, + * use "en" as the default value for language + */ +static char* parseFilename(const char* id, char* /*lang*/) { + int idLen = (int) uprv_strlen(id); + char* localeID = (char*) uprv_malloc(idLen); + int pos = 0; + int canonCapacity = 0; + char* canon = nullptr; + int canonLen = 0; + /*int i;*/ + UErrorCode status = U_ZERO_ERROR; + const char *ext = uprv_strchr(id, '.'); + + if(ext != nullptr){ + pos = (int) (ext - id); + } else { + pos = idLen; + } + uprv_memcpy(localeID, id, pos); + localeID[pos]=0; /* NUL terminate the string */ + + canonCapacity =pos*3; + canon = (char*) uprv_malloc(canonCapacity); + canonLen = uloc_canonicalize(localeID, canon, canonCapacity, &status); + + if(U_FAILURE(status)){ + fprintf(stderr, "Could not canonicalize the locale ID: %s. Error: %s\n", localeID, u_errorName(status)); + exit(status); + } + strnrepchr(canon, canonLen, '_', '-'); + return canon; +} + +static const char* xmlHeader = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"; +#if 0 +static const char* bundleStart = "<xliff version = \"1.2\" " + "xmlns='urn:oasis:names:tc:xliff:document:1.2' " + "xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' " + "xsi:schemaLocation='urn:oasis:names:tc:xliff:document:1.2 xliff-core-1.2-transitional.xsd'>\n"; +#else +static const char* bundleStart = "<xliff version = \"1.1\" " + "xmlns='urn:oasis:names:tc:xliff:document:1.1' " + "xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' " + "xsi:schemaLocation='urn:oasis:names:tc:xliff:document:1.1 http://www.oasis-open.org/committees/xliff/documents/xliff-core-1.1.xsd'>\n"; +#endif +static const char* bundleEnd = "</xliff>\n"; + +void res_write_xml(struct SResource *res, const char* id, const char* language, UBool isTopLevel, UErrorCode *status); + +static char* convertAndEscape(char** pDest, int32_t destCap, int32_t* destLength, + const char16_t* src, int32_t srcLen, UErrorCode* status){ + int32_t srcIndex=0; + char* dest=nullptr; + char* temp=nullptr; + int32_t destLen=0; + UChar32 c = 0; + + if(status==nullptr || U_FAILURE(*status) || pDest==nullptr || srcLen==0 || src == nullptr){ + return nullptr; + } + dest =*pDest; + if(dest==nullptr || destCap <=0){ + destCap = srcLen * 8; + dest = (char*) uprv_malloc(sizeof(char) * destCap); + if(dest==nullptr){ + *status=U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + } + + dest[0]=0; + + while(srcIndex<srcLen){ + U16_NEXT(src, srcIndex, srcLen, c); + + if (U16_IS_LEAD(c) || U16_IS_TRAIL(c)) { + *status = U_ILLEGAL_CHAR_FOUND; + fprintf(stderr, "Illegal Surrogate! \n"); + uprv_free(dest); + return nullptr; + } + + if((destLen+U8_LENGTH(c)) < destCap){ + + /* ASCII Range */ + if(c <=0x007F){ + switch(c) { + case '\x26': + uprv_strcpy(dest+( destLen),"\x26\x61\x6d\x70\x3b"); /* &*/ + destLen+=(int32_t)uprv_strlen("\x26\x61\x6d\x70\x3b"); + break; + case '\x3c': + uprv_strcpy(dest+(destLen),"\x26\x6c\x74\x3b"); /* <*/ + destLen+=(int32_t)uprv_strlen("\x26\x6c\x74\x3b"); + break; + case '\x3e': + uprv_strcpy(dest+(destLen),"\x26\x67\x74\x3b"); /* >*/ + destLen+=(int32_t)uprv_strlen("\x26\x67\x74\x3b"); + break; + case '\x22': + uprv_strcpy(dest+(destLen),"\x26\x71\x75\x6f\x74\x3b"); /* "*/ + destLen+=(int32_t)uprv_strlen("\x26\x71\x75\x6f\x74\x3b"); + break; + case '\x27': + uprv_strcpy(dest+(destLen),"\x26\x61\x70\x6f\x73\x3b"); /* ' */ + destLen+=(int32_t)uprv_strlen("\x26\x61\x70\x6f\x73\x3b"); + break; + + /* Disallow C0 controls except TAB, CR, LF*/ + case 0x00: + case 0x01: + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + case 0x07: + case 0x08: + /*case 0x09:*/ + /*case 0x0A: */ + case 0x0B: + case 0x0C: + /*case 0x0D:*/ + case 0x0E: + case 0x0F: + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: + case 0x18: + case 0x19: + case 0x1A: + case 0x1B: + case 0x1C: + case 0x1D: + case 0x1E: + case 0x1F: + *status = U_ILLEGAL_CHAR_FOUND; + fprintf(stderr, "Illegal Character \\u%04X!\n",(int)c); + uprv_free(dest); + return nullptr; + default: + dest[destLen++]=(char)c; + } + }else{ + UBool isError = false; + U8_APPEND((unsigned char*)dest,destLen,destCap,c,isError); + if(isError){ + *status = U_ILLEGAL_CHAR_FOUND; + fprintf(stderr, "Illegal Character \\U%08X!\n",(int)c); + uprv_free(dest); + return nullptr; + } + } + }else{ + destCap += destLen; + + temp = (char*) uprv_malloc(sizeof(char)*destCap); + if(temp==nullptr){ + *status=U_MEMORY_ALLOCATION_ERROR; + uprv_free(dest); + return nullptr; + } + uprv_memmove(temp,dest,destLen); + destLen=0; + uprv_free(dest); + dest=temp; + temp=nullptr; + } + + } + *destLength = destLen; + return dest; +} + +#define ASTERISK 0x002A +#define SPACE 0x0020 +#define CR 0x000A +#define LF 0x000D +#define AT_SIGN 0x0040 + +#if UCONFIG_NO_REGULAR_EXPRESSIONS==0 +static void +trim(char **src, int32_t *len){ + + char *s = nullptr; + int32_t i = 0; + if(src == nullptr || *src == nullptr){ + return; + } + s = *src; + /* trim from the end */ + for( i=(*len-1); i>= 0; i--){ + switch(s[i]){ + case ASTERISK: + case SPACE: + case CR: + case LF: + s[i] = 0; + continue; + default: + break; + } + break; + + } + *len = i+1; +} + +static void +print(char16_t* src, int32_t srcLen,const char *tagStart,const char *tagEnd, UErrorCode *status){ + int32_t bufCapacity = srcLen*4; + char *buf = nullptr; + int32_t bufLen = 0; + + if(U_FAILURE(*status)){ + return; + } + + buf = (char*) (uprv_malloc(bufCapacity)); + if(buf==0){ + fprintf(stderr, "Could not allocate memory!!"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + buf = convertAndEscape(&buf, bufCapacity, &bufLen, src, srcLen,status); + if(U_SUCCESS(*status)){ + trim(&buf,&bufLen); + write_utf8_file(out,UnicodeString(tagStart)); + write_utf8_file(out,UnicodeString(buf, bufLen, "UTF-8")); + write_utf8_file(out,UnicodeString(tagEnd)); + write_utf8_file(out,UnicodeString("\n")); + + } +} +#endif + +static void +printNoteElements(const UString *src, UErrorCode *status){ + +#if UCONFIG_NO_REGULAR_EXPRESSIONS==0 /* donot compile when no RegularExpressions are available */ + + int32_t capacity = 0; + char16_t* note = nullptr; + int32_t noteLen = 0; + int32_t count = 0,i; + + if(src == nullptr){ + return; + } + + capacity = src->fLength; + note = (char16_t*) uprv_malloc(U_SIZEOF_UCHAR * capacity); + + count = getCount(src->fChars,src->fLength, UPC_NOTE, status); + if(U_FAILURE(*status)){ + uprv_free(note); + return; + } + for(i=0; i < count; i++){ + noteLen = getAt(src->fChars,src->fLength, ¬e, capacity, i, UPC_NOTE, status); + if(U_FAILURE(*status)){ + uprv_free(note); + return; + } + if(noteLen > 0){ + write_tabs(out); + print(note, noteLen,"<note>", "</note>", status); + } + } + uprv_free(note); +#else + + fprintf(stderr, "Warning: Could not output comments to XLIFF file. ICU has been built without RegularExpression support.\n"); + +#endif /* UCONFIG_NO_REGULAR_EXPRESSIONS */ + +} + +static void printAttribute(const char *name, const char *value, int32_t /*len*/) +{ + write_utf8_file(out, UnicodeString(" ")); + write_utf8_file(out, UnicodeString(name)); + write_utf8_file(out, UnicodeString(" = \"")); + write_utf8_file(out, UnicodeString(value)); + write_utf8_file(out, UnicodeString("\"")); +} + +#if UCONFIG_NO_REGULAR_EXPRESSIONS==0 /* donot compile when no RegularExpressions are available */ +static void printAttribute(const char *name, const UnicodeString value, int32_t /*len*/) +{ + write_utf8_file(out, UnicodeString(" ")); + write_utf8_file(out, UnicodeString(name)); + write_utf8_file(out, UnicodeString(" = \"")); + write_utf8_file(out, value); + write_utf8_file(out, UnicodeString("\"")); +} +#endif + +static void +printComments(struct UString *src, const char *resName, UBool printTranslate, UErrorCode *status){ + +#if UCONFIG_NO_REGULAR_EXPRESSIONS==0 /* donot compile when no RegularExpressions are available */ + + if(status==nullptr || U_FAILURE(*status)){ + return; + } + + int32_t capacity = src->fLength + 1; + char* buf = nullptr; + int32_t bufLen = 0; + char16_t* desc = (char16_t*) uprv_malloc(U_SIZEOF_UCHAR * capacity); + char16_t* trans = (char16_t*) uprv_malloc(U_SIZEOF_UCHAR * capacity); + + int32_t descLen = 0, transLen=0; + if(desc==nullptr || trans==nullptr){ + *status = U_MEMORY_ALLOCATION_ERROR; + uprv_free(desc); + uprv_free(trans); + return; + } + // TODO: make src const, stop modifying it in-place, make printContainer() take const resource, etc. + src->fLength = removeCmtText(src->fChars, src->fLength, status); + descLen = getDescription(src->fChars,src->fLength, &desc, capacity, status); + transLen = getTranslate(src->fChars,src->fLength, &trans, capacity, status); + + /* first print translate attribute */ + if(transLen > 0){ + if(printTranslate){ + /* print translate attribute */ + buf = convertAndEscape(&buf, 0, &bufLen, trans, transLen, status); + if(U_SUCCESS(*status)){ + printAttribute("translate", UnicodeString(buf, bufLen, "UTF-8"), bufLen); + write_utf8_file(out,UnicodeString(">\n")); + } + }else if(getShowWarning()){ + fprintf(stderr, "Warning: Translate attribute for resource %s cannot be set. XLIFF prohibits it.\n", resName); + /* no translate attribute .. just close the tag */ + write_utf8_file(out,UnicodeString(">\n")); + } + }else{ + /* no translate attribute .. just close the tag */ + write_utf8_file(out,UnicodeString(">\n")); + } + + if(descLen > 0){ + write_tabs(out); + print(desc, descLen, "<!--", "-->", status); + } + + uprv_free(desc); + uprv_free(trans); +#else + + fprintf(stderr, "Warning: Could not output comments to XLIFF file. ICU has been built without RegularExpression support.\n"); + +#endif /* UCONFIG_NO_REGULAR_EXPRESSIONS */ + +} + +/* + * Print out a containing element, like: + * <trans-unit id = "blah" resname = "blah" restype = "x-id-alias" translate = "no"> + * <group id "calendar_gregorian" resname = "gregorian" restype = "x-icu-array"> + */ +static char *printContainer(SResource *res, const char *container, const char *restype, const char *mimetype, const char *id, UErrorCode *status) +{ + const char *resname = nullptr; + char *sid = nullptr; + + write_tabs(out); + + resname = res->getKeyString(srBundle); + if (resname != nullptr && *resname != 0) { + sid = getID(id, resname, sid); + } else { + sid = getID(id, nullptr, sid); + } + + write_utf8_file(out, UnicodeString("<")); + write_utf8_file(out, UnicodeString(container)); + printAttribute("id", sid, (int32_t) uprv_strlen(sid)); + + if (resname != nullptr) { + printAttribute("resname", resname, (int32_t) uprv_strlen(resname)); + } + + if (mimetype != nullptr) { + printAttribute("mime-type", mimetype, (int32_t) uprv_strlen(mimetype)); + } + + if (restype != nullptr) { + printAttribute("restype", restype, (int32_t) uprv_strlen(restype)); + } + + tabCount += 1; + if (res->fComment.fLength > 0) { + /* printComments will print the closing ">\n" */ + printComments(&res->fComment, resname, true, status); + } else { + write_utf8_file(out, UnicodeString(">\n")); + } + + return sid; +} + +/* Writing Functions */ + +static const char *trans_unit = "trans-unit"; +static const char *close_trans_unit = "</trans-unit>\n"; +static const char *source = "<source>"; +static const char *close_source = "</source>\n"; +static const char *group = "group"; +static const char *close_group = "</group>\n"; + +static const char *bin_unit = "bin-unit"; +static const char *close_bin_unit = "</bin-unit>\n"; +static const char *bin_source = "<bin-source>\n"; +static const char *close_bin_source = "</bin-source>\n"; +static const char *external_file = "<external-file"; +/*static const char *close_external_file = "</external-file>\n";*/ +static const char *internal_file = "<internal-file"; +static const char *close_internal_file = "</internal-file>\n"; + +static const char *application_mimetype = "application"; /* add "/octet-stream"? */ + +static const char *alias_restype = "x-icu-alias"; +static const char *array_restype = "x-icu-array"; +static const char *binary_restype = "x-icu-binary"; +static const char *integer_restype = "x-icu-integer"; +static const char *intvector_restype = "x-icu-intvector"; +static const char *table_restype = "x-icu-table"; + +static void +string_write_xml(StringResource *res, const char* id, const char* /*language*/, UErrorCode *status) { + + char *sid = nullptr; + char* buf = nullptr; + int32_t bufLen = 0; + + if(status==nullptr || U_FAILURE(*status)){ + return; + } + + sid = printContainer(res, trans_unit, nullptr, nullptr, id, status); + + write_tabs(out); + + write_utf8_file(out, UnicodeString(source)); + + buf = convertAndEscape(&buf, 0, &bufLen, res->getBuffer(), res->length(), status); + + if (U_FAILURE(*status)) { + return; + } + + write_utf8_file(out, UnicodeString(buf, bufLen, "UTF-8")); + write_utf8_file(out, UnicodeString(close_source)); + + printNoteElements(&res->fComment, status); + + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(close_trans_unit)); + + uprv_free(buf); + uprv_free(sid); +} + +static void +alias_write_xml(AliasResource *res, const char* id, const char* /*language*/, UErrorCode *status) { + char *sid = nullptr; + char* buf = nullptr; + int32_t bufLen=0; + + sid = printContainer(res, trans_unit, alias_restype, nullptr, id, status); + + write_tabs(out); + + write_utf8_file(out, UnicodeString(source)); + + buf = convertAndEscape(&buf, 0, &bufLen, res->getBuffer(), res->length(), status); + + if(U_FAILURE(*status)){ + return; + } + write_utf8_file(out, UnicodeString(buf, bufLen, "UTF-8")); + write_utf8_file(out, UnicodeString(close_source)); + + printNoteElements(&res->fComment, status); + + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(close_trans_unit)); + + uprv_free(buf); + uprv_free(sid); +} + +static void +array_write_xml(ArrayResource *res, const char* id, const char* language, UErrorCode *status) { + char* sid = nullptr; + int index = 0; + + struct SResource *current = nullptr; + + sid = printContainer(res, group, array_restype, nullptr, id, status); + + current = res->fFirst; + + while (current != nullptr) { + char c[256] = {0}; + char* subId = nullptr; + + itostr(c, index, 10, 0); + index += 1; + subId = getID(sid, c, subId); + + res_write_xml(current, subId, language, false, status); + uprv_free(subId); + subId = nullptr; + + if(U_FAILURE(*status)){ + return; + } + + current = current->fNext; + } + + tabCount -= 1; + write_tabs(out); + write_utf8_file(out, UnicodeString(close_group)); + + uprv_free(sid); +} + +static void +intvector_write_xml(IntVectorResource *res, const char* id, const char* /*language*/, UErrorCode *status) { + char* sid = nullptr; + char* ivd = nullptr; + uint32_t i=0; + uint32_t len=0; + char buf[256] = {'0'}; + + sid = printContainer(res, group, intvector_restype, nullptr, id, status); + + for(i = 0; i < res->fCount; i += 1) { + char c[256] = {0}; + + itostr(c, i, 10, 0); + ivd = getID(sid, c, ivd); + len = itostr(buf, res->fArray[i], 10, 0); + + write_tabs(out); + write_utf8_file(out, UnicodeString("<")); + write_utf8_file(out, UnicodeString(trans_unit)); + + printAttribute("id", ivd, (int32_t)uprv_strlen(ivd)); + printAttribute("restype", integer_restype, (int32_t) strlen(integer_restype)); + + write_utf8_file(out, UnicodeString(">\n")); + + tabCount += 1; + write_tabs(out); + write_utf8_file(out, UnicodeString(source)); + + write_utf8_file(out, UnicodeString(buf, len)); + + write_utf8_file(out, UnicodeString(close_source)); + tabCount -= 1; + write_tabs(out); + write_utf8_file(out, UnicodeString(close_trans_unit)); + + uprv_free(ivd); + ivd = nullptr; + } + + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(close_group)); + uprv_free(sid); + sid = nullptr; +} + +static void +int_write_xml(IntResource *res, const char* id, const char* /*language*/, UErrorCode *status) { + char* sid = nullptr; + char buf[256] = {0}; + uint32_t len = 0; + + sid = printContainer(res, trans_unit, integer_restype, nullptr, id, status); + + write_tabs(out); + + write_utf8_file(out, UnicodeString(source)); + + len = itostr(buf, res->fValue, 10, 0); + write_utf8_file(out, UnicodeString(buf, len)); + + write_utf8_file(out, UnicodeString(close_source)); + + printNoteElements(&res->fComment, status); + + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(close_trans_unit)); + + uprv_free(sid); + sid = nullptr; +} + +static void +bin_write_xml(BinaryResource *res, const char* id, const char* /*language*/, UErrorCode *status) { + const char* m_type = application_mimetype; + char* sid = nullptr; + uint32_t crc = 0xFFFFFFFF; + + char fileName[1024] ={0}; + int32_t tLen = ( outDir == nullptr) ? 0 :(int32_t)uprv_strlen(outDir); + char* fn = (char*) uprv_malloc(sizeof(char) * (tLen+1024 + + (res->fFileName !=nullptr ? + uprv_strlen(res->fFileName) :0))); + const char* ext = nullptr; + + char* f = nullptr; + + fn[0]=0; + + if(res->fFileName != nullptr){ + uprv_strcpy(fileName, res->fFileName); + f = uprv_strrchr(fileName, '\\'); + + if (f != nullptr) { + f++; + } else { + f = fileName; + } + + ext = uprv_strrchr(fileName, '.'); + + if (ext == nullptr) { + fprintf(stderr, "Error: %s is an unknown binary filename type.\n", fileName); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + + if(uprv_strcmp(ext, ".jpg")==0 || uprv_strcmp(ext, ".jpeg")==0 || uprv_strcmp(ext, ".gif")==0 ){ + m_type = "image"; + } else if(uprv_strcmp(ext, ".wav")==0 || uprv_strcmp(ext, ".au")==0 ){ + m_type = "audio"; + } else if(uprv_strcmp(ext, ".avi")==0 || uprv_strcmp(ext, ".mpg")==0 || uprv_strcmp(ext, ".mpeg")==0){ + m_type = "video"; + } else if(uprv_strcmp(ext, ".txt")==0 || uprv_strcmp(ext, ".text")==0){ + m_type = "text"; + } + + sid = printContainer(res, bin_unit, binary_restype, m_type, id, status); + + write_tabs(out); + + write_utf8_file(out, UnicodeString(bin_source)); + + tabCount+= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(external_file)); + printAttribute("href", f, (int32_t)uprv_strlen(f)); + write_utf8_file(out, UnicodeString("/>\n")); + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(close_bin_source)); + + printNoteElements(&res->fComment, status); + tabCount -= 1; + write_tabs(out); + write_utf8_file(out, UnicodeString(close_bin_unit)); + } else { + char temp[256] = {0}; + uint32_t i = 0; + int32_t len=0; + + sid = printContainer(res, bin_unit, binary_restype, m_type, id, status); + + write_tabs(out); + write_utf8_file(out, UnicodeString(bin_source)); + + tabCount += 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(internal_file)); + printAttribute("form", application_mimetype, (int32_t) uprv_strlen(application_mimetype)); + + while(i <res->fLength){ + len = itostr(temp, res->fData[i], 16, 2); + crc = computeCRC(temp, len, crc); + i++; + } + + len = itostr(temp, crc, 10, 0); + printAttribute("crc", temp, len); + + write_utf8_file(out, UnicodeString(">")); + + i = 0; + while(i <res->fLength){ + len = itostr(temp, res->fData[i], 16, 2); + write_utf8_file(out, UnicodeString(temp)); + i += 1; + } + + write_utf8_file(out, UnicodeString(close_internal_file)); + + tabCount -= 2; + write_tabs(out); + + write_utf8_file(out, UnicodeString(close_bin_source)); + printNoteElements(&res->fComment, status); + + tabCount -= 1; + write_tabs(out); + write_utf8_file(out, UnicodeString(close_bin_unit)); + + uprv_free(sid); + sid = nullptr; + } + + uprv_free(fn); +} + + + +static void +table_write_xml(TableResource *res, const char* id, const char* language, UBool isTopLevel, UErrorCode *status) { + + struct SResource *current = nullptr; + char* sid = nullptr; + + if (U_FAILURE(*status)) { + return ; + } + + sid = printContainer(res, group, table_restype, nullptr, id, status); + + if(isTopLevel) { + sid[0] = '\0'; + } + + current = res->fFirst; + + while (current != nullptr) { + res_write_xml(current, sid, language, false, status); + + if(U_FAILURE(*status)){ + return; + } + + current = current->fNext; + } + + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(close_group)); + + uprv_free(sid); + sid = nullptr; +} + +void +res_write_xml(struct SResource *res, const char* id, const char* language, UBool isTopLevel, UErrorCode *status) { + + if (U_FAILURE(*status)) { + return ; + } + + if (res != nullptr) { + switch (res->fType) { + case URES_STRING: + string_write_xml (static_cast<StringResource *>(res), id, language, status); + return; + + case URES_ALIAS: + alias_write_xml (static_cast<AliasResource *>(res), id, language, status); + return; + + case URES_INT_VECTOR: + intvector_write_xml (static_cast<IntVectorResource *>(res), id, language, status); + return; + + case URES_BINARY: + bin_write_xml (static_cast<BinaryResource *>(res), id, language, status); + return; + + case URES_INT: + int_write_xml (static_cast<IntResource *>(res), id, language, status); + return; + + case URES_ARRAY: + array_write_xml (static_cast<ArrayResource *>(res), id, language, status); + return; + + case URES_TABLE: + table_write_xml (static_cast<TableResource *>(res), id, language, isTopLevel, status); + return; + + default: + break; + } + } + + *status = U_INTERNAL_PROGRAM_ERROR; +} + +void +bundle_write_xml(struct SRBRoot *bundle, const char *outputDir,const char* outputEnc, const char* filename, + char *writtenFilename, int writtenFilenameLen, + const char* language, const char* outFileName, UErrorCode *status) { + + char* xmlfileName = nullptr; + char* outputFileName = nullptr; + char* originalFileName = nullptr; + const char* fileStart = "<file xml:space = \"preserve\" source-language = \""; + const char* file1 = "\" datatype = \"x-icu-resource-bundle\" "; + const char* file2 = "original = \""; + const char* file4 = "\" date = \""; + const char* fileEnd = "</file>\n"; + const char* headerStart = "<header>\n"; + const char* headerEnd = "</header>\n"; + const char* bodyStart = "<body>\n"; + const char* bodyEnd = "</body>\n"; + + const char *tool_start = "<tool"; + const char *tool_id = "genrb-" GENRB_VERSION "-icu-" U_ICU_VERSION; + const char *tool_name = "genrb"; + + char* temp = nullptr; + char* lang = nullptr; + const char* pos = nullptr; + int32_t first, index; + time_t currTime; + char timeBuf[128]; + + outDir = outputDir; + + srBundle = bundle; + + pos = uprv_strrchr(filename, '\\'); + if(pos != nullptr) { + first = (int32_t)(pos - filename + 1); + } else { + first = 0; + } + index = (int32_t)(uprv_strlen(filename) - uprv_strlen(textExt) - first); + originalFileName = (char *)uprv_malloc(sizeof(char)*index+1); + uprv_memset(originalFileName, 0, sizeof(char)*index+1); + uprv_strncpy(originalFileName, filename + first, index); + + if(uprv_strcmp(originalFileName, srBundle->fLocale) != 0) { + fprintf(stdout, "Warning: The file name is not same as the resource name!\n"); + } + + temp = originalFileName; + originalFileName = (char *)uprv_malloc(sizeof(char)* (uprv_strlen(temp)+uprv_strlen(textExt)) + 1); + uprv_memset(originalFileName, 0, sizeof(char)* (uprv_strlen(temp)+uprv_strlen(textExt)) + 1); + uprv_strcat(originalFileName, temp); + uprv_strcat(originalFileName, textExt); + uprv_free(temp); + temp = nullptr; + + + if (language == nullptr) { +/* lang = parseFilename(filename, lang); + if (lang == nullptr) {*/ + /* now check if locale name is valid or not + * this is to cater for situation where + * pegasusServer.txt contains + * + * en{ + * .. + * } + */ + lang = parseFilename(srBundle->fLocale, lang); + /* + * Neither the file name nor the table name inside the + * txt file contain a valid country and language codes + * throw an error. + * pegasusServer.txt contains + * + * testelements{ + * .... + * } + */ + if(lang==nullptr){ + fprintf(stderr, "Error: The file name and table name do not contain a valid language code. Please use -l option to specify it.\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + /* }*/ + } else { + lang = (char *)uprv_malloc(sizeof(char)*uprv_strlen(language) +1); + uprv_memset(lang, 0, sizeof(char)*uprv_strlen(language) +1); + uprv_strcpy(lang, language); + } + + if(outFileName) { + outputFileName = (char *)uprv_malloc(sizeof(char)*uprv_strlen(outFileName) + 1); + uprv_memset(outputFileName, 0, sizeof(char)*uprv_strlen(outFileName) + 1); + uprv_strcpy(outputFileName,outFileName); + } else { + outputFileName = (char *)uprv_malloc(sizeof(char)*uprv_strlen(srBundle->fLocale) + 1); + uprv_memset(outputFileName, 0, sizeof(char)*uprv_strlen(srBundle->fLocale) + 1); + uprv_strcpy(outputFileName,srBundle->fLocale); + } + + if(outputDir) { + xmlfileName = (char *)uprv_malloc(sizeof(char)*(uprv_strlen(outputDir) + uprv_strlen(outputFileName) + uprv_strlen(xliffExt) + 1) +1); + uprv_memset(xmlfileName, 0, sizeof(char)*(uprv_strlen(outputDir)+ uprv_strlen(outputFileName) + uprv_strlen(xliffExt) + 1) +1); + } else { + xmlfileName = (char *)uprv_malloc(sizeof(char)*(uprv_strlen(outputFileName) + uprv_strlen(xliffExt)) +1); + uprv_memset(xmlfileName, 0, sizeof(char)*(uprv_strlen(outputFileName) + uprv_strlen(xliffExt)) +1); + } + + if(outputDir){ + uprv_strcpy(xmlfileName, outputDir); + if(outputDir[uprv_strlen(outputDir)-1] !=U_FILE_SEP_CHAR){ + uprv_strcat(xmlfileName,U_FILE_SEP_STRING); + } + } + uprv_strcat(xmlfileName,outputFileName); + uprv_strcat(xmlfileName,xliffExt); + + if (writtenFilename) { + uprv_strncpy(writtenFilename, xmlfileName, writtenFilenameLen); + } + + if (U_FAILURE(*status)) { + goto cleanup_bundle_write_xml; + } + + out= T_FileStream_open(xmlfileName,"w"); + + if(out==nullptr){ + *status = U_FILE_ACCESS_ERROR; + goto cleanup_bundle_write_xml; + } + write_utf8_file(out, UnicodeString(xmlHeader)); + + if(outputEnc && *outputEnc!='\0'){ + /* store the output encoding */ + enc = outputEnc; + conv=ucnv_open(enc,status); + if(U_FAILURE(*status)){ + goto cleanup_bundle_write_xml; + } + } + write_utf8_file(out, UnicodeString(bundleStart)); + write_tabs(out); + write_utf8_file(out, UnicodeString(fileStart)); + /* check if lang and language are the same */ + if(language != nullptr && uprv_strcmp(lang, srBundle->fLocale)!=0){ + fprintf(stderr,"Warning: The top level tag in the resource and language specified are not the same. Please check the input.\n"); + } + write_utf8_file(out, UnicodeString(lang)); + write_utf8_file(out, UnicodeString(file1)); + write_utf8_file(out, UnicodeString(file2)); + write_utf8_file(out, UnicodeString(originalFileName)); + write_utf8_file(out, UnicodeString(file4)); + + time(&currTime); + strftime(timeBuf, sizeof(timeBuf), "%Y-%m-%dT%H:%M:%SZ", gmtime(&currTime)); + write_utf8_file(out, UnicodeString(timeBuf)); + write_utf8_file(out, UnicodeString("\">\n")); + + tabCount += 1; + write_tabs(out); + write_utf8_file(out, UnicodeString(headerStart)); + + tabCount += 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(tool_start)); + printAttribute("tool-id", tool_id, (int32_t) uprv_strlen(tool_id)); + printAttribute("tool-name", tool_name, (int32_t) uprv_strlen(tool_name)); + write_utf8_file(out, UnicodeString("/>\n")); + + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(headerEnd)); + + write_tabs(out); + tabCount += 1; + + write_utf8_file(out, UnicodeString(bodyStart)); + + + res_write_xml(bundle->fRoot, bundle->fLocale, lang, true, status); + + tabCount -= 1; + write_tabs(out); + + write_utf8_file(out, UnicodeString(bodyEnd)); + tabCount--; + write_tabs(out); + write_utf8_file(out, UnicodeString(fileEnd)); + tabCount--; + write_tabs(out); + write_utf8_file(out, UnicodeString(bundleEnd)); + T_FileStream_close(out); + + ucnv_close(conv); + +cleanup_bundle_write_xml: + uprv_free(originalFileName); + uprv_free(lang); + if(xmlfileName != nullptr) { + uprv_free(xmlfileName); + } + if(outputFileName != nullptr){ + uprv_free(outputFileName); + } +} |