summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/tools/gensprep
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
commit2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
treeb80bf8bf13c3766139fbacc530efd0dd9d54394c /intl/icu/source/tools/gensprep
parentInitial commit. (diff)
downloadfirefox-2aa4a82499d4becd2284cdb482213d541b8804dd.tar.xz
firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.zip
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'intl/icu/source/tools/gensprep')
-rw-r--r--intl/icu/source/tools/gensprep/Makefile.in96
-rwxr-xr-xintl/icu/source/tools/gensprep/filterRFC3454.pl678
-rw-r--r--intl/icu/source/tools/gensprep/gensprep.8.in104
-rw-r--r--intl/icu/source/tools/gensprep/gensprep.c458
-rw-r--r--intl/icu/source/tools/gensprep/gensprep.h83
-rw-r--r--intl/icu/source/tools/gensprep/gensprep.vcxproj100
-rw-r--r--intl/icu/source/tools/gensprep/gensprep.vcxproj.filters30
-rw-r--r--intl/icu/source/tools/gensprep/store.c651
8 files changed, 2200 insertions, 0 deletions
diff --git a/intl/icu/source/tools/gensprep/Makefile.in b/intl/icu/source/tools/gensprep/Makefile.in
new file mode 100644
index 0000000000..3ed8cd123b
--- /dev/null
+++ b/intl/icu/source/tools/gensprep/Makefile.in
@@ -0,0 +1,96 @@
+## Makefile.in for ICU - tools/gensprep
+## Copyright (C) 2016 and later: Unicode, Inc. and others.
+## License & terms of use: http://www.unicode.org/copyright.html
+## Copyright (c) 2001-2011, International Business Machines Corporation and
+## others. All Rights Reserved.
+## Steven R. Loomis/Markus W. Scherer
+
+## Source directory information
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+
+top_builddir = ../..
+
+include $(top_builddir)/icudefs.mk
+
+## Build directory information
+subdir = tools/gensprep
+
+TARGET_STUB_NAME = gensprep
+
+SECTION = 8
+
+MAN_FILES = $(TARGET_STUB_NAME).$(SECTION)
+
+
+## Extra files to remove for 'make clean'
+CLEANFILES = *~ $(DEPS) $(MAN_FILES)
+
+## Target information
+TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
+
+CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
+LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
+
+OBJECTS = gensprep.o store.o
+
+DEPS = $(OBJECTS:.o=.d)
+
+## List of phony targets
+.PHONY : all all-local install install-local clean clean-local \
+distclean distclean-local dist dist-local check check-local install-man
+
+## Clear suffix list
+.SUFFIXES :
+
+## List of standard targets
+all: all-local
+install: install-local
+clean: clean-local
+distclean : distclean-local
+dist: dist-local
+check: all check-local
+
+all-local: $(TARGET) $(MAN_FILES)
+
+install-local: all-local install-man
+ $(MKINSTALLDIRS) $(DESTDIR)$(sbindir)
+ $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir)
+
+install-man: $(MAN_FILES)
+ $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION)
+ $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION)
+
+dist-local:
+
+clean-local:
+ test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
+ $(RMV) $(TARGET) $(OBJECTS)
+
+distclean-local: clean-local
+ $(RMV) Makefile
+
+check-local: all-local
+
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+ cd $(top_builddir) \
+ && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
+
+$(TARGET) : $(OBJECTS)
+ $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS)
+ $(POST_BUILD_STEP)
+
+
+%.$(SECTION): $(srcdir)/%.$(SECTION).in
+ cd $(top_builddir) \
+ && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
+
+
+ifeq (,$(MAKECMDGOALS))
+-include $(DEPS)
+else
+ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
+-include $(DEPS)
+endif
+endif
+
diff --git a/intl/icu/source/tools/gensprep/filterRFC3454.pl b/intl/icu/source/tools/gensprep/filterRFC3454.pl
new file mode 100755
index 0000000000..321b03512c
--- /dev/null
+++ b/intl/icu/source/tools/gensprep/filterRFC3454.pl
@@ -0,0 +1,678 @@
+#!/usr/bin/perl
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (c) 2001-2015 International Business Machines
+# Corporation and others. All Rights Reserved.
+
+####################################################################################
+# filterRFC3454.pl:
+# This tool filters the RFC-3454 txt file for StringPrep tables and creates a table
+# to be used in NamePrepProfile
+#
+# Author: Ram Viswanadha
+#
+####################################################################################
+
+use File::Find;
+use File::Basename;
+use IO::File;
+use Cwd;
+use File::Copy;
+use Getopt::Long;
+use File::Path;
+use File::Copy;
+use Time::localtime;
+
+$icu_copyright = "#####################################################################\n# Copyright (c) %d, International Business Machines Corporation and\n# others. All Rights Reserved.\n#####################################################################\n\n";
+$copyright = "###################\n# This file was generated from RFC 3454 (http://www.ietf.org/rfc/rfc3454.txt)\n# Copyright (C) The Internet Society (2002). All Rights Reserved. \n###################\n\n";
+$warning = "###################\n# WARNING: This table is generated by filterRFC3454.pl tool with\n# options: @ARGV \n###################\n\n";
+#run the program)
+main();
+
+#---------------------------------------------------------------------
+# The main program
+
+sub main(){
+ GetOptions(
+ "--sourcedir=s" => \$sourceDir,
+ "--destdir=s" => \$destDir,
+ "--src-filename=s" => \$srcFileName,
+ "--dest-filename=s" => \$destFileName,
+ "--A1" => \$a1,
+ "--B1" => \$b1,
+ "--B2" => \$b2,
+ "--B3" => \$b3,
+ "--C11" => \$c11,
+ "--C12" => \$c12,
+ "--C21" => \$c21,
+ "--C22" => \$c22,
+ "--C3" => \$c3,
+ "--C4" => \$c4,
+ "--C5" => \$c5,
+ "--C6" => \$c6,
+ "--C7" => \$c7,
+ "--C8" => \$c8,
+ "--C9" => \$c9,
+ "--iscsi" => \$writeISCSIProhibitedExtra,
+ "--xmpp-node" => \$writeXMPPNodeProhibitedExtra,
+ "--sasl" => \$writeSASLMap,
+ "--ldap" => \$writeLDAPMap,
+ "--normalize" => \$norm,
+ "--check-bidi" => \$checkBidi,
+ );
+ usage() unless defined $sourceDir;
+ usage() unless defined $destDir;
+ usage() unless defined $srcFileName;
+ usage() unless defined $destFileName;
+
+ $infile = $sourceDir."/".$srcFileName;
+ $inFH = IO::File->new($infile,"r")
+ or die "could not open the file $infile for reading: $! \n";
+ $outfile = $destDir."/".$destFileName;
+
+ unlink($outfile);
+ $outFH = IO::File->new($outfile,"a")
+ or die "could not open the file $outfile for writing: $! \n";
+
+ printf $outFH $icu_copyright, localtime->year()+1900;
+ print $outFH $copyright;
+ print $outFH $warning;
+
+ if(defined $norm) {
+ print $outFH "\@normalize;;\n";
+ }
+ if(defined $checkBidi) {
+ print $outFH "\@check-bidi;;\n";
+ }
+ print $outFH "\n";
+ close($outFH);
+
+ if(defined $b2 && defined $b3){
+ die "ERROR: --B2 and --B3 are both specified\!\n";
+ }
+
+ while(defined ($line=<$inFH>)){
+ next unless $line=~ /Start\sTable/;
+ if($line =~ /A.1/){
+ createUnassignedTable($inFH,$outfile);
+ }
+ if($line =~ /B.1/ && defined $b1){
+ createMapToNothing($inFH,$outfile);
+ }
+ if($line =~ /B.2/ && defined $b2){
+ createCaseMapNorm($inFH,$outfile);
+ }
+ if($line =~ /B.3/ && defined $b3){
+ createCaseMapNoNorm($inFH,$outfile);
+ }
+ if($line =~ /C.1.1/ && defined $c11 ){
+ createProhibitedTable($inFH,$outfile,$line);
+ }
+ if($line =~ /C.1.2/ && defined $c12 ){
+ createProhibitedTable($inFH,$outfile,$line);
+ }
+ if($line =~ /C.2.1/ && defined $c21 ){
+ createProhibitedTable($inFH,$outfile,$line);
+ }
+ if($line =~ /C.2.2/ && defined $c22 ){
+ createProhibitedTable($inFH,$outfile,$line);
+ }
+ if($line =~ /C.3/ && defined $c3 ){
+ createProhibitedTable($inFH,$outfile,$line);
+ }
+ if($line =~ /C.4/ && defined $c4 ){
+ createProhibitedTable($inFH,$outfile,$line);
+ }
+ if($line =~ /C.5/ && defined $c5 ){
+ createProhibitedTable($inFH,$outfile,$line);
+ }
+ if($line =~ /C.6/ && defined $c6 ){
+ createProhibitedTable($inFH,$outfile,$line);
+ }
+ if($line =~ /C.7/ && defined $c7 ){
+ createProhibitedTable($inFH,$outfile,$line);
+ }
+ if($line =~ /C.8/ && defined $c8 ){
+ createProhibitedTable($inFH,$outfile,$line);
+ }
+ if($line =~ /C.9/ && defined $c9 ){
+ createProhibitedTable($inFH,$outfile,$line);
+ }
+ }
+ if( defined $writeISCSIProhibitedExtra){
+ create_iSCSIExtraProhibitedTable($inFH, $outfile);
+ }
+ if( defined $writeXMPPNodeProhibitedExtra){
+ create_XMPPNodeExtraProhibitedTable($inFH, $outfile);
+ }
+ if( defined $writeSASLMap){
+ create_SASLMapTable($inFH, $outfile);
+ }
+ if( defined $writeLDAPMap){
+ create_LDAPMapTable($inFH, $outfile);
+ }
+ close($inFH);
+}
+
+#-----------------------------------------------------------------------
+sub readPrint{
+ local ($inFH, $outFH,$comment, $table) = @_;
+ $count = 0;
+ print $outFH $comment."\n";
+ while(defined ($line = <$inFH>)){
+ next if $line =~ /Hoffman\s\&\sBlanchet/; # ignore heading
+ next if $line =~ /RFC\s3454/; # ignore heading
+ next if $line =~ /\f/; # ignore form feed
+ next if $line eq "\n"; # ignore blank lines
+ # break if "End Table" is found
+ if( $line =~ /End\sTable/){
+ print $outFH "\n# Total code points $count\n\n";
+ return;
+ }
+ if($print==1){
+ print $line;
+ }
+ $line =~ s/-/../;
+ $line =~ s/^\s+//;
+ if($line =~ /\;/){
+ }else{
+ $line =~ s/$/;/;
+ }
+ if($table =~ /A/ ){
+ ($code, $noise) = split /;/ , $line;
+ $line = $code."; ; UNASSIGNED\n";
+ }elsif ( $table =~ /B\.1/ ){
+ $line =~ s/Map to nothing/MAP/;
+ }elsif ( $table =~ /B\.[23]/ ){
+ $line =~ s/Case map/MAP/;
+ $line =~ s/Additional folding/MAP/;
+ }elsif ( $table =~ /C/ ) {
+ ($code, $noise) = split /;/ , $line;
+ $line = $code."; ; PROHIBITED\n";
+ }
+ if($line =~ /\.\./){
+ ($code, $noise) = split /;/ , $line;
+ ($startStr, $endStr ) = split /\.\./, $code;
+ $start = atoi($startStr);
+ $end = atoi($endStr);
+ #print $start." ".$end."\n";
+ while($start <= $end){
+ $count++;
+ $start++;
+ }
+ }else{
+ $count++;
+ }
+ print $outFH $line;
+ }
+}
+#-----------------------------------------------------------------------
+sub atoi {
+ my $t;
+ foreach my $d (split(//, shift())) {
+ $t = $t * 16 + $d;
+ }
+ return $t;
+}
+#-----------------------------------------------------------------------
+sub createUnassignedTable{
+ ($inFH,$outfile) = @_;
+ $outFH = IO::File->new($outfile,"a")
+ or die "could not open the file $outfile for writing: $! \n";
+ $comment = "# This table contains code points from Table A.1 from RFC 3454\n";
+ readPrint($inFH,$outFH, $comment, "A");
+ close($outFH);
+}
+#-----------------------------------------------------------------------
+sub createMapToNothing{
+ ($inFH,$outfile) = @_;
+ $outFH = IO::File->new($outfile,"a")
+ or die "could not open the file $outfile for writing: $! \n";
+ $comment = "# This table contains code points from Table B.1 from RFC 3454\n";
+ readPrint($inFH,$outFH,$comment, "B.1");
+ close($outFH);
+}
+#-----------------------------------------------------------------------
+sub createCaseMapNorm{
+ ($inFH,$outfile) = @_;
+ $outFH = IO::File->new($outfile,"a")
+ or die "could not open the file $outfile for writing: $! \n";
+ $comment = $warning."# This table contains code points from Table B.2 from RFC 3454\n";
+ readPrint($inFH,$outFH,$comment, "B.2");
+ close($outFH);
+}
+#-----------------------------------------------------------------------
+sub createCaseMapNoNorm{
+ ($inFH,$outfile) = @_;
+ $outFH = IO::File->new($outfile,"a")
+ or die "could not open the file $outfile for writing: $! \n";
+ $comment = $warning."# This table contains code points from Table B.3 from RFC 3454\n";
+ readPrint($inFH,$outFH,$comment, "B.3");
+ close($outFH);
+}
+#-----------------------------------------------------------------------
+sub createProhibitedTable{
+ ($inFH,$outfile,$line) = @_;
+ $line =~ s/Start//;
+ $line =~ s/-//g;
+ $comment = "# code points from $line";
+
+ $outFH = IO::File->new($outfile, "a")
+ or die "could not open the file $outfile for writing: $! \n";
+ readPrint($inFH,$outFH,$comment, "C");
+ close($outFH);
+}
+
+#-----------------------------------------------------------------------
+sub create_iSCSIExtraProhibitedTable{
+ ($inFH,$outfile,$line) = @_;
+ $comment ="# Additional prohibitions from iSCSI profile (rfc3722.txt)\n\n";
+
+ $outFH = IO::File->new($outfile, "a")
+ or die "could not open the file $outfile for writing: $! \n";
+ print $outFH $comment;
+ print $outFH "0021..002C; ; PROHIBITED\n";
+ print $outFH "002F; ; PROHIBITED\n";
+ print $outFH "003B..0040; ; PROHIBITED\n";
+ print $outFH "005B..0060; ; PROHIBITED\n";
+ print $outFH "007B..007E; ; PROHIBITED\n";
+ print $outFH "3002; ; PROHIBITED\n";
+ print $outFH "\n# Total code points 30\n";
+ close($outFH);
+}
+#-----------------------------------------------------------------------
+sub create_XMPPNodeExtraProhibitedTable{
+ ($inFH,$outfile,$line) = @_;
+ $comment ="# Additional prohibitions from XMPP Nodeprep profile (rfc3920.txt)\n\n";
+
+ $outFH = IO::File->new($outfile, "a")
+ or die "could not open the file $outfile for writing: $! \n";
+ print $outFH $comment;
+ print $outFH "0022; ; PROHIBITED\n";
+ print $outFH "0026; ; PROHIBITED\n";
+ print $outFH "0027; ; PROHIBITED\n";
+ print $outFH "002F; ; PROHIBITED\n";
+ print $outFH "003A; ; PROHIBITED\n";
+ print $outFH "003C; ; PROHIBITED\n";
+ print $outFH "003E; ; PROHIBITED\n";
+ print $outFH "0040; ; PROHIBITED\n";
+ print $outFH "\n# Total code points 8\n";
+ close($outFH);
+}
+#-----------------------------------------------------------------------
+sub create_SASLMapTable{
+ ($inFH,$outfile,$line) = @_;
+ $comment ="# Map table for SASL profile (rfc4013.txt)\n\n";
+
+ $outFH = IO::File->new($outfile, "a")
+ or die "could not open the file $outfile for writing: $! \n";
+ print $outFH $comment;
+ # non-ASCII space characters [C.1.2] to SPACE
+ print $outFH "00A0; 0020; MAP\n";
+ print $outFH "1680; 0020; MAP\n";
+ print $outFH "2000; 0020; MAP\n";
+ print $outFH "2001; 0020; MAP\n";
+ print $outFH "2002; 0020; MAP\n";
+ print $outFH "2003; 0020; MAP\n";
+ print $outFH "2004; 0020; MAP\n";
+ print $outFH "2005; 0020; MAP\n";
+ print $outFH "2006; 0020; MAP\n";
+ print $outFH "2007; 0020; MAP\n";
+ print $outFH "2008; 0020; MAP\n";
+ print $outFH "2009; 0020; MAP\n";
+ print $outFH "200A; 0020; MAP\n";
+ print $outFH "200B; 0020; MAP\n";
+ print $outFH "202F; 0020; MAP\n";
+ print $outFH "205F; 0020; MAP\n";
+ print $outFH "3000; 0020; MAP\n";
+
+ # commonly mapped to nothing characters except U+200B to nothing
+ print $outFH "00AD; ; MAP\n";
+ print $outFH "034F; ; MAP\n";
+ print $outFH "1806; ; MAP\n";
+ print $outFH "180B; ; MAP\n";
+ print $outFH "180C; ; MAP\n";
+ print $outFH "180D; ; MAP\n";
+ print $outFH "200C; ; MAP\n";
+ print $outFH "200D; ; MAP\n";
+ print $outFH "2060; ; MAP\n";
+ print $outFH "FE00; ; MAP\n";
+ print $outFH "FE01; ; MAP\n";
+ print $outFH "FE02; ; MAP\n";
+ print $outFH "FE03; ; MAP\n";
+ print $outFH "FE04; ; MAP\n";
+ print $outFH "FE05; ; MAP\n";
+ print $outFH "FE06; ; MAP\n";
+ print $outFH "FE07; ; MAP\n";
+ print $outFH "FE08; ; MAP\n";
+ print $outFH "FE09; ; MAP\n";
+ print $outFH "FE0A; ; MAP\n";
+ print $outFH "FE0B; ; MAP\n";
+ print $outFH "FE0C; ; MAP\n";
+ print $outFH "FE0D; ; MAP\n";
+ print $outFH "FE0E; ; MAP\n";
+ print $outFH "FE0F; ; MAP\n";
+ print $outFH "FEFF; ; MAP\n";
+ print $outFH "\n# Total code points 43\n";
+ close($outFH);
+}
+#-----------------------------------------------------------------------
+sub create_LDAPMapTable{
+ ($inFH,$outfile,$line) = @_;
+ $comment ="# Map table for LDAP profile (rfc4518.txt)\n\n";
+
+ $outFH = IO::File->new($outfile, "a")
+ or die "could not open the file $outfile for writing: $! \n";
+ print $outFH $comment;
+
+ # SOFT HYPHEN (U+00AD) and MONGOLIAN TODO SOFT HYPHEN (U+1806) code
+ # points are mapped to nothing. COMBINING GRAPHEME JOINER (U+034F) and
+ # VARIATION SELECTORs (U+180B-180D, FF00-FE0F) code points are also
+ # mapped to nothing. The OBJECT REPLACEMENT CHARACTER (U+FFFC) is
+ # mapped to nothing.
+
+ print $outFH "00AD; ; MAP\n";
+ print $outFH "034F; ; MAP\n";
+ print $outFH "1806; ; MAP\n";
+ print $outFH "180B; ; MAP\n";
+ print $outFH "180C; ; MAP\n";
+ print $outFH "180D; ; MAP\n";
+ print $outFH "FE00; ; MAP\n";
+ print $outFH "FE01; ; MAP\n";
+ print $outFH "FE02; ; MAP\n";
+ print $outFH "FE03; ; MAP\n";
+ print $outFH "FE04; ; MAP\n";
+ print $outFH "FE05; ; MAP\n";
+ print $outFH "FE06; ; MAP\n";
+ print $outFH "FE07; ; MAP\n";
+ print $outFH "FE08; ; MAP\n";
+ print $outFH "FE09; ; MAP\n";
+ print $outFH "FE0A; ; MAP\n";
+ print $outFH "FE0B; ; MAP\n";
+ print $outFH "FE0C; ; MAP\n";
+ print $outFH "FE0D; ; MAP\n";
+ print $outFH "FE0E; ; MAP\n";
+ print $outFH "FE0F; ; MAP\n";
+ print $outFH "FFFC; ; MAP\n";
+
+# CHARACTER TABULATION (U+0009), LINE FEED (LF) (U+000A), LINE
+# TABULATION (U+000B), FORM FEED (FF) (U+000C), CARRIAGE RETURN (CR)
+# (U+000D), and NEXT LINE (NEL) (U+0085) are mapped to SPACE (U+0020).
+
+ print $outFH "0009; 0020; MAP\n";
+ print $outFH "000A; 0020; MAP\n";
+ print $outFH "000B; 0020; MAP\n";
+ print $outFH "000C; 0020; MAP\n";
+ print $outFH "000D; 0020; MAP\n";
+ print $outFH "0085; 0020; MAP\n";
+
+ # All other control code (e.g., Cc) points or code points with a
+ # control function (e.g., Cf) are mapped to nothing. The following is
+ # a complete list of these code points: U+0000-0008, 000E-001F, 007F-
+ # 0084, 0086-009F, 06DD, 070F, 180E, 200C-200F, 202A-202E, 2060-2063,
+ # 206A-206F, FEFF, FFF9-FFFB, 1D173-1D17A, E0001, E0020-E007F.
+
+ print $outFH "0000; ; MAP\n";
+ print $outFH "0001; ; MAP\n";
+ print $outFH "0002; ; MAP\n";
+ print $outFH "0003; ; MAP\n";
+ print $outFH "0004; ; MAP\n";
+ print $outFH "0005; ; MAP\n";
+ print $outFH "0006; ; MAP\n";
+ print $outFH "0007; ; MAP\n";
+ print $outFH "0008; ; MAP\n";
+ print $outFH "000E; ; MAP\n";
+ print $outFH "000F; ; MAP\n";
+ print $outFH "0010; ; MAP\n";
+ print $outFH "0011; ; MAP\n";
+ print $outFH "0012; ; MAP\n";
+ print $outFH "0013; ; MAP\n";
+ print $outFH "0014; ; MAP\n";
+ print $outFH "0015; ; MAP\n";
+ print $outFH "0016; ; MAP\n";
+ print $outFH "0017; ; MAP\n";
+ print $outFH "0018; ; MAP\n";
+ print $outFH "0019; ; MAP\n";
+ print $outFH "001A; ; MAP\n";
+ print $outFH "001B; ; MAP\n";
+ print $outFH "001C; ; MAP\n";
+ print $outFH "001D; ; MAP\n";
+ print $outFH "001E; ; MAP\n";
+ print $outFH "001F; ; MAP\n";
+ print $outFH "007F; ; MAP\n";
+ print $outFH "0080; ; MAP\n";
+ print $outFH "0081; ; MAP\n";
+ print $outFH "0082; ; MAP\n";
+ print $outFH "0083; ; MAP\n";
+ print $outFH "0084; ; MAP\n";
+ print $outFH "0086; ; MAP\n";
+ print $outFH "0087; ; MAP\n";
+ print $outFH "0088; ; MAP\n";
+ print $outFH "0089; ; MAP\n";
+ print $outFH "008A; ; MAP\n";
+ print $outFH "008B; ; MAP\n";
+ print $outFH "008C; ; MAP\n";
+ print $outFH "008D; ; MAP\n";
+ print $outFH "008E; ; MAP\n";
+ print $outFH "008F; ; MAP\n";
+ print $outFH "0090; ; MAP\n";
+ print $outFH "0091; ; MAP\n";
+ print $outFH "0092; ; MAP\n";
+ print $outFH "0093; ; MAP\n";
+ print $outFH "0094; ; MAP\n";
+ print $outFH "0095; ; MAP\n";
+ print $outFH "0096; ; MAP\n";
+ print $outFH "0097; ; MAP\n";
+ print $outFH "0098; ; MAP\n";
+ print $outFH "0099; ; MAP\n";
+ print $outFH "009A; ; MAP\n";
+ print $outFH "009B; ; MAP\n";
+ print $outFH "009C; ; MAP\n";
+ print $outFH "009D; ; MAP\n";
+ print $outFH "009E; ; MAP\n";
+ print $outFH "009F; ; MAP\n";
+ print $outFH "06DD; ; MAP\n";
+ print $outFH "070F; ; MAP\n";
+ print $outFH "180E; ; MAP\n";
+ print $outFH "200C; ; MAP\n";
+ print $outFH "200D; ; MAP\n";
+ print $outFH "200E; ; MAP\n";
+ print $outFH "200F; ; MAP\n";
+ print $outFH "202A; ; MAP\n";
+ print $outFH "202B; ; MAP\n";
+ print $outFH "202C; ; MAP\n";
+ print $outFH "202D; ; MAP\n";
+ print $outFH "202E; ; MAP\n";
+ print $outFH "2060; ; MAP\n";
+ print $outFH "2061; ; MAP\n";
+ print $outFH "2062; ; MAP\n";
+ print $outFH "2063; ; MAP\n";
+ print $outFH "206A; ; MAP\n";
+ print $outFH "206B; ; MAP\n";
+ print $outFH "206C; ; MAP\n";
+ print $outFH "206D; ; MAP\n";
+ print $outFH "206E; ; MAP\n";
+ print $outFH "206F; ; MAP\n";
+ print $outFH "FEFF; ; MAP\n";
+ print $outFH "FFF9; ; MAP\n";
+ print $outFH "FFFA; ; MAP\n";
+ print $outFH "FFFB; ; MAP\n";
+ print $outFH "1D173; ; MAP\n";
+ print $outFH "1D174; ; MAP\n";
+ print $outFH "1D175; ; MAP\n";
+ print $outFH "1D176; ; MAP\n";
+ print $outFH "1D177; ; MAP\n";
+ print $outFH "1D178; ; MAP\n";
+ print $outFH "1D179; ; MAP\n";
+ print $outFH "1D17A; ; MAP\n";
+ print $outFH "E0001; ; MAP\n";
+ print $outFH "E0020; ; MAP\n";
+ print $outFH "E0021; ; MAP\n";
+ print $outFH "E0022; ; MAP\n";
+ print $outFH "E0023; ; MAP\n";
+ print $outFH "E0024; ; MAP\n";
+ print $outFH "E0025; ; MAP\n";
+ print $outFH "E0026; ; MAP\n";
+ print $outFH "E0027; ; MAP\n";
+ print $outFH "E0028; ; MAP\n";
+ print $outFH "E0029; ; MAP\n";
+ print $outFH "E002A; ; MAP\n";
+ print $outFH "E002B; ; MAP\n";
+ print $outFH "E002C; ; MAP\n";
+ print $outFH "E002D; ; MAP\n";
+ print $outFH "E002E; ; MAP\n";
+ print $outFH "E002F; ; MAP\n";
+ print $outFH "E0030; ; MAP\n";
+ print $outFH "E0031; ; MAP\n";
+ print $outFH "E0032; ; MAP\n";
+ print $outFH "E0033; ; MAP\n";
+ print $outFH "E0034; ; MAP\n";
+ print $outFH "E0035; ; MAP\n";
+ print $outFH "E0036; ; MAP\n";
+ print $outFH "E0037; ; MAP\n";
+ print $outFH "E0038; ; MAP\n";
+ print $outFH "E0039; ; MAP\n";
+ print $outFH "E003A; ; MAP\n";
+ print $outFH "E003B; ; MAP\n";
+ print $outFH "E003C; ; MAP\n";
+ print $outFH "E003D; ; MAP\n";
+ print $outFH "E003E; ; MAP\n";
+ print $outFH "E003F; ; MAP\n";
+ print $outFH "E0040; ; MAP\n";
+ print $outFH "E0041; ; MAP\n";
+ print $outFH "E0042; ; MAP\n";
+ print $outFH "E0043; ; MAP\n";
+ print $outFH "E0044; ; MAP\n";
+ print $outFH "E0045; ; MAP\n";
+ print $outFH "E0046; ; MAP\n";
+ print $outFH "E0047; ; MAP\n";
+ print $outFH "E0048; ; MAP\n";
+ print $outFH "E0049; ; MAP\n";
+ print $outFH "E004A; ; MAP\n";
+ print $outFH "E004B; ; MAP\n";
+ print $outFH "E004C; ; MAP\n";
+ print $outFH "E004D; ; MAP\n";
+ print $outFH "E004E; ; MAP\n";
+ print $outFH "E004F; ; MAP\n";
+ print $outFH "E0050; ; MAP\n";
+ print $outFH "E0051; ; MAP\n";
+ print $outFH "E0052; ; MAP\n";
+ print $outFH "E0053; ; MAP\n";
+ print $outFH "E0054; ; MAP\n";
+ print $outFH "E0055; ; MAP\n";
+ print $outFH "E0056; ; MAP\n";
+ print $outFH "E0057; ; MAP\n";
+ print $outFH "E0058; ; MAP\n";
+ print $outFH "E0059; ; MAP\n";
+ print $outFH "E005A; ; MAP\n";
+ print $outFH "E005B; ; MAP\n";
+ print $outFH "E005C; ; MAP\n";
+ print $outFH "E005D; ; MAP\n";
+ print $outFH "E005E; ; MAP\n";
+ print $outFH "E005F; ; MAP\n";
+ print $outFH "E0060; ; MAP\n";
+ print $outFH "E0061; ; MAP\n";
+ print $outFH "E0062; ; MAP\n";
+ print $outFH "E0063; ; MAP\n";
+ print $outFH "E0064; ; MAP\n";
+ print $outFH "E0065; ; MAP\n";
+ print $outFH "E0066; ; MAP\n";
+ print $outFH "E0067; ; MAP\n";
+ print $outFH "E0068; ; MAP\n";
+ print $outFH "E0069; ; MAP\n";
+ print $outFH "E006A; ; MAP\n";
+ print $outFH "E006B; ; MAP\n";
+ print $outFH "E006C; ; MAP\n";
+ print $outFH "E006D; ; MAP\n";
+ print $outFH "E006E; ; MAP\n";
+ print $outFH "E006F; ; MAP\n";
+ print $outFH "E0070; ; MAP\n";
+ print $outFH "E0071; ; MAP\n";
+ print $outFH "E0072; ; MAP\n";
+ print $outFH "E0073; ; MAP\n";
+ print $outFH "E0074; ; MAP\n";
+ print $outFH "E0075; ; MAP\n";
+ print $outFH "E0076; ; MAP\n";
+ print $outFH "E0077; ; MAP\n";
+ print $outFH "E0078; ; MAP\n";
+ print $outFH "E0079; ; MAP\n";
+ print $outFH "E007A; ; MAP\n";
+ print $outFH "E007B; ; MAP\n";
+ print $outFH "E007C; ; MAP\n";
+ print $outFH "E007D; ; MAP\n";
+ print $outFH "E007E; ; MAP\n";
+ print $outFH "E007F; ; MAP\n";
+
+ # ZERO WIDTH SPACE (U+200B) is mapped to nothing. All other code
+ # points with Separator (space, line, or paragraph) property (e.g., Zs,
+ # Zl, or Zp) are mapped to SPACE (U+0020). The following is a complete
+ # list of these code points: U+0020, 00A0, 1680, 2000-200A, 2028-2029,
+ # 202F, 205F, 3000.
+
+ print $outFH "200B; ; MAP\n";
+ print $outFH "00A0; 0020; MAP\n";
+ print $outFH "1680; 0020; MAP\n";
+ print $outFH "2000; 0020; MAP\n";
+ print $outFH "2001; 0020; MAP\n";
+ print $outFH "2002; 0020; MAP\n";
+ print $outFH "2003; 0020; MAP\n";
+ print $outFH "2004; 0020; MAP\n";
+ print $outFH "2005; 0020; MAP\n";
+ print $outFH "2006; 0020; MAP\n";
+ print $outFH "2007; 0020; MAP\n";
+ print $outFH "2008; 0020; MAP\n";
+ print $outFH "2009; 0020; MAP\n";
+ print $outFH "200A; 0020; MAP\n";
+ print $outFH "2028; 0020; MAP\n";
+ print $outFH "2029; 0020; MAP\n";
+ print $outFH "202F; 0020; MAP\n";
+ print $outFH "205F; 0020; MAP\n";
+ print $outFH "3000; 0020; MAP\n";
+
+ print $outFH "\n# Total code points 238\n";
+ close($outFH);
+}
+#-----------------------------------------------------------------------
+sub usage {
+ print << "END";
+Usage:
+filterRFC3454.pl
+Options:
+ --sourcedir=<directory>
+ --destdir=<directory>
+ --src-filename=<name of RFC file>
+ --dest-filename=<name of destination file>
+ --A1 Generate data for table A.1
+ --B1 Generate data for table B.1
+ --B2 Generate data for table B.2
+ --B3 Generate data for table B.3
+ --C11 Generate data for table C.1.1
+ --C12 Generate data for table C.1.2
+ --C21 Generate data for table C.2.1
+ --C22 Generate data for table C.2.2
+ --C3 Generate data for table C.3
+ --C4 Generate data for table C.4
+ --C5 Generate data for table C.5
+ --C6 Generate data for table C.6
+ --C7 Generate data for table C.7
+ --C8 Generate data for table C.8
+ --C9 Generate data for table C.9
+ --iscsi Generate data for iSCSI extra prohibited table
+ --xmpp-node Generate data for XMPP extra prohibited table
+ --sasl Generate data for SASL map table
+ --ldap Generate data for LDAP map table
+ --normalize Embed the normalization directive in the output file
+ --check-bidi Embed the check bidi directove in the output file
+
+Note, --B2 and --B3 are mutually exclusive.
+
+e.g.: filterRFC3454.pl --sourcedir=. --destdir=./output --src-filename=rfc3454.txt --dest-filename=NamePrepProfile.txt --A1 --B1 --B2 --C12 --C22 --C3 --C4 --C5 --C6 --C7 --C8 --C9 --normalize --check-bidi
+
+filterRFC3454.pl filters the RFC file and creates String prep table files.
+The RFC text can be downloaded from ftp://ftp.rfc-editor.org/in-notes/rfc3454.txt
+
+END
+ exit(0);
+}
+
+
diff --git a/intl/icu/source/tools/gensprep/gensprep.8.in b/intl/icu/source/tools/gensprep/gensprep.8.in
new file mode 100644
index 0000000000..e1e9fb32e2
--- /dev/null
+++ b/intl/icu/source/tools/gensprep/gensprep.8.in
@@ -0,0 +1,104 @@
+.\" Hey, Emacs! This is -*-nroff-*- you know...
+.\"
+.\" gensprep.8: manual page for the gensprep utility
+.\"
+.\" Copyright (C) 2016 and later: Unicode, Inc. and others.
+.\" License & terms of use: http://www.unicode.org/copyright.html
+.\" Copyright (C) 2003 IBM, Inc. and others.
+.\"
+.TH gensprep 8 "18 March 2003" "ICU MANPAGE" "ICU @VERSION@ Manual"
+.SH NAME
+.B gensprep
+\- compile StringPrep data from files filtered by filterRFC3454.pl
+.SH SYNOPSIS
+.B gensprep
+[
+.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
+]
+[
+.BR "\-v\fP, \fB\-\-verbose"
+]
+[
+.BI "\-c\fP, \fB\-\-copyright"
+]
+[
+.BI "\-s\fP, \fB\-\-sourcedir" " source"
+]
+[
+.BI "\-d\fP, \fB\-\-destdir" " destination"
+]
+.SH DESCRIPTION
+.B gensprep
+reads filtered RFC 3454 files and compiles their
+information into a binary form.
+The resulting file,
+.BR <name>.icu ,
+can then be read directly by ICU, or used by
+.BR pkgdata (8)
+for incorporation into a larger archive or library.
+.LP
+The files read by
+.B gensprep
+are described in the
+.B FILES
+section.
+.SH OPTIONS
+.TP
+.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
+Print help about usage and exit.
+.TP
+.BR "\-v\fP, \fB\-\-verbose"
+Display extra informative messages during execution.
+.TP
+.BI "\-c\fP, \fB\-\-copyright"
+Include a copyright notice into the binary data.
+.TP
+.BI "\-s\fP, \fB\-\-sourcedir" " source"
+Set the source directory to
+.IR source .
+The default source directory is specified by the environment variable
+.BR ICU_DATA .
+.TP
+.BI "\-d\fP, \fB\-\-destdir" " destination"
+Set the destination directory to
+.IR destination .
+The default destination directory is specified by the environment variable
+.BR ICU_DATA .
+.SH ENVIRONMENT
+.TP 10
+.B ICU_DATA
+Specifies the directory containing ICU data. Defaults to
+.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ .
+Some tools in ICU depend on the presence of the trailing slash. It is thus
+important to make sure that it is present if
+.B ICU_DATA
+is set.
+.SH FILES
+The following files are read by
+.B gensprep
+and are looked for in the
+.I source
+/misc for rfc3454_*.txt files and in
+.I source
+/unidata for NormalizationCorrections.txt.
+.TP 20
+.B rfc3453_A_1.txt
+Contains the list of unassigned codepoints in Unicode version 3.2.0.\|.\|..
+.TP
+.B rfc3454_B_1.txt
+Contains the list of code points that are commonly mapped to nothing.\|.\|..
+.TP
+.B rfc3454_B_2.txt
+Contains the list of mappings for casefolding of code points when Normalization form NFKC is specified.\|.\|..
+.TP
+.B rfc3454_C_X.txt
+Contains the list of code points that are prohibited for IDNA.
+.TP
+.B NormalizationCorrections.txt
+Contains the list of code points whose normalization has changed since Unicode Version 3.2.0.
+.SH VERSION
+@VERSION@
+.SH COPYRIGHT
+Copyright (C) 2000-2002 IBM, Inc. and others.
+.SH SEE ALSO
+.BR pkgdata (8)
diff --git a/intl/icu/source/tools/gensprep/gensprep.c b/intl/icu/source/tools/gensprep/gensprep.c
new file mode 100644
index 0000000000..a78a5f3e56
--- /dev/null
+++ b/intl/icu/source/tools/gensprep/gensprep.c
@@ -0,0 +1,458 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 2003-2016, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: gensprep.c
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2003-02-06
+* created by: Ram Viswanadha
+*
+* This program reads the Profile.txt files,
+* parses them, and extracts the data for StringPrep profile.
+* It then preprocesses it and writes a binary file for efficient use
+* in various StringPrep conversion processes.
+*/
+
+#define USPREP_TYPE_NAMES_ARRAY 1
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cmemory.h"
+#include "cstring.h"
+#include "unewdata.h"
+#include "uoptions.h"
+#include "uparse.h"
+#include "sprpimpl.h"
+
+#include "unicode/uclean.h"
+#include "unicode/udata.h"
+#include "unicode/utypes.h"
+#include "unicode/putil.h"
+
+
+U_CDECL_BEGIN
+#include "gensprep.h"
+U_CDECL_END
+
+UBool beVerbose=FALSE, haveCopyright=TRUE;
+
+#define NORM_CORRECTIONS_FILE_NAME "NormalizationCorrections.txt"
+
+#define NORMALIZE_DIRECTIVE "normalize"
+#define NORMALIZE_DIRECTIVE_LEN 9
+#define CHECK_BIDI_DIRECTIVE "check-bidi"
+#define CHECK_BIDI_DIRECTIVE_LEN 10
+
+/* prototypes --------------------------------------------------------------- */
+
+static void
+parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode);
+
+static void
+parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode);
+
+
+/* -------------------------------------------------------------------------- */
+
+static UOption options[]={
+ UOPTION_HELP_H,
+ UOPTION_HELP_QUESTION_MARK,
+ UOPTION_VERBOSE,
+ UOPTION_COPYRIGHT,
+ UOPTION_DESTDIR,
+ UOPTION_SOURCEDIR,
+ UOPTION_ICUDATADIR,
+ UOPTION_BUNDLE_NAME,
+ { "normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0 },
+ { "norm-correction", NULL, NULL, NULL, 'm', UOPT_REQUIRES_ARG, 0 },
+ { "check-bidi", NULL, NULL, NULL, 'k', UOPT_NO_ARG, 0},
+ { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
+};
+
+enum{
+ HELP,
+ HELP_QUESTION_MARK,
+ VERBOSE,
+ COPYRIGHT,
+ DESTDIR,
+ SOURCEDIR,
+ ICUDATADIR,
+ BUNDLE_NAME,
+ NORMALIZE,
+ NORM_CORRECTION_DIR,
+ CHECK_BIDI,
+ UNICODE_VERSION
+};
+
+static int printHelp(int argc, char* argv[]){
+ /*
+ * Broken into chucks because the C89 standard says the minimum
+ * required supported string length is 509 bytes.
+ */
+ fprintf(stderr,
+ "Usage: %s [-options] [file_name]\n"
+ "\n"
+ "Read the files specified and\n"
+ "create a binary file [package-name]_[bundle-name]." DATA_TYPE " with the StringPrep profile data\n"
+ "\n",
+ argv[0]);
+ fprintf(stderr,
+ "Options:\n"
+ "\t-h or -? or --help print this usage text\n"
+ "\t-v or --verbose verbose output\n"
+ "\t-c or --copyright include a copyright notice\n");
+ fprintf(stderr,
+ "\t-d or --destdir destination directory, followed by the path\n"
+ "\t-s or --sourcedir source directory of ICU data, followed by the path\n"
+ "\t-b or --bundle-name generate the output data file with the name specified\n"
+ "\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
+ "\t followed by path, defaults to %s\n",
+ u_getDataDirectory());
+ fprintf(stderr,
+ "\t-n or --normalize turn on the option for normalization and include mappings\n"
+ "\t from NormalizationCorrections.txt from the given path,\n"
+ "\t e.g: /test/icu/source/data/unidata\n");
+ fprintf(stderr,
+ "\t-m or --norm-correction use NormalizationCorrections.txt from the given path\n"
+ "\t when the input file contains a normalization directive.\n"
+ "\t unlike -n/--normalize, this option does not force the\n"
+ "\t normalization.\n");
+ fprintf(stderr,
+ "\t-k or --check-bidi turn on the option for checking for BiDi in the profile\n"
+ "\t-u or --unicode version of Unicode to be used with this profile followed by the version\n"
+ );
+ return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
+}
+
+
+extern int
+main(int argc, char* argv[]) {
+#if !UCONFIG_NO_IDNA
+ char* filename = NULL;
+#endif
+ const char *srcDir=NULL, *destDir=NULL, *icuUniDataDir=NULL;
+ const char *bundleName=NULL, *inputFileName = NULL;
+ char *basename=NULL;
+ int32_t sprepOptions = 0;
+
+ UErrorCode errorCode=U_ZERO_ERROR;
+
+ U_MAIN_INIT_ARGS(argc, argv);
+
+ /* preset then read command line options */
+ options[DESTDIR].value=u_getDataDirectory();
+ options[SOURCEDIR].value="";
+ options[UNICODE_VERSION].value="0"; /* don't assume the unicode version */
+ options[BUNDLE_NAME].value = DATA_NAME;
+ options[NORMALIZE].value = "";
+
+ argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options);
+
+ /* error handling, printing usage message */
+ if(argc<0) {
+ fprintf(stderr,
+ "error in command line argument \"%s\"\n",
+ argv[-argc]);
+ }
+ if(argc<0 || options[HELP].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
+ return printHelp(argc, argv);
+
+ }
+
+ /* get the options values */
+ beVerbose=options[VERBOSE].doesOccur;
+ haveCopyright=options[COPYRIGHT].doesOccur;
+ srcDir=options[SOURCEDIR].value;
+ destDir=options[DESTDIR].value;
+ bundleName = options[BUNDLE_NAME].value;
+ if(options[NORMALIZE].doesOccur) {
+ icuUniDataDir = options[NORMALIZE].value;
+ } else {
+ icuUniDataDir = options[NORM_CORRECTION_DIR].value;
+ }
+
+ if(argc<2) {
+ /* print the help message */
+ return printHelp(argc, argv);
+ } else {
+ inputFileName = argv[1];
+ }
+ if(!options[UNICODE_VERSION].doesOccur){
+ return printHelp(argc, argv);
+ }
+ if(options[ICUDATADIR].doesOccur) {
+ u_setDataDirectory(options[ICUDATADIR].value);
+ }
+#if UCONFIG_NO_IDNA
+
+ fprintf(stderr,
+ "gensprep writes dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE
+ " because UCONFIG_NO_IDNA is set, \n"
+ "see icu/source/common/unicode/uconfig.h\n");
+ generateData(destDir, bundleName);
+
+#else
+
+ setUnicodeVersion(options[UNICODE_VERSION].value);
+ filename = (char* ) uprv_malloc(uprv_strlen(srcDir) + uprv_strlen(inputFileName) + (icuUniDataDir == NULL ? 0 : uprv_strlen(icuUniDataDir)) + 40); /* hopefully this should be enough */
+
+ /* prepare the filename beginning with the source dir */
+ if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL && uprv_strchr(srcDir,U_FILE_ALT_SEP_CHAR) == NULL){
+ filename[0] = '.';
+ filename[1] = U_FILE_SEP_CHAR;
+ uprv_strcpy(filename+2,srcDir);
+ }else{
+ uprv_strcpy(filename, srcDir);
+ }
+
+ basename=filename+uprv_strlen(filename);
+ if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
+ *basename++=U_FILE_SEP_CHAR;
+ }
+
+ /* initialize */
+ init();
+
+ /* process the file */
+ uprv_strcpy(basename,inputFileName);
+ parseMappings(filename,FALSE, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "Could not open file %s for reading. Error: %s \n", filename, u_errorName(errorCode));
+ return errorCode;
+ }
+
+ if(options[NORMALIZE].doesOccur){ /* this option might be set by @normalize;; in the source file */
+ /* set up directory for NormalizationCorrections.txt */
+ uprv_strcpy(filename,icuUniDataDir);
+ basename=filename+uprv_strlen(filename);
+ if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
+ *basename++=U_FILE_SEP_CHAR;
+ }
+
+ *basename++=U_FILE_SEP_CHAR;
+ uprv_strcpy(basename,NORM_CORRECTIONS_FILE_NAME);
+
+ parseNormalizationCorrections(filename,&errorCode);
+ if(U_FAILURE(errorCode)){
+ fprintf(stderr,"Could not open file %s for reading \n", filename);
+ return errorCode;
+ }
+ sprepOptions |= _SPREP_NORMALIZATION_ON;
+ }
+
+ if(options[CHECK_BIDI].doesOccur){ /* this option might be set by @check-bidi;; in the source file */
+ sprepOptions |= _SPREP_CHECK_BIDI_ON;
+ }
+
+ setOptions(sprepOptions);
+
+ /* process parsed data */
+ if(U_SUCCESS(errorCode)) {
+ /* write the data file */
+ generateData(destDir, bundleName);
+
+ cleanUpData();
+ }
+
+ uprv_free(filename);
+
+ u_cleanup();
+
+#endif
+
+ return errorCode;
+}
+
+#if !UCONFIG_NO_IDNA
+
+static void U_CALLCONV
+normalizationCorrectionsLineFn(void *context,
+ char *fields[][2], int32_t fieldCount,
+ UErrorCode *pErrorCode) {
+ (void)context; // suppress compiler warnings about unused variable
+ (void)fieldCount; // suppress compiler warnings about unused variable
+ uint32_t mapping[40];
+ char *end, *s;
+ uint32_t code;
+ int32_t length;
+ UVersionInfo version;
+ UVersionInfo thisVersion;
+
+ /* get the character code, field 0 */
+ code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
+ if(U_FAILURE(*pErrorCode)) {
+ fprintf(stderr, "gensprep: error parsing NormalizationCorrections.txt mapping at %s\n", fields[0][0]);
+ exit(*pErrorCode);
+ }
+ /* Original (erroneous) decomposition */
+ s = fields[1][0];
+
+ /* parse the mapping string */
+ length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode);
+
+ /* ignore corrected decomposition */
+
+ u_versionFromString(version,fields[3][0] );
+ u_versionFromString(thisVersion, "3.2.0");
+
+
+
+ if(U_FAILURE(*pErrorCode)) {
+ fprintf(stderr, "gensprep error parsing NormalizationCorrections.txt of U+%04lx - %s\n",
+ (long)code, u_errorName(*pErrorCode));
+ exit(*pErrorCode);
+ }
+
+ /* store the mapping */
+ if( version[0] > thisVersion[0] ||
+ ((version[0]==thisVersion[0]) && (version[1] > thisVersion[1]))
+ ){
+ storeMapping(code,mapping, length, USPREP_MAP, pErrorCode);
+ }
+ setUnicodeVersionNC(version);
+}
+
+static void
+parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode) {
+ char *fields[4][2];
+
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+ return;
+ }
+
+ u_parseDelimitedFile(filename, ';', fields, 4, normalizationCorrectionsLineFn, NULL, pErrorCode);
+
+ /* fprintf(stdout,"Number of code points that have NormalizationCorrections mapping with length >1 : %i\n",len); */
+
+ if(U_FAILURE(*pErrorCode) && ( *pErrorCode!=U_FILE_ACCESS_ERROR)) {
+ fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
+ exit(*pErrorCode);
+ }
+}
+
+static void U_CALLCONV
+strprepProfileLineFn(void *context,
+ char *fields[][2], int32_t fieldCount,
+ UErrorCode *pErrorCode) {
+ (void)fieldCount; // suppress compiler warnings about unused variable
+ uint32_t mapping[40];
+ char *end, *map;
+ uint32_t code;
+ int32_t length;
+ /*UBool* mapWithNorm = (UBool*) context;*/
+ const char* typeName;
+ uint32_t rangeStart=0,rangeEnd =0;
+ const char* filename = (const char*) context;
+ const char *s;
+
+ s = u_skipWhitespace(fields[0][0]);
+ if (*s == '@') {
+ /* special directive */
+ s++;
+ length = (int32_t)(fields[0][1] - s);
+ if (length >= NORMALIZE_DIRECTIVE_LEN
+ && uprv_strncmp(s, NORMALIZE_DIRECTIVE, NORMALIZE_DIRECTIVE_LEN) == 0) {
+ options[NORMALIZE].doesOccur = TRUE;
+ return;
+ }
+ else if (length >= CHECK_BIDI_DIRECTIVE_LEN
+ && uprv_strncmp(s, CHECK_BIDI_DIRECTIVE, CHECK_BIDI_DIRECTIVE_LEN) == 0) {
+ options[CHECK_BIDI].doesOccur = TRUE;
+ return;
+ }
+ else {
+ fprintf(stderr, "gensprep error parsing a directive %s.", fields[0][0]);
+ }
+ }
+
+ typeName = fields[2][0];
+ map = fields[1][0];
+
+ if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){
+
+ u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode);
+ if(U_FAILURE(*pErrorCode)){
+ fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode));
+ return;
+ }
+
+ /* store the range */
+ storeRange(rangeStart,rangeEnd,USPREP_UNASSIGNED, pErrorCode);
+
+ }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){
+
+ u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode);
+ if(U_FAILURE(*pErrorCode)){
+ fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode));
+ return;
+ }
+
+ /* store the range */
+ storeRange(rangeStart,rangeEnd,USPREP_PROHIBITED, pErrorCode);
+
+ }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){
+
+ /* get the character code, field 0 */
+ code=(uint32_t)uprv_strtoul(s, &end, 16);
+ if(end<=s || end!=fields[0][1]) {
+ fprintf(stderr, "gensprep: syntax error in field 0 at %s\n", fields[0][0]);
+ *pErrorCode=U_PARSE_ERROR;
+ exit(U_PARSE_ERROR);
+ }
+
+ /* parse the mapping string */
+ length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode);
+
+ /* store the mapping */
+ storeMapping(code,mapping, length,USPREP_MAP, pErrorCode);
+
+ }else{
+ *pErrorCode = U_INVALID_FORMAT_ERROR;
+ }
+
+ if(U_FAILURE(*pErrorCode)) {
+ fprintf(stderr, "gensprep error parsing %s line %s at %s. Error: %s\n",filename,
+ fields[0][0],fields[2][0],u_errorName(*pErrorCode));
+ exit(*pErrorCode);
+ }
+
+}
+
+static void
+parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode) {
+ char *fields[3][2];
+
+ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+ return;
+ }
+
+ u_parseDelimitedFile(filename, ';', fields, 3, strprepProfileLineFn, (void*)filename, pErrorCode);
+
+ /*fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);*/
+
+ if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
+ fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
+ exit(*pErrorCode);
+ }
+}
+
+
+#endif /* #if !UCONFIG_NO_IDNA */
+
+/*
+ * Hey, Emacs, please set the following:
+ *
+ * Local Variables:
+ * indent-tabs-mode: nil
+ * End:
+ *
+ */
diff --git a/intl/icu/source/tools/gensprep/gensprep.h b/intl/icu/source/tools/gensprep/gensprep.h
new file mode 100644
index 0000000000..a2e5e61f9a
--- /dev/null
+++ b/intl/icu/source/tools/gensprep/gensprep.h
@@ -0,0 +1,83 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 1999-2006, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: gensprep.h
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2003-02-06
+* created by: Ram Viswanadha
+*/
+
+#ifndef __GENIDN_H__
+#define __GENIDN_H__
+
+#include "unicode/utypes.h"
+#include "sprpimpl.h"
+
+/* file definitions */
+#define DATA_NAME "sprep"
+#define DATA_TYPE "spp"
+
+/*
+ * data structure that holds the IDN properties for one or more
+ * code point(s) at build time
+ */
+
+
+/* global flags */
+extern UBool beVerbose, haveCopyright;
+
+/* prototypes */
+
+extern void
+setUnicodeVersion(const char *v);
+
+extern void
+setUnicodeVersionNC(UVersionInfo version);
+
+extern void
+init(void);
+
+#if !UCONFIG_NO_IDNA
+extern void
+storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, UStringPrepType type, UErrorCode* status);
+extern void
+storeRange(uint32_t start, uint32_t end, UStringPrepType type,UErrorCode* status);
+#endif
+
+extern void
+generateData(const char *dataDir, const char* bundleName);
+
+extern void
+setOptions(int32_t options);
+
+extern void
+cleanUpData(void);
+
+/*
+extern void
+storeIDN(uint32_t code, IDN *idn);
+
+extern void
+processData(void);
+
+
+*/
+#endif
+
+/*
+ * Hey, Emacs, please set the following:
+ *
+ * Local Variables:
+ * indent-tabs-mode: nil
+ * End:
+ *
+ */
diff --git a/intl/icu/source/tools/gensprep/gensprep.vcxproj b/intl/icu/source/tools/gensprep/gensprep.vcxproj
new file mode 100644
index 0000000000..c771a4162a
--- /dev/null
+++ b/intl/icu/source/tools/gensprep/gensprep.vcxproj
@@ -0,0 +1,100 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{631C23CE-6C1D-4875-88F0-85E0A42B36EA}</ProjectGuid>
+ </PropertyGroup>
+ <PropertyGroup Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseOfMfc>false</UseOfMfc>
+ <CharacterSet>MultiByte</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <!-- The following import will include the 'default' configuration options for VS projects. -->
+ <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" />
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC71.props" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup>
+ <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+ <OutDir>.\$(Platform)\$(Configuration)\</OutDir>
+ <IntDir>.\$(Platform)\$(Configuration)\</IntDir>
+ <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. -->
+ <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir>
+ <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation -->
+ <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental>
+ <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental>
+ </PropertyGroup>
+ <!-- Options that are common to *all* configurations -->
+ <ItemDefinitionGroup>
+ <Midl>
+ <TypeLibraryName>$(OutDir)\gensprep.tlb</TypeLibraryName>
+ </Midl>
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <CompileAs>Default</CompileAs>
+ <DisableLanguageExtensions>false</DisableLanguageExtensions>
+ <AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <PrecompiledHeaderOutputFile>$(OutDir)\gensprep.pch</PrecompiledHeaderOutputFile>
+ <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation>
+ <ObjectFileName>$(OutDir)/</ObjectFileName>
+ <ProgramDataBaseFileName>$(OutDir)\gensprep.pdb</ProgramDataBaseFileName>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <OutputFile>$(OutDir)\gensprep.exe</OutputFile>
+ <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+ </Link>
+ <CustomBuildStep>
+ <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command>
+ <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs>
+ </CustomBuildStep>
+ </ItemDefinitionGroup>
+ <!-- Options that are common to all 'Debug' project configurations -->
+ <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+ <ClCompile>
+ <BrowseInformation>true</BrowseInformation>
+ <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>icuucd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <!-- Options that are common to all 'Release' project configurations -->
+ <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+ <ClCompile>
+ <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>icuuc.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClCompile Include="gensprep.c" />
+ <ClCompile Include="store.c" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="gensprep.h" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
+</Project> \ No newline at end of file
diff --git a/intl/icu/source/tools/gensprep/gensprep.vcxproj.filters b/intl/icu/source/tools/gensprep/gensprep.vcxproj.filters
new file mode 100644
index 0000000000..2791b3aa6a
--- /dev/null
+++ b/intl/icu/source/tools/gensprep/gensprep.vcxproj.filters
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <Filter Include="Source Files">
+ <UniqueIdentifier>{bb521e6b-d70a-4efd-9399-408729059da6}</UniqueIdentifier>
+ <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions>
+ </Filter>
+ <Filter Include="Header Files">
+ <UniqueIdentifier>{837c7f4e-341d-4455-aa1e-f6ff7a03b065}</UniqueIdentifier>
+ <Extensions>h;hpp;hxx;hm;inl</Extensions>
+ </Filter>
+ <Filter Include="Resource Files">
+ <UniqueIdentifier>{a80f327a-7fb8-4737-8bd9-0f4b26c2c344}</UniqueIdentifier>
+ <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="gensprep.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ <ClCompile Include="store.c">
+ <Filter>Source Files</Filter>
+ </ClCompile>
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="gensprep.h">
+ <Filter>Header Files</Filter>
+ </ClInclude>
+ </ItemGroup>
+</Project> \ No newline at end of file
diff --git a/intl/icu/source/tools/gensprep/store.c b/intl/icu/source/tools/gensprep/store.c
new file mode 100644
index 0000000000..4b00d1b796
--- /dev/null
+++ b/intl/icu/source/tools/gensprep/store.c
@@ -0,0 +1,651 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+/*
+*******************************************************************************
+*
+* Copyright (C) 1999-2014, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: store.c
+* encoding: UTF-8
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2003-02-06
+* created by: Ram Viswanadha
+*
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "unicode/utypes.h"
+#include "cmemory.h"
+#include "cstring.h"
+#include "filestrm.h"
+#include "unicode/udata.h"
+#include "unicode/utf16.h"
+#include "utrie.h"
+#include "unewdata.h"
+#include "gensprep.h"
+#include "uhash.h"
+
+
+#define DO_DEBUG_OUT 0
+
+
+/*
+ * StringPrep profile file format ------------------------------------
+ *
+ * The file format prepared and written here contains a 16-bit trie and a mapping table.
+ *
+ * Before the data contents described below, there are the headers required by
+ * the udata API for loading ICU data. Especially, a UDataInfo structure
+ * precedes the actual data. It contains platform properties values and the
+ * file format version.
+ *
+ * The following is a description of format version 2.
+ *
+ * Data contents:
+ *
+ * The contents is a parsed, binary form of RFC3454 and possibly
+ * NormalizationCorrections.txt depending on the options specified on the profile.
+ *
+ * Any Unicode code point from 0 to 0x10ffff can be looked up to get
+ * the trie-word, if any, for that code point. This means that the input
+ * to the lookup are 21-bit unsigned integers, with not all of the
+ * 21-bit range used.
+ *
+ * *.spp files customarily begin with a UDataInfo structure, see udata.h and .c.
+ * After that there are the following structures:
+ *
+ * int32_t indexes[_SPREP_INDEX_TOP]; -- _SPREP_INDEX_TOP=16, see enum in sprpimpl.h file
+ *
+ * UTrie stringPrepTrie; -- size in bytes=indexes[_SPREP_INDEX_TRIE_SIZE]
+ *
+ * uint16_t mappingTable[]; -- Contains the sequecence of code units that the code point maps to
+ * size in bytes = indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]
+ *
+ * The indexes array contains the following values:
+ * indexes[_SPREP_INDEX_TRIE_SIZE] -- The size of the StringPrep trie in bytes
+ * indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] -- The size of the mappingTable in bytes
+ * indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] -- The index of Unicode version of last entry in NormalizationCorrections.txt
+ * indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] -- The starting index of 1 UChar mapping index in the mapping table
+ * indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] -- The starting index of 2 UChars mapping index in the mapping table
+ * indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] -- The starting index of 3 UChars mapping index in the mapping table
+ * indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] -- The starting index of 4 UChars mapping index in the mapping table
+ * indexes[_SPREP_OPTIONS] -- Bit set of options to turn on in the profile, e.g: USPREP_NORMALIZATION_ON, USPREP_CHECK_BIDI_ON
+ *
+ *
+ * StringPrep Trie :
+ *
+ * The StringPrep tries is a 16-bit trie that contains data for the profile.
+ * Each code point is associated with a value (trie-word) in the trie.
+ *
+ * - structure of data words from the trie
+ *
+ * i) A value greater than or equal to _SPREP_TYPE_THRESHOLD (0xFFF0)
+ * represents the type associated with the code point
+ * if(trieWord >= _SPREP_TYPE_THRESHOLD){
+ * type = trieWord - 0xFFF0;
+ * }
+ * The type can be :
+ * USPREP_UNASSIGNED
+ * USPREP_PROHIBITED
+ * USPREP_DELETE
+ *
+ * ii) A value less than _SPREP_TYPE_THRESHOLD means the type is USPREP_MAP and
+ * contains distribution described below
+ *
+ * 0 - ON : The code point is prohibited (USPREP_PROHIBITED). This is to allow for codepoint that are both prohibited and mapped.
+ * 1 - ON : The value in the next 14 bits is an index into the mapping table
+ * OFF: The value in the next 14 bits is an delta value from the code point
+ * 2..15 - Contains data as described by bit 1. If all bits are set
+ * (value = _SPREP_MAX_INDEX_VALUE) then the type is USPREP_DELETE
+ *
+ *
+ * Mapping Table:
+ * The data in mapping table is sorted according to the length of the mapping sequence.
+ * If the type of the code point is USPREP_MAP and value in trie word is an index, the index
+ * is compared with start indexes of sequence length start to figure out the length according to
+ * the following algorithm:
+ *
+ * if( index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] &&
+ * index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){
+ * length = 1;
+ * }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] &&
+ * index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){
+ * length = 2;
+ * }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] &&
+ * index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){
+ * length = 3;
+ * }else{
+ * // The first position in the mapping table contains the length
+ * // of the sequence
+ * length = mappingTable[index++];
+ *
+ * }
+ *
+ */
+
+/* file data ---------------------------------------------------------------- */
+/* indexes[] value names */
+
+#if UCONFIG_NO_IDNA
+
+/* dummy UDataInfo cf. udata.h */
+static UDataInfo dataInfo = {
+ sizeof(UDataInfo),
+ 0,
+
+ U_IS_BIG_ENDIAN,
+ U_CHARSET_FAMILY,
+ U_SIZEOF_UCHAR,
+ 0,
+
+ { 0, 0, 0, 0 }, /* dummy dataFormat */
+ { 0, 0, 0, 0 }, /* dummy formatVersion */
+ { 0, 0, 0, 0 } /* dummy dataVersion */
+};
+
+#else
+
+static int32_t indexes[_SPREP_INDEX_TOP]={ 0 };
+
+static uint16_t* mappingData= NULL;
+static int32_t mappingDataCapacity = 0; /* we skip the first index in mapping data */
+static int16_t currentIndex = 0; /* the current index into the data trie */
+static int32_t maxLength = 0; /* maximum length of mapping string */
+
+
+/* UDataInfo cf. udata.h */
+static UDataInfo dataInfo={
+ sizeof(UDataInfo),
+ 0,
+
+ U_IS_BIG_ENDIAN,
+ U_CHARSET_FAMILY,
+ U_SIZEOF_UCHAR,
+ 0,
+
+ { 0x53, 0x50, 0x52, 0x50 }, /* dataFormat="SPRP" */
+ { 3, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
+ { 3, 2, 0, 0 } /* dataVersion (Unicode version) */
+};
+void
+setUnicodeVersion(const char *v) {
+ UVersionInfo version;
+ u_versionFromString(version, v);
+ uprv_memcpy(dataInfo.dataVersion, version, 4);
+}
+
+void
+setUnicodeVersionNC(UVersionInfo version){
+ uint32_t univer = version[0] << 24;
+ univer += version[1] << 16;
+ univer += version[2] << 8;
+ univer += version[3];
+ indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] = univer;
+}
+static UNewTrie *sprepTrie;
+
+#define MAX_DATA_LENGTH 11500
+
+
+#define SPREP_DELTA_RANGE_POSITIVE_LIMIT 8191
+#define SPREP_DELTA_RANGE_NEGATIVE_LIMIT -8192
+
+
+extern void
+init() {
+
+ sprepTrie = (UNewTrie *)uprv_calloc(1, sizeof(UNewTrie));
+
+ /* initialize the two tries */
+ if(NULL==utrie_open(sprepTrie, NULL, MAX_DATA_LENGTH, 0, 0, FALSE)) {
+ fprintf(stderr, "error: failed to initialize tries\n");
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+}
+
+static UHashtable* hashTable = NULL;
+
+
+typedef struct ValueStruct {
+ UChar* mapping;
+ int16_t length;
+ UStringPrepType type;
+} ValueStruct;
+
+/* Callback for deleting the value from the hashtable */
+static void U_CALLCONV valueDeleter(void* obj){
+ ValueStruct* value = (ValueStruct*) obj;
+ uprv_free(value->mapping);
+ uprv_free(value);
+}
+
+/* Callback for hashing the entry */
+static int32_t U_CALLCONV hashEntry(const UHashTok parm) {
+ return parm.integer;
+}
+
+/* Callback for comparing two entries */
+static UBool U_CALLCONV compareEntries(const UHashTok p1, const UHashTok p2) {
+ return (UBool)(p1.integer != p2.integer);
+}
+
+
+static void
+storeMappingData(){
+
+ int32_t pos = UHASH_FIRST;
+ const UHashElement* element = NULL;
+ ValueStruct* value = NULL;
+ int32_t codepoint = 0;
+ int32_t elementCount = 0;
+ int32_t writtenElementCount = 0;
+ int32_t mappingLength = 1; /* minimum mapping length */
+ int32_t oldMappingLength = 0;
+ uint16_t trieWord =0;
+ int32_t limitIndex = 0;
+
+ if (hashTable == NULL) {
+ return;
+ }
+ elementCount = uhash_count(hashTable);
+
+ /*initialize the mapping data */
+ mappingData = (uint16_t*) uprv_calloc(mappingDataCapacity, U_SIZEOF_UCHAR);
+
+ while(writtenElementCount < elementCount){
+
+ while( (element = uhash_nextElement(hashTable, &pos))!=NULL){
+
+ codepoint = element->key.integer;
+ value = (ValueStruct*)element->value.pointer;
+
+ /* store the start of indexes */
+ if(oldMappingLength != mappingLength){
+ /* Assume that index[] is used according to the enums defined */
+ if(oldMappingLength <=_SPREP_MAX_INDEX_TOP_LENGTH){
+ indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex;
+ }
+ if(oldMappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH &&
+ mappingLength == _SPREP_MAX_INDEX_TOP_LENGTH +1){
+
+ limitIndex = currentIndex;
+
+ }
+ oldMappingLength = mappingLength;
+ }
+
+ if(value->length == mappingLength){
+ uint32_t savedTrieWord = 0;
+ trieWord = currentIndex << 2;
+ /* turn on the 2nd bit to signal that the following bits contain an index */
+ trieWord += 0x02;
+
+ if(trieWord > _SPREP_TYPE_THRESHOLD){
+ fprintf(stderr,"trieWord cannot contain value greater than 0x%04X.\n",_SPREP_TYPE_THRESHOLD);
+ exit(U_ILLEGAL_CHAR_FOUND);
+ }
+ /* figure out if the code point has type already stored */
+ savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL);
+ if(savedTrieWord!=0){
+ if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){
+ /* turn on the first bit in trie word */
+ trieWord += 0x01;
+ }else{
+ /*
+ * the codepoint has value something other than prohibited
+ * and a mapping .. error!
+ */
+ fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint);
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ }
+
+ /* now set the value in the trie */
+ if(!utrie_set32(sprepTrie,codepoint,trieWord)){
+ fprintf(stderr,"Could not set the value for code point.\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+
+ /* written the trie word for the codepoint... increment the count*/
+ writtenElementCount++;
+
+ /* sanity check are we exceeding the max number allowed */
+ if(currentIndex+value->length+1 > _SPREP_MAX_INDEX_VALUE){
+ fprintf(stderr, "Too many entries in the mapping table %i. Maximum allowed is %i\n",
+ currentIndex+value->length, _SPREP_MAX_INDEX_VALUE);
+ exit(U_INDEX_OUTOFBOUNDS_ERROR);
+ }
+
+ /* copy the mapping data */
+ /* write the length */
+ if(mappingLength > _SPREP_MAX_INDEX_TOP_LENGTH ){
+ /* the cast here is safe since we donot expect the length to be > 65535 */
+ mappingData[currentIndex++] = (uint16_t) mappingLength;
+ }
+ /* copy the contents to mappindData array */
+ u_memmove(mappingData+currentIndex, value->mapping, value->length);
+ currentIndex += value->length;
+ if (currentIndex > mappingDataCapacity) {
+ /* If this happens there is a bug in the computation of the mapping data size in storeMapping() */
+ fprintf(stderr, "gensprep, fatal error at %s, %d. Aborting.\n", __FILE__, __LINE__);
+ exit(U_INTERNAL_PROGRAM_ERROR);
+ }
+ }
+ }
+ mappingLength++;
+ pos = -1;
+ }
+ /* set the last length for range check */
+ if(mappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH){
+ indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex+1;
+ }else{
+ indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] = limitIndex;
+ }
+
+}
+
+extern void setOptions(int32_t options){
+ indexes[_SPREP_OPTIONS] = options;
+}
+extern void
+storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length,
+ UStringPrepType type, UErrorCode* status){
+
+
+ UChar* map = NULL;
+ int16_t adjustedLen=0, i, j;
+ uint16_t trieWord = 0;
+ ValueStruct *value = NULL;
+ uint32_t savedTrieWord = 0;
+
+ /* initialize the hashtable */
+ if(hashTable==NULL){
+ hashTable = uhash_open(hashEntry, compareEntries, NULL, status);
+ uhash_setValueDeleter(hashTable, valueDeleter);
+ }
+
+ /* figure out if the code point has type already stored */
+ savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL);
+ if(savedTrieWord!=0){
+ if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){
+ /* turn on the first bit in trie word */
+ trieWord += 0x01;
+ }else{
+ /*
+ * the codepoint has value something other than prohibited
+ * and a mapping .. error!
+ */
+ fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint);
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ }
+
+ /* figure out the real length */
+ for(i=0; i<length; i++){
+ adjustedLen += U16_LENGTH(mapping[i]);
+ }
+
+ if(adjustedLen == 0){
+ trieWord = (uint16_t)(_SPREP_MAX_INDEX_VALUE << 2);
+ /* make sure that the value of trieWord is less than the threshold */
+ if(trieWord < _SPREP_TYPE_THRESHOLD){
+ /* now set the value in the trie */
+ if(!utrie_set32(sprepTrie,codepoint,trieWord)){
+ fprintf(stderr,"Could not set the value for code point.\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ /* value is set so just return */
+ return;
+ }else{
+ fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD);
+ exit(U_ILLEGAL_CHAR_FOUND);
+ }
+ }
+
+ if(adjustedLen == 1){
+ /* calculate the delta */
+ int16_t delta = (int16_t)((int32_t)codepoint - (int16_t) mapping[0]);
+ if(delta >= SPREP_DELTA_RANGE_NEGATIVE_LIMIT && delta <= SPREP_DELTA_RANGE_POSITIVE_LIMIT){
+
+ trieWord = delta;
+ trieWord <<= 2;
+
+
+ /* make sure that the second bit is OFF */
+ if((trieWord & 0x02) != 0 ){
+ fprintf(stderr,"The second bit in the trie word is not zero while storing a delta.\n");
+ exit(U_INTERNAL_PROGRAM_ERROR);
+ }
+ /* make sure that the value of trieWord is less than the threshold */
+ if(trieWord < _SPREP_TYPE_THRESHOLD){
+ /* now set the value in the trie */
+ if(!utrie_set32(sprepTrie,codepoint,trieWord)){
+ fprintf(stderr,"Could not set the value for code point.\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ /* value is set so just return */
+ return;
+ }
+ }
+ /*
+ * if the delta is not in the given range or if the trieWord is larger than the threshold
+ * just fall through for storing the mapping in the mapping table
+ */
+ }
+
+ map = (UChar*) uprv_calloc(adjustedLen + 1, U_SIZEOF_UCHAR);
+
+ for (i=0, j=0; i<length; i++) {
+ U16_APPEND_UNSAFE(map, j, mapping[i]);
+ }
+
+ value = (ValueStruct*) uprv_malloc(sizeof(ValueStruct));
+ value->mapping = map;
+ value->type = type;
+ value->length = adjustedLen;
+ if(value->length > _SPREP_MAX_INDEX_TOP_LENGTH){
+ mappingDataCapacity++;
+ }
+ if(maxLength < value->length){
+ maxLength = value->length;
+ }
+ uhash_iput(hashTable,codepoint,value,status);
+ mappingDataCapacity += adjustedLen;
+
+ if(U_FAILURE(*status)){
+ fprintf(stderr, "Failed to put entries into the hastable. Error: %s\n", u_errorName(*status));
+ exit(*status);
+ }
+}
+
+
+extern void
+storeRange(uint32_t start, uint32_t end, UStringPrepType type, UErrorCode* status){
+ (void)status; // suppress compiler warnings about unused variable
+ uint16_t trieWord = 0;
+
+ if((int)(_SPREP_TYPE_THRESHOLD + type) > 0xFFFF){
+ fprintf(stderr,"trieWord cannot contain value greater than 0xFFFF.\n");
+ exit(U_ILLEGAL_CHAR_FOUND);
+ }
+ trieWord = (_SPREP_TYPE_THRESHOLD + type); /* the top 4 bits contain the value */
+ if(start == end){
+ uint32_t savedTrieWord = utrie_get32(sprepTrie, start, NULL);
+ if(savedTrieWord>0){
+ if(savedTrieWord < _SPREP_TYPE_THRESHOLD && type == USPREP_PROHIBITED){
+ /*
+ * A mapping is stored in the trie word
+ * and the only other possible type that a
+ * code point can have is USPREP_PROHIBITED
+ *
+ */
+
+ /* turn on the 0th bit in the savedTrieWord */
+ savedTrieWord += 0x01;
+
+ /* the downcast is safe since we only save 16 bit values */
+ trieWord = (uint16_t)savedTrieWord;
+
+ /* make sure that the value of trieWord is less than the threshold */
+ if(trieWord < _SPREP_TYPE_THRESHOLD){
+ /* now set the value in the trie */
+ if(!utrie_set32(sprepTrie,start,trieWord)){
+ fprintf(stderr,"Could not set the value for code point.\n");
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ /* value is set so just return */
+ return;
+ }else{
+ fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD);
+ exit(U_ILLEGAL_CHAR_FOUND);
+ }
+
+ }else if(savedTrieWord != trieWord){
+ fprintf(stderr,"Value for codepoint \\U%08X already set!.\n", (int)start);
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ /* if savedTrieWord == trieWord .. fall through and set the value */
+ }
+ if(!utrie_set32(sprepTrie,start,trieWord)){
+ fprintf(stderr,"Could not set the value for code point \\U%08X.\n", (int)start);
+ exit(U_ILLEGAL_ARGUMENT_ERROR);
+ }
+ }else{
+ if(!utrie_setRange32(sprepTrie, start, end+1, trieWord, FALSE)){
+ fprintf(stderr,"Value for certain codepoint already set.\n");
+ exit(U_ILLEGAL_CHAR_FOUND);
+ }
+ }
+
+}
+
+/* folding value: just store the offset (16 bits) if there is any non-0 entry */
+static uint32_t U_CALLCONV
+getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) {
+ uint32_t value;
+ UChar32 limit=0;
+ UBool inBlockZero;
+
+ limit=start+0x400;
+ while(start<limit) {
+ value=utrie_get32(trie, start, &inBlockZero);
+ if(inBlockZero) {
+ start+=UTRIE_DATA_BLOCK_LENGTH;
+ } else if(value!=0) {
+ return (uint32_t)offset;
+ } else {
+ ++start;
+ }
+ }
+ return 0;
+
+}
+
+#endif /* #if !UCONFIG_NO_IDNA */
+
+extern void
+generateData(const char *dataDir, const char* bundleName) {
+ static uint8_t sprepTrieBlock[100000];
+
+ UNewDataMemory *pData;
+ UErrorCode errorCode=U_ZERO_ERROR;
+ int32_t size, dataLength;
+ char* fileName = (char*) uprv_malloc(uprv_strlen(bundleName) +100);
+
+#if UCONFIG_NO_IDNA
+
+ size=0;
+
+#else
+
+ int32_t sprepTrieSize;
+
+ /* sort and add mapping data */
+ storeMappingData();
+
+ sprepTrieSize=utrie_serialize(sprepTrie, sprepTrieBlock, sizeof(sprepTrieBlock), getFoldedValue, TRUE, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "error: utrie_serialize(sprep trie) failed, %s\n", u_errorName(errorCode));
+ exit(errorCode);
+ }
+
+ size = sprepTrieSize + mappingDataCapacity*U_SIZEOF_UCHAR + sizeof(indexes);
+ if(beVerbose) {
+ printf("size of sprep trie %5u bytes\n", (int)sprepTrieSize);
+ printf("size of " U_ICUDATA_NAME "_%s." DATA_TYPE " contents: %ld bytes\n", bundleName,(long)size);
+ printf("size of mapping data array %5u bytes\n",(int)mappingDataCapacity * U_SIZEOF_UCHAR);
+ printf("Number of code units in mappingData (currentIndex) are: %i \n", currentIndex);
+ printf("Maximum length of the mapping string is : %i \n", (int)maxLength);
+ }
+
+#endif
+
+ fileName[0]=0;
+ uprv_strcat(fileName,bundleName);
+ /* write the data */
+ pData=udata_create(dataDir, DATA_TYPE, fileName, &dataInfo,
+ haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "gensprep: unable to create the output file, error %d\n", errorCode);
+ exit(errorCode);
+ }
+
+#if !UCONFIG_NO_IDNA
+
+ indexes[_SPREP_INDEX_TRIE_SIZE]=sprepTrieSize;
+ indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]=mappingDataCapacity*U_SIZEOF_UCHAR;
+
+ udata_writeBlock(pData, indexes, sizeof(indexes));
+ udata_writeBlock(pData, sprepTrieBlock, sprepTrieSize);
+ udata_writeBlock(pData, mappingData, indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]);
+
+
+#endif
+
+ /* finish up */
+ dataLength=udata_finish(pData, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "gensprep: error %d writing the output file\n", errorCode);
+ exit(errorCode);
+ }
+
+ if(dataLength!=size) {
+ fprintf(stderr, "gensprep error: data length %ld != calculated size %ld\n",
+ (long)dataLength, (long)size);
+ exit(U_INTERNAL_PROGRAM_ERROR);
+ }
+
+#if !UCONFIG_NO_IDNA
+ /* done with writing the data .. close the hashtable */
+ if (hashTable != NULL) {
+ uhash_close(hashTable);
+ }
+#endif
+
+ uprv_free(fileName);
+}
+
+#if !UCONFIG_NO_IDNA
+
+extern void
+cleanUpData(void) {
+ uprv_free(mappingData);
+ utrie_close(sprepTrie);
+ uprv_free(sprepTrie);
+}
+
+#endif /* #if !UCONFIG_NO_IDNA */
+
+/*
+ * Hey, Emacs, please set the following:
+ *
+ * Local Variables:
+ * indent-tabs-mode: nil
+ * End:
+ *
+ */