diff options
Diffstat (limited to 'intl/icu/source/tools/gensprep')
-rw-r--r-- | intl/icu/source/tools/gensprep/Makefile.in | 97 | ||||
-rwxr-xr-x | intl/icu/source/tools/gensprep/filterRFC3454.pl | 678 | ||||
-rw-r--r-- | intl/icu/source/tools/gensprep/gensprep.8.in | 104 | ||||
-rw-r--r-- | intl/icu/source/tools/gensprep/gensprep.c | 460 | ||||
-rw-r--r-- | intl/icu/source/tools/gensprep/gensprep.h | 83 | ||||
-rw-r--r-- | intl/icu/source/tools/gensprep/gensprep.vcxproj | 84 | ||||
-rw-r--r-- | intl/icu/source/tools/gensprep/gensprep.vcxproj.filters | 30 | ||||
-rw-r--r-- | intl/icu/source/tools/gensprep/sources.txt | 2 | ||||
-rw-r--r-- | intl/icu/source/tools/gensprep/store.c | 653 |
9 files changed, 2191 insertions, 0 deletions
diff --git a/intl/icu/source/tools/gensprep/Makefile.in b/intl/icu/source/tools/gensprep/Makefile.in new file mode 100644 index 0000000000..7f475aeb56 --- /dev/null +++ b/intl/icu/source/tools/gensprep/Makefile.in @@ -0,0 +1,97 @@ +## Makefile.in for ICU - tools/gensprep +## Copyright (C) 2016 and later: Unicode, Inc. and others. +## License & terms of use: http://www.unicode.org/copyright.html +## Copyright (c) 2001-2011, International Business Machines Corporation and +## others. All Rights Reserved. +## Steven R. Loomis/Markus W. Scherer + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## Build directory information +subdir = tools/gensprep + +TARGET_STUB_NAME = gensprep + +SECTION = 8 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil +LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +SOURCES = $(shell cat $(srcdir)/sources.txt) +OBJECTS = $(SOURCES:.c=.o) + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(sbindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir) + +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + $(POST_BUILD_STEP) + + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif + diff --git a/intl/icu/source/tools/gensprep/filterRFC3454.pl b/intl/icu/source/tools/gensprep/filterRFC3454.pl new file mode 100755 index 0000000000..321b03512c --- /dev/null +++ b/intl/icu/source/tools/gensprep/filterRFC3454.pl @@ -0,0 +1,678 @@ +#!/usr/bin/perl +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (c) 2001-2015 International Business Machines +# Corporation and others. All Rights Reserved. + +#################################################################################### +# filterRFC3454.pl: +# This tool filters the RFC-3454 txt file for StringPrep tables and creates a table +# to be used in NamePrepProfile +# +# Author: Ram Viswanadha +# +#################################################################################### + +use File::Find; +use File::Basename; +use IO::File; +use Cwd; +use File::Copy; +use Getopt::Long; +use File::Path; +use File::Copy; +use Time::localtime; + +$icu_copyright = "#####################################################################\n# Copyright (c) %d, International Business Machines Corporation and\n# others. All Rights Reserved.\n#####################################################################\n\n"; +$copyright = "###################\n# This file was generated from RFC 3454 (http://www.ietf.org/rfc/rfc3454.txt)\n# Copyright (C) The Internet Society (2002). All Rights Reserved. \n###################\n\n"; +$warning = "###################\n# WARNING: This table is generated by filterRFC3454.pl tool with\n# options: @ARGV \n###################\n\n"; +#run the program) +main(); + +#--------------------------------------------------------------------- +# The main program + +sub main(){ + GetOptions( + "--sourcedir=s" => \$sourceDir, + "--destdir=s" => \$destDir, + "--src-filename=s" => \$srcFileName, + "--dest-filename=s" => \$destFileName, + "--A1" => \$a1, + "--B1" => \$b1, + "--B2" => \$b2, + "--B3" => \$b3, + "--C11" => \$c11, + "--C12" => \$c12, + "--C21" => \$c21, + "--C22" => \$c22, + "--C3" => \$c3, + "--C4" => \$c4, + "--C5" => \$c5, + "--C6" => \$c6, + "--C7" => \$c7, + "--C8" => \$c8, + "--C9" => \$c9, + "--iscsi" => \$writeISCSIProhibitedExtra, + "--xmpp-node" => \$writeXMPPNodeProhibitedExtra, + "--sasl" => \$writeSASLMap, + "--ldap" => \$writeLDAPMap, + "--normalize" => \$norm, + "--check-bidi" => \$checkBidi, + ); + usage() unless defined $sourceDir; + usage() unless defined $destDir; + usage() unless defined $srcFileName; + usage() unless defined $destFileName; + + $infile = $sourceDir."/".$srcFileName; + $inFH = IO::File->new($infile,"r") + or die "could not open the file $infile for reading: $! \n"; + $outfile = $destDir."/".$destFileName; + + unlink($outfile); + $outFH = IO::File->new($outfile,"a") + or die "could not open the file $outfile for writing: $! \n"; + + printf $outFH $icu_copyright, localtime->year()+1900; + print $outFH $copyright; + print $outFH $warning; + + if(defined $norm) { + print $outFH "\@normalize;;\n"; + } + if(defined $checkBidi) { + print $outFH "\@check-bidi;;\n"; + } + print $outFH "\n"; + close($outFH); + + if(defined $b2 && defined $b3){ + die "ERROR: --B2 and --B3 are both specified\!\n"; + } + + while(defined ($line=<$inFH>)){ + next unless $line=~ /Start\sTable/; + if($line =~ /A.1/){ + createUnassignedTable($inFH,$outfile); + } + if($line =~ /B.1/ && defined $b1){ + createMapToNothing($inFH,$outfile); + } + if($line =~ /B.2/ && defined $b2){ + createCaseMapNorm($inFH,$outfile); + } + if($line =~ /B.3/ && defined $b3){ + createCaseMapNoNorm($inFH,$outfile); + } + if($line =~ /C.1.1/ && defined $c11 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.1.2/ && defined $c12 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.2.1/ && defined $c21 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.2.2/ && defined $c22 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.3/ && defined $c3 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.4/ && defined $c4 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.5/ && defined $c5 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.6/ && defined $c6 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.7/ && defined $c7 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.8/ && defined $c8 ){ + createProhibitedTable($inFH,$outfile,$line); + } + if($line =~ /C.9/ && defined $c9 ){ + createProhibitedTable($inFH,$outfile,$line); + } + } + if( defined $writeISCSIProhibitedExtra){ + create_iSCSIExtraProhibitedTable($inFH, $outfile); + } + if( defined $writeXMPPNodeProhibitedExtra){ + create_XMPPNodeExtraProhibitedTable($inFH, $outfile); + } + if( defined $writeSASLMap){ + create_SASLMapTable($inFH, $outfile); + } + if( defined $writeLDAPMap){ + create_LDAPMapTable($inFH, $outfile); + } + close($inFH); +} + +#----------------------------------------------------------------------- +sub readPrint{ + local ($inFH, $outFH,$comment, $table) = @_; + $count = 0; + print $outFH $comment."\n"; + while(defined ($line = <$inFH>)){ + next if $line =~ /Hoffman\s\&\sBlanchet/; # ignore heading + next if $line =~ /RFC\s3454/; # ignore heading + next if $line =~ /\f/; # ignore form feed + next if $line eq "\n"; # ignore blank lines + # break if "End Table" is found + if( $line =~ /End\sTable/){ + print $outFH "\n# Total code points $count\n\n"; + return; + } + if($print==1){ + print $line; + } + $line =~ s/-/../; + $line =~ s/^\s+//; + if($line =~ /\;/){ + }else{ + $line =~ s/$/;/; + } + if($table =~ /A/ ){ + ($code, $noise) = split /;/ , $line; + $line = $code."; ; UNASSIGNED\n"; + }elsif ( $table =~ /B\.1/ ){ + $line =~ s/Map to nothing/MAP/; + }elsif ( $table =~ /B\.[23]/ ){ + $line =~ s/Case map/MAP/; + $line =~ s/Additional folding/MAP/; + }elsif ( $table =~ /C/ ) { + ($code, $noise) = split /;/ , $line; + $line = $code."; ; PROHIBITED\n"; + } + if($line =~ /\.\./){ + ($code, $noise) = split /;/ , $line; + ($startStr, $endStr ) = split /\.\./, $code; + $start = atoi($startStr); + $end = atoi($endStr); + #print $start." ".$end."\n"; + while($start <= $end){ + $count++; + $start++; + } + }else{ + $count++; + } + print $outFH $line; + } +} +#----------------------------------------------------------------------- +sub atoi { + my $t; + foreach my $d (split(//, shift())) { + $t = $t * 16 + $d; + } + return $t; +} +#----------------------------------------------------------------------- +sub createUnassignedTable{ + ($inFH,$outfile) = @_; + $outFH = IO::File->new($outfile,"a") + or die "could not open the file $outfile for writing: $! \n"; + $comment = "# This table contains code points from Table A.1 from RFC 3454\n"; + readPrint($inFH,$outFH, $comment, "A"); + close($outFH); +} +#----------------------------------------------------------------------- +sub createMapToNothing{ + ($inFH,$outfile) = @_; + $outFH = IO::File->new($outfile,"a") + or die "could not open the file $outfile for writing: $! \n"; + $comment = "# This table contains code points from Table B.1 from RFC 3454\n"; + readPrint($inFH,$outFH,$comment, "B.1"); + close($outFH); +} +#----------------------------------------------------------------------- +sub createCaseMapNorm{ + ($inFH,$outfile) = @_; + $outFH = IO::File->new($outfile,"a") + or die "could not open the file $outfile for writing: $! \n"; + $comment = $warning."# This table contains code points from Table B.2 from RFC 3454\n"; + readPrint($inFH,$outFH,$comment, "B.2"); + close($outFH); +} +#----------------------------------------------------------------------- +sub createCaseMapNoNorm{ + ($inFH,$outfile) = @_; + $outFH = IO::File->new($outfile,"a") + or die "could not open the file $outfile for writing: $! \n"; + $comment = $warning."# This table contains code points from Table B.3 from RFC 3454\n"; + readPrint($inFH,$outFH,$comment, "B.3"); + close($outFH); +} +#----------------------------------------------------------------------- +sub createProhibitedTable{ + ($inFH,$outfile,$line) = @_; + $line =~ s/Start//; + $line =~ s/-//g; + $comment = "# code points from $line"; + + $outFH = IO::File->new($outfile, "a") + or die "could not open the file $outfile for writing: $! \n"; + readPrint($inFH,$outFH,$comment, "C"); + close($outFH); +} + +#----------------------------------------------------------------------- +sub create_iSCSIExtraProhibitedTable{ + ($inFH,$outfile,$line) = @_; + $comment ="# Additional prohibitions from iSCSI profile (rfc3722.txt)\n\n"; + + $outFH = IO::File->new($outfile, "a") + or die "could not open the file $outfile for writing: $! \n"; + print $outFH $comment; + print $outFH "0021..002C; ; PROHIBITED\n"; + print $outFH "002F; ; PROHIBITED\n"; + print $outFH "003B..0040; ; PROHIBITED\n"; + print $outFH "005B..0060; ; PROHIBITED\n"; + print $outFH "007B..007E; ; PROHIBITED\n"; + print $outFH "3002; ; PROHIBITED\n"; + print $outFH "\n# Total code points 30\n"; + close($outFH); +} +#----------------------------------------------------------------------- +sub create_XMPPNodeExtraProhibitedTable{ + ($inFH,$outfile,$line) = @_; + $comment ="# Additional prohibitions from XMPP Nodeprep profile (rfc3920.txt)\n\n"; + + $outFH = IO::File->new($outfile, "a") + or die "could not open the file $outfile for writing: $! \n"; + print $outFH $comment; + print $outFH "0022; ; PROHIBITED\n"; + print $outFH "0026; ; PROHIBITED\n"; + print $outFH "0027; ; PROHIBITED\n"; + print $outFH "002F; ; PROHIBITED\n"; + print $outFH "003A; ; PROHIBITED\n"; + print $outFH "003C; ; PROHIBITED\n"; + print $outFH "003E; ; PROHIBITED\n"; + print $outFH "0040; ; PROHIBITED\n"; + print $outFH "\n# Total code points 8\n"; + close($outFH); +} +#----------------------------------------------------------------------- +sub create_SASLMapTable{ + ($inFH,$outfile,$line) = @_; + $comment ="# Map table for SASL profile (rfc4013.txt)\n\n"; + + $outFH = IO::File->new($outfile, "a") + or die "could not open the file $outfile for writing: $! \n"; + print $outFH $comment; + # non-ASCII space characters [C.1.2] to SPACE + print $outFH "00A0; 0020; MAP\n"; + print $outFH "1680; 0020; MAP\n"; + print $outFH "2000; 0020; MAP\n"; + print $outFH "2001; 0020; MAP\n"; + print $outFH "2002; 0020; MAP\n"; + print $outFH "2003; 0020; MAP\n"; + print $outFH "2004; 0020; MAP\n"; + print $outFH "2005; 0020; MAP\n"; + print $outFH "2006; 0020; MAP\n"; + print $outFH "2007; 0020; MAP\n"; + print $outFH "2008; 0020; MAP\n"; + print $outFH "2009; 0020; MAP\n"; + print $outFH "200A; 0020; MAP\n"; + print $outFH "200B; 0020; MAP\n"; + print $outFH "202F; 0020; MAP\n"; + print $outFH "205F; 0020; MAP\n"; + print $outFH "3000; 0020; MAP\n"; + + # commonly mapped to nothing characters except U+200B to nothing + print $outFH "00AD; ; MAP\n"; + print $outFH "034F; ; MAP\n"; + print $outFH "1806; ; MAP\n"; + print $outFH "180B; ; MAP\n"; + print $outFH "180C; ; MAP\n"; + print $outFH "180D; ; MAP\n"; + print $outFH "200C; ; MAP\n"; + print $outFH "200D; ; MAP\n"; + print $outFH "2060; ; MAP\n"; + print $outFH "FE00; ; MAP\n"; + print $outFH "FE01; ; MAP\n"; + print $outFH "FE02; ; MAP\n"; + print $outFH "FE03; ; MAP\n"; + print $outFH "FE04; ; MAP\n"; + print $outFH "FE05; ; MAP\n"; + print $outFH "FE06; ; MAP\n"; + print $outFH "FE07; ; MAP\n"; + print $outFH "FE08; ; MAP\n"; + print $outFH "FE09; ; MAP\n"; + print $outFH "FE0A; ; MAP\n"; + print $outFH "FE0B; ; MAP\n"; + print $outFH "FE0C; ; MAP\n"; + print $outFH "FE0D; ; MAP\n"; + print $outFH "FE0E; ; MAP\n"; + print $outFH "FE0F; ; MAP\n"; + print $outFH "FEFF; ; MAP\n"; + print $outFH "\n# Total code points 43\n"; + close($outFH); +} +#----------------------------------------------------------------------- +sub create_LDAPMapTable{ + ($inFH,$outfile,$line) = @_; + $comment ="# Map table for LDAP profile (rfc4518.txt)\n\n"; + + $outFH = IO::File->new($outfile, "a") + or die "could not open the file $outfile for writing: $! \n"; + print $outFH $comment; + + # SOFT HYPHEN (U+00AD) and MONGOLIAN TODO SOFT HYPHEN (U+1806) code + # points are mapped to nothing. COMBINING GRAPHEME JOINER (U+034F) and + # VARIATION SELECTORs (U+180B-180D, FF00-FE0F) code points are also + # mapped to nothing. The OBJECT REPLACEMENT CHARACTER (U+FFFC) is + # mapped to nothing. + + print $outFH "00AD; ; MAP\n"; + print $outFH "034F; ; MAP\n"; + print $outFH "1806; ; MAP\n"; + print $outFH "180B; ; MAP\n"; + print $outFH "180C; ; MAP\n"; + print $outFH "180D; ; MAP\n"; + print $outFH "FE00; ; MAP\n"; + print $outFH "FE01; ; MAP\n"; + print $outFH "FE02; ; MAP\n"; + print $outFH "FE03; ; MAP\n"; + print $outFH "FE04; ; MAP\n"; + print $outFH "FE05; ; MAP\n"; + print $outFH "FE06; ; MAP\n"; + print $outFH "FE07; ; MAP\n"; + print $outFH "FE08; ; MAP\n"; + print $outFH "FE09; ; MAP\n"; + print $outFH "FE0A; ; MAP\n"; + print $outFH "FE0B; ; MAP\n"; + print $outFH "FE0C; ; MAP\n"; + print $outFH "FE0D; ; MAP\n"; + print $outFH "FE0E; ; MAP\n"; + print $outFH "FE0F; ; MAP\n"; + print $outFH "FFFC; ; MAP\n"; + +# CHARACTER TABULATION (U+0009), LINE FEED (LF) (U+000A), LINE +# TABULATION (U+000B), FORM FEED (FF) (U+000C), CARRIAGE RETURN (CR) +# (U+000D), and NEXT LINE (NEL) (U+0085) are mapped to SPACE (U+0020). + + print $outFH "0009; 0020; MAP\n"; + print $outFH "000A; 0020; MAP\n"; + print $outFH "000B; 0020; MAP\n"; + print $outFH "000C; 0020; MAP\n"; + print $outFH "000D; 0020; MAP\n"; + print $outFH "0085; 0020; MAP\n"; + + # All other control code (e.g., Cc) points or code points with a + # control function (e.g., Cf) are mapped to nothing. The following is + # a complete list of these code points: U+0000-0008, 000E-001F, 007F- + # 0084, 0086-009F, 06DD, 070F, 180E, 200C-200F, 202A-202E, 2060-2063, + # 206A-206F, FEFF, FFF9-FFFB, 1D173-1D17A, E0001, E0020-E007F. + + print $outFH "0000; ; MAP\n"; + print $outFH "0001; ; MAP\n"; + print $outFH "0002; ; MAP\n"; + print $outFH "0003; ; MAP\n"; + print $outFH "0004; ; MAP\n"; + print $outFH "0005; ; MAP\n"; + print $outFH "0006; ; MAP\n"; + print $outFH "0007; ; MAP\n"; + print $outFH "0008; ; MAP\n"; + print $outFH "000E; ; MAP\n"; + print $outFH "000F; ; MAP\n"; + print $outFH "0010; ; MAP\n"; + print $outFH "0011; ; MAP\n"; + print $outFH "0012; ; MAP\n"; + print $outFH "0013; ; MAP\n"; + print $outFH "0014; ; MAP\n"; + print $outFH "0015; ; MAP\n"; + print $outFH "0016; ; MAP\n"; + print $outFH "0017; ; MAP\n"; + print $outFH "0018; ; MAP\n"; + print $outFH "0019; ; MAP\n"; + print $outFH "001A; ; MAP\n"; + print $outFH "001B; ; MAP\n"; + print $outFH "001C; ; MAP\n"; + print $outFH "001D; ; MAP\n"; + print $outFH "001E; ; MAP\n"; + print $outFH "001F; ; MAP\n"; + print $outFH "007F; ; MAP\n"; + print $outFH "0080; ; MAP\n"; + print $outFH "0081; ; MAP\n"; + print $outFH "0082; ; MAP\n"; + print $outFH "0083; ; MAP\n"; + print $outFH "0084; ; MAP\n"; + print $outFH "0086; ; MAP\n"; + print $outFH "0087; ; MAP\n"; + print $outFH "0088; ; MAP\n"; + print $outFH "0089; ; MAP\n"; + print $outFH "008A; ; MAP\n"; + print $outFH "008B; ; MAP\n"; + print $outFH "008C; ; MAP\n"; + print $outFH "008D; ; MAP\n"; + print $outFH "008E; ; MAP\n"; + print $outFH "008F; ; MAP\n"; + print $outFH "0090; ; MAP\n"; + print $outFH "0091; ; MAP\n"; + print $outFH "0092; ; MAP\n"; + print $outFH "0093; ; MAP\n"; + print $outFH "0094; ; MAP\n"; + print $outFH "0095; ; MAP\n"; + print $outFH "0096; ; MAP\n"; + print $outFH "0097; ; MAP\n"; + print $outFH "0098; ; MAP\n"; + print $outFH "0099; ; MAP\n"; + print $outFH "009A; ; MAP\n"; + print $outFH "009B; ; MAP\n"; + print $outFH "009C; ; MAP\n"; + print $outFH "009D; ; MAP\n"; + print $outFH "009E; ; MAP\n"; + print $outFH "009F; ; MAP\n"; + print $outFH "06DD; ; MAP\n"; + print $outFH "070F; ; MAP\n"; + print $outFH "180E; ; MAP\n"; + print $outFH "200C; ; MAP\n"; + print $outFH "200D; ; MAP\n"; + print $outFH "200E; ; MAP\n"; + print $outFH "200F; ; MAP\n"; + print $outFH "202A; ; MAP\n"; + print $outFH "202B; ; MAP\n"; + print $outFH "202C; ; MAP\n"; + print $outFH "202D; ; MAP\n"; + print $outFH "202E; ; MAP\n"; + print $outFH "2060; ; MAP\n"; + print $outFH "2061; ; MAP\n"; + print $outFH "2062; ; MAP\n"; + print $outFH "2063; ; MAP\n"; + print $outFH "206A; ; MAP\n"; + print $outFH "206B; ; MAP\n"; + print $outFH "206C; ; MAP\n"; + print $outFH "206D; ; MAP\n"; + print $outFH "206E; ; MAP\n"; + print $outFH "206F; ; MAP\n"; + print $outFH "FEFF; ; MAP\n"; + print $outFH "FFF9; ; MAP\n"; + print $outFH "FFFA; ; MAP\n"; + print $outFH "FFFB; ; MAP\n"; + print $outFH "1D173; ; MAP\n"; + print $outFH "1D174; ; MAP\n"; + print $outFH "1D175; ; MAP\n"; + print $outFH "1D176; ; MAP\n"; + print $outFH "1D177; ; MAP\n"; + print $outFH "1D178; ; MAP\n"; + print $outFH "1D179; ; MAP\n"; + print $outFH "1D17A; ; MAP\n"; + print $outFH "E0001; ; MAP\n"; + print $outFH "E0020; ; MAP\n"; + print $outFH "E0021; ; MAP\n"; + print $outFH "E0022; ; MAP\n"; + print $outFH "E0023; ; MAP\n"; + print $outFH "E0024; ; MAP\n"; + print $outFH "E0025; ; MAP\n"; + print $outFH "E0026; ; MAP\n"; + print $outFH "E0027; ; MAP\n"; + print $outFH "E0028; ; MAP\n"; + print $outFH "E0029; ; MAP\n"; + print $outFH "E002A; ; MAP\n"; + print $outFH "E002B; ; MAP\n"; + print $outFH "E002C; ; MAP\n"; + print $outFH "E002D; ; MAP\n"; + print $outFH "E002E; ; MAP\n"; + print $outFH "E002F; ; MAP\n"; + print $outFH "E0030; ; MAP\n"; + print $outFH "E0031; ; MAP\n"; + print $outFH "E0032; ; MAP\n"; + print $outFH "E0033; ; MAP\n"; + print $outFH "E0034; ; MAP\n"; + print $outFH "E0035; ; MAP\n"; + print $outFH "E0036; ; MAP\n"; + print $outFH "E0037; ; MAP\n"; + print $outFH "E0038; ; MAP\n"; + print $outFH "E0039; ; MAP\n"; + print $outFH "E003A; ; MAP\n"; + print $outFH "E003B; ; MAP\n"; + print $outFH "E003C; ; MAP\n"; + print $outFH "E003D; ; MAP\n"; + print $outFH "E003E; ; MAP\n"; + print $outFH "E003F; ; MAP\n"; + print $outFH "E0040; ; MAP\n"; + print $outFH "E0041; ; MAP\n"; + print $outFH "E0042; ; MAP\n"; + print $outFH "E0043; ; MAP\n"; + print $outFH "E0044; ; MAP\n"; + print $outFH "E0045; ; MAP\n"; + print $outFH "E0046; ; MAP\n"; + print $outFH "E0047; ; MAP\n"; + print $outFH "E0048; ; MAP\n"; + print $outFH "E0049; ; MAP\n"; + print $outFH "E004A; ; MAP\n"; + print $outFH "E004B; ; MAP\n"; + print $outFH "E004C; ; MAP\n"; + print $outFH "E004D; ; MAP\n"; + print $outFH "E004E; ; MAP\n"; + print $outFH "E004F; ; MAP\n"; + print $outFH "E0050; ; MAP\n"; + print $outFH "E0051; ; MAP\n"; + print $outFH "E0052; ; MAP\n"; + print $outFH "E0053; ; MAP\n"; + print $outFH "E0054; ; MAP\n"; + print $outFH "E0055; ; MAP\n"; + print $outFH "E0056; ; MAP\n"; + print $outFH "E0057; ; MAP\n"; + print $outFH "E0058; ; MAP\n"; + print $outFH "E0059; ; MAP\n"; + print $outFH "E005A; ; MAP\n"; + print $outFH "E005B; ; MAP\n"; + print $outFH "E005C; ; MAP\n"; + print $outFH "E005D; ; MAP\n"; + print $outFH "E005E; ; MAP\n"; + print $outFH "E005F; ; MAP\n"; + print $outFH "E0060; ; MAP\n"; + print $outFH "E0061; ; MAP\n"; + print $outFH "E0062; ; MAP\n"; + print $outFH "E0063; ; MAP\n"; + print $outFH "E0064; ; MAP\n"; + print $outFH "E0065; ; MAP\n"; + print $outFH "E0066; ; MAP\n"; + print $outFH "E0067; ; MAP\n"; + print $outFH "E0068; ; MAP\n"; + print $outFH "E0069; ; MAP\n"; + print $outFH "E006A; ; MAP\n"; + print $outFH "E006B; ; MAP\n"; + print $outFH "E006C; ; MAP\n"; + print $outFH "E006D; ; MAP\n"; + print $outFH "E006E; ; MAP\n"; + print $outFH "E006F; ; MAP\n"; + print $outFH "E0070; ; MAP\n"; + print $outFH "E0071; ; MAP\n"; + print $outFH "E0072; ; MAP\n"; + print $outFH "E0073; ; MAP\n"; + print $outFH "E0074; ; MAP\n"; + print $outFH "E0075; ; MAP\n"; + print $outFH "E0076; ; MAP\n"; + print $outFH "E0077; ; MAP\n"; + print $outFH "E0078; ; MAP\n"; + print $outFH "E0079; ; MAP\n"; + print $outFH "E007A; ; MAP\n"; + print $outFH "E007B; ; MAP\n"; + print $outFH "E007C; ; MAP\n"; + print $outFH "E007D; ; MAP\n"; + print $outFH "E007E; ; MAP\n"; + print $outFH "E007F; ; MAP\n"; + + # ZERO WIDTH SPACE (U+200B) is mapped to nothing. All other code + # points with Separator (space, line, or paragraph) property (e.g., Zs, + # Zl, or Zp) are mapped to SPACE (U+0020). The following is a complete + # list of these code points: U+0020, 00A0, 1680, 2000-200A, 2028-2029, + # 202F, 205F, 3000. + + print $outFH "200B; ; MAP\n"; + print $outFH "00A0; 0020; MAP\n"; + print $outFH "1680; 0020; MAP\n"; + print $outFH "2000; 0020; MAP\n"; + print $outFH "2001; 0020; MAP\n"; + print $outFH "2002; 0020; MAP\n"; + print $outFH "2003; 0020; MAP\n"; + print $outFH "2004; 0020; MAP\n"; + print $outFH "2005; 0020; MAP\n"; + print $outFH "2006; 0020; MAP\n"; + print $outFH "2007; 0020; MAP\n"; + print $outFH "2008; 0020; MAP\n"; + print $outFH "2009; 0020; MAP\n"; + print $outFH "200A; 0020; MAP\n"; + print $outFH "2028; 0020; MAP\n"; + print $outFH "2029; 0020; MAP\n"; + print $outFH "202F; 0020; MAP\n"; + print $outFH "205F; 0020; MAP\n"; + print $outFH "3000; 0020; MAP\n"; + + print $outFH "\n# Total code points 238\n"; + close($outFH); +} +#----------------------------------------------------------------------- +sub usage { + print << "END"; +Usage: +filterRFC3454.pl +Options: + --sourcedir=<directory> + --destdir=<directory> + --src-filename=<name of RFC file> + --dest-filename=<name of destination file> + --A1 Generate data for table A.1 + --B1 Generate data for table B.1 + --B2 Generate data for table B.2 + --B3 Generate data for table B.3 + --C11 Generate data for table C.1.1 + --C12 Generate data for table C.1.2 + --C21 Generate data for table C.2.1 + --C22 Generate data for table C.2.2 + --C3 Generate data for table C.3 + --C4 Generate data for table C.4 + --C5 Generate data for table C.5 + --C6 Generate data for table C.6 + --C7 Generate data for table C.7 + --C8 Generate data for table C.8 + --C9 Generate data for table C.9 + --iscsi Generate data for iSCSI extra prohibited table + --xmpp-node Generate data for XMPP extra prohibited table + --sasl Generate data for SASL map table + --ldap Generate data for LDAP map table + --normalize Embed the normalization directive in the output file + --check-bidi Embed the check bidi directove in the output file + +Note, --B2 and --B3 are mutually exclusive. + +e.g.: filterRFC3454.pl --sourcedir=. --destdir=./output --src-filename=rfc3454.txt --dest-filename=NamePrepProfile.txt --A1 --B1 --B2 --C12 --C22 --C3 --C4 --C5 --C6 --C7 --C8 --C9 --normalize --check-bidi + +filterRFC3454.pl filters the RFC file and creates String prep table files. +The RFC text can be downloaded from ftp://ftp.rfc-editor.org/in-notes/rfc3454.txt + +END + exit(0); +} + + diff --git a/intl/icu/source/tools/gensprep/gensprep.8.in b/intl/icu/source/tools/gensprep/gensprep.8.in new file mode 100644 index 0000000000..e1e9fb32e2 --- /dev/null +++ b/intl/icu/source/tools/gensprep/gensprep.8.in @@ -0,0 +1,104 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" gensprep.8: manual page for the gensprep utility +.\" +.\" Copyright (C) 2016 and later: Unicode, Inc. and others. +.\" License & terms of use: http://www.unicode.org/copyright.html +.\" Copyright (C) 2003 IBM, Inc. and others. +.\" +.TH gensprep 8 "18 March 2003" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B gensprep +\- compile StringPrep data from files filtered by filterRFC3454.pl +.SH SYNOPSIS +.B gensprep +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-v\fP, \fB\-\-verbose" +] +[ +.BI "\-c\fP, \fB\-\-copyright" +] +[ +.BI "\-s\fP, \fB\-\-sourcedir" " source" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +.SH DESCRIPTION +.B gensprep +reads filtered RFC 3454 files and compiles their +information into a binary form. +The resulting file, +.BR <name>.icu , +can then be read directly by ICU, or used by +.BR pkgdata (8) +for incorporation into a larger archive or library. +.LP +The files read by +.B gensprep +are described in the +.B FILES +section. +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BR "\-v\fP, \fB\-\-verbose" +Display extra informative messages during execution. +.TP +.BI "\-c\fP, \fB\-\-copyright" +Include a copyright notice into the binary data. +.TP +.BI "\-s\fP, \fB\-\-sourcedir" " source" +Set the source directory to +.IR source . +The default source directory is specified by the environment variable +.BR ICU_DATA . +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory to +.IR destination . +The default destination directory is specified by the environment variable +.BR ICU_DATA . +.SH ENVIRONMENT +.TP 10 +.B ICU_DATA +Specifies the directory containing ICU data. Defaults to +.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ . +Some tools in ICU depend on the presence of the trailing slash. It is thus +important to make sure that it is present if +.B ICU_DATA +is set. +.SH FILES +The following files are read by +.B gensprep +and are looked for in the +.I source +/misc for rfc3454_*.txt files and in +.I source +/unidata for NormalizationCorrections.txt. +.TP 20 +.B rfc3453_A_1.txt +Contains the list of unassigned codepoints in Unicode version 3.2.0.\|.\|.. +.TP +.B rfc3454_B_1.txt +Contains the list of code points that are commonly mapped to nothing.\|.\|.. +.TP +.B rfc3454_B_2.txt +Contains the list of mappings for casefolding of code points when Normalization form NFKC is specified.\|.\|.. +.TP +.B rfc3454_C_X.txt +Contains the list of code points that are prohibited for IDNA. +.TP +.B NormalizationCorrections.txt +Contains the list of code points whose normalization has changed since Unicode Version 3.2.0. +.SH VERSION +@VERSION@ +.SH COPYRIGHT +Copyright (C) 2000-2002 IBM, Inc. and others. +.SH SEE ALSO +.BR pkgdata (8) diff --git a/intl/icu/source/tools/gensprep/gensprep.c b/intl/icu/source/tools/gensprep/gensprep.c new file mode 100644 index 0000000000..10b0e45390 --- /dev/null +++ b/intl/icu/source/tools/gensprep/gensprep.c @@ -0,0 +1,460 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 2003-2016, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: gensprep.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003-02-06 +* created by: Ram Viswanadha +* +* This program reads the Profile.txt files, +* parses them, and extracts the data for StringPrep profile. +* It then preprocesses it and writes a binary file for efficient use +* in various StringPrep conversion processes. +*/ + +#define USPREP_TYPE_NAMES_ARRAY 1 + +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> + +#include "cmemory.h" +#include "cstring.h" +#include "toolutil.h" +#include "unewdata.h" +#include "uoptions.h" +#include "uparse.h" +#include "sprpimpl.h" + +#include "unicode/uclean.h" +#include "unicode/udata.h" +#include "unicode/utypes.h" +#include "unicode/putil.h" + + +U_CDECL_BEGIN +#include "gensprep.h" +U_CDECL_END + +UBool beVerbose=false, haveCopyright=true; + +#define NORM_CORRECTIONS_FILE_NAME "NormalizationCorrections.txt" + +#define NORMALIZE_DIRECTIVE "normalize" +#define NORMALIZE_DIRECTIVE_LEN 9 +#define CHECK_BIDI_DIRECTIVE "check-bidi" +#define CHECK_BIDI_DIRECTIVE_LEN 10 + +/* prototypes --------------------------------------------------------------- */ + +static void +parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode); + +static void +parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode); + + +/* -------------------------------------------------------------------------- */ + +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_VERBOSE, + UOPTION_COPYRIGHT, + UOPTION_DESTDIR, + UOPTION_SOURCEDIR, + UOPTION_ICUDATADIR, + UOPTION_BUNDLE_NAME, + { "normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0 }, + { "norm-correction", NULL, NULL, NULL, 'm', UOPT_REQUIRES_ARG, 0 }, + { "check-bidi", NULL, NULL, NULL, 'k', UOPT_NO_ARG, 0}, + { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }, +}; + +enum{ + HELP, + HELP_QUESTION_MARK, + VERBOSE, + COPYRIGHT, + DESTDIR, + SOURCEDIR, + ICUDATADIR, + BUNDLE_NAME, + NORMALIZE, + NORM_CORRECTION_DIR, + CHECK_BIDI, + UNICODE_VERSION +}; + +static int printHelp(int argc, char* argv[]){ + /* + * Broken into chucks because the C89 standard says the minimum + * required supported string length is 509 bytes. + */ + fprintf(stderr, + "Usage: %s [-options] [file_name]\n" + "\n" + "Read the files specified and\n" + "create a binary file [package-name]_[bundle-name]." DATA_TYPE " with the StringPrep profile data\n" + "\n", + argv[0]); + fprintf(stderr, + "Options:\n" + "\t-h or -? or --help print this usage text\n" + "\t-v or --verbose verbose output\n" + "\t-c or --copyright include a copyright notice\n"); + fprintf(stderr, + "\t-d or --destdir destination directory, followed by the path\n" + "\t-s or --sourcedir source directory of ICU data, followed by the path\n" + "\t-b or --bundle-name generate the output data file with the name specified\n" + "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" + "\t followed by path, defaults to %s\n", + u_getDataDirectory()); + fprintf(stderr, + "\t-n or --normalize turn on the option for normalization and include mappings\n" + "\t from NormalizationCorrections.txt from the given path,\n" + "\t e.g: /test/icu/source/data/unidata\n"); + fprintf(stderr, + "\t-m or --norm-correction use NormalizationCorrections.txt from the given path\n" + "\t when the input file contains a normalization directive.\n" + "\t unlike -n/--normalize, this option does not force the\n" + "\t normalization.\n"); + fprintf(stderr, + "\t-k or --check-bidi turn on the option for checking for BiDi in the profile\n" + "\t-u or --unicode version of Unicode to be used with this profile followed by the version\n" + ); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; +} + + +extern int +main(int argc, char* argv[]) { +#if !UCONFIG_NO_IDNA + char* filename = NULL; +#endif + const char *srcDir=NULL, *destDir=NULL, *icuUniDataDir=NULL; + const char *bundleName=NULL, *inputFileName = NULL; + char *basename=NULL; + int32_t sprepOptions = 0; + + UErrorCode errorCode=U_ZERO_ERROR; + + U_MAIN_INIT_ARGS(argc, argv); + + /* preset then read command line options */ + options[DESTDIR].value=u_getDataDirectory(); + options[SOURCEDIR].value=""; + options[UNICODE_VERSION].value="0"; /* don't assume the unicode version */ + options[BUNDLE_NAME].value = DATA_NAME; + options[NORMALIZE].value = ""; + + argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } + if(argc<0 || options[HELP].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { + return printHelp(argc, argv); + + } + + /* get the options values */ + beVerbose=options[VERBOSE].doesOccur; + haveCopyright=options[COPYRIGHT].doesOccur; + srcDir=options[SOURCEDIR].value; + destDir=options[DESTDIR].value; + bundleName = options[BUNDLE_NAME].value; + if(options[NORMALIZE].doesOccur) { + icuUniDataDir = options[NORMALIZE].value; + } else { + icuUniDataDir = options[NORM_CORRECTION_DIR].value; + } + + if(argc<2) { + /* print the help message */ + return printHelp(argc, argv); + } else { + inputFileName = argv[1]; + } + if(!options[UNICODE_VERSION].doesOccur){ + return printHelp(argc, argv); + } + if(options[ICUDATADIR].doesOccur) { + u_setDataDirectory(options[ICUDATADIR].value); + } +#if UCONFIG_NO_IDNA + + fprintf(stderr, + "gensprep writes dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE + " because UCONFIG_NO_IDNA is set, \n" + "see icu/source/common/unicode/uconfig.h\n"); + generateData(destDir, bundleName); + +#else + + setUnicodeVersion(options[UNICODE_VERSION].value); + filename = (char* ) uprv_malloc(uprv_strlen(srcDir) + uprv_strlen(inputFileName) + (icuUniDataDir == NULL ? 0 : uprv_strlen(icuUniDataDir)) + 40); /* hopefully this should be enough */ + + /* prepare the filename beginning with the source dir */ + if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL && uprv_strchr(srcDir,U_FILE_ALT_SEP_CHAR) == NULL){ + filename[0] = '.'; + filename[1] = U_FILE_SEP_CHAR; + uprv_strcpy(filename+2,srcDir); + }else{ + uprv_strcpy(filename, srcDir); + } + + basename=filename+uprv_strlen(filename); + if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { + *basename++=U_FILE_SEP_CHAR; + } + + /* initialize */ + init(); + + /* process the file */ + uprv_strcpy(basename,inputFileName); + parseMappings(filename,false, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "Could not open file %s for reading. Error: %s \n", filename, u_errorName(errorCode)); + return errorCode; + } + + if(options[NORMALIZE].doesOccur){ /* this option might be set by @normalize;; in the source file */ + /* set up directory for NormalizationCorrections.txt */ + uprv_strcpy(filename,icuUniDataDir); + basename=filename+uprv_strlen(filename); + if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { + *basename++=U_FILE_SEP_CHAR; + } + + *basename++=U_FILE_SEP_CHAR; + uprv_strcpy(basename,NORM_CORRECTIONS_FILE_NAME); + + parseNormalizationCorrections(filename,&errorCode); + if(U_FAILURE(errorCode)){ + fprintf(stderr,"Could not open file %s for reading \n", filename); + return errorCode; + } + sprepOptions |= _SPREP_NORMALIZATION_ON; + } + + if(options[CHECK_BIDI].doesOccur){ /* this option might be set by @check-bidi;; in the source file */ + sprepOptions |= _SPREP_CHECK_BIDI_ON; + } + + setOptions(sprepOptions); + + /* process parsed data */ + if(U_SUCCESS(errorCode)) { + /* write the data file */ + generateData(destDir, bundleName); + + cleanUpData(); + } + + uprv_free(filename); + + u_cleanup(); + +#endif + + return errorCode; +} + +#if !UCONFIG_NO_IDNA + +static void U_CALLCONV +normalizationCorrectionsLineFn(void *context, + char *fields[][2], int32_t fieldCount, + UErrorCode *pErrorCode) { + (void)context; // suppress compiler warnings about unused variable + (void)fieldCount; // suppress compiler warnings about unused variable + uint32_t mapping[40]; + char *end, *s; + uint32_t code; + int32_t length; + UVersionInfo version; + UVersionInfo thisVersion; + + /* get the character code, field 0 */ + code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "gensprep: error parsing NormalizationCorrections.txt mapping at %s\n", fields[0][0]); + exit(*pErrorCode); + } + /* Original (erroneous) decomposition */ + s = fields[1][0]; + + /* parse the mapping string */ + length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode); + + /* ignore corrected decomposition */ + + u_versionFromString(version,fields[3][0] ); + u_versionFromString(thisVersion, "3.2.0"); + + + + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "gensprep error parsing NormalizationCorrections.txt of U+%04lx - %s\n", + (long)code, u_errorName(*pErrorCode)); + exit(*pErrorCode); + } + + /* store the mapping */ + if( version[0] > thisVersion[0] || + ((version[0]==thisVersion[0]) && (version[1] > thisVersion[1])) + ){ + storeMapping(code,mapping, length, USPREP_MAP, pErrorCode); + } + setUnicodeVersionNC(version); +} + +static void +parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode) { + char *fields[4][2]; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return; + } + + u_parseDelimitedFile(filename, ';', fields, 4, normalizationCorrectionsLineFn, NULL, pErrorCode); + + /* fprintf(stdout,"Number of code points that have NormalizationCorrections mapping with length >1 : %i\n",len); */ + + if(U_FAILURE(*pErrorCode) && ( *pErrorCode!=U_FILE_ACCESS_ERROR)) { + fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); + exit(*pErrorCode); + } +} + +static void U_CALLCONV +strprepProfileLineFn(void *context, + char *fields[][2], int32_t fieldCount, + UErrorCode *pErrorCode) { + (void)fieldCount; // suppress compiler warnings about unused variable + uint32_t mapping[40]; + char *end, *map; + uint32_t code; + int32_t length; + /*UBool* mapWithNorm = (UBool*) context;*/ + const char* typeName; + uint32_t rangeStart=0,rangeEnd =0; + const char* filename = (const char*) context; + const char *s; + + s = u_skipWhitespace(fields[0][0]); + if (*s == '@') { + /* special directive */ + s++; + length = (int32_t)(fields[0][1] - s); + if (length >= NORMALIZE_DIRECTIVE_LEN + && uprv_strncmp(s, NORMALIZE_DIRECTIVE, NORMALIZE_DIRECTIVE_LEN) == 0) { + options[NORMALIZE].doesOccur = true; + return; + } + else if (length >= CHECK_BIDI_DIRECTIVE_LEN + && uprv_strncmp(s, CHECK_BIDI_DIRECTIVE, CHECK_BIDI_DIRECTIVE_LEN) == 0) { + options[CHECK_BIDI].doesOccur = true; + return; + } + else { + fprintf(stderr, "gensprep error parsing a directive %s.", fields[0][0]); + } + } + + typeName = fields[2][0]; + map = fields[1][0]; + + if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){ + + u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); + if(U_FAILURE(*pErrorCode)){ + fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); + return; + } + + /* store the range */ + storeRange(rangeStart,rangeEnd,USPREP_UNASSIGNED, pErrorCode); + + }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){ + + u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); + if(U_FAILURE(*pErrorCode)){ + fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); + return; + } + + /* store the range */ + storeRange(rangeStart,rangeEnd,USPREP_PROHIBITED, pErrorCode); + + }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){ + + /* get the character code, field 0 */ + code=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || end!=fields[0][1]) { + fprintf(stderr, "gensprep: syntax error in field 0 at %s\n", fields[0][0]); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + + /* parse the mapping string */ + length=u_parseCodePoints(map, mapping, sizeof(mapping)/4, pErrorCode); + + /* store the mapping */ + storeMapping(code,mapping, length,USPREP_MAP, pErrorCode); + + }else{ + *pErrorCode = U_INVALID_FORMAT_ERROR; + } + + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "gensprep error parsing %s line %s at %s. Error: %s\n",filename, + fields[0][0],fields[2][0],u_errorName(*pErrorCode)); + exit(*pErrorCode); + } + +} + +static void +parseMappings(const char *filename, UBool reportError, UErrorCode *pErrorCode) { + char *fields[3][2]; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return; + } + + u_parseDelimitedFile(filename, ';', fields, 3, strprepProfileLineFn, (void*)filename, pErrorCode); + + /*fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);*/ + + if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) { + fprintf(stderr, "gensprep error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); + exit(*pErrorCode); + } +} + + +#endif /* #if !UCONFIG_NO_IDNA */ + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ diff --git a/intl/icu/source/tools/gensprep/gensprep.h b/intl/icu/source/tools/gensprep/gensprep.h new file mode 100644 index 0000000000..a2e5e61f9a --- /dev/null +++ b/intl/icu/source/tools/gensprep/gensprep.h @@ -0,0 +1,83 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2006, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: gensprep.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003-02-06 +* created by: Ram Viswanadha +*/ + +#ifndef __GENIDN_H__ +#define __GENIDN_H__ + +#include "unicode/utypes.h" +#include "sprpimpl.h" + +/* file definitions */ +#define DATA_NAME "sprep" +#define DATA_TYPE "spp" + +/* + * data structure that holds the IDN properties for one or more + * code point(s) at build time + */ + + +/* global flags */ +extern UBool beVerbose, haveCopyright; + +/* prototypes */ + +extern void +setUnicodeVersion(const char *v); + +extern void +setUnicodeVersionNC(UVersionInfo version); + +extern void +init(void); + +#if !UCONFIG_NO_IDNA +extern void +storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, UStringPrepType type, UErrorCode* status); +extern void +storeRange(uint32_t start, uint32_t end, UStringPrepType type,UErrorCode* status); +#endif + +extern void +generateData(const char *dataDir, const char* bundleName); + +extern void +setOptions(int32_t options); + +extern void +cleanUpData(void); + +/* +extern void +storeIDN(uint32_t code, IDN *idn); + +extern void +processData(void); + + +*/ +#endif + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ diff --git a/intl/icu/source/tools/gensprep/gensprep.vcxproj b/intl/icu/source/tools/gensprep/gensprep.vcxproj new file mode 100644 index 0000000000..c6f7bbd861 --- /dev/null +++ b/intl/icu/source/tools/gensprep/gensprep.vcxproj @@ -0,0 +1,84 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup Label="Globals"> + <ProjectGuid>{631C23CE-6C1D-4875-88F0-85E0A42B36EA}</ProjectGuid> + </PropertyGroup> + <PropertyGroup Label="Configuration"> + <ConfigurationType>Application</ConfigurationType> + <UseOfMfc>false</UseOfMfc> + <CharacterSet>MultiByte</CharacterSet> + </PropertyGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" /> + <!-- The following import will include the 'default' configuration options for VS projects. --> + <Import Project="..\..\allinone\Build.Windows.ProjectConfiguration.props" /> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> + <ImportGroup Label="ExtensionSettings"> + </ImportGroup> + <PropertyGroup Label="UserMacros" /> + <PropertyGroup> + <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion> + <OutDir>.\$(Platform)\$(Configuration)\</OutDir> + <IntDir>.\$(Platform)\$(Configuration)\</IntDir> + <!-- The ICU projects use "Win32" to mean "x86", so we need to special case it. --> + <OutDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</OutDir> + <IntDir Condition="'$(Platform)'=='Win32'">.\x86\$(Configuration)\</IntDir> + <!-- Disable Incremental Linking for Release builds as it prevents Link-time Code Generation --> + <LinkIncremental Condition="'$(Configuration)'=='Debug'">true</LinkIncremental> + <LinkIncremental Condition="'$(Configuration)'=='Release'">false</LinkIncremental> + </PropertyGroup> + <!-- Options that are common to *all* configurations --> + <ItemDefinitionGroup> + <Midl> + <TypeLibraryName>$(OutDir)\gensprep.tlb</TypeLibraryName> + </Midl> + <ClCompile> + <WarningLevel>Level3</WarningLevel> + <CompileAs>Default</CompileAs> + <DisableLanguageExtensions>false</DisableLanguageExtensions> + <AdditionalIncludeDirectories>..\..\common;..\toolutil;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> + <PrecompiledHeaderOutputFile>$(OutDir)\gensprep.pch</PrecompiledHeaderOutputFile> + <AssemblerListingLocation>$(OutDir)/</AssemblerListingLocation> + <ObjectFileName>$(OutDir)/</ObjectFileName> + <ProgramDataBaseFileName>$(OutDir)\gensprep.pdb</ProgramDataBaseFileName> + </ClCompile> + <Link> + <SubSystem>Console</SubSystem> + <OutputFile>$(OutDir)\gensprep.exe</OutputFile> + <AdditionalLibraryDirectories>..\..\..\$(IcuLibOutputDir);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories> + </Link> + <CustomBuildStep> + <Command>copy "$(TargetPath)" ..\..\..\$(IcuBinOutputDir)</Command> + <Outputs>..\..\..\$(IcuBinOutputDir)\$(TargetFileName);%(Outputs)</Outputs> + </CustomBuildStep> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Debug' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'"> + <ClCompile> + <BrowseInformation>true</BrowseInformation> + <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary> + </ClCompile> + <Link> + <AdditionalDependencies>icuucd.lib;icutud.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <!-- Options that are common to all 'Release' project configurations --> + <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'"> + <ClCompile> + <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary> + <FunctionLevelLinking>true</FunctionLevelLinking> + </ClCompile> + <Link> + <AdditionalDependencies>icuuc.lib;icutu.lib;%(AdditionalDependencies)</AdditionalDependencies> + </Link> + </ItemDefinitionGroup> + <ItemGroup> + <ClCompile Include="gensprep.c" /> + <ClCompile Include="store.c" /> + </ItemGroup> + <ItemGroup> + <ClInclude Include="gensprep.h" /> + </ItemGroup> + <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> + <ImportGroup Label="ExtensionTargets"> + </ImportGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/gensprep/gensprep.vcxproj.filters b/intl/icu/source/tools/gensprep/gensprep.vcxproj.filters new file mode 100644 index 0000000000..2791b3aa6a --- /dev/null +++ b/intl/icu/source/tools/gensprep/gensprep.vcxproj.filters @@ -0,0 +1,30 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <ItemGroup> + <Filter Include="Source Files"> + <UniqueIdentifier>{bb521e6b-d70a-4efd-9399-408729059da6}</UniqueIdentifier> + <Extensions>cpp;c;cxx;rc;def;r;odl;idl;hpj;bat</Extensions> + </Filter> + <Filter Include="Header Files"> + <UniqueIdentifier>{837c7f4e-341d-4455-aa1e-f6ff7a03b065}</UniqueIdentifier> + <Extensions>h;hpp;hxx;hm;inl</Extensions> + </Filter> + <Filter Include="Resource Files"> + <UniqueIdentifier>{a80f327a-7fb8-4737-8bd9-0f4b26c2c344}</UniqueIdentifier> + <Extensions>ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe</Extensions> + </Filter> + </ItemGroup> + <ItemGroup> + <ClCompile Include="gensprep.c"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="store.c"> + <Filter>Source Files</Filter> + </ClCompile> + </ItemGroup> + <ItemGroup> + <ClInclude Include="gensprep.h"> + <Filter>Header Files</Filter> + </ClInclude> + </ItemGroup> +</Project>
\ No newline at end of file diff --git a/intl/icu/source/tools/gensprep/sources.txt b/intl/icu/source/tools/gensprep/sources.txt new file mode 100644 index 0000000000..c369456cb3 --- /dev/null +++ b/intl/icu/source/tools/gensprep/sources.txt @@ -0,0 +1,2 @@ +gensprep.c +store.c diff --git a/intl/icu/source/tools/gensprep/store.c b/intl/icu/source/tools/gensprep/store.c new file mode 100644 index 0000000000..c3712febb4 --- /dev/null +++ b/intl/icu/source/tools/gensprep/store.c @@ -0,0 +1,653 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +******************************************************************************* +* +* Copyright (C) 1999-2014, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: store.c +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003-02-06 +* created by: Ram Viswanadha +* +*/ + +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include "unicode/utypes.h" +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" +#include "toolutil.h" +#include "unicode/udata.h" +#include "unicode/utf16.h" +#include "utrie.h" +#include "unewdata.h" +#include "gensprep.h" +#include "uhash.h" + + +#define DO_DEBUG_OUT 0 + + +/* + * StringPrep profile file format ------------------------------------ + * + * The file format prepared and written here contains a 16-bit trie and a mapping table. + * + * Before the data contents described below, there are the headers required by + * the udata API for loading ICU data. Especially, a UDataInfo structure + * precedes the actual data. It contains platform properties values and the + * file format version. + * + * The following is a description of format version 2. + * + * Data contents: + * + * The contents is a parsed, binary form of RFC3454 and possibly + * NormalizationCorrections.txt depending on the options specified on the profile. + * + * Any Unicode code point from 0 to 0x10ffff can be looked up to get + * the trie-word, if any, for that code point. This means that the input + * to the lookup are 21-bit unsigned integers, with not all of the + * 21-bit range used. + * + * *.spp files customarily begin with a UDataInfo structure, see udata.h and .c. + * After that there are the following structures: + * + * int32_t indexes[_SPREP_INDEX_TOP]; -- _SPREP_INDEX_TOP=16, see enum in sprpimpl.h file + * + * UTrie stringPrepTrie; -- size in bytes=indexes[_SPREP_INDEX_TRIE_SIZE] + * + * uint16_t mappingTable[]; -- Contains the sequence of code units that the code point maps to + * size in bytes = indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] + * + * The indexes array contains the following values: + * indexes[_SPREP_INDEX_TRIE_SIZE] -- The size of the StringPrep trie in bytes + * indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] -- The size of the mappingTable in bytes + * indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] -- The index of Unicode version of last entry in NormalizationCorrections.txt + * indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] -- The starting index of 1 UChar mapping index in the mapping table + * indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] -- The starting index of 2 UChars mapping index in the mapping table + * indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] -- The starting index of 3 UChars mapping index in the mapping table + * indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] -- The starting index of 4 UChars mapping index in the mapping table + * indexes[_SPREP_OPTIONS] -- Bit set of options to turn on in the profile, e.g: USPREP_NORMALIZATION_ON, USPREP_CHECK_BIDI_ON + * + * + * StringPrep Trie : + * + * The StringPrep tries is a 16-bit trie that contains data for the profile. + * Each code point is associated with a value (trie-word) in the trie. + * + * - structure of data words from the trie + * + * i) A value greater than or equal to _SPREP_TYPE_THRESHOLD (0xFFF0) + * represents the type associated with the code point + * if(trieWord >= _SPREP_TYPE_THRESHOLD){ + * type = trieWord - 0xFFF0; + * } + * The type can be : + * USPREP_UNASSIGNED + * USPREP_PROHIBITED + * USPREP_DELETE + * + * ii) A value less than _SPREP_TYPE_THRESHOLD means the type is USPREP_MAP and + * contains distribution described below + * + * 0 - ON : The code point is prohibited (USPREP_PROHIBITED). This is to allow for codepoint that are both prohibited and mapped. + * 1 - ON : The value in the next 14 bits is an index into the mapping table + * OFF: The value in the next 14 bits is an delta value from the code point + * 2..15 - Contains data as described by bit 1. If all bits are set + * (value = _SPREP_MAX_INDEX_VALUE) then the type is USPREP_DELETE + * + * + * Mapping Table: + * The data in mapping table is sorted according to the length of the mapping sequence. + * If the type of the code point is USPREP_MAP and value in trie word is an index, the index + * is compared with start indexes of sequence length start to figure out the length according to + * the following algorithm: + * + * if( index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] && + * index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){ + * length = 1; + * }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] && + * index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){ + * length = 2; + * }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] && + * index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){ + * length = 3; + * }else{ + * // The first position in the mapping table contains the length + * // of the sequence + * length = mappingTable[index++]; + * + * } + * + */ + +/* file data ---------------------------------------------------------------- */ +/* indexes[] value names */ + +#if UCONFIG_NO_IDNA + +/* dummy UDataInfo cf. udata.h */ +static UDataInfo dataInfo = { + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, + + { 0, 0, 0, 0 }, /* dummy dataFormat */ + { 0, 0, 0, 0 }, /* dummy formatVersion */ + { 0, 0, 0, 0 } /* dummy dataVersion */ +}; + +#else + +static int32_t indexes[_SPREP_INDEX_TOP]={ 0 }; + +static uint16_t* mappingData= NULL; +static int32_t mappingDataCapacity = 0; /* we skip the first index in mapping data */ +static int16_t currentIndex = 0; /* the current index into the data trie */ +static int32_t maxLength = 0; /* maximum length of mapping string */ + + +/* UDataInfo cf. udata.h */ +static UDataInfo dataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, + + { 0x53, 0x50, 0x52, 0x50 }, /* dataFormat="SPRP" */ + { 3, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ + { 3, 2, 0, 0 } /* dataVersion (Unicode version) */ +}; +void +setUnicodeVersion(const char *v) { + UVersionInfo version; + u_versionFromString(version, v); + uprv_memcpy(dataInfo.dataVersion, version, 4); +} + +void +setUnicodeVersionNC(UVersionInfo version){ + uint32_t univer = version[0] << 24; + univer += version[1] << 16; + univer += version[2] << 8; + univer += version[3]; + indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] = univer; +} +static UNewTrie *sprepTrie; + +#define MAX_DATA_LENGTH 11500 + + +#define SPREP_DELTA_RANGE_POSITIVE_LIMIT 8191 +#define SPREP_DELTA_RANGE_NEGATIVE_LIMIT -8192 + + +extern void +init() { + + sprepTrie = (UNewTrie *)uprv_calloc(1, sizeof(UNewTrie)); + + /* initialize the two tries */ + if(NULL==utrie_open(sprepTrie, NULL, MAX_DATA_LENGTH, 0, 0, false)) { + fprintf(stderr, "error: failed to initialize tries\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } +} + +static UHashtable* hashTable = NULL; + + +typedef struct ValueStruct { + UChar* mapping; + int16_t length; + UStringPrepType type; +} ValueStruct; + +/* Callback for deleting the value from the hashtable */ +static void U_CALLCONV valueDeleter(void* obj){ + ValueStruct* value = (ValueStruct*) obj; + uprv_free(value->mapping); + uprv_free(value); +} + +/* Callback for hashing the entry */ +static int32_t U_CALLCONV hashEntry(const UHashTok parm) { + return parm.integer; +} + +/* Callback for comparing two entries */ +static UBool U_CALLCONV compareEntries(const UHashTok p1, const UHashTok p2) { + return (UBool)(p1.integer != p2.integer); +} + + +static void +storeMappingData(void){ + + int32_t pos = UHASH_FIRST; + const UHashElement* element = NULL; + ValueStruct* value = NULL; + int32_t codepoint = 0; + int32_t elementCount = 0; + int32_t writtenElementCount = 0; + int32_t mappingLength = 1; /* minimum mapping length */ + int32_t oldMappingLength = 0; + uint16_t trieWord =0; + int32_t limitIndex = 0; + + if (hashTable == NULL) { + return; + } + elementCount = uhash_count(hashTable); + + /*initialize the mapping data */ + mappingData = (uint16_t*) uprv_calloc(mappingDataCapacity, U_SIZEOF_UCHAR); + + while(writtenElementCount < elementCount){ + + while( (element = uhash_nextElement(hashTable, &pos))!=NULL){ + + codepoint = element->key.integer; + value = (ValueStruct*)element->value.pointer; + + /* store the start of indexes */ + if(oldMappingLength != mappingLength){ + /* Assume that index[] is used according to the enums defined */ + if(oldMappingLength <=_SPREP_MAX_INDEX_TOP_LENGTH){ + indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex; + } + if(oldMappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH && + mappingLength == _SPREP_MAX_INDEX_TOP_LENGTH +1){ + + limitIndex = currentIndex; + + } + oldMappingLength = mappingLength; + } + + if(value->length == mappingLength){ + uint32_t savedTrieWord = 0; + trieWord = currentIndex << 2; + /* turn on the 2nd bit to signal that the following bits contain an index */ + trieWord += 0x02; + + if(trieWord > _SPREP_TYPE_THRESHOLD){ + fprintf(stderr,"trieWord cannot contain value greater than 0x%04X.\n",_SPREP_TYPE_THRESHOLD); + exit(U_ILLEGAL_CHAR_FOUND); + } + /* figure out if the code point has type already stored */ + savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); + if(savedTrieWord!=0){ + if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ + /* turn on the first bit in trie word */ + trieWord += 0x01; + }else{ + /* + * the codepoint has value something other than prohibited + * and a mapping .. error! + */ + fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + } + + /* now set the value in the trie */ + if(!utrie_set32(sprepTrie,codepoint,trieWord)){ + fprintf(stderr,"Could not set the value for code point.\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + + /* written the trie word for the codepoint... increment the count*/ + writtenElementCount++; + + /* sanity check are we exceeding the max number allowed */ + if(currentIndex+value->length+1 > _SPREP_MAX_INDEX_VALUE){ + fprintf(stderr, "Too many entries in the mapping table %i. Maximum allowed is %i\n", + currentIndex+value->length, _SPREP_MAX_INDEX_VALUE); + exit(U_INDEX_OUTOFBOUNDS_ERROR); + } + + /* copy the mapping data */ + /* write the length */ + if(mappingLength > _SPREP_MAX_INDEX_TOP_LENGTH ){ + /* the cast here is safe since we donot expect the length to be > 65535 */ + mappingData[currentIndex++] = (uint16_t) mappingLength; + } + /* copy the contents to mappindData array */ + u_memmove(mappingData+currentIndex, value->mapping, value->length); + currentIndex += value->length; + if (currentIndex > mappingDataCapacity) { + /* If this happens there is a bug in the computation of the mapping data size in storeMapping() */ + fprintf(stderr, "gensprep, fatal error at %s, %d. Aborting.\n", __FILE__, __LINE__); + exit(U_INTERNAL_PROGRAM_ERROR); + } + } + } + mappingLength++; + pos = -1; + } + /* set the last length for range check */ + if(mappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH){ + indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex+1; + }else{ + indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] = limitIndex; + } + +} + +extern void setOptions(int32_t options){ + indexes[_SPREP_OPTIONS] = options; +} +extern void +storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, + UStringPrepType type, UErrorCode* status){ + + + UChar* map = NULL; + int16_t adjustedLen=0, i, j; + uint16_t trieWord = 0; + ValueStruct *value = NULL; + uint32_t savedTrieWord = 0; + + /* initialize the hashtable */ + if(hashTable==NULL){ + hashTable = uhash_open(hashEntry, compareEntries, NULL, status); + uhash_setValueDeleter(hashTable, valueDeleter); + } + + /* figure out if the code point has type already stored */ + savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); + if(savedTrieWord!=0){ + if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ + /* turn on the first bit in trie word */ + trieWord += 0x01; + }else{ + /* + * the codepoint has value something other than prohibited + * and a mapping .. error! + */ + fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + } + + /* figure out the real length */ + for(i=0; i<length; i++){ + adjustedLen += U16_LENGTH(mapping[i]); + } + + if(adjustedLen == 0){ + trieWord = (uint16_t)(_SPREP_MAX_INDEX_VALUE << 2); + /* make sure that the value of trieWord is less than the threshold */ + if(trieWord < _SPREP_TYPE_THRESHOLD){ + /* now set the value in the trie */ + if(!utrie_set32(sprepTrie,codepoint,trieWord)){ + fprintf(stderr,"Could not set the value for code point.\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + /* value is set so just return */ + return; + }else{ + fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); + exit(U_ILLEGAL_CHAR_FOUND); + } + } + + if(adjustedLen == 1){ + /* calculate the delta */ + int16_t delta = (int16_t)((int32_t)codepoint - (int16_t) mapping[0]); + if(delta >= SPREP_DELTA_RANGE_NEGATIVE_LIMIT && delta <= SPREP_DELTA_RANGE_POSITIVE_LIMIT){ + + trieWord = delta; + trieWord <<= 2; + + + /* make sure that the second bit is OFF */ + if((trieWord & 0x02) != 0 ){ + fprintf(stderr,"The second bit in the trie word is not zero while storing a delta.\n"); + exit(U_INTERNAL_PROGRAM_ERROR); + } + /* make sure that the value of trieWord is less than the threshold */ + if(trieWord < _SPREP_TYPE_THRESHOLD){ + /* now set the value in the trie */ + if(!utrie_set32(sprepTrie,codepoint,trieWord)){ + fprintf(stderr,"Could not set the value for code point.\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + /* value is set so just return */ + return; + } + } + /* + * if the delta is not in the given range or if the trieWord is larger than the threshold + * just fall through for storing the mapping in the mapping table + */ + } + + map = (UChar*) uprv_calloc(adjustedLen + 1, U_SIZEOF_UCHAR); + + for (i=0, j=0; i<length; i++) { + U16_APPEND_UNSAFE(map, j, mapping[i]); + } + + value = (ValueStruct*) uprv_malloc(sizeof(ValueStruct)); + value->mapping = map; + value->type = type; + value->length = adjustedLen; + if(value->length > _SPREP_MAX_INDEX_TOP_LENGTH){ + mappingDataCapacity++; + } + if(maxLength < value->length){ + maxLength = value->length; + } + uhash_iput(hashTable,codepoint,value,status); + mappingDataCapacity += adjustedLen; + + if(U_FAILURE(*status)){ + fprintf(stderr, "Failed to put entries into the hash table. Error: %s\n", u_errorName(*status)); + exit(*status); + } +} + + +extern void +storeRange(uint32_t start, uint32_t end, UStringPrepType type, UErrorCode* status){ + (void)status; // suppress compiler warnings about unused variable + uint16_t trieWord = 0; + + if((int)(_SPREP_TYPE_THRESHOLD + type) > 0xFFFF){ + fprintf(stderr,"trieWord cannot contain value greater than 0xFFFF.\n"); + exit(U_ILLEGAL_CHAR_FOUND); + } + trieWord = (_SPREP_TYPE_THRESHOLD + type); /* the top 4 bits contain the value */ + if(start == end){ + uint32_t savedTrieWord = utrie_get32(sprepTrie, start, NULL); + if(savedTrieWord>0){ + if(savedTrieWord < _SPREP_TYPE_THRESHOLD && type == USPREP_PROHIBITED){ + /* + * A mapping is stored in the trie word + * and the only other possible type that a + * code point can have is USPREP_PROHIBITED + * + */ + + /* turn on the 0th bit in the savedTrieWord */ + savedTrieWord += 0x01; + + /* the downcast is safe since we only save 16 bit values */ + trieWord = (uint16_t)savedTrieWord; + + /* make sure that the value of trieWord is less than the threshold */ + if(trieWord < _SPREP_TYPE_THRESHOLD){ + /* now set the value in the trie */ + if(!utrie_set32(sprepTrie,start,trieWord)){ + fprintf(stderr,"Could not set the value for code point.\n"); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + /* value is set so just return */ + return; + }else{ + fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); + exit(U_ILLEGAL_CHAR_FOUND); + } + + }else if(savedTrieWord != trieWord){ + fprintf(stderr,"Value for codepoint \\U%08X already set!.\n", (int)start); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + /* if savedTrieWord == trieWord .. fall through and set the value */ + } + if(!utrie_set32(sprepTrie,start,trieWord)){ + fprintf(stderr,"Could not set the value for code point \\U%08X.\n", (int)start); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + }else{ + if(!utrie_setRange32(sprepTrie, start, end+1, trieWord, false)){ + fprintf(stderr,"Value for certain codepoint already set.\n"); + exit(U_ILLEGAL_CHAR_FOUND); + } + } + +} + +/* folding value: just store the offset (16 bits) if there is any non-0 entry */ +static uint32_t U_CALLCONV +getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) { + uint32_t value; + UChar32 limit=0; + UBool inBlockZero; + + limit=start+0x400; + while(start<limit) { + value=utrie_get32(trie, start, &inBlockZero); + if(inBlockZero) { + start+=UTRIE_DATA_BLOCK_LENGTH; + } else if(value!=0) { + return (uint32_t)offset; + } else { + ++start; + } + } + return 0; + +} + +#endif /* #if !UCONFIG_NO_IDNA */ + +extern void +generateData(const char *dataDir, const char* bundleName) { + static uint8_t sprepTrieBlock[100000]; + + UNewDataMemory *pData; + UErrorCode errorCode=U_ZERO_ERROR; + int32_t size, dataLength; + char* fileName = (char*) uprv_malloc(uprv_strlen(bundleName) +100); + +#if UCONFIG_NO_IDNA + + size=0; + +#else + + int32_t sprepTrieSize; + + /* sort and add mapping data */ + storeMappingData(); + + sprepTrieSize=utrie_serialize(sprepTrie, sprepTrieBlock, sizeof(sprepTrieBlock), getFoldedValue, true, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "error: utrie_serialize(sprep trie) failed, %s\n", u_errorName(errorCode)); + exit(errorCode); + } + + size = sprepTrieSize + mappingDataCapacity*U_SIZEOF_UCHAR + sizeof(indexes); + if(beVerbose) { + printf("size of sprep trie %5u bytes\n", (int)sprepTrieSize); + printf("size of " U_ICUDATA_NAME "_%s." DATA_TYPE " contents: %ld bytes\n", bundleName,(long)size); + printf("size of mapping data array %5u bytes\n",(int)mappingDataCapacity * U_SIZEOF_UCHAR); + printf("Number of code units in mappingData (currentIndex) are: %i \n", currentIndex); + printf("Maximum length of the mapping string is : %i \n", (int)maxLength); + } + +#endif + + fileName[0]=0; + uprv_strcat(fileName,bundleName); + /* write the data */ + pData=udata_create(dataDir, DATA_TYPE, fileName, &dataInfo, + haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "gensprep: unable to create the output file, error %d\n", errorCode); + exit(errorCode); + } + +#if !UCONFIG_NO_IDNA + + indexes[_SPREP_INDEX_TRIE_SIZE]=sprepTrieSize; + indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]=mappingDataCapacity*U_SIZEOF_UCHAR; + + udata_writeBlock(pData, indexes, sizeof(indexes)); + udata_writeBlock(pData, sprepTrieBlock, sprepTrieSize); + udata_writeBlock(pData, mappingData, indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]); + + +#endif + + /* finish up */ + dataLength=udata_finish(pData, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "gensprep: error %d writing the output file\n", errorCode); + exit(errorCode); + } + + if(dataLength!=size) { + fprintf(stderr, "gensprep error: data length %ld != calculated size %ld\n", + (long)dataLength, (long)size); + exit(U_INTERNAL_PROGRAM_ERROR); + } + +#if !UCONFIG_NO_IDNA + /* done with writing the data .. close the hashtable */ + if (hashTable != NULL) { + uhash_close(hashTable); + } +#endif + + uprv_free(fileName); +} + +#if !UCONFIG_NO_IDNA + +extern void +cleanUpData(void) { + uprv_free(mappingData); + utrie_close(sprepTrie); + uprv_free(sprepTrie); +} + +#endif /* #if !UCONFIG_NO_IDNA */ + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ |