diff options
Diffstat (limited to 'src/common/unicode')
-rw-r--r-- | src/common/unicode/.gitignore | 9 | ||||
-rw-r--r-- | src/common/unicode/Makefile | 72 | ||||
-rw-r--r-- | src/common/unicode/README | 28 | ||||
-rw-r--r-- | src/common/unicode/generate-norm_test_table.pl | 106 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_combining_table.pl | 51 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_east_asian_fw_table.pl | 76 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_norm_table.pl | 406 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_normprops_table.pl | 125 | ||||
-rw-r--r-- | src/common/unicode/norm_test.c | 86 |
9 files changed, 959 insertions, 0 deletions
diff --git a/src/common/unicode/.gitignore b/src/common/unicode/.gitignore new file mode 100644 index 0000000..46243f7 --- /dev/null +++ b/src/common/unicode/.gitignore @@ -0,0 +1,9 @@ +/norm_test +/norm_test_table.h + +# Downloaded files +/CompositionExclusions.txt +/DerivedNormalizationProps.txt +/EastAsianWidth.txt +/NormalizationTest.txt +/UnicodeData.txt diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile new file mode 100644 index 0000000..60e01e7 --- /dev/null +++ b/src/common/unicode/Makefile @@ -0,0 +1,72 @@ +#------------------------------------------------------------------------- +# +# Makefile +# Makefile for src/common/unicode +# +# IDENTIFICATION +# src/common/unicode/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/common/unicode +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +override CPPFLAGS := -DFRONTEND -I. $(CPPFLAGS) +LIBS += $(PTHREAD_LIBS) + +# By default, do nothing. +all: + +update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h + mv $^ $(top_srcdir)/src/include/common/ + $(MAKE) normalization-check + +# These files are part of the Unicode Character Database. Download +# them on demand. The dependency on Makefile.global is for +# UNICODE_VERSION. +UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global + $(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F) + +# Generation of conversion tables used for string normalization with +# UTF-8 strings. +unicode_norm_hashfunc.h: unicode_norm_table.h + +unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt + $(PERL) $< + +unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt + $(PERL) $^ >$@ + +unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt + $(PERL) $^ >$@ + +unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt + $(PERL) $^ >$@ + +# Test suite +normalization-check: norm_test + ./norm_test + +norm_test: norm_test.o ../unicode_norm.o | submake-common + +norm_test.o: norm_test_table.h + +.PHONY: submake-common + +submake-common: + $(MAKE) -C .. all + +norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt + perl $^ $@ + +.PHONY: normalization-check + + +clean: + rm -f $(OBJS) norm_test norm_test.o + +distclean: clean + rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h + +maintainer-clean: distclean diff --git a/src/common/unicode/README b/src/common/unicode/README new file mode 100644 index 0000000..56956f6 --- /dev/null +++ b/src/common/unicode/README @@ -0,0 +1,28 @@ +This directory contains tools to generate the tables in +src/include/common/unicode_norm.h, used for Unicode normalization. The +generated .h file is included in the source tree, so these are normally not +needed to build PostgreSQL, only if you need to re-generate the .h file +from the Unicode data files for some reason, e.g. to update to a new version +of Unicode. + +Generating unicode_norm_table.h +------------------------------- + +Run + + make update-unicode + +from the top level of the source tree and commit the result. + +Tests +----- + +The Unicode consortium publishes a comprehensive test suite for the +normalization algorithm, in a file called NormalizationTest.txt. This +directory also contains a perl script and some C code, to run our +normalization code with all the test strings in NormalizationTest.txt. +To download NormalizationTest.txt and run the tests: + + make normalization-check + +This is also run as part of the update-unicode target. diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl new file mode 100644 index 0000000..838f552 --- /dev/null +++ b/src/common/unicode/generate-norm_test_table.pl @@ -0,0 +1,106 @@ +#!/usr/bin/perl +# +# Read Unicode consortium's normalization test suite, NormalizationTest.txt, +# and generate a C array from it, for norm_test.c. +# +# NormalizationTest.txt is part of the Unicode Character Database. +# +# Copyright (c) 2000-2022, PostgreSQL Global Development Group + +use strict; +use warnings; + +use File::Basename; + +die "Usage: $0 INPUT_FILE OUTPUT_FILE\n" if @ARGV != 2; +my $input_file = $ARGV[0]; +my $output_file = $ARGV[1]; +my $output_base = basename($output_file); + +# Open the input and output files +open my $INPUT, '<', $input_file + or die "Could not open input file $input_file: $!"; +open my $OUTPUT, '>', $output_file + or die "Could not open output file $output_file: $!\n"; + +# Print header of output file. +print $OUTPUT <<HEADER; +/*------------------------------------------------------------------------- + * + * norm_test_table.h + * Test strings for Unicode normalization. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/common/unicode/norm_test_table.h + * + *------------------------------------------------------------------------- + */ + +/* + * File auto-generated by src/common/unicode/generate-norm_test_table.pl, do + * not edit. There is deliberately not an #ifndef PG_NORM_TEST_TABLE_H + * here. + */ + +typedef struct +{ + int linenum; + pg_wchar input[50]; + pg_wchar output[4][50]; +} pg_unicode_test; + +/* test table */ +HEADER +print $OUTPUT + "static const pg_unicode_test UnicodeNormalizationTests[] =\n{\n"; + +# Helper routine to convert a space-separated list of Unicode characters to +# hexadecimal list format, suitable for outputting in a C array. +sub codepoint_string_to_hex +{ + my $codepoint_string = shift; + + my $result; + + foreach (split(' ', $codepoint_string)) + { + my $cp = $_; + my $utf8 = "0x$cp, "; + $result .= $utf8; + } + $result .= '0'; # null-terminated the array + return $result; +} + +# Process the input file line by line +my $linenum = 0; +while (my $line = <$INPUT>) +{ + $linenum = $linenum + 1; + if ($line =~ /^\s*#/) { next; } # ignore comments + + if ($line =~ /^@/) { next; } # ignore @Part0 like headers + + # Split the line wanted and get the fields needed: + # + # source; NFC; NFD; NFKC; NFKD + my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line); + + my $source_utf8 = codepoint_string_to_hex($source); + my $nfc_utf8 = codepoint_string_to_hex($nfc); + my $nfd_utf8 = codepoint_string_to_hex($nfd); + my $nfkc_utf8 = codepoint_string_to_hex($nfkc); + my $nfkd_utf8 = codepoint_string_to_hex($nfkd); + + print $OUTPUT + "\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n"; +} + +# Output terminator entry +print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }"; +print $OUTPUT "\n};\n"; + +close $OUTPUT; +close $INPUT; diff --git a/src/common/unicode/generate-unicode_combining_table.pl b/src/common/unicode/generate-unicode_combining_table.pl new file mode 100644 index 0000000..8177c20 --- /dev/null +++ b/src/common/unicode/generate-unicode_combining_table.pl @@ -0,0 +1,51 @@ +#!/usr/bin/perl +# +# Generate sorted list of non-overlapping intervals of non-spacing +# characters, using Unicode data files as input. Pass UnicodeData.txt +# as argument. The output is on stdout. +# +# Copyright (c) 2019-2022, PostgreSQL Global Development Group + +use strict; +use warnings; + +my $range_start = undef; +my $codepoint; +my $prev_codepoint; +my $count = 0; + +print + "/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */\n\n"; + +print "static const struct mbinterval combining[] = {\n"; + +foreach my $line (<ARGV>) +{ + chomp $line; + my @fields = split ';', $line; + $codepoint = hex $fields[0]; + + if ($fields[2] eq 'Me' || $fields[2] eq 'Mn') + { + # combining character, save for start of range + if (!defined($range_start)) + { + $range_start = $codepoint; + } + } + else + { + # not a combining character, print out previous range if any + if (defined($range_start)) + { + printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_codepoint; + $range_start = undef; + } + } +} +continue +{ + $prev_codepoint = $codepoint; +} + +print "};\n"; diff --git a/src/common/unicode/generate-unicode_east_asian_fw_table.pl b/src/common/unicode/generate-unicode_east_asian_fw_table.pl new file mode 100644 index 0000000..9d03684 --- /dev/null +++ b/src/common/unicode/generate-unicode_east_asian_fw_table.pl @@ -0,0 +1,76 @@ +#!/usr/bin/perl +# +# Generate a sorted list of non-overlapping intervals of East Asian Wide (W) +# and East Asian Fullwidth (F) characters, using Unicode data files as input. +# Pass EastAsianWidth.txt as argument. The output is on stdout. +# +# Copyright (c) 2019-2022, PostgreSQL Global Development Group + +use strict; +use warnings; + +my $range_start = undef; +my ($first, $last); +my $prev_last; + +print + "/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n"; + +print "static const struct mbinterval east_asian_fw[] = {\n"; + +foreach my $line (<ARGV>) +{ + chomp $line; + $line =~ s/\s*#.*$//; + next if $line eq ''; + my ($codepoint, $width) = split ';', $line; + + if ($codepoint =~ /\.\./) + { + ($first, $last) = split /\.\./, $codepoint; + } + else + { + $first = $last = $codepoint; + } + + ($first, $last) = map(hex, ($first, $last)); + + if ($width eq 'F' || $width eq 'W') + { + # fullwidth/wide characters + if (!defined($range_start)) + { + # save for start of range if one hasn't been started yet + $range_start = $first; + } + elsif ($first != $prev_last + 1) + { + # ranges aren't contiguous; emit the last and start a new one + printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last; + $range_start = $first; + } + } + else + { + # not wide characters, print out previous range if any + if (defined($range_start)) + { + printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last; + $range_start = undef; + } + } +} +continue +{ + $prev_last = $last; +} + +# don't forget any ranges at the very end of the database (though there are none +# as of Unicode 13.0) +if (defined($range_start)) +{ + printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last; +} + +print "};\n"; diff --git a/src/common/unicode/generate-unicode_norm_table.pl b/src/common/unicode/generate-unicode_norm_table.pl new file mode 100644 index 0000000..e442345 --- /dev/null +++ b/src/common/unicode/generate-unicode_norm_table.pl @@ -0,0 +1,406 @@ +#!/usr/bin/perl +# +# Generate a composition table and its lookup utilities, using Unicode data +# files as input. +# +# Input: UnicodeData.txt and CompositionExclusions.txt +# Output: unicode_norm_table.h and unicode_norm_hashfunc.h +# +# Copyright (c) 2000-2022, PostgreSQL Global Development Group + +use strict; +use warnings; + +use FindBin; +use lib "$FindBin::RealBin/../../tools/"; +use PerfectHash; + +my $output_table_file = "unicode_norm_table.h"; +my $output_func_file = "unicode_norm_hashfunc.h"; + +my $FH; + +# Read list of codes that should be excluded from re-composition. +my @composition_exclusion_codes = (); +open($FH, '<', "CompositionExclusions.txt") + or die "Could not open CompositionExclusions.txt: $!."; +while (my $line = <$FH>) +{ + if ($line =~ /^([[:xdigit:]]+)/) + { + push @composition_exclusion_codes, $1; + } +} +close $FH; + +# Read entries from UnicodeData.txt into a list, and a hash table. We need +# three fields from each row: the codepoint, canonical combining class, +# and character decomposition mapping +my @characters = (); +my %character_hash = (); +open($FH, '<', "UnicodeData.txt") + or die "Could not open UnicodeData.txt: $!."; +while (my $line = <$FH>) +{ + + # Split the line wanted and get the fields needed: + # - Unicode code value + # - Canonical Combining Class + # - Character Decomposition Mapping + my @elts = split(';', $line); + my $code = $elts[0]; + my $class = $elts[3]; + my $decomp = $elts[5]; + + # Skip codepoints above U+10FFFF. They cannot be represented in 4 bytes + # in UTF-8, and PostgreSQL doesn't support UTF-8 characters longer than + # 4 bytes. (This is just pro forma, as there aren't any such entries in + # the data file, currently.) + next if hex($code) > 0x10FFFF; + + # Skip characters with no decompositions and a class of 0, to reduce the + # table size. + next if $class eq '0' && $decomp eq ''; + + my %char_entry = (code => $code, class => $class, decomp => $decomp); + push(@characters, \%char_entry); + $character_hash{$code} = \%char_entry; +} +close $FH; + +my $num_characters = scalar @characters; + +# Start writing out the output files +open my $OT, '>', $output_table_file + or die "Could not open output file $output_table_file: $!\n"; +open my $OF, '>', $output_func_file + or die "Could not open output file $output_func_file: $!\n"; + +print $OT <<HEADER; +/*------------------------------------------------------------------------- + * + * unicode_norm_table.h + * Composition table used for Unicode normalization + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/common/unicode_norm_table.h + * + *------------------------------------------------------------------------- + */ + +/* + * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl, + * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_TABLE_H + * here. + */ +typedef struct +{ + uint32 codepoint; /* Unicode codepoint */ + uint8 comb_class; /* combining class of character */ + uint8 dec_size_flags; /* size and flags of decomposition code list */ + uint16 dec_index; /* index into UnicodeDecomp_codepoints, or the + * decomposition itself if DECOMP_INLINE */ +} pg_unicode_decomposition; + +#define DECOMP_NO_COMPOSE 0x80 /* don't use for re-composition */ +#define DECOMP_INLINE 0x40 /* decomposition is stored inline in + * dec_index */ +#define DECOMP_COMPAT 0x20 /* compatibility mapping */ + +#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F) +#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0) +#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0) +#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0) + +/* Table of Unicode codepoints and their decompositions */ +static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] = +{ +HEADER + +print $OF <<HEADER; +/*------------------------------------------------------------------------- + * + * unicode_norm_hashfunc.h + * Perfect hash functions used for Unicode normalization + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/common/unicode_norm_hashfunc.h + * + *------------------------------------------------------------------------- + */ + +/* + * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl, + * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H + * here. + */ + +#include "common/unicode_norm_table.h" + +/* Typedef for perfect hash functions */ +typedef int (*cp_hash_func) (const void *key); + +/* Information for lookups with perfect hash functions */ +typedef struct +{ + const pg_unicode_decomposition *decomps; + cp_hash_func hash; + int num_decomps; +} pg_unicode_decompinfo; + +typedef struct +{ + const uint16 *inverse_lookup; + cp_hash_func hash; + int num_recomps; +} pg_unicode_recompinfo; + +HEADER + +my $decomp_index = 0; +my $decomp_string = ""; +my @dec_cp_packed; +my $main_index = 0; +my @rec_info; + +my $last_code = $characters[-1]->{code}; +foreach my $char (@characters) +{ + my $code = $char->{code}; + my $class = $char->{class}; + my $decomp = $char->{decomp}; + + # Save the code point bytes as a string in network order. + push @dec_cp_packed, pack('N', hex($char->{code})); + + # The character decomposition mapping field in UnicodeData.txt is a list + # of unicode codepoints, separated by space. But it can be prefixed with + # so-called compatibility formatting tag, like "<compat>", or "<font>". + # The entries with compatibility formatting tags should not be used for + # re-composing characters during normalization, so flag them in the table. + # (The tag doesn't matter, only whether there is a tag or not) + my $compat = 0; + if ($decomp =~ /\<.*\>/) + { + $compat = 1; + $decomp =~ s/\<[^][]*\>//g; + } + my @decomp_elts = split(" ", $decomp); + + # Decomposition size + # Print size of decomposition + my $decomp_size = scalar(@decomp_elts); + die if $decomp_size > 0x1F; # to not overrun bitmask + + my $first_decomp = shift @decomp_elts; + + my $flags = ""; + my $comment = ""; + + if ($compat) + { + $flags .= " | DECOMP_COMPAT"; + } + + if ($decomp_size == 2) + { + # Should this be used for recomposition? + if ( $character_hash{$first_decomp} + && $character_hash{$first_decomp}->{class} != 0) + { + $flags .= " | DECOMP_NO_COMPOSE"; + $comment = "non-starter decomposition"; + } + else + { + foreach my $lcode (@composition_exclusion_codes) + { + if ($lcode eq $code) + { + $flags .= " | DECOMP_NO_COMPOSE"; + $comment = "in exclusion list"; + last; + } + } + } + + # Save info for recomposeable codepoints. + # Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C + # above! See also the inverse lookup in recompose_code() found in + # src/common/unicode_norm.c. + if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/)) + { + push @rec_info, + { + code => $code, + main_index => $main_index, + first => $first_decomp, + second => $decomp_elts[0] + }; + } + } + + if ($decomp_size == 0) + { + print $OT "\t{0x$code, $class, 0$flags, 0}"; + } + elsif ($decomp_size == 1 && length($first_decomp) <= 4) + { + + # The decomposition consists of a single codepoint, and it fits + # in a uint16, so we can store it "inline" in the main table. + $flags .= " | DECOMP_INLINE"; + print $OT "\t{0x$code, $class, 1$flags, 0x$first_decomp}"; + } + else + { + print $OT "\t{0x$code, $class, $decomp_size$flags, $decomp_index}"; + + # Now save the decompositions into a dedicated area that will + # be written afterwards. First build the entry dedicated to + # a sub-table with the code and decomposition. + $decomp_string .= ",\n" if ($decomp_string ne ""); + + $decomp_string .= "\t /* $decomp_index */ 0x$first_decomp"; + foreach (@decomp_elts) + { + $decomp_string .= ", 0x$_"; + } + + $decomp_index = $decomp_index + $decomp_size; + } + + # Print a comma after all items except the last one. + print $OT "," unless ($code eq $last_code); + + print $OT "\t/* $comment */" if ($comment ne ""); + print $OT "\n"; + + $main_index++; +} +print $OT "\n};\n\n"; + +# Print the array of decomposed codes. +print $OT <<HEADER; +/* codepoints array */ +static const uint32 UnicodeDecomp_codepoints[$decomp_index] = +{ +$decomp_string +}; +HEADER + +# Emit the definition of the decomp hash function. +my $dec_funcname = 'Decomp_hash_func'; +my $dec_func = PerfectHash::generate_hash_function(\@dec_cp_packed, + $dec_funcname, fixed_key_length => 4); +print $OF "/* Perfect hash function for decomposition */\n"; +print $OF "static $dec_func\n"; + +# Emit the structure that wraps the hash lookup information into +# one variable. +print $OF <<HEADER; +/* Hash lookup information for decomposition */ +static const pg_unicode_decompinfo UnicodeDecompInfo = +{ + UnicodeDecompMain, + $dec_funcname, + $num_characters +}; + +HEADER + +# Find the lowest codepoint that decomposes to each recomposeable +# code pair and create a mapping to it. +my $recomp_string = ""; +my @rec_cp_packed; +my %seenit; +my $firstentry = 1; +foreach my $rec (sort recomp_sort @rec_info) +{ + # The hash key is formed by concatenating the bytes of the two + # codepoints. See also recompose_code() in common/unicode_norm.c. + my $hashkey = (hex($rec->{first}) << 32) | hex($rec->{second}); + + # We are only interested in the lowest code point that decomposes + # to the given code pair. + next if $seenit{$hashkey}; + + # Save the hash key bytes in network order + push @rec_cp_packed, pack('Q>', $hashkey); + + # Append inverse lookup element + $recomp_string .= ",\n" if !$firstentry; + $recomp_string .= sprintf "\t/* U+%s+%s -> U+%s */ %s", + $rec->{first}, + $rec->{second}, + $rec->{code}, + $rec->{main_index}; + + $seenit{$hashkey} = 1; + $firstentry = 0; +} + +# Emit the inverse lookup array containing indexes into UnicodeDecompMain. +my $num_recomps = scalar @rec_cp_packed; +print $OF <<HEADER; +/* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */ +static const uint16 RecompInverseLookup[$num_recomps] = +{ +$recomp_string +}; + +HEADER + +# Emit the definition of the recomposition hash function. +my $rec_funcname = 'Recomp_hash_func'; +my $rec_func = + PerfectHash::generate_hash_function(\@rec_cp_packed, $rec_funcname, + fixed_key_length => 8); +print $OF "/* Perfect hash function for recomposition */\n"; +print $OF "static $rec_func\n"; + +# Emit the structure that wraps the hash lookup information into +# one variable. +print $OF <<HEADER; +/* Hash lookup information for recomposition */ +static const pg_unicode_recompinfo UnicodeRecompInfo = +{ + RecompInverseLookup, + $rec_funcname, + $num_recomps +}; +HEADER + +close $OT; +close $OF; + +sub recomp_sort +{ + my $a1 = hex($a->{first}); + my $b1 = hex($b->{first}); + + my $a2 = hex($a->{second}); + my $b2 = hex($b->{second}); + + # First sort by the first code point + return -1 if $a1 < $b1; + return 1 if $a1 > $b1; + + # Then sort by the second code point + return -1 if $a2 < $b2; + return 1 if $a2 > $b2; + + # Finally sort by the code point that decomposes into first and + # second ones. + my $acode = hex($a->{code}); + my $bcode = hex($b->{code}); + + return -1 if $acode < $bcode; + return 1 if $acode > $bcode; + + die "found duplicate entries of recomposeable code pairs"; +} diff --git a/src/common/unicode/generate-unicode_normprops_table.pl b/src/common/unicode/generate-unicode_normprops_table.pl new file mode 100644 index 0000000..08e41b3 --- /dev/null +++ b/src/common/unicode/generate-unicode_normprops_table.pl @@ -0,0 +1,125 @@ +#!/usr/bin/perl +# +# Generate table of Unicode normalization "quick check" properties +# (see UAX #15). Pass DerivedNormalizationProps.txt as argument. The +# output is on stdout. +# +# Copyright (c) 2020-2022, PostgreSQL Global Development Group + +use strict; +use warnings; + +use FindBin; +use lib "$FindBin::RealBin/../../tools/"; +use PerfectHash; + +my %data; + +print + "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n"; + +print <<EOS; +#include "common/unicode_norm.h" + +/* + * Normalization quick check entry for codepoint. We use a bit field + * here to save space. + */ +typedef struct +{ + unsigned int codepoint:21; + signed int quickcheck:4; /* really UnicodeNormalizationQC */ +} pg_unicode_normprops; + +/* Typedef for hash function on quick check table */ +typedef int (*qc_hash_func) (const void *key); + +/* Information for quick check lookup with perfect hash function */ +typedef struct +{ + const pg_unicode_normprops *normprops; + qc_hash_func hash; + int num_normprops; +} pg_unicode_norminfo; +EOS + +foreach my $line (<ARGV>) +{ + chomp $line; + $line =~ s/\s*#.*$//; + next if $line eq ''; + my ($codepoint, $prop, $value) = split /\s*;\s*/, $line; + next if $prop !~ /_QC/; + + my ($first, $last); + if ($codepoint =~ /\.\./) + { + ($first, $last) = split /\.\./, $codepoint; + } + else + { + $first = $last = $codepoint; + } + + foreach my $cp (hex($first) .. hex($last)) + { + $data{$prop}{$cp} = $value; + } +} + +# We create a separate array for each normalization form rather than, +# say, a two-dimensional array, because that array would be very +# sparse and would create unnecessary overhead especially for the NFC +# lookup. +foreach my $prop (sort keys %data) +{ + # Don't build the tables for the "D" forms because they are too + # big. See also unicode_is_normalized_quickcheck(). + next if $prop eq "NFD_QC" || $prop eq "NFKD_QC"; + + print "\n"; + print + "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n"; + + my %subdata = %{ $data{$prop} }; + my @cp_packed; + foreach my $cp (sort { $a <=> $b } keys %subdata) + { + my $qc; + if ($subdata{$cp} eq 'N') + { + $qc = 'UNICODE_NORM_QC_NO'; + } + elsif ($subdata{$cp} eq 'M') + { + $qc = 'UNICODE_NORM_QC_MAYBE'; + } + else + { + die; + } + printf "\t{0x%04X, %s},\n", $cp, $qc; + + # Save the bytes as a string in network order. + push @cp_packed, pack('N', $cp); + } + + print "};\n"; + + # Emit the definition of the perfect hash function. + my $funcname = $prop . '_hash_func'; + my $f = PerfectHash::generate_hash_function(\@cp_packed, $funcname, + fixed_key_length => 4); + printf "\n/* Perfect hash function for %s */", $prop; + print "\nstatic $f\n"; + + # Emit the structure that wraps the hash lookup information into + # one variable. + printf "/* Hash lookup information for %s */", $prop; + printf "\nstatic const pg_unicode_norminfo "; + printf "UnicodeNormInfo_%s = {\n", $prop; + printf "\tUnicodeNormProps_%s,\n", $prop; + printf "\t%s,\n", $funcname; + printf "\t%d\n", scalar @cp_packed; + printf "};\n"; +} diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c new file mode 100644 index 0000000..0e244ad --- /dev/null +++ b/src/common/unicode/norm_test.c @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * norm_test.c + * Program to test Unicode normalization functions. + * + * Portions Copyright (c) 2017-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/common/unicode/norm_test.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "common/unicode_norm.h" + +#include "norm_test_table.h" + +static char * +print_wchar_str(const pg_wchar *s) +{ +#define BUF_DIGITS 50 + static char buf[BUF_DIGITS * 11 + 1]; + int i; + char *p; + + i = 0; + p = buf; + while (*s && i < BUF_DIGITS) + { + p += sprintf(p, "U+%04X ", *s); + i++; + s++; + } + *p = '\0'; + + return buf; +} + +static int +pg_wcscmp(const pg_wchar *s1, const pg_wchar *s2) +{ + for (;;) + { + if (*s1 < *s2) + return -1; + if (*s1 > *s2) + return 1; + if (*s1 == 0) + return 0; + s1++; + s2++; + } +} + +int +main(int argc, char **argv) +{ + const pg_unicode_test *test; + + for (test = UnicodeNormalizationTests; test->input[0] != 0; test++) + { + for (int form = 0; form < 4; form++) + { + pg_wchar *result; + + result = unicode_normalize(form, test->input); + + if (pg_wcscmp(test->output[form], result) != 0) + { + printf("FAILURE (NormalizationTest.txt line %d form %d):\n", test->linenum, form); + printf("input: %s\n", print_wchar_str(test->input)); + printf("expected: %s\n", print_wchar_str(test->output[form])); + printf("got: %s\n", print_wchar_str(result)); + printf("\n"); + exit(1); + } + } + } + + printf("All tests successful!\n"); + exit(0); +} |