diff options
Diffstat (limited to '')
-rw-r--r-- | src/common/unicode/.gitignore | 9 | ||||
-rw-r--r-- | src/common/unicode/Makefile | 72 | ||||
-rw-r--r-- | src/common/unicode/README | 28 | ||||
-rw-r--r-- | src/common/unicode/generate-norm_test_table.pl | 106 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_combining_table.pl | 51 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_east_asian_fw_table.pl | 76 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_norm_table.pl | 406 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_normprops_table.pl | 125 | ||||
-rw-r--r-- | src/common/unicode/norm_test.c | 86 | ||||
-rw-r--r-- | src/common/unicode_norm.c | 634 |
10 files changed, 1593 insertions, 0 deletions
diff --git a/src/common/unicode/.gitignore b/src/common/unicode/.gitignore new file mode 100644 index 0000000..46243f7 --- /dev/null +++ b/src/common/unicode/.gitignore @@ -0,0 +1,9 @@ +/norm_test +/norm_test_table.h + +# Downloaded files +/CompositionExclusions.txt +/DerivedNormalizationProps.txt +/EastAsianWidth.txt +/NormalizationTest.txt +/UnicodeData.txt diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile new file mode 100644 index 0000000..60e01e7 --- /dev/null +++ b/src/common/unicode/Makefile @@ -0,0 +1,72 @@ +#------------------------------------------------------------------------- +# +# Makefile +# Makefile for src/common/unicode +# +# IDENTIFICATION +# src/common/unicode/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/common/unicode +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +override CPPFLAGS := -DFRONTEND -I. $(CPPFLAGS) +LIBS += $(PTHREAD_LIBS) + +# By default, do nothing. +all: + +update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h + mv $^ $(top_srcdir)/src/include/common/ + $(MAKE) normalization-check + +# These files are part of the Unicode Character Database. Download +# them on demand. The dependency on Makefile.global is for +# UNICODE_VERSION. +UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global + $(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F) + +# Generation of conversion tables used for string normalization with +# UTF-8 strings. +unicode_norm_hashfunc.h: unicode_norm_table.h + +unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt + $(PERL) $< + +unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt + $(PERL) $^ >$@ + +unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt + $(PERL) $^ >$@ + +unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt + $(PERL) $^ >$@ + +# Test suite +normalization-check: norm_test + ./norm_test + +norm_test: norm_test.o ../unicode_norm.o | submake-common + +norm_test.o: norm_test_table.h + +.PHONY: submake-common + +submake-common: + $(MAKE) -C .. all + +norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt + perl $^ $@ + +.PHONY: normalization-check + + +clean: + rm -f $(OBJS) norm_test norm_test.o + +distclean: clean + rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h + +maintainer-clean: distclean diff --git a/src/common/unicode/README b/src/common/unicode/README new file mode 100644 index 0000000..56956f6 --- /dev/null +++ b/src/common/unicode/README @@ -0,0 +1,28 @@ +This directory contains tools to generate the tables in +src/include/common/unicode_norm.h, used for Unicode normalization. The +generated .h file is included in the source tree, so these are normally not +needed to build PostgreSQL, only if you need to re-generate the .h file +from the Unicode data files for some reason, e.g. to update to a new version +of Unicode. + +Generating unicode_norm_table.h +------------------------------- + +Run + + make update-unicode + +from the top level of the source tree and commit the result. + +Tests +----- + +The Unicode consortium publishes a comprehensive test suite for the +normalization algorithm, in a file called NormalizationTest.txt. This +directory also contains a perl script and some C code, to run our +normalization code with all the test strings in NormalizationTest.txt. +To download NormalizationTest.txt and run the tests: + + make normalization-check + +This is also run as part of the update-unicode target. diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl new file mode 100644 index 0000000..838f552 --- /dev/null +++ b/src/common/unicode/generate-norm_test_table.pl @@ -0,0 +1,106 @@ +#!/usr/bin/perl +# +# Read Unicode consortium's normalization test suite, NormalizationTest.txt, +# and generate a C array from it, for norm_test.c. +# +# NormalizationTest.txt is part of the Unicode Character Database. +# +# Copyright (c) 2000-2022, PostgreSQL Global Development Group + +use strict; +use warnings; + +use File::Basename; + +die "Usage: $0 INPUT_FILE OUTPUT_FILE\n" if @ARGV != 2; +my $input_file = $ARGV[0]; +my $output_file = $ARGV[1]; +my $output_base = basename($output_file); + +# Open the input and output files +open my $INPUT, '<', $input_file + or die "Could not open input file $input_file: $!"; +open my $OUTPUT, '>', $output_file + or die "Could not open output file $output_file: $!\n"; + +# Print header of output file. +print $OUTPUT <<HEADER; +/*------------------------------------------------------------------------- + * + * norm_test_table.h + * Test strings for Unicode normalization. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/common/unicode/norm_test_table.h + * + *------------------------------------------------------------------------- + */ + +/* + * File auto-generated by src/common/unicode/generate-norm_test_table.pl, do + * not edit. There is deliberately not an #ifndef PG_NORM_TEST_TABLE_H + * here. + */ + +typedef struct +{ + int linenum; + pg_wchar input[50]; + pg_wchar output[4][50]; +} pg_unicode_test; + +/* test table */ +HEADER +print $OUTPUT + "static const pg_unicode_test UnicodeNormalizationTests[] =\n{\n"; + +# Helper routine to convert a space-separated list of Unicode characters to +# hexadecimal list format, suitable for outputting in a C array. +sub codepoint_string_to_hex +{ + my $codepoint_string = shift; + + my $result; + + foreach (split(' ', $codepoint_string)) + { + my $cp = $_; + my $utf8 = "0x$cp, "; + $result .= $utf8; + } + $result .= '0'; # null-terminated the array + return $result; +} + +# Process the input file line by line +my $linenum = 0; +while (my $line = <$INPUT>) +{ + $linenum = $linenum + 1; + if ($line =~ /^\s*#/) { next; } # ignore comments + + if ($line =~ /^@/) { next; } # ignore @Part0 like headers + + # Split the line wanted and get the fields needed: + # + # source; NFC; NFD; NFKC; NFKD + my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line); + + my $source_utf8 = codepoint_string_to_hex($source); + my $nfc_utf8 = codepoint_string_to_hex($nfc); + my $nfd_utf8 = codepoint_string_to_hex($nfd); + my $nfkc_utf8 = codepoint_string_to_hex($nfkc); + my $nfkd_utf8 = codepoint_string_to_hex($nfkd); + + print $OUTPUT + "\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n"; +} + +# Output terminator entry +print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }"; +print $OUTPUT "\n};\n"; + +close $OUTPUT; +close $INPUT; diff --git a/src/common/unicode/generate-unicode_combining_table.pl b/src/common/unicode/generate-unicode_combining_table.pl new file mode 100644 index 0000000..8177c20 --- /dev/null +++ b/src/common/unicode/generate-unicode_combining_table.pl @@ -0,0 +1,51 @@ +#!/usr/bin/perl +# +# Generate sorted list of non-overlapping intervals of non-spacing +# characters, using Unicode data files as input. Pass UnicodeData.txt +# as argument. The output is on stdout. +# +# Copyright (c) 2019-2022, PostgreSQL Global Development Group + +use strict; +use warnings; + +my $range_start = undef; +my $codepoint; +my $prev_codepoint; +my $count = 0; + +print + "/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */\n\n"; + +print "static const struct mbinterval combining[] = {\n"; + +foreach my $line (<ARGV>) +{ + chomp $line; + my @fields = split ';', $line; + $codepoint = hex $fields[0]; + + if ($fields[2] eq 'Me' || $fields[2] eq 'Mn') + { + # combining character, save for start of range + if (!defined($range_start)) + { + $range_start = $codepoint; + } + } + else + { + # not a combining character, print out previous range if any + if (defined($range_start)) + { + printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_codepoint; + $range_start = undef; + } + } +} +continue +{ + $prev_codepoint = $codepoint; +} + +print "};\n"; diff --git a/src/common/unicode/generate-unicode_east_asian_fw_table.pl b/src/common/unicode/generate-unicode_east_asian_fw_table.pl new file mode 100644 index 0000000..9d03684 --- /dev/null +++ b/src/common/unicode/generate-unicode_east_asian_fw_table.pl @@ -0,0 +1,76 @@ +#!/usr/bin/perl +# +# Generate a sorted list of non-overlapping intervals of East Asian Wide (W) +# and East Asian Fullwidth (F) characters, using Unicode data files as input. +# Pass EastAsianWidth.txt as argument. The output is on stdout. +# +# Copyright (c) 2019-2022, PostgreSQL Global Development Group + +use strict; +use warnings; + +my $range_start = undef; +my ($first, $last); +my $prev_last; + +print + "/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n"; + +print "static const struct mbinterval east_asian_fw[] = {\n"; + +foreach my $line (<ARGV>) +{ + chomp $line; + $line =~ s/\s*#.*$//; + next if $line eq ''; + my ($codepoint, $width) = split ';', $line; + + if ($codepoint =~ /\.\./) + { + ($first, $last) = split /\.\./, $codepoint; + } + else + { + $first = $last = $codepoint; + } + + ($first, $last) = map(hex, ($first, $last)); + + if ($width eq 'F' || $width eq 'W') + { + # fullwidth/wide characters + if (!defined($range_start)) + { + # save for start of range if one hasn't been started yet + $range_start = $first; + } + elsif ($first != $prev_last + 1) + { + # ranges aren't contiguous; emit the last and start a new one + printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last; + $range_start = $first; + } + } + else + { + # not wide characters, print out previous range if any + if (defined($range_start)) + { + printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last; + $range_start = undef; + } + } +} +continue +{ + $prev_last = $last; +} + +# don't forget any ranges at the very end of the database (though there are none +# as of Unicode 13.0) +if (defined($range_start)) +{ + printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last; +} + +print "};\n"; diff --git a/src/common/unicode/generate-unicode_norm_table.pl b/src/common/unicode/generate-unicode_norm_table.pl new file mode 100644 index 0000000..e442345 --- /dev/null +++ b/src/common/unicode/generate-unicode_norm_table.pl @@ -0,0 +1,406 @@ +#!/usr/bin/perl +# +# Generate a composition table and its lookup utilities, using Unicode data +# files as input. +# +# Input: UnicodeData.txt and CompositionExclusions.txt +# Output: unicode_norm_table.h and unicode_norm_hashfunc.h +# +# Copyright (c) 2000-2022, PostgreSQL Global Development Group + +use strict; +use warnings; + +use FindBin; +use lib "$FindBin::RealBin/../../tools/"; +use PerfectHash; + +my $output_table_file = "unicode_norm_table.h"; +my $output_func_file = "unicode_norm_hashfunc.h"; + +my $FH; + +# Read list of codes that should be excluded from re-composition. +my @composition_exclusion_codes = (); +open($FH, '<', "CompositionExclusions.txt") + or die "Could not open CompositionExclusions.txt: $!."; +while (my $line = <$FH>) +{ + if ($line =~ /^([[:xdigit:]]+)/) + { + push @composition_exclusion_codes, $1; + } +} +close $FH; + +# Read entries from UnicodeData.txt into a list, and a hash table. We need +# three fields from each row: the codepoint, canonical combining class, +# and character decomposition mapping +my @characters = (); +my %character_hash = (); +open($FH, '<', "UnicodeData.txt") + or die "Could not open UnicodeData.txt: $!."; +while (my $line = <$FH>) +{ + + # Split the line wanted and get the fields needed: + # - Unicode code value + # - Canonical Combining Class + # - Character Decomposition Mapping + my @elts = split(';', $line); + my $code = $elts[0]; + my $class = $elts[3]; + my $decomp = $elts[5]; + + # Skip codepoints above U+10FFFF. They cannot be represented in 4 bytes + # in UTF-8, and PostgreSQL doesn't support UTF-8 characters longer than + # 4 bytes. (This is just pro forma, as there aren't any such entries in + # the data file, currently.) + next if hex($code) > 0x10FFFF; + + # Skip characters with no decompositions and a class of 0, to reduce the + # table size. + next if $class eq '0' && $decomp eq ''; + + my %char_entry = (code => $code, class => $class, decomp => $decomp); + push(@characters, \%char_entry); + $character_hash{$code} = \%char_entry; +} +close $FH; + +my $num_characters = scalar @characters; + +# Start writing out the output files +open my $OT, '>', $output_table_file + or die "Could not open output file $output_table_file: $!\n"; +open my $OF, '>', $output_func_file + or die "Could not open output file $output_func_file: $!\n"; + +print $OT <<HEADER; +/*------------------------------------------------------------------------- + * + * unicode_norm_table.h + * Composition table used for Unicode normalization + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/common/unicode_norm_table.h + * + *------------------------------------------------------------------------- + */ + +/* + * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl, + * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_TABLE_H + * here. + */ +typedef struct +{ + uint32 codepoint; /* Unicode codepoint */ + uint8 comb_class; /* combining class of character */ + uint8 dec_size_flags; /* size and flags of decomposition code list */ + uint16 dec_index; /* index into UnicodeDecomp_codepoints, or the + * decomposition itself if DECOMP_INLINE */ +} pg_unicode_decomposition; + +#define DECOMP_NO_COMPOSE 0x80 /* don't use for re-composition */ +#define DECOMP_INLINE 0x40 /* decomposition is stored inline in + * dec_index */ +#define DECOMP_COMPAT 0x20 /* compatibility mapping */ + +#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F) +#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0) +#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0) +#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0) + +/* Table of Unicode codepoints and their decompositions */ +static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] = +{ +HEADER + +print $OF <<HEADER; +/*------------------------------------------------------------------------- + * + * unicode_norm_hashfunc.h + * Perfect hash functions used for Unicode normalization + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/common/unicode_norm_hashfunc.h + * + *------------------------------------------------------------------------- + */ + +/* + * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl, + * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H + * here. + */ + +#include "common/unicode_norm_table.h" + +/* Typedef for perfect hash functions */ +typedef int (*cp_hash_func) (const void *key); + +/* Information for lookups with perfect hash functions */ +typedef struct +{ + const pg_unicode_decomposition *decomps; + cp_hash_func hash; + int num_decomps; +} pg_unicode_decompinfo; + +typedef struct +{ + const uint16 *inverse_lookup; + cp_hash_func hash; + int num_recomps; +} pg_unicode_recompinfo; + +HEADER + +my $decomp_index = 0; +my $decomp_string = ""; +my @dec_cp_packed; +my $main_index = 0; +my @rec_info; + +my $last_code = $characters[-1]->{code}; +foreach my $char (@characters) +{ + my $code = $char->{code}; + my $class = $char->{class}; + my $decomp = $char->{decomp}; + + # Save the code point bytes as a string in network order. + push @dec_cp_packed, pack('N', hex($char->{code})); + + # The character decomposition mapping field in UnicodeData.txt is a list + # of unicode codepoints, separated by space. But it can be prefixed with + # so-called compatibility formatting tag, like "<compat>", or "<font>". + # The entries with compatibility formatting tags should not be used for + # re-composing characters during normalization, so flag them in the table. + # (The tag doesn't matter, only whether there is a tag or not) + my $compat = 0; + if ($decomp =~ /\<.*\>/) + { + $compat = 1; + $decomp =~ s/\<[^][]*\>//g; + } + my @decomp_elts = split(" ", $decomp); + + # Decomposition size + # Print size of decomposition + my $decomp_size = scalar(@decomp_elts); + die if $decomp_size > 0x1F; # to not overrun bitmask + + my $first_decomp = shift @decomp_elts; + + my $flags = ""; + my $comment = ""; + + if ($compat) + { + $flags .= " | DECOMP_COMPAT"; + } + + if ($decomp_size == 2) + { + # Should this be used for recomposition? + if ( $character_hash{$first_decomp} + && $character_hash{$first_decomp}->{class} != 0) + { + $flags .= " | DECOMP_NO_COMPOSE"; + $comment = "non-starter decomposition"; + } + else + { + foreach my $lcode (@composition_exclusion_codes) + { + if ($lcode eq $code) + { + $flags .= " | DECOMP_NO_COMPOSE"; + $comment = "in exclusion list"; + last; + } + } + } + + # Save info for recomposeable codepoints. + # Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C + # above! See also the inverse lookup in recompose_code() found in + # src/common/unicode_norm.c. + if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/)) + { + push @rec_info, + { + code => $code, + main_index => $main_index, + first => $first_decomp, + second => $decomp_elts[0] + }; + } + } + + if ($decomp_size == 0) + { + print $OT "\t{0x$code, $class, 0$flags, 0}"; + } + elsif ($decomp_size == 1 && length($first_decomp) <= 4) + { + + # The decomposition consists of a single codepoint, and it fits + # in a uint16, so we can store it "inline" in the main table. + $flags .= " | DECOMP_INLINE"; + print $OT "\t{0x$code, $class, 1$flags, 0x$first_decomp}"; + } + else + { + print $OT "\t{0x$code, $class, $decomp_size$flags, $decomp_index}"; + + # Now save the decompositions into a dedicated area that will + # be written afterwards. First build the entry dedicated to + # a sub-table with the code and decomposition. + $decomp_string .= ",\n" if ($decomp_string ne ""); + + $decomp_string .= "\t /* $decomp_index */ 0x$first_decomp"; + foreach (@decomp_elts) + { + $decomp_string .= ", 0x$_"; + } + + $decomp_index = $decomp_index + $decomp_size; + } + + # Print a comma after all items except the last one. + print $OT "," unless ($code eq $last_code); + + print $OT "\t/* $comment */" if ($comment ne ""); + print $OT "\n"; + + $main_index++; +} +print $OT "\n};\n\n"; + +# Print the array of decomposed codes. +print $OT <<HEADER; +/* codepoints array */ +static const uint32 UnicodeDecomp_codepoints[$decomp_index] = +{ +$decomp_string +}; +HEADER + +# Emit the definition of the decomp hash function. +my $dec_funcname = 'Decomp_hash_func'; +my $dec_func = PerfectHash::generate_hash_function(\@dec_cp_packed, + $dec_funcname, fixed_key_length => 4); +print $OF "/* Perfect hash function for decomposition */\n"; +print $OF "static $dec_func\n"; + +# Emit the structure that wraps the hash lookup information into +# one variable. +print $OF <<HEADER; +/* Hash lookup information for decomposition */ +static const pg_unicode_decompinfo UnicodeDecompInfo = +{ + UnicodeDecompMain, + $dec_funcname, + $num_characters +}; + +HEADER + +# Find the lowest codepoint that decomposes to each recomposeable +# code pair and create a mapping to it. +my $recomp_string = ""; +my @rec_cp_packed; +my %seenit; +my $firstentry = 1; +foreach my $rec (sort recomp_sort @rec_info) +{ + # The hash key is formed by concatenating the bytes of the two + # codepoints. See also recompose_code() in common/unicode_norm.c. + my $hashkey = (hex($rec->{first}) << 32) | hex($rec->{second}); + + # We are only interested in the lowest code point that decomposes + # to the given code pair. + next if $seenit{$hashkey}; + + # Save the hash key bytes in network order + push @rec_cp_packed, pack('Q>', $hashkey); + + # Append inverse lookup element + $recomp_string .= ",\n" if !$firstentry; + $recomp_string .= sprintf "\t/* U+%s+%s -> U+%s */ %s", + $rec->{first}, + $rec->{second}, + $rec->{code}, + $rec->{main_index}; + + $seenit{$hashkey} = 1; + $firstentry = 0; +} + +# Emit the inverse lookup array containing indexes into UnicodeDecompMain. +my $num_recomps = scalar @rec_cp_packed; +print $OF <<HEADER; +/* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */ +static const uint16 RecompInverseLookup[$num_recomps] = +{ +$recomp_string +}; + +HEADER + +# Emit the definition of the recomposition hash function. +my $rec_funcname = 'Recomp_hash_func'; +my $rec_func = + PerfectHash::generate_hash_function(\@rec_cp_packed, $rec_funcname, + fixed_key_length => 8); +print $OF "/* Perfect hash function for recomposition */\n"; +print $OF "static $rec_func\n"; + +# Emit the structure that wraps the hash lookup information into +# one variable. +print $OF <<HEADER; +/* Hash lookup information for recomposition */ +static const pg_unicode_recompinfo UnicodeRecompInfo = +{ + RecompInverseLookup, + $rec_funcname, + $num_recomps +}; +HEADER + +close $OT; +close $OF; + +sub recomp_sort +{ + my $a1 = hex($a->{first}); + my $b1 = hex($b->{first}); + + my $a2 = hex($a->{second}); + my $b2 = hex($b->{second}); + + # First sort by the first code point + return -1 if $a1 < $b1; + return 1 if $a1 > $b1; + + # Then sort by the second code point + return -1 if $a2 < $b2; + return 1 if $a2 > $b2; + + # Finally sort by the code point that decomposes into first and + # second ones. + my $acode = hex($a->{code}); + my $bcode = hex($b->{code}); + + return -1 if $acode < $bcode; + return 1 if $acode > $bcode; + + die "found duplicate entries of recomposeable code pairs"; +} diff --git a/src/common/unicode/generate-unicode_normprops_table.pl b/src/common/unicode/generate-unicode_normprops_table.pl new file mode 100644 index 0000000..08e41b3 --- /dev/null +++ b/src/common/unicode/generate-unicode_normprops_table.pl @@ -0,0 +1,125 @@ +#!/usr/bin/perl +# +# Generate table of Unicode normalization "quick check" properties +# (see UAX #15). Pass DerivedNormalizationProps.txt as argument. The +# output is on stdout. +# +# Copyright (c) 2020-2022, PostgreSQL Global Development Group + +use strict; +use warnings; + +use FindBin; +use lib "$FindBin::RealBin/../../tools/"; +use PerfectHash; + +my %data; + +print + "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n"; + +print <<EOS; +#include "common/unicode_norm.h" + +/* + * Normalization quick check entry for codepoint. We use a bit field + * here to save space. + */ +typedef struct +{ + unsigned int codepoint:21; + signed int quickcheck:4; /* really UnicodeNormalizationQC */ +} pg_unicode_normprops; + +/* Typedef for hash function on quick check table */ +typedef int (*qc_hash_func) (const void *key); + +/* Information for quick check lookup with perfect hash function */ +typedef struct +{ + const pg_unicode_normprops *normprops; + qc_hash_func hash; + int num_normprops; +} pg_unicode_norminfo; +EOS + +foreach my $line (<ARGV>) +{ + chomp $line; + $line =~ s/\s*#.*$//; + next if $line eq ''; + my ($codepoint, $prop, $value) = split /\s*;\s*/, $line; + next if $prop !~ /_QC/; + + my ($first, $last); + if ($codepoint =~ /\.\./) + { + ($first, $last) = split /\.\./, $codepoint; + } + else + { + $first = $last = $codepoint; + } + + foreach my $cp (hex($first) .. hex($last)) + { + $data{$prop}{$cp} = $value; + } +} + +# We create a separate array for each normalization form rather than, +# say, a two-dimensional array, because that array would be very +# sparse and would create unnecessary overhead especially for the NFC +# lookup. +foreach my $prop (sort keys %data) +{ + # Don't build the tables for the "D" forms because they are too + # big. See also unicode_is_normalized_quickcheck(). + next if $prop eq "NFD_QC" || $prop eq "NFKD_QC"; + + print "\n"; + print + "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n"; + + my %subdata = %{ $data{$prop} }; + my @cp_packed; + foreach my $cp (sort { $a <=> $b } keys %subdata) + { + my $qc; + if ($subdata{$cp} eq 'N') + { + $qc = 'UNICODE_NORM_QC_NO'; + } + elsif ($subdata{$cp} eq 'M') + { + $qc = 'UNICODE_NORM_QC_MAYBE'; + } + else + { + die; + } + printf "\t{0x%04X, %s},\n", $cp, $qc; + + # Save the bytes as a string in network order. + push @cp_packed, pack('N', $cp); + } + + print "};\n"; + + # Emit the definition of the perfect hash function. + my $funcname = $prop . '_hash_func'; + my $f = PerfectHash::generate_hash_function(\@cp_packed, $funcname, + fixed_key_length => 4); + printf "\n/* Perfect hash function for %s */", $prop; + print "\nstatic $f\n"; + + # Emit the structure that wraps the hash lookup information into + # one variable. + printf "/* Hash lookup information for %s */", $prop; + printf "\nstatic const pg_unicode_norminfo "; + printf "UnicodeNormInfo_%s = {\n", $prop; + printf "\tUnicodeNormProps_%s,\n", $prop; + printf "\t%s,\n", $funcname; + printf "\t%d\n", scalar @cp_packed; + printf "};\n"; +} diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c new file mode 100644 index 0000000..0e244ad --- /dev/null +++ b/src/common/unicode/norm_test.c @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * norm_test.c + * Program to test Unicode normalization functions. + * + * Portions Copyright (c) 2017-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/common/unicode/norm_test.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "common/unicode_norm.h" + +#include "norm_test_table.h" + +static char * +print_wchar_str(const pg_wchar *s) +{ +#define BUF_DIGITS 50 + static char buf[BUF_DIGITS * 11 + 1]; + int i; + char *p; + + i = 0; + p = buf; + while (*s && i < BUF_DIGITS) + { + p += sprintf(p, "U+%04X ", *s); + i++; + s++; + } + *p = '\0'; + + return buf; +} + +static int +pg_wcscmp(const pg_wchar *s1, const pg_wchar *s2) +{ + for (;;) + { + if (*s1 < *s2) + return -1; + if (*s1 > *s2) + return 1; + if (*s1 == 0) + return 0; + s1++; + s2++; + } +} + +int +main(int argc, char **argv) +{ + const pg_unicode_test *test; + + for (test = UnicodeNormalizationTests; test->input[0] != 0; test++) + { + for (int form = 0; form < 4; form++) + { + pg_wchar *result; + + result = unicode_normalize(form, test->input); + + if (pg_wcscmp(test->output[form], result) != 0) + { + printf("FAILURE (NormalizationTest.txt line %d form %d):\n", test->linenum, form); + printf("input: %s\n", print_wchar_str(test->input)); + printf("expected: %s\n", print_wchar_str(test->output[form])); + printf("got: %s\n", print_wchar_str(result)); + printf("\n"); + exit(1); + } + } + } + + printf("All tests successful!\n"); + exit(0); +} diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c new file mode 100644 index 0000000..8f1402e --- /dev/null +++ b/src/common/unicode_norm.c @@ -0,0 +1,634 @@ +/*------------------------------------------------------------------------- + * unicode_norm.c + * Normalize a Unicode string + * + * This implements Unicode normalization, per the documentation at + * https://www.unicode.org/reports/tr15/. + * + * Portions Copyright (c) 2017-2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/common/unicode_norm.c + * + *------------------------------------------------------------------------- + */ +#ifndef FRONTEND +#include "postgres.h" +#else +#include "postgres_fe.h" +#endif + +#include "common/unicode_norm.h" +#ifndef FRONTEND +#include "common/unicode_norm_hashfunc.h" +#include "common/unicode_normprops_table.h" +#include "port/pg_bswap.h" +#else +#include "common/unicode_norm_table.h" +#endif + +#ifndef FRONTEND +#define ALLOC(size) palloc(size) +#define FREE(size) pfree(size) +#else +#define ALLOC(size) malloc(size) +#define FREE(size) free(size) +#endif + +/* Constants for calculations with Hangul characters */ +#define SBASE 0xAC00 /* U+AC00 */ +#define LBASE 0x1100 /* U+1100 */ +#define VBASE 0x1161 /* U+1161 */ +#define TBASE 0x11A7 /* U+11A7 */ +#define LCOUNT 19 +#define VCOUNT 21 +#define TCOUNT 28 +#define NCOUNT VCOUNT * TCOUNT +#define SCOUNT LCOUNT * NCOUNT + +#ifdef FRONTEND +/* comparison routine for bsearch() of decomposition lookup table. */ +static int +conv_compare(const void *p1, const void *p2) +{ + uint32 v1, + v2; + + v1 = *(const uint32 *) p1; + v2 = ((const pg_unicode_decomposition *) p2)->codepoint; + return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1); +} + +#endif + +/* + * get_code_entry + * + * Get the entry corresponding to code in the decomposition lookup table. + * The backend version of this code uses a perfect hash function for the + * lookup, while the frontend version uses a binary search. + */ +static const pg_unicode_decomposition * +get_code_entry(pg_wchar code) +{ +#ifndef FRONTEND + int h; + uint32 hashkey; + pg_unicode_decompinfo decompinfo = UnicodeDecompInfo; + + /* + * Compute the hash function. The hash key is the codepoint with the bytes + * in network order. + */ + hashkey = pg_hton32(code); + h = decompinfo.hash(&hashkey); + + /* An out-of-range result implies no match */ + if (h < 0 || h >= decompinfo.num_decomps) + return NULL; + + /* + * Since it's a perfect hash, we need only match to the specific codepoint + * it identifies. + */ + if (code != decompinfo.decomps[h].codepoint) + return NULL; + + /* Success! */ + return &decompinfo.decomps[h]; +#else + return bsearch(&(code), + UnicodeDecompMain, + lengthof(UnicodeDecompMain), + sizeof(pg_unicode_decomposition), + conv_compare); +#endif +} + +/* + * Get the combining class of the given codepoint. + */ +static uint8 +get_canonical_class(pg_wchar code) +{ + const pg_unicode_decomposition *entry = get_code_entry(code); + + /* + * If no entries are found, the character used is either an Hangul + * character or a character with a class of 0 and no decompositions. + */ + if (!entry) + return 0; + else + return entry->comb_class; +} + +/* + * Given a decomposition entry looked up earlier, get the decomposed + * characters. + * + * Note: the returned pointer can point to statically allocated buffer, and + * is only valid until next call to this function! + */ +static const pg_wchar * +get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size) +{ + static pg_wchar x; + + if (DECOMPOSITION_IS_INLINE(entry)) + { + Assert(DECOMPOSITION_SIZE(entry) == 1); + x = (pg_wchar) entry->dec_index; + *dec_size = 1; + return &x; + } + else + { + *dec_size = DECOMPOSITION_SIZE(entry); + return &UnicodeDecomp_codepoints[entry->dec_index]; + } +} + +/* + * Calculate how many characters a given character will decompose to. + * + * This needs to recurse, if the character decomposes into characters that + * are, in turn, decomposable. + */ +static int +get_decomposed_size(pg_wchar code, bool compat) +{ + const pg_unicode_decomposition *entry; + int size = 0; + int i; + const uint32 *decomp; + int dec_size; + + /* + * Fast path for Hangul characters not stored in tables to save memory as + * decomposition is algorithmic. See + * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details + * on the matter. + */ + if (code >= SBASE && code < SBASE + SCOUNT) + { + uint32 tindex, + sindex; + + sindex = code - SBASE; + tindex = sindex % TCOUNT; + + if (tindex != 0) + return 3; + return 2; + } + + entry = get_code_entry(code); + + /* + * Just count current code if no other decompositions. A NULL entry is + * equivalent to a character with class 0 and no decompositions. + */ + if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 || + (!compat && DECOMPOSITION_IS_COMPAT(entry))) + return 1; + + /* + * If this entry has other decomposition codes look at them as well. First + * get its decomposition in the list of tables available. + */ + decomp = get_code_decomposition(entry, &dec_size); + for (i = 0; i < dec_size; i++) + { + uint32 lcode = decomp[i]; + + size += get_decomposed_size(lcode, compat); + } + + return size; +} + +/* + * Recompose a set of characters. For hangul characters, the calculation + * is algorithmic. For others, an inverse lookup at the decomposition + * table is necessary. Returns true if a recomposition can be done, and + * false otherwise. + */ +static bool +recompose_code(uint32 start, uint32 code, uint32 *result) +{ + /* + * Handle Hangul characters algorithmically, per the Unicode spec. + * + * Check if two current characters are L and V. + */ + if (start >= LBASE && start < LBASE + LCOUNT && + code >= VBASE && code < VBASE + VCOUNT) + { + /* make syllable of form LV */ + uint32 lindex = start - LBASE; + uint32 vindex = code - VBASE; + + *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT; + return true; + } + /* Check if two current characters are LV and T */ + else if (start >= SBASE && start < (SBASE + SCOUNT) && + ((start - SBASE) % TCOUNT) == 0 && + code >= TBASE && code < (TBASE + TCOUNT)) + { + /* make syllable of form LVT */ + uint32 tindex = code - TBASE; + + *result = start + tindex; + return true; + } + else + { + const pg_unicode_decomposition *entry; + + /* + * Do an inverse lookup of the decomposition tables to see if anything + * matches. The comparison just needs to be a perfect match on the + * sub-table of size two, because the start character has already been + * recomposed partially. This lookup uses a perfect hash function for + * the backend code. + */ +#ifndef FRONTEND + + int h, + inv_lookup_index; + uint64 hashkey; + pg_unicode_recompinfo recompinfo = UnicodeRecompInfo; + + /* + * Compute the hash function. The hash key is formed by concatenating + * bytes of the two codepoints in network order. See also + * src/common/unicode/generate-unicode_norm_table.pl. + */ + hashkey = pg_hton64(((uint64) start << 32) | (uint64) code); + h = recompinfo.hash(&hashkey); + + /* An out-of-range result implies no match */ + if (h < 0 || h >= recompinfo.num_recomps) + return false; + + inv_lookup_index = recompinfo.inverse_lookup[h]; + entry = &UnicodeDecompMain[inv_lookup_index]; + + if (start == UnicodeDecomp_codepoints[entry->dec_index] && + code == UnicodeDecomp_codepoints[entry->dec_index + 1]) + { + *result = entry->codepoint; + return true; + } + +#else + + int i; + + for (i = 0; i < lengthof(UnicodeDecompMain); i++) + { + entry = &UnicodeDecompMain[i]; + + if (DECOMPOSITION_SIZE(entry) != 2) + continue; + + if (DECOMPOSITION_NO_COMPOSE(entry)) + continue; + + if (start == UnicodeDecomp_codepoints[entry->dec_index] && + code == UnicodeDecomp_codepoints[entry->dec_index + 1]) + { + *result = entry->codepoint; + return true; + } + } +#endif /* !FRONTEND */ + } + + return false; +} + +/* + * Decompose the given code into the array given by caller. The + * decomposition begins at the position given by caller, saving one + * lookup on the decomposition table. The current position needs to be + * updated here to let the caller know from where to continue filling + * in the array result. + */ +static void +decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current) +{ + const pg_unicode_decomposition *entry; + int i; + const uint32 *decomp; + int dec_size; + + /* + * Fast path for Hangul characters not stored in tables to save memory as + * decomposition is algorithmic. See + * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details + * on the matter. + */ + if (code >= SBASE && code < SBASE + SCOUNT) + { + uint32 l, + v, + tindex, + sindex; + pg_wchar *res = *result; + + sindex = code - SBASE; + l = LBASE + sindex / (VCOUNT * TCOUNT); + v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT; + tindex = sindex % TCOUNT; + + res[*current] = l; + (*current)++; + res[*current] = v; + (*current)++; + + if (tindex != 0) + { + res[*current] = TBASE + tindex; + (*current)++; + } + + return; + } + + entry = get_code_entry(code); + + /* + * Just fill in with the current decomposition if there are no + * decomposition codes to recurse to. A NULL entry is equivalent to a + * character with class 0 and no decompositions, so just leave also in + * this case. + */ + if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 || + (!compat && DECOMPOSITION_IS_COMPAT(entry))) + { + pg_wchar *res = *result; + + res[*current] = code; + (*current)++; + return; + } + + /* + * If this entry has other decomposition codes look at them as well. + */ + decomp = get_code_decomposition(entry, &dec_size); + for (i = 0; i < dec_size; i++) + { + pg_wchar lcode = (pg_wchar) decomp[i]; + + /* Leave if no more decompositions */ + decompose_code(lcode, compat, result, current); + } +} + +/* + * unicode_normalize - Normalize a Unicode string to the specified form. + * + * The input is a 0-terminated array of codepoints. + * + * In frontend, returns a 0-terminated array of codepoints, allocated with + * malloc. Or NULL if we run out of memory. In backend, the returned + * string is palloc'd instead, and OOM is reported with ereport(). + */ +pg_wchar * +unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input) +{ + bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD); + bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC); + pg_wchar *decomp_chars; + pg_wchar *recomp_chars; + int decomp_size, + current_size; + int count; + const pg_wchar *p; + + /* variables for recomposition */ + int last_class; + int starter_pos; + int target_pos; + uint32 starter_ch; + + /* First, do character decomposition */ + + /* + * Calculate how many characters long the decomposed version will be. + */ + decomp_size = 0; + for (p = input; *p; p++) + decomp_size += get_decomposed_size(*p, compat); + + decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar)); + if (decomp_chars == NULL) + return NULL; + + /* + * Now fill in each entry recursively. This needs a second pass on the + * decomposition table. + */ + current_size = 0; + for (p = input; *p; p++) + decompose_code(*p, compat, &decomp_chars, ¤t_size); + decomp_chars[decomp_size] = '\0'; + Assert(decomp_size == current_size); + + /* Leave if there is nothing to decompose */ + if (decomp_size == 0) + return decomp_chars; + + /* + * Now apply canonical ordering. + */ + for (count = 1; count < decomp_size; count++) + { + pg_wchar prev = decomp_chars[count - 1]; + pg_wchar next = decomp_chars[count]; + pg_wchar tmp; + const uint8 prevClass = get_canonical_class(prev); + const uint8 nextClass = get_canonical_class(next); + + /* + * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html) + * annex 4, a sequence of two adjacent characters in a string is an + * exchangeable pair if the combining class (from the Unicode + * Character Database) for the first character is greater than the + * combining class for the second, and the second is not a starter. A + * character is a starter if its combining class is 0. + */ + if (prevClass == 0 || nextClass == 0) + continue; + + if (prevClass <= nextClass) + continue; + + /* exchange can happen */ + tmp = decomp_chars[count - 1]; + decomp_chars[count - 1] = decomp_chars[count]; + decomp_chars[count] = tmp; + + /* backtrack to check again */ + if (count > 1) + count -= 2; + } + + if (!recompose) + return decomp_chars; + + /* + * The last phase of NFC and NFKC is the recomposition of the reordered + * Unicode string using combining classes. The recomposed string cannot be + * longer than the decomposed one, so make the allocation of the output + * string based on that assumption. + */ + recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar)); + if (!recomp_chars) + { + FREE(decomp_chars); + return NULL; + } + + last_class = -1; /* this eliminates a special check */ + starter_pos = 0; + target_pos = 1; + starter_ch = recomp_chars[0] = decomp_chars[0]; + + for (count = 1; count < decomp_size; count++) + { + pg_wchar ch = decomp_chars[count]; + int ch_class = get_canonical_class(ch); + pg_wchar composite; + + if (last_class < ch_class && + recompose_code(starter_ch, ch, &composite)) + { + recomp_chars[starter_pos] = composite; + starter_ch = composite; + } + else if (ch_class == 0) + { + starter_pos = target_pos; + starter_ch = ch; + last_class = -1; + recomp_chars[target_pos++] = ch; + } + else + { + last_class = ch_class; + recomp_chars[target_pos++] = ch; + } + } + recomp_chars[target_pos] = (pg_wchar) '\0'; + + FREE(decomp_chars); + + return recomp_chars; +} + +/* + * Normalization "quick check" algorithm; see + * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms> + */ + +/* We only need this in the backend. */ +#ifndef FRONTEND + +static const pg_unicode_normprops * +qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo) +{ + int h; + uint32 hashkey; + + /* + * Compute the hash function. The hash key is the codepoint with the bytes + * in network order. + */ + hashkey = pg_hton32(ch); + h = norminfo->hash(&hashkey); + + /* An out-of-range result implies no match */ + if (h < 0 || h >= norminfo->num_normprops) + return NULL; + + /* + * Since it's a perfect hash, we need only match to the specific codepoint + * it identifies. + */ + if (ch != norminfo->normprops[h].codepoint) + return NULL; + + /* Success! */ + return &norminfo->normprops[h]; +} + +/* + * Look up the normalization quick check character property + */ +static UnicodeNormalizationQC +qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch) +{ + const pg_unicode_normprops *found = NULL; + + switch (form) + { + case UNICODE_NFC: + found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC); + break; + case UNICODE_NFKC: + found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC); + break; + default: + Assert(false); + break; + } + + if (found) + return found->quickcheck; + else + return UNICODE_NORM_QC_YES; +} + +UnicodeNormalizationQC +unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input) +{ + uint8 lastCanonicalClass = 0; + UnicodeNormalizationQC result = UNICODE_NORM_QC_YES; + + /* + * For the "D" forms, we don't run the quickcheck. We don't include the + * lookup tables for those because they are huge, checking for these + * particular forms is less common, and running the slow path is faster + * for the "D" forms than the "C" forms because you don't need to + * recompose, which is slow. + */ + if (form == UNICODE_NFD || form == UNICODE_NFKD) + return UNICODE_NORM_QC_MAYBE; + + for (const pg_wchar *p = input; *p; p++) + { + pg_wchar ch = *p; + uint8 canonicalClass; + UnicodeNormalizationQC check; + + canonicalClass = get_canonical_class(ch); + if (lastCanonicalClass > canonicalClass && canonicalClass != 0) + return UNICODE_NORM_QC_NO; + + check = qc_is_allowed(form, ch); + if (check == UNICODE_NORM_QC_NO) + return UNICODE_NORM_QC_NO; + else if (check == UNICODE_NORM_QC_MAYBE) + result = UNICODE_NORM_QC_MAYBE; + + lastCanonicalClass = canonicalClass; + } + return result; +} + +#endif /* !FRONTEND */ |