diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:19:15 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:19:15 +0000 |
commit | 6eb9c5a5657d1fe77b55cc261450f3538d35a94d (patch) | |
tree | 657d8194422a5daccecfd42d654b8a245ef7b4c8 /src/common/unicode | |
parent | Initial commit. (diff) | |
download | postgresql-13-6eb9c5a5657d1fe77b55cc261450f3538d35a94d.tar.xz postgresql-13-6eb9c5a5657d1fe77b55cc261450f3538d35a94d.zip |
Adding upstream version 13.4.upstream/13.4upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/common/unicode')
-rw-r--r-- | src/common/unicode/.gitignore | 8 | ||||
-rw-r--r-- | src/common/unicode/Makefile | 67 | ||||
-rw-r--r-- | src/common/unicode/README | 28 | ||||
-rw-r--r-- | src/common/unicode/generate-norm_test_table.pl | 106 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_combining_table.pl | 53 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_norm_table.pl | 234 | ||||
-rw-r--r-- | src/common/unicode/generate-unicode_normprops_table.pl | 88 | ||||
-rw-r--r-- | src/common/unicode/norm_test.c | 86 |
8 files changed, 670 insertions, 0 deletions
diff --git a/src/common/unicode/.gitignore b/src/common/unicode/.gitignore new file mode 100644 index 0000000..512862e --- /dev/null +++ b/src/common/unicode/.gitignore @@ -0,0 +1,8 @@ +/norm_test +/norm_test_table.h + +# Downloaded files +/CompositionExclusions.txt +/DerivedNormalizationProps.txt +/NormalizationTest.txt +/UnicodeData.txt diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile new file mode 100644 index 0000000..93a9d16 --- /dev/null +++ b/src/common/unicode/Makefile @@ -0,0 +1,67 @@ +#------------------------------------------------------------------------- +# +# Makefile +# Makefile for src/common/unicode +# +# IDENTIFICATION +# src/common/unicode/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/common/unicode +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +override CPPFLAGS := -DFRONTEND $(CPPFLAGS) +LIBS += $(PTHREAD_LIBS) + +# By default, do nothing. +all: + +update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h + mv $^ ../../../src/include/common/ + $(MAKE) normalization-check + +# These files are part of the Unicode Character Database. Download +# them on demand. The dependency on Makefile.global is for +# UNICODE_VERSION. +UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global + $(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F) + +# Generation of conversion tables used for string normalization with +# UTF-8 strings. +unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt + $(PERL) generate-unicode_norm_table.pl + +unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt + $(PERL) $^ >$@ + +unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt + $(PERL) $^ >$@ + +# Test suite +normalization-check: norm_test + ./norm_test + +norm_test: norm_test.o ../unicode_norm.o | submake-common + +norm_test.o: norm_test_table.h + +.PHONY: submake-common + +submake-common: + $(MAKE) -C .. all + +norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt + perl generate-norm_test_table.pl NormalizationTest.txt $@ + +.PHONY: normalization-check + + +clean: + rm -f $(OBJS) norm_test norm_test.o + +distclean: clean + rm -f UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h + +maintainer-clean: distclean diff --git a/src/common/unicode/README b/src/common/unicode/README new file mode 100644 index 0000000..56956f6 --- /dev/null +++ b/src/common/unicode/README @@ -0,0 +1,28 @@ +This directory contains tools to generate the tables in +src/include/common/unicode_norm.h, used for Unicode normalization. The +generated .h file is included in the source tree, so these are normally not +needed to build PostgreSQL, only if you need to re-generate the .h file +from the Unicode data files for some reason, e.g. to update to a new version +of Unicode. + +Generating unicode_norm_table.h +------------------------------- + +Run + + make update-unicode + +from the top level of the source tree and commit the result. + +Tests +----- + +The Unicode consortium publishes a comprehensive test suite for the +normalization algorithm, in a file called NormalizationTest.txt. This +directory also contains a perl script and some C code, to run our +normalization code with all the test strings in NormalizationTest.txt. +To download NormalizationTest.txt and run the tests: + + make normalization-check + +This is also run as part of the update-unicode target. diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl new file mode 100644 index 0000000..acc6796 --- /dev/null +++ b/src/common/unicode/generate-norm_test_table.pl @@ -0,0 +1,106 @@ +#!/usr/bin/perl +# +# Read Unicode consortium's normalization test suite, NormalizationTest.txt, +# and generate a C array from it, for norm_test.c. +# +# NormalizationTest.txt is part of the Unicode Character Database. +# +# Copyright (c) 2000-2020, PostgreSQL Global Development Group + +use strict; +use warnings; + +use File::Basename; + +die "Usage: $0 INPUT_FILE OUTPUT_FILE\n" if @ARGV != 2; +my $input_file = $ARGV[0]; +my $output_file = $ARGV[1]; +my $output_base = basename($output_file); + +# Open the input and output files +open my $INPUT, '<', $input_file + or die "Could not open input file $input_file: $!"; +open my $OUTPUT, '>', $output_file + or die "Could not open output file $output_file: $!\n"; + +# Print header of output file. +print $OUTPUT <<HEADER; +/*------------------------------------------------------------------------- + * + * norm_test_table.h + * Test strings for Unicode normalization. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/common/unicode/norm_test_table.h + * + *------------------------------------------------------------------------- + */ + +/* + * File auto-generated by src/common/unicode/generate-norm_test_table.pl, do + * not edit. There is deliberately not an #ifndef PG_NORM_TEST_TABLE_H + * here. + */ + +typedef struct +{ + int linenum; + pg_wchar input[50]; + pg_wchar output[4][50]; +} pg_unicode_test; + +/* test table */ +HEADER +print $OUTPUT + "static const pg_unicode_test UnicodeNormalizationTests[] =\n{\n"; + +# Helper routine to convert a space-separated list of Unicode characters to +# hexadecimal list format, suitable for outputting in a C array. +sub codepoint_string_to_hex +{ + my $codepoint_string = shift; + + my $result; + + foreach (split(' ', $codepoint_string)) + { + my $cp = $_; + my $utf8 = "0x$cp, "; + $result .= $utf8; + } + $result .= '0'; # null-terminated the array + return $result; +} + +# Process the input file line by line +my $linenum = 0; +while (my $line = <$INPUT>) +{ + $linenum = $linenum + 1; + if ($line =~ /^\s*#/) { next; } # ignore comments + + if ($line =~ /^@/) { next; } # ignore @Part0 like headers + + # Split the line wanted and get the fields needed: + # + # source; NFC; NFD; NFKC; NFKD + my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line); + + my $source_utf8 = codepoint_string_to_hex($source); + my $nfc_utf8 = codepoint_string_to_hex($nfc); + my $nfd_utf8 = codepoint_string_to_hex($nfd); + my $nfkc_utf8 = codepoint_string_to_hex($nfkc); + my $nfkd_utf8 = codepoint_string_to_hex($nfkd); + + print $OUTPUT + "\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n"; +} + +# Output terminator entry +print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }"; +print $OUTPUT "\n};\n"; + +close $OUTPUT; +close $INPUT; diff --git a/src/common/unicode/generate-unicode_combining_table.pl b/src/common/unicode/generate-unicode_combining_table.pl new file mode 100644 index 0000000..c0fc3cc --- /dev/null +++ b/src/common/unicode/generate-unicode_combining_table.pl @@ -0,0 +1,53 @@ +#!/usr/bin/perl +# +# Generate sorted list of non-overlapping intervals of non-spacing +# characters, using Unicode data files as input. Pass UnicodeData.txt +# as argument. The output is on stdout. +# +# Copyright (c) 2019, PostgreSQL Global Development Group + +use strict; +use warnings; + +my $range_start = undef; +my $codepoint; +my $prev_codepoint; +my $count = 0; + +print + "/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */\n\n"; + +print "static const struct mbinterval combining[] = {\n"; + +foreach my $line (<ARGV>) +{ + chomp $line; + my @fields = split ';', $line; + $codepoint = hex $fields[0]; + + next if $codepoint > 0xFFFF; + + if ($fields[2] eq 'Me' || $fields[2] eq 'Mn') + { + # combining character, save for start of range + if (!defined($range_start)) + { + $range_start = $codepoint; + } + } + else + { + # not a combining character, print out previous range if any + if (defined($range_start)) + { + printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_codepoint; + $range_start = undef; + } + } +} +continue +{ + $prev_codepoint = $codepoint; +} + +print "};\n"; diff --git a/src/common/unicode/generate-unicode_norm_table.pl b/src/common/unicode/generate-unicode_norm_table.pl new file mode 100644 index 0000000..7ce15e1 --- /dev/null +++ b/src/common/unicode/generate-unicode_norm_table.pl @@ -0,0 +1,234 @@ +#!/usr/bin/perl +# +# Generate a composition table, using Unicode data files as input +# +# Input: UnicodeData.txt and CompositionExclusions.txt +# Output: unicode_norm_table.h +# +# Copyright (c) 2000-2020, PostgreSQL Global Development Group + +use strict; +use warnings; + +my $output_file = "unicode_norm_table.h"; + +my $FH; + +# Read list of codes that should be excluded from re-composition. +my @composition_exclusion_codes = (); +open($FH, '<', "CompositionExclusions.txt") + or die "Could not open CompositionExclusions.txt: $!."; +while (my $line = <$FH>) +{ + if ($line =~ /^([[:xdigit:]]+)/) + { + push @composition_exclusion_codes, $1; + } +} +close $FH; + +# Read entries from UnicodeData.txt into a list, and a hash table. We need +# three fields from each row: the codepoint, canonical combining class, +# and character decomposition mapping +my @characters = (); +my %character_hash = (); +open($FH, '<', "UnicodeData.txt") + or die "Could not open UnicodeData.txt: $!."; +while (my $line = <$FH>) +{ + + # Split the line wanted and get the fields needed: + # - Unicode code value + # - Canonical Combining Class + # - Character Decomposition Mapping + my @elts = split(';', $line); + my $code = $elts[0]; + my $class = $elts[3]; + my $decomp = $elts[5]; + + # Skip codepoints above U+10FFFF. They cannot be represented in 4 bytes + # in UTF-8, and PostgreSQL doesn't support UTF-8 characters longer than + # 4 bytes. (This is just pro forma, as there aren't any such entries in + # the data file, currently.) + next if hex($code) > 0x10FFFF; + + # Skip characters with no decompositions and a class of 0, to reduce the + # table size. + next if $class eq '0' && $decomp eq ''; + + my %char_entry = (code => $code, class => $class, decomp => $decomp); + push(@characters, \%char_entry); + $character_hash{$code} = \%char_entry; +} +close $FH; + +my $num_characters = scalar @characters; + +# Start writing out the output file +open my $OUTPUT, '>', $output_file + or die "Could not open output file $output_file: $!\n"; + +print $OUTPUT <<HEADER; +/*------------------------------------------------------------------------- + * + * unicode_norm_table.h + * Composition table used for Unicode normalization + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/common/unicode_norm_table.h + * + *------------------------------------------------------------------------- + */ + +/* + * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl, + * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_TABLE_H + * here. + */ +typedef struct +{ + uint32 codepoint; /* Unicode codepoint */ + uint8 comb_class; /* combining class of character */ + uint8 dec_size_flags; /* size and flags of decomposition code list */ + uint16 dec_index; /* index into UnicodeDecomp_codepoints, or the + * decomposition itself if DECOMP_INLINE */ +} pg_unicode_decomposition; + +#define DECOMP_NO_COMPOSE 0x80 /* don't use for re-composition */ +#define DECOMP_INLINE 0x40 /* decomposition is stored inline in + * dec_index */ +#define DECOMP_COMPAT 0x20 /* compatibility mapping */ + +#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F) +#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0) +#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0) +#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0) + +/* Table of Unicode codepoints and their decompositions */ +static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] = +{ +HEADER + +my $decomp_index = 0; +my $decomp_string = ""; + +my $last_code = $characters[-1]->{code}; +foreach my $char (@characters) +{ + my $code = $char->{code}; + my $class = $char->{class}; + my $decomp = $char->{decomp}; + + # The character decomposition mapping field in UnicodeData.txt is a list + # of unicode codepoints, separated by space. But it can be prefixed with + # so-called compatibility formatting tag, like "<compat>", or "<font>". + # The entries with compatibility formatting tags should not be used for + # re-composing characters during normalization, so flag them in the table. + # (The tag doesn't matter, only whether there is a tag or not) + my $compat = 0; + if ($decomp =~ /\<.*\>/) + { + $compat = 1; + $decomp =~ s/\<[^][]*\>//g; + } + my @decomp_elts = split(" ", $decomp); + + # Decomposition size + # Print size of decomposition + my $decomp_size = scalar(@decomp_elts); + die if $decomp_size > 0x1F; # to not overrun bitmask + + my $first_decomp = shift @decomp_elts; + + my $flags = ""; + my $comment = ""; + + if ($compat) + { + $flags .= " | DECOMP_COMPAT"; + } + + if ($decomp_size == 2) + { + # Should this be used for recomposition? + if ( $character_hash{$first_decomp} + && $character_hash{$first_decomp}->{class} != 0) + { + $flags .= " | DECOMP_NO_COMPOSE"; + $comment = "non-starter decomposition"; + } + else + { + foreach my $lcode (@composition_exclusion_codes) + { + if ($lcode eq $char->{code}) + { + $flags .= " | DECOMP_NO_COMPOSE"; + $comment = "in exclusion list"; + last; + } + } + } + } + + if ($decomp_size == 0) + { + print $OUTPUT "\t{0x$code, $class, 0$flags, 0}"; + } + elsif ($decomp_size == 1 && length($first_decomp) <= 4) + { + + # The decomposition consists of a single codepoint, and it fits + # in a uint16, so we can store it "inline" in the main table. + $flags .= " | DECOMP_INLINE"; + print $OUTPUT "\t{0x$code, $class, 1$flags, 0x$first_decomp}"; + } + else + { + print $OUTPUT + "\t{0x$code, $class, $decomp_size$flags, $decomp_index}"; + + # Now save the decompositions into a dedicated area that will + # be written afterwards. First build the entry dedicated to + # a sub-table with the code and decomposition. + $decomp_string .= ",\n" if ($decomp_string ne ""); + + $decomp_string .= "\t /* $decomp_index */ 0x$first_decomp"; + foreach (@decomp_elts) + { + $decomp_string .= ", 0x$_"; + } + + $decomp_index = $decomp_index + $decomp_size; + } + + # Print a comma after all items except the last one. + print $OUTPUT "," unless ($code eq $last_code); + if ($comment ne "") + { + + # If the line is wide already, indent the comment with one tab, + # otherwise with two. This is to make the output match the way + # pgindent would mangle it. (This is quite hacky. To do this + # properly, we should actually track how long the line is so far, + # but this works for now.) + print $OUTPUT "\t" if ($decomp_index < 10); + + print $OUTPUT "\t/* $comment */" if ($comment ne ""); + } + print $OUTPUT "\n"; +} +print $OUTPUT "\n};\n\n"; + +# Print the array of decomposed codes. +print $OUTPUT <<HEADER; +/* codepoints array */ +static const uint32 UnicodeDecomp_codepoints[$decomp_index] = +{ +$decomp_string +}; +HEADER + +close $OUTPUT; diff --git a/src/common/unicode/generate-unicode_normprops_table.pl b/src/common/unicode/generate-unicode_normprops_table.pl new file mode 100644 index 0000000..e8e5097 --- /dev/null +++ b/src/common/unicode/generate-unicode_normprops_table.pl @@ -0,0 +1,88 @@ +#!/usr/bin/perl +# +# Generate table of Unicode normalization "quick check" properties +# (see UAX #15). Pass DerivedNormalizationProps.txt as argument. The +# output is on stdout. +# +# Copyright (c) 2020, PostgreSQL Global Development Group + +use strict; +use warnings; + +my %data; + +print + "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n"; + +print <<EOS; +#include "common/unicode_norm.h" + +/* + * We use a bit field here to save space. + */ +typedef struct +{ + unsigned int codepoint:21; + signed int quickcheck:4; /* really UnicodeNormalizationQC */ +} pg_unicode_normprops; +EOS + +foreach my $line (<ARGV>) +{ + chomp $line; + $line =~ s/\s*#.*$//; + next if $line eq ''; + my ($codepoint, $prop, $value) = split /\s*;\s*/, $line; + next if $prop !~ /_QC/; + + my ($first, $last); + if ($codepoint =~ /\.\./) + { + ($first, $last) = split /\.\./, $codepoint; + } + else + { + $first = $last = $codepoint; + } + + foreach my $cp (hex($first) .. hex($last)) + { + $data{$prop}{$cp} = $value; + } +} + +# We create a separate array for each normalization form rather than, +# say, a two-dimensional array, because that array would be very +# sparse and would create unnecessary overhead especially for the NFC +# lookup. +foreach my $prop (sort keys %data) +{ + # Don't build the tables for the "D" forms because they are too + # big. See also unicode_is_normalized_quickcheck(). + next if $prop eq "NFD_QC" || $prop eq "NFKD_QC"; + + print "\n"; + print + "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n"; + + my %subdata = %{ $data{$prop} }; + foreach my $cp (sort { $a <=> $b } keys %subdata) + { + my $qc; + if ($subdata{$cp} eq 'N') + { + $qc = 'UNICODE_NORM_QC_NO'; + } + elsif ($subdata{$cp} eq 'M') + { + $qc = 'UNICODE_NORM_QC_MAYBE'; + } + else + { + die; + } + printf "\t{0x%04X, %s},\n", $cp, $qc; + } + + print "};\n"; +} diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c new file mode 100644 index 0000000..dde5d24 --- /dev/null +++ b/src/common/unicode/norm_test.c @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * norm_test.c + * Program to test Unicode normalization functions. + * + * Portions Copyright (c) 2017-2020, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/common/unicode/norm_test.c + * + *------------------------------------------------------------------------- + */ +#include "postgres_fe.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "common/unicode_norm.h" + +#include "norm_test_table.h" + +static char * +print_wchar_str(const pg_wchar *s) +{ +#define BUF_DIGITS 50 + static char buf[BUF_DIGITS * 11 + 1]; + int i; + char *p; + + i = 0; + p = buf; + while (*s && i < BUF_DIGITS) + { + p += sprintf(p, "U+%04X ", *s); + i++; + s++; + } + *p = '\0'; + + return buf; +} + +static int +pg_wcscmp(const pg_wchar *s1, const pg_wchar *s2) +{ + for (;;) + { + if (*s1 < *s2) + return -1; + if (*s1 > *s2) + return 1; + if (*s1 == 0) + return 0; + s1++; + s2++; + } +} + +int +main(int argc, char **argv) +{ + const pg_unicode_test *test; + + for (test = UnicodeNormalizationTests; test->input[0] != 0; test++) + { + for (int form = 0; form < 4; form++) + { + pg_wchar *result; + + result = unicode_normalize(form, test->input); + + if (pg_wcscmp(test->output[form], result) != 0) + { + printf("FAILURE (NormalizationTest.txt line %d form %d):\n", test->linenum, form); + printf("input: %s\n", print_wchar_str(test->input)); + printf("expected: %s\n", print_wchar_str(test->output[form])); + printf("got: %s\n", print_wchar_str(result)); + printf("\n"); + exit(1); + } + } + } + + printf("All tests successful!\n"); + exit(0); +} |