Adding upstream version 15.5.upstream/15.5

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:17:33 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 12:17:33 +0000
commit: 5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch)
tree: 739caf8c461053357daa9f162bef34516c7bf452 /src/common/unicode
parent: Initial commit. (diff)
download: postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz
postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip
9 files changed, 959 insertions, 0 deletions
diff --git a/src/common/unicode/.gitignore b/src/common/unicode/.gitignore
new file mode 100644
index 0000000..46243f7
--- /dev/null
+++ b/src/common/unicode/.gitignore
@@ -0,0 +1,9 @@
+/norm_test
+/norm_test_table.h
+
+# Downloaded files
+/CompositionExclusions.txt
+/DerivedNormalizationProps.txt
+/EastAsianWidth.txt
+/NormalizationTest.txt
+/UnicodeData.txt
diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile
new file mode 100644
index 0000000..60e01e7
--- /dev/null
+++ b/src/common/unicode/Makefile
@@ -0,0 +1,72 @@
+#-------------------------------------------------------------------------
+#
+# Makefile
+#    Makefile for src/common/unicode
+#
+# IDENTIFICATION
+#    src/common/unicode/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/common/unicode
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+override CPPFLAGS := -DFRONTEND -I. $(CPPFLAGS)
+LIBS += $(PTHREAD_LIBS)
+
+# By default, do nothing.
+all:
+
+update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
+	mv $^ $(top_srcdir)/src/include/common/
+	$(MAKE) normalization-check
+
+# These files are part of the Unicode Character Database. Download
+# them on demand.  The dependency on Makefile.global is for
+# UNICODE_VERSION.
+UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
+	$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
+
+# Generation of conversion tables used for string normalization with
+# UTF-8 strings.
+unicode_norm_hashfunc.h: unicode_norm_table.h
+
+unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt
+	$(PERL) $<
+
+unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
+	$(PERL) $^ >$@
+
+unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt
+	$(PERL) $^ >$@
+
+unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt
+	$(PERL) $^ >$@
+
+# Test suite
+normalization-check: norm_test
+	./norm_test
+
+norm_test: norm_test.o ../unicode_norm.o | submake-common
+
+norm_test.o: norm_test_table.h
+
+.PHONY: submake-common
+
+submake-common:
+	$(MAKE) -C .. all
+
+norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt
+	perl $^ $@
+
+.PHONY: normalization-check
+
+
+clean:
+	rm -f $(OBJS) norm_test norm_test.o
+
+distclean: clean
+	rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
+
+maintainer-clean: distclean
diff --git a/src/common/unicode/README b/src/common/unicode/README
new file mode 100644
index 0000000..56956f6
--- /dev/null
+++ b/src/common/unicode/README
@@ -0,0 +1,28 @@
+This directory contains tools to generate the tables in
+src/include/common/unicode_norm.h, used for Unicode normalization. The
+generated .h file is included in the source tree, so these are normally not
+needed to build PostgreSQL, only if you need to re-generate the .h file
+from the Unicode data files for some reason, e.g. to update to a new version
+of Unicode.
+
+Generating unicode_norm_table.h
+-------------------------------
+
+Run
+
+    make update-unicode
+
+from the top level of the source tree and commit the result.
+
+Tests
+-----
+
+The Unicode consortium publishes a comprehensive test suite for the
+normalization algorithm, in a file called NormalizationTest.txt. This
+directory also contains a perl script and some C code, to run our
+normalization code with all the test strings in NormalizationTest.txt.
+To download NormalizationTest.txt and run the tests:
+
+    make normalization-check
+
+This is also run as part of the update-unicode target.
diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl
new file mode 100644
index 0000000..838f552
--- /dev/null
+++ b/src/common/unicode/generate-norm_test_table.pl
@@ -0,0 +1,106 @@
+#!/usr/bin/perl
+#
+# Read Unicode consortium's normalization test suite, NormalizationTest.txt,
+# and generate a C array from it, for norm_test.c.
+#
+# NormalizationTest.txt is part of the Unicode Character Database.
+#
+# Copyright (c) 2000-2022, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+use File::Basename;
+
+die "Usage: $0 INPUT_FILE OUTPUT_FILE\n" if @ARGV != 2;
+my $input_file  = $ARGV[0];
+my $output_file = $ARGV[1];
+my $output_base = basename($output_file);
+
+# Open the input and output files
+open my $INPUT, '<', $input_file
+  or die "Could not open input file $input_file: $!";
+open my $OUTPUT, '>', $output_file
+  or die "Could not open output file $output_file: $!\n";
+
+# Print header of output file.
+print $OUTPUT <<HEADER;
+/*-------------------------------------------------------------------------
+ *
+ * norm_test_table.h
+ *	  Test strings for Unicode normalization.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/common/unicode/norm_test_table.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * File auto-generated by src/common/unicode/generate-norm_test_table.pl, do
+ * not edit. There is deliberately not an #ifndef PG_NORM_TEST_TABLE_H
+ * here.
+ */
+
+typedef struct
+{
+	int			linenum;
+	pg_wchar	input[50];
+	pg_wchar	output[4][50];
+} pg_unicode_test;
+
+/* test table */
+HEADER
+print $OUTPUT
+  "static const pg_unicode_test UnicodeNormalizationTests[] =\n{\n";
+
+# Helper routine to convert a space-separated list of Unicode characters to
+# hexadecimal list format, suitable for outputting in a C array.
+sub codepoint_string_to_hex
+{
+	my $codepoint_string = shift;
+
+	my $result;
+
+	foreach (split(' ', $codepoint_string))
+	{
+		my $cp   = $_;
+		my $utf8 = "0x$cp, ";
+		$result .= $utf8;
+	}
+	$result .= '0';    # null-terminated the array
+	return $result;
+}
+
+# Process the input file line by line
+my $linenum = 0;
+while (my $line = <$INPUT>)
+{
+	$linenum = $linenum + 1;
+	if ($line =~ /^\s*#/) { next; }    # ignore comments
+
+	if ($line =~ /^@/) { next; }       # ignore @Part0 like headers
+
+	# Split the line wanted and get the fields needed:
+	#
+	# source; NFC; NFD; NFKC; NFKD
+	my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line);
+
+	my $source_utf8 = codepoint_string_to_hex($source);
+	my $nfc_utf8    = codepoint_string_to_hex($nfc);
+	my $nfd_utf8    = codepoint_string_to_hex($nfd);
+	my $nfkc_utf8   = codepoint_string_to_hex($nfkc);
+	my $nfkd_utf8   = codepoint_string_to_hex($nfkd);
+
+	print $OUTPUT
+	  "\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n";
+}
+
+# Output terminator entry
+print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }";
+print $OUTPUT "\n};\n";
+
+close $OUTPUT;
+close $INPUT;
diff --git a/src/common/unicode/generate-unicode_combining_table.pl b/src/common/unicode/generate-unicode_combining_table.pl
new file mode 100644
index 0000000..8177c20
--- /dev/null
+++ b/src/common/unicode/generate-unicode_combining_table.pl
@@ -0,0 +1,51 @@
+#!/usr/bin/perl
+#
+# Generate sorted list of non-overlapping intervals of non-spacing
+# characters, using Unicode data files as input.  Pass UnicodeData.txt
+# as argument.  The output is on stdout.
+#
+# Copyright (c) 2019-2022, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+my $range_start = undef;
+my $codepoint;
+my $prev_codepoint;
+my $count = 0;
+
+print
+  "/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */\n\n";
+
+print "static const struct mbinterval combining[] = {\n";
+
+foreach my $line (<ARGV>)
+{
+	chomp $line;
+	my @fields = split ';', $line;
+	$codepoint = hex $fields[0];
+
+	if ($fields[2] eq 'Me' || $fields[2] eq 'Mn')
+	{
+		# combining character, save for start of range
+		if (!defined($range_start))
+		{
+			$range_start = $codepoint;
+		}
+	}
+	else
+	{
+		# not a combining character, print out previous range if any
+		if (defined($range_start))
+		{
+			printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_codepoint;
+			$range_start = undef;
+		}
+	}
+}
+continue
+{
+	$prev_codepoint = $codepoint;
+}
+
+print "};\n";
diff --git a/src/common/unicode/generate-unicode_east_asian_fw_table.pl b/src/common/unicode/generate-unicode_east_asian_fw_table.pl
new file mode 100644
index 0000000..9d03684
--- /dev/null
+++ b/src/common/unicode/generate-unicode_east_asian_fw_table.pl
@@ -0,0 +1,76 @@
+#!/usr/bin/perl
+#
+# Generate a sorted list of non-overlapping intervals of East Asian Wide (W)
+# and East Asian Fullwidth (F) characters, using Unicode data files as input.
+# Pass EastAsianWidth.txt as argument.  The output is on stdout.
+#
+# Copyright (c) 2019-2022, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+my $range_start = undef;
+my ($first, $last);
+my $prev_last;
+
+print
+  "/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n";
+
+print "static const struct mbinterval east_asian_fw[] = {\n";
+
+foreach my $line (<ARGV>)
+{
+	chomp $line;
+	$line =~ s/\s*#.*$//;
+	next if $line eq '';
+	my ($codepoint, $width) = split ';', $line;
+
+	if ($codepoint =~ /\.\./)
+	{
+		($first, $last) = split /\.\./, $codepoint;
+	}
+	else
+	{
+		$first = $last = $codepoint;
+	}
+
+	($first, $last) = map(hex, ($first, $last));
+
+	if ($width eq 'F' || $width eq 'W')
+	{
+		# fullwidth/wide characters
+		if (!defined($range_start))
+		{
+			# save for start of range if one hasn't been started yet
+			$range_start = $first;
+		}
+		elsif ($first != $prev_last + 1)
+		{
+			# ranges aren't contiguous; emit the last and start a new one
+			printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+			$range_start = $first;
+		}
+	}
+	else
+	{
+		# not wide characters, print out previous range if any
+		if (defined($range_start))
+		{
+			printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+			$range_start = undef;
+		}
+	}
+}
+continue
+{
+	$prev_last = $last;
+}
+
+# don't forget any ranges at the very end of the database (though there are none
+# as of Unicode 13.0)
+if (defined($range_start))
+{
+	printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+}
+
+print "};\n";
diff --git a/src/common/unicode/generate-unicode_norm_table.pl b/src/common/unicode/generate-unicode_norm_table.pl
new file mode 100644
index 0000000..e442345
--- /dev/null
+++ b/src/common/unicode/generate-unicode_norm_table.pl
@@ -0,0 +1,406 @@
+#!/usr/bin/perl
+#
+# Generate a composition table and its lookup utilities, using Unicode data
+# files as input.
+#
+# Input: UnicodeData.txt and CompositionExclusions.txt
+# Output: unicode_norm_table.h and unicode_norm_hashfunc.h
+#
+# Copyright (c) 2000-2022, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::RealBin/../../tools/";
+use PerfectHash;
+
+my $output_table_file = "unicode_norm_table.h";
+my $output_func_file  = "unicode_norm_hashfunc.h";
+
+my $FH;
+
+# Read list of codes that should be excluded from re-composition.
+my @composition_exclusion_codes = ();
+open($FH, '<', "CompositionExclusions.txt")
+  or die "Could not open CompositionExclusions.txt: $!.";
+while (my $line = <$FH>)
+{
+	if ($line =~ /^([[:xdigit:]]+)/)
+	{
+		push @composition_exclusion_codes, $1;
+	}
+}
+close $FH;
+
+# Read entries from UnicodeData.txt into a list, and a hash table. We need
+# three fields from each row: the codepoint, canonical combining class,
+# and character decomposition mapping
+my @characters     = ();
+my %character_hash = ();
+open($FH, '<', "UnicodeData.txt")
+  or die "Could not open UnicodeData.txt: $!.";
+while (my $line = <$FH>)
+{
+
+	# Split the line wanted and get the fields needed:
+	# - Unicode code value
+	# - Canonical Combining Class
+	# - Character Decomposition Mapping
+	my @elts   = split(';', $line);
+	my $code   = $elts[0];
+	my $class  = $elts[3];
+	my $decomp = $elts[5];
+
+	# Skip codepoints above U+10FFFF. They cannot be represented in 4 bytes
+	# in UTF-8, and PostgreSQL doesn't support UTF-8 characters longer than
+	# 4 bytes. (This is just pro forma, as there aren't any such entries in
+	# the data file, currently.)
+	next if hex($code) > 0x10FFFF;
+
+	# Skip characters with no decompositions and a class of 0, to reduce the
+	# table size.
+	next if $class eq '0' && $decomp eq '';
+
+	my %char_entry = (code => $code, class => $class, decomp => $decomp);
+	push(@characters, \%char_entry);
+	$character_hash{$code} = \%char_entry;
+}
+close $FH;
+
+my $num_characters = scalar @characters;
+
+# Start writing out the output files
+open my $OT, '>', $output_table_file
+  or die "Could not open output file $output_table_file: $!\n";
+open my $OF, '>', $output_func_file
+  or die "Could not open output file $output_func_file: $!\n";
+
+print $OT <<HEADER;
+/*-------------------------------------------------------------------------
+ *
+ * unicode_norm_table.h
+ *	  Composition table used for Unicode normalization
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/common/unicode_norm_table.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl,
+ * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_TABLE_H
+ * here.
+ */
+typedef struct
+{
+	uint32		codepoint;		/* Unicode codepoint */
+	uint8		comb_class;		/* combining class of character */
+	uint8		dec_size_flags; /* size and flags of decomposition code list */
+	uint16		dec_index;		/* index into UnicodeDecomp_codepoints, or the
+								 * decomposition itself if DECOMP_INLINE */
+} pg_unicode_decomposition;
+
+#define DECOMP_NO_COMPOSE	0x80	/* don't use for re-composition */
+#define DECOMP_INLINE		0x40	/* decomposition is stored inline in
+									 * dec_index */
+#define DECOMP_COMPAT		0x20	/* compatibility mapping */
+
+#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F)
+#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0)
+#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
+#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0)
+
+/* Table of Unicode codepoints and their decompositions */
+static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
+{
+HEADER
+
+print $OF <<HEADER;
+/*-------------------------------------------------------------------------
+ *
+ * unicode_norm_hashfunc.h
+ *	  Perfect hash functions used for Unicode normalization
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/common/unicode_norm_hashfunc.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl,
+ * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H
+ * here.
+ */
+
+#include "common/unicode_norm_table.h"
+
+/* Typedef for perfect hash functions */
+typedef int (*cp_hash_func) (const void *key);
+
+/* Information for lookups with perfect hash functions */
+typedef struct
+{
+	const pg_unicode_decomposition *decomps;
+	cp_hash_func	hash;
+	int		num_decomps;
+} pg_unicode_decompinfo;
+
+typedef struct
+{
+	const uint16	*inverse_lookup;
+	cp_hash_func	hash;
+	int		num_recomps;
+} pg_unicode_recompinfo;
+
+HEADER
+
+my $decomp_index  = 0;
+my $decomp_string = "";
+my @dec_cp_packed;
+my $main_index = 0;
+my @rec_info;
+
+my $last_code = $characters[-1]->{code};
+foreach my $char (@characters)
+{
+	my $code   = $char->{code};
+	my $class  = $char->{class};
+	my $decomp = $char->{decomp};
+
+	# Save the code point bytes as a string in network order.
+	push @dec_cp_packed, pack('N', hex($char->{code}));
+
+	# The character decomposition mapping field in UnicodeData.txt is a list
+	# of unicode codepoints, separated by space. But it can be prefixed with
+	# so-called compatibility formatting tag, like "<compat>", or "<font>".
+	# The entries with compatibility formatting tags should not be used for
+	# re-composing characters during normalization, so flag them in the table.
+	# (The tag doesn't matter, only whether there is a tag or not)
+	my $compat = 0;
+	if ($decomp =~ /\<.*\>/)
+	{
+		$compat = 1;
+		$decomp =~ s/\<[^][]*\>//g;
+	}
+	my @decomp_elts = split(" ", $decomp);
+
+	# Decomposition size
+	# Print size of decomposition
+	my $decomp_size = scalar(@decomp_elts);
+	die if $decomp_size > 0x1F;    # to not overrun bitmask
+
+	my $first_decomp = shift @decomp_elts;
+
+	my $flags   = "";
+	my $comment = "";
+
+	if ($compat)
+	{
+		$flags .= " | DECOMP_COMPAT";
+	}
+
+	if ($decomp_size == 2)
+	{
+		# Should this be used for recomposition?
+		if (   $character_hash{$first_decomp}
+			&& $character_hash{$first_decomp}->{class} != 0)
+		{
+			$flags .= " | DECOMP_NO_COMPOSE";
+			$comment = "non-starter decomposition";
+		}
+		else
+		{
+			foreach my $lcode (@composition_exclusion_codes)
+			{
+				if ($lcode eq $code)
+				{
+					$flags .= " | DECOMP_NO_COMPOSE";
+					$comment = "in exclusion list";
+					last;
+				}
+			}
+		}
+
+		# Save info for recomposeable codepoints.
+		# Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C
+		# above!  See also the inverse lookup in recompose_code() found in
+		# src/common/unicode_norm.c.
+		if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/))
+		{
+			push @rec_info,
+			  {
+				code       => $code,
+				main_index => $main_index,
+				first      => $first_decomp,
+				second     => $decomp_elts[0]
+			  };
+		}
+	}
+
+	if ($decomp_size == 0)
+	{
+		print $OT "\t{0x$code, $class, 0$flags, 0}";
+	}
+	elsif ($decomp_size == 1 && length($first_decomp) <= 4)
+	{
+
+		# The decomposition consists of a single codepoint, and it fits
+		# in a uint16, so we can store it "inline" in the main table.
+		$flags .= " | DECOMP_INLINE";
+		print $OT "\t{0x$code, $class, 1$flags, 0x$first_decomp}";
+	}
+	else
+	{
+		print $OT "\t{0x$code, $class, $decomp_size$flags, $decomp_index}";
+
+		# Now save the decompositions into a dedicated area that will
+		# be written afterwards.  First build the entry dedicated to
+		# a sub-table with the code and decomposition.
+		$decomp_string .= ",\n" if ($decomp_string ne "");
+
+		$decomp_string .= "\t /* $decomp_index */ 0x$first_decomp";
+		foreach (@decomp_elts)
+		{
+			$decomp_string .= ", 0x$_";
+		}
+
+		$decomp_index = $decomp_index + $decomp_size;
+	}
+
+	# Print a comma after all items except the last one.
+	print $OT "," unless ($code eq $last_code);
+
+	print $OT "\t/* $comment */" if ($comment ne "");
+	print $OT "\n";
+
+	$main_index++;
+}
+print $OT "\n};\n\n";
+
+# Print the array of decomposed codes.
+print $OT <<HEADER;
+/* codepoints array  */
+static const uint32 UnicodeDecomp_codepoints[$decomp_index] =
+{
+$decomp_string
+};
+HEADER
+
+# Emit the definition of the decomp hash function.
+my $dec_funcname = 'Decomp_hash_func';
+my $dec_func     = PerfectHash::generate_hash_function(\@dec_cp_packed,
+	$dec_funcname, fixed_key_length => 4);
+print $OF "/* Perfect hash function for decomposition */\n";
+print $OF "static $dec_func\n";
+
+# Emit the structure that wraps the hash lookup information into
+# one variable.
+print $OF <<HEADER;
+/* Hash lookup information for decomposition */
+static const pg_unicode_decompinfo UnicodeDecompInfo =
+{
+	UnicodeDecompMain,
+	$dec_funcname,
+	$num_characters
+};
+
+HEADER
+
+# Find the lowest codepoint that decomposes to each recomposeable
+# code pair and create a mapping to it.
+my $recomp_string = "";
+my @rec_cp_packed;
+my %seenit;
+my $firstentry = 1;
+foreach my $rec (sort recomp_sort @rec_info)
+{
+	# The hash key is formed by concatenating the bytes of the two
+	# codepoints. See also recompose_code() in common/unicode_norm.c.
+	my $hashkey = (hex($rec->{first}) << 32) | hex($rec->{second});
+
+	# We are only interested in the lowest code point that decomposes
+	# to the given code pair.
+	next if $seenit{$hashkey};
+
+	# Save the hash key bytes in network order
+	push @rec_cp_packed, pack('Q>', $hashkey);
+
+	# Append inverse lookup element
+	$recomp_string .= ",\n" if !$firstentry;
+	$recomp_string .= sprintf "\t/* U+%s+%s -> U+%s */ %s",
+	  $rec->{first},
+	  $rec->{second},
+	  $rec->{code},
+	  $rec->{main_index};
+
+	$seenit{$hashkey} = 1;
+	$firstentry = 0;
+}
+
+# Emit the inverse lookup array containing indexes into UnicodeDecompMain.
+my $num_recomps = scalar @rec_cp_packed;
+print $OF <<HEADER;
+/* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */
+static const uint16 RecompInverseLookup[$num_recomps] =
+{
+$recomp_string
+};
+
+HEADER
+
+# Emit the definition of the recomposition hash function.
+my $rec_funcname = 'Recomp_hash_func';
+my $rec_func =
+  PerfectHash::generate_hash_function(\@rec_cp_packed, $rec_funcname,
+	fixed_key_length => 8);
+print $OF "/* Perfect hash function for recomposition */\n";
+print $OF "static $rec_func\n";
+
+# Emit the structure that wraps the hash lookup information into
+# one variable.
+print $OF <<HEADER;
+/* Hash lookup information for recomposition */
+static const pg_unicode_recompinfo UnicodeRecompInfo =
+{
+	RecompInverseLookup,
+	$rec_funcname,
+	$num_recomps
+};
+HEADER
+
+close $OT;
+close $OF;
+
+sub recomp_sort
+{
+	my $a1 = hex($a->{first});
+	my $b1 = hex($b->{first});
+
+	my $a2 = hex($a->{second});
+	my $b2 = hex($b->{second});
+
+	# First sort by the first code point
+	return -1 if $a1 < $b1;
+	return 1  if $a1 > $b1;
+
+	# Then sort by the second code point
+	return -1 if $a2 < $b2;
+	return 1  if $a2 > $b2;
+
+	# Finally sort by the code point that decomposes into first and
+	# second ones.
+	my $acode = hex($a->{code});
+	my $bcode = hex($b->{code});
+
+	return -1 if $acode < $bcode;
+	return 1  if $acode > $bcode;
+
+	die "found duplicate entries of recomposeable code pairs";
+}
diff --git a/src/common/unicode/generate-unicode_normprops_table.pl b/src/common/unicode/generate-unicode_normprops_table.pl
new file mode 100644
index 0000000..08e41b3
--- /dev/null
+++ b/src/common/unicode/generate-unicode_normprops_table.pl
@@ -0,0 +1,125 @@
+#!/usr/bin/perl
+#
+# Generate table of Unicode normalization "quick check" properties
+# (see UAX #15).  Pass DerivedNormalizationProps.txt as argument.  The
+# output is on stdout.
+#
+# Copyright (c) 2020-2022, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::RealBin/../../tools/";
+use PerfectHash;
+
+my %data;
+
+print
+  "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n";
+
+print <<EOS;
+#include "common/unicode_norm.h"
+
+/*
+ * Normalization quick check entry for codepoint.  We use a bit field
+ * here to save space.
+ */
+typedef struct
+{
+	unsigned int codepoint:21;
+	signed int	quickcheck:4;	/* really UnicodeNormalizationQC */
+} pg_unicode_normprops;
+
+/* Typedef for hash function on quick check table */
+typedef int (*qc_hash_func) (const void *key);
+
+/* Information for quick check lookup with perfect hash function */
+typedef struct
+{
+	const pg_unicode_normprops *normprops;
+	qc_hash_func	hash;
+	int		num_normprops;
+} pg_unicode_norminfo;
+EOS
+
+foreach my $line (<ARGV>)
+{
+	chomp $line;
+	$line =~ s/\s*#.*$//;
+	next if $line eq '';
+	my ($codepoint, $prop, $value) = split /\s*;\s*/, $line;
+	next if $prop !~ /_QC/;
+
+	my ($first, $last);
+	if ($codepoint =~ /\.\./)
+	{
+		($first, $last) = split /\.\./, $codepoint;
+	}
+	else
+	{
+		$first = $last = $codepoint;
+	}
+
+	foreach my $cp (hex($first) .. hex($last))
+	{
+		$data{$prop}{$cp} = $value;
+	}
+}
+
+# We create a separate array for each normalization form rather than,
+# say, a two-dimensional array, because that array would be very
+# sparse and would create unnecessary overhead especially for the NFC
+# lookup.
+foreach my $prop (sort keys %data)
+{
+	# Don't build the tables for the "D" forms because they are too
+	# big.  See also unicode_is_normalized_quickcheck().
+	next if $prop eq "NFD_QC" || $prop eq "NFKD_QC";
+
+	print "\n";
+	print
+	  "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";
+
+	my %subdata = %{ $data{$prop} };
+	my @cp_packed;
+	foreach my $cp (sort { $a <=> $b } keys %subdata)
+	{
+		my $qc;
+		if ($subdata{$cp} eq 'N')
+		{
+			$qc = 'UNICODE_NORM_QC_NO';
+		}
+		elsif ($subdata{$cp} eq 'M')
+		{
+			$qc = 'UNICODE_NORM_QC_MAYBE';
+		}
+		else
+		{
+			die;
+		}
+		printf "\t{0x%04X, %s},\n", $cp, $qc;
+
+		# Save the bytes as a string in network order.
+		push @cp_packed, pack('N', $cp);
+	}
+
+	print "};\n";
+
+	# Emit the definition of the perfect hash function.
+	my $funcname = $prop . '_hash_func';
+	my $f        = PerfectHash::generate_hash_function(\@cp_packed, $funcname,
+		fixed_key_length => 4);
+	printf "\n/* Perfect hash function for %s */", $prop;
+	print "\nstatic $f\n";
+
+	# Emit the structure that wraps the hash lookup information into
+	# one variable.
+	printf "/* Hash lookup information for %s */", $prop;
+	printf "\nstatic const pg_unicode_norminfo ";
+	printf "UnicodeNormInfo_%s = {\n", $prop;
+	printf "\tUnicodeNormProps_%s,\n", $prop;
+	printf "\t%s,\n",                  $funcname;
+	printf "\t%d\n",                   scalar @cp_packed;
+	printf "};\n";
+}
diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c
new file mode 100644
index 0000000..0e244ad
--- /dev/null
+++ b/src/common/unicode/norm_test.c
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ * norm_test.c
+ *		Program to test Unicode normalization functions.
+ *
+ * Portions Copyright (c) 2017-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/common/unicode/norm_test.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres_fe.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/unicode_norm.h"
+
+#include "norm_test_table.h"
+
+static char *
+print_wchar_str(const pg_wchar *s)
+{
+#define BUF_DIGITS 50
+	static char buf[BUF_DIGITS * 11 + 1];
+	int			i;
+	char	   *p;
+
+	i = 0;
+	p = buf;
+	while (*s && i < BUF_DIGITS)
+	{
+		p += sprintf(p, "U+%04X ", *s);
+		i++;
+		s++;
+	}
+	*p = '\0';
+
+	return buf;
+}
+
+static int
+pg_wcscmp(const pg_wchar *s1, const pg_wchar *s2)
+{
+	for (;;)
+	{
+		if (*s1 < *s2)
+			return -1;
+		if (*s1 > *s2)
+			return 1;
+		if (*s1 == 0)
+			return 0;
+		s1++;
+		s2++;
+	}
+}
+
+int
+main(int argc, char **argv)
+{
+	const		pg_unicode_test *test;
+
+	for (test = UnicodeNormalizationTests; test->input[0] != 0; test++)
+	{
+		for (int form = 0; form < 4; form++)
+		{
+			pg_wchar   *result;
+
+			result = unicode_normalize(form, test->input);
+
+			if (pg_wcscmp(test->output[form], result) != 0)
+			{
+				printf("FAILURE (NormalizationTest.txt line %d form %d):\n", test->linenum, form);
+				printf("input:    %s\n", print_wchar_str(test->input));
+				printf("expected: %s\n", print_wchar_str(test->output[form]));
+				printf("got:      %s\n", print_wchar_str(result));
+				printf("\n");
+				exit(1);
+			}
+		}
+	}
+
+	printf("All tests successful!\n");
+	exit(0);
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:17:33 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 12:17:33 +0000
commit	5e45211a64149b3c659b90ff2de6fa982a5a93ed (patch)
tree	739caf8c461053357daa9f162bef34516c7bf452 /src/common/unicode
parent	Initial commit. (diff)
download	postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.tar.xz postgresql-15-5e45211a64149b3c659b90ff2de6fa982a5a93ed.zip