1 files changed, 125 insertions, 0 deletions
diff --git a/src/common/unicode/generate-unicode_normprops_table.pl b/src/common/unicode/generate-unicode_normprops_table.pl
new file mode 100644
index 0000000..c7795d1
--- /dev/null
+++ b/src/common/unicode/generate-unicode_normprops_table.pl
@@ -0,0 +1,125 @@
+#!/usr/bin/perl
+#
+# Generate table of Unicode normalization "quick check" properties
+# (see UAX #15).  Pass DerivedNormalizationProps.txt as argument.  The
+# output is on stdout.
+#
+# Copyright (c) 2020-2021, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::RealBin/../../tools/";
+use PerfectHash;
+
+my %data;
+
+print
+  "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n";
+
+print <<EOS;
+#include "common/unicode_norm.h"
+
+/*
+ * Normalization quick check entry for codepoint.  We use a bit field
+ * here to save space.
+ */
+typedef struct
+{
+	unsigned int codepoint:21;
+	signed int	quickcheck:4;	/* really UnicodeNormalizationQC */
+} pg_unicode_normprops;
+
+/* Typedef for hash function on quick check table */
+typedef int (*qc_hash_func) (const void *key);
+
+/* Information for quick check lookup with perfect hash function */
+typedef struct
+{
+	const pg_unicode_normprops *normprops;
+	qc_hash_func	hash;
+	int		num_normprops;
+} pg_unicode_norminfo;
+EOS
+
+foreach my $line (<ARGV>)
+{
+	chomp $line;
+	$line =~ s/\s*#.*$//;
+	next if $line eq '';
+	my ($codepoint, $prop, $value) = split /\s*;\s*/, $line;
+	next if $prop !~ /_QC/;
+
+	my ($first, $last);
+	if ($codepoint =~ /\.\./)
+	{
+		($first, $last) = split /\.\./, $codepoint;
+	}
+	else
+	{
+		$first = $last = $codepoint;
+	}
+
+	foreach my $cp (hex($first) .. hex($last))
+	{
+		$data{$prop}{$cp} = $value;
+	}
+}
+
+# We create a separate array for each normalization form rather than,
+# say, a two-dimensional array, because that array would be very
+# sparse and would create unnecessary overhead especially for the NFC
+# lookup.
+foreach my $prop (sort keys %data)
+{
+	# Don't build the tables for the "D" forms because they are too
+	# big.  See also unicode_is_normalized_quickcheck().
+	next if $prop eq "NFD_QC" || $prop eq "NFKD_QC";
+
+	print "\n";
+	print
+	  "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";
+
+	my %subdata = %{ $data{$prop} };
+	my @cp_packed;
+	foreach my $cp (sort { $a <=> $b } keys %subdata)
+	{
+		my $qc;
+		if ($subdata{$cp} eq 'N')
+		{
+			$qc = 'UNICODE_NORM_QC_NO';
+		}
+		elsif ($subdata{$cp} eq 'M')
+		{
+			$qc = 'UNICODE_NORM_QC_MAYBE';
+		}
+		else
+		{
+			die;
+		}
+		printf "\t{0x%04X, %s},\n", $cp, $qc;
+
+		# Save the bytes as a string in network order.
+		push @cp_packed, pack('N', $cp);
+	}
+
+	print "};\n";
+
+	# Emit the definition of the perfect hash function.
+	my $funcname = $prop . '_hash_func';
+	my $f        = PerfectHash::generate_hash_function(\@cp_packed, $funcname,
+		fixed_key_length => 4);
+	printf "\n/* Perfect hash function for %s */", $prop;
+	print "\nstatic $f\n";
+
+	# Emit the structure that wraps the hash lookup information into
+	# one variable.
+	printf "/* Hash lookup information for %s */", $prop;
+	printf "\nstatic const pg_unicode_norminfo ";
+	printf "UnicodeNormInfo_%s = {\n", $prop;
+	printf "\tUnicodeNormProps_%s,\n", $prop;
+	printf "\t%s,\n",                  $funcname;
+	printf "\t%d\n",                   scalar @cp_packed;
+	printf "};\n";
+}