summaryrefslogtreecommitdiffstats
path: root/src/common/unicode/generate-unicode_normprops_table.pl
diff options
context:
space:
mode:
Diffstat (limited to 'src/common/unicode/generate-unicode_normprops_table.pl')
-rw-r--r--src/common/unicode/generate-unicode_normprops_table.pl125
1 files changed, 125 insertions, 0 deletions
diff --git a/src/common/unicode/generate-unicode_normprops_table.pl b/src/common/unicode/generate-unicode_normprops_table.pl
new file mode 100644
index 0000000..c7795d1
--- /dev/null
+++ b/src/common/unicode/generate-unicode_normprops_table.pl
@@ -0,0 +1,125 @@
+#!/usr/bin/perl
+#
+# Generate table of Unicode normalization "quick check" properties
+# (see UAX #15). Pass DerivedNormalizationProps.txt as argument. The
+# output is on stdout.
+#
+# Copyright (c) 2020-2021, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::RealBin/../../tools/";
+use PerfectHash;
+
+my %data;
+
+print
+ "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n";
+
+print <<EOS;
+#include "common/unicode_norm.h"
+
+/*
+ * Normalization quick check entry for codepoint. We use a bit field
+ * here to save space.
+ */
+typedef struct
+{
+ unsigned int codepoint:21;
+ signed int quickcheck:4; /* really UnicodeNormalizationQC */
+} pg_unicode_normprops;
+
+/* Typedef for hash function on quick check table */
+typedef int (*qc_hash_func) (const void *key);
+
+/* Information for quick check lookup with perfect hash function */
+typedef struct
+{
+ const pg_unicode_normprops *normprops;
+ qc_hash_func hash;
+ int num_normprops;
+} pg_unicode_norminfo;
+EOS
+
+foreach my $line (<ARGV>)
+{
+ chomp $line;
+ $line =~ s/\s*#.*$//;
+ next if $line eq '';
+ my ($codepoint, $prop, $value) = split /\s*;\s*/, $line;
+ next if $prop !~ /_QC/;
+
+ my ($first, $last);
+ if ($codepoint =~ /\.\./)
+ {
+ ($first, $last) = split /\.\./, $codepoint;
+ }
+ else
+ {
+ $first = $last = $codepoint;
+ }
+
+ foreach my $cp (hex($first) .. hex($last))
+ {
+ $data{$prop}{$cp} = $value;
+ }
+}
+
+# We create a separate array for each normalization form rather than,
+# say, a two-dimensional array, because that array would be very
+# sparse and would create unnecessary overhead especially for the NFC
+# lookup.
+foreach my $prop (sort keys %data)
+{
+ # Don't build the tables for the "D" forms because they are too
+ # big. See also unicode_is_normalized_quickcheck().
+ next if $prop eq "NFD_QC" || $prop eq "NFKD_QC";
+
+ print "\n";
+ print
+ "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";
+
+ my %subdata = %{ $data{$prop} };
+ my @cp_packed;
+ foreach my $cp (sort { $a <=> $b } keys %subdata)
+ {
+ my $qc;
+ if ($subdata{$cp} eq 'N')
+ {
+ $qc = 'UNICODE_NORM_QC_NO';
+ }
+ elsif ($subdata{$cp} eq 'M')
+ {
+ $qc = 'UNICODE_NORM_QC_MAYBE';
+ }
+ else
+ {
+ die;
+ }
+ printf "\t{0x%04X, %s},\n", $cp, $qc;
+
+ # Save the bytes as a string in network order.
+ push @cp_packed, pack('N', $cp);
+ }
+
+ print "};\n";
+
+ # Emit the definition of the perfect hash function.
+ my $funcname = $prop . '_hash_func';
+ my $f = PerfectHash::generate_hash_function(\@cp_packed, $funcname,
+ fixed_key_length => 4);
+ printf "\n/* Perfect hash function for %s */", $prop;
+ print "\nstatic $f\n";
+
+ # Emit the structure that wraps the hash lookup information into
+ # one variable.
+ printf "/* Hash lookup information for %s */", $prop;
+ printf "\nstatic const pg_unicode_norminfo ";
+ printf "UnicodeNormInfo_%s = {\n", $prop;
+ printf "\tUnicodeNormProps_%s,\n", $prop;
+ printf "\t%s,\n", $funcname;
+ printf "\t%d\n", scalar @cp_packed;
+ printf "};\n";
+}