diff options
Diffstat (limited to 'src/common/unicode/generate-unicode_normprops_table.pl')
-rw-r--r-- | src/common/unicode/generate-unicode_normprops_table.pl | 125 |
1 files changed, 125 insertions, 0 deletions
diff --git a/src/common/unicode/generate-unicode_normprops_table.pl b/src/common/unicode/generate-unicode_normprops_table.pl new file mode 100644 index 0000000..c7795d1 --- /dev/null +++ b/src/common/unicode/generate-unicode_normprops_table.pl @@ -0,0 +1,125 @@ +#!/usr/bin/perl +# +# Generate table of Unicode normalization "quick check" properties +# (see UAX #15). Pass DerivedNormalizationProps.txt as argument. The +# output is on stdout. +# +# Copyright (c) 2020-2021, PostgreSQL Global Development Group + +use strict; +use warnings; + +use FindBin; +use lib "$FindBin::RealBin/../../tools/"; +use PerfectHash; + +my %data; + +print + "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n"; + +print <<EOS; +#include "common/unicode_norm.h" + +/* + * Normalization quick check entry for codepoint. We use a bit field + * here to save space. + */ +typedef struct +{ + unsigned int codepoint:21; + signed int quickcheck:4; /* really UnicodeNormalizationQC */ +} pg_unicode_normprops; + +/* Typedef for hash function on quick check table */ +typedef int (*qc_hash_func) (const void *key); + +/* Information for quick check lookup with perfect hash function */ +typedef struct +{ + const pg_unicode_normprops *normprops; + qc_hash_func hash; + int num_normprops; +} pg_unicode_norminfo; +EOS + +foreach my $line (<ARGV>) +{ + chomp $line; + $line =~ s/\s*#.*$//; + next if $line eq ''; + my ($codepoint, $prop, $value) = split /\s*;\s*/, $line; + next if $prop !~ /_QC/; + + my ($first, $last); + if ($codepoint =~ /\.\./) + { + ($first, $last) = split /\.\./, $codepoint; + } + else + { + $first = $last = $codepoint; + } + + foreach my $cp (hex($first) .. hex($last)) + { + $data{$prop}{$cp} = $value; + } +} + +# We create a separate array for each normalization form rather than, +# say, a two-dimensional array, because that array would be very +# sparse and would create unnecessary overhead especially for the NFC +# lookup. +foreach my $prop (sort keys %data) +{ + # Don't build the tables for the "D" forms because they are too + # big. See also unicode_is_normalized_quickcheck(). + next if $prop eq "NFD_QC" || $prop eq "NFKD_QC"; + + print "\n"; + print + "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n"; + + my %subdata = %{ $data{$prop} }; + my @cp_packed; + foreach my $cp (sort { $a <=> $b } keys %subdata) + { + my $qc; + if ($subdata{$cp} eq 'N') + { + $qc = 'UNICODE_NORM_QC_NO'; + } + elsif ($subdata{$cp} eq 'M') + { + $qc = 'UNICODE_NORM_QC_MAYBE'; + } + else + { + die; + } + printf "\t{0x%04X, %s},\n", $cp, $qc; + + # Save the bytes as a string in network order. + push @cp_packed, pack('N', $cp); + } + + print "};\n"; + + # Emit the definition of the perfect hash function. + my $funcname = $prop . '_hash_func'; + my $f = PerfectHash::generate_hash_function(\@cp_packed, $funcname, + fixed_key_length => 4); + printf "\n/* Perfect hash function for %s */", $prop; + print "\nstatic $f\n"; + + # Emit the structure that wraps the hash lookup information into + # one variable. + printf "/* Hash lookup information for %s */", $prop; + printf "\nstatic const pg_unicode_norminfo "; + printf "UnicodeNormInfo_%s = {\n", $prop; + printf "\tUnicodeNormProps_%s,\n", $prop; + printf "\t%s,\n", $funcname; + printf "\t%d\n", scalar @cp_packed; + printf "};\n"; +} |