#!/usr/bin/perl # # Generate table of Unicode normalization "quick check" properties # (see UAX #15). Pass DerivedNormalizationProps.txt as argument. The # output is on stdout. # # Copyright (c) 2020-2022, PostgreSQL Global Development Group use strict; use warnings; use FindBin; use lib "$FindBin::RealBin/../../tools/"; use PerfectHash; my %data; print "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n"; print <) { chomp $line; $line =~ s/\s*#.*$//; next if $line eq ''; my ($codepoint, $prop, $value) = split /\s*;\s*/, $line; next if $prop !~ /_QC/; my ($first, $last); if ($codepoint =~ /\.\./) { ($first, $last) = split /\.\./, $codepoint; } else { $first = $last = $codepoint; } foreach my $cp (hex($first) .. hex($last)) { $data{$prop}{$cp} = $value; } } # We create a separate array for each normalization form rather than, # say, a two-dimensional array, because that array would be very # sparse and would create unnecessary overhead especially for the NFC # lookup. foreach my $prop (sort keys %data) { # Don't build the tables for the "D" forms because they are too # big. See also unicode_is_normalized_quickcheck(). next if $prop eq "NFD_QC" || $prop eq "NFKD_QC"; print "\n"; print "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n"; my %subdata = %{ $data{$prop} }; my @cp_packed; foreach my $cp (sort { $a <=> $b } keys %subdata) { my $qc; if ($subdata{$cp} eq 'N') { $qc = 'UNICODE_NORM_QC_NO'; } elsif ($subdata{$cp} eq 'M') { $qc = 'UNICODE_NORM_QC_MAYBE'; } else { die; } printf "\t{0x%04X, %s},\n", $cp, $qc; # Save the bytes as a string in network order. push @cp_packed, pack('N', $cp); } print "};\n"; # Emit the definition of the perfect hash function. my $funcname = $prop . '_hash_func'; my $f = PerfectHash::generate_hash_function(\@cp_packed, $funcname, fixed_key_length => 4); printf "\n/* Perfect hash function for %s */", $prop; print "\nstatic $f\n"; # Emit the structure that wraps the hash lookup information into # one variable. printf "/* Hash lookup information for %s */", $prop; printf "\nstatic const pg_unicode_norminfo "; printf "UnicodeNormInfo_%s = {\n", $prop; printf "\tUnicodeNormProps_%s,\n", $prop; printf "\t%s,\n", $funcname; printf "\t%d\n", scalar @cp_packed; printf "};\n"; }