1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
|
#!/usr/bin/perl
#
# Generate table of Unicode normalization "quick check" properties
# (see UAX #15). Pass DerivedNormalizationProps.txt as argument. The
# output is on stdout.
#
# Copyright (c) 2020-2021, PostgreSQL Global Development Group
use strict;
use warnings;
use FindBin;
use lib "$FindBin::RealBin/../../tools/";
use PerfectHash;
my %data;
print
"/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n";
print <<EOS;
#include "common/unicode_norm.h"
/*
* Normalization quick check entry for codepoint. We use a bit field
* here to save space.
*/
typedef struct
{
unsigned int codepoint:21;
signed int quickcheck:4; /* really UnicodeNormalizationQC */
} pg_unicode_normprops;
/* Typedef for hash function on quick check table */
typedef int (*qc_hash_func) (const void *key);
/* Information for quick check lookup with perfect hash function */
typedef struct
{
const pg_unicode_normprops *normprops;
qc_hash_func hash;
int num_normprops;
} pg_unicode_norminfo;
EOS
foreach my $line (<ARGV>)
{
chomp $line;
$line =~ s/\s*#.*$//;
next if $line eq '';
my ($codepoint, $prop, $value) = split /\s*;\s*/, $line;
next if $prop !~ /_QC/;
my ($first, $last);
if ($codepoint =~ /\.\./)
{
($first, $last) = split /\.\./, $codepoint;
}
else
{
$first = $last = $codepoint;
}
foreach my $cp (hex($first) .. hex($last))
{
$data{$prop}{$cp} = $value;
}
}
# We create a separate array for each normalization form rather than,
# say, a two-dimensional array, because that array would be very
# sparse and would create unnecessary overhead especially for the NFC
# lookup.
foreach my $prop (sort keys %data)
{
# Don't build the tables for the "D" forms because they are too
# big. See also unicode_is_normalized_quickcheck().
next if $prop eq "NFD_QC" || $prop eq "NFKD_QC";
print "\n";
print
"static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";
my %subdata = %{ $data{$prop} };
my @cp_packed;
foreach my $cp (sort { $a <=> $b } keys %subdata)
{
my $qc;
if ($subdata{$cp} eq 'N')
{
$qc = 'UNICODE_NORM_QC_NO';
}
elsif ($subdata{$cp} eq 'M')
{
$qc = 'UNICODE_NORM_QC_MAYBE';
}
else
{
die;
}
printf "\t{0x%04X, %s},\n", $cp, $qc;
# Save the bytes as a string in network order.
push @cp_packed, pack('N', $cp);
}
print "};\n";
# Emit the definition of the perfect hash function.
my $funcname = $prop . '_hash_func';
my $f = PerfectHash::generate_hash_function(\@cp_packed, $funcname,
fixed_key_length => 4);
printf "\n/* Perfect hash function for %s */", $prop;
print "\nstatic $f\n";
# Emit the structure that wraps the hash lookup information into
# one variable.
printf "/* Hash lookup information for %s */", $prop;
printf "\nstatic const pg_unicode_norminfo ";
printf "UnicodeNormInfo_%s = {\n", $prop;
printf "\tUnicodeNormProps_%s,\n", $prop;
printf "\t%s,\n", $funcname;
printf "\t%d\n", scalar @cp_packed;
printf "};\n";
}
|