src/common/unicode/generate-unicode_normprops_table.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125

#!/usr/bin/perl
#
# Generate table of Unicode normalization "quick check" properties
# (see UAX #15).  Pass DerivedNormalizationProps.txt as argument.  The
# output is on stdout.
#
# Copyright (c) 2020-2022, PostgreSQL Global Development Group

use strict;
use warnings;

use FindBin;
use lib "$FindBin::RealBin/../../tools/";
use PerfectHash;

my %data;

print
  "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n";

print <<EOS;
#include "common/unicode_norm.h"

/*
 * Normalization quick check entry for codepoint.  We use a bit field
 * here to save space.
 */
typedef struct
{
	unsigned int codepoint:21;
	signed int	quickcheck:4;	/* really UnicodeNormalizationQC */
} pg_unicode_normprops;

/* Typedef for hash function on quick check table */
typedef int (*qc_hash_func) (const void *key);

/* Information for quick check lookup with perfect hash function */
typedef struct
{
	const pg_unicode_normprops *normprops;
	qc_hash_func	hash;
	int		num_normprops;
} pg_unicode_norminfo;
EOS

foreach my $line (<ARGV>)
{
	chomp $line;
	$line =~ s/\s*#.*$//;
	next if $line eq '';
	my ($codepoint, $prop, $value) = split /\s*;\s*/, $line;
	next if $prop !~ /_QC/;

	my ($first, $last);
	if ($codepoint =~ /\.\./)
	{
		($first, $last) = split /\.\./, $codepoint;
	}
	else
	{
		$first = $last = $codepoint;
	}

	foreach my $cp (hex($first) .. hex($last))
	{
		$data{$prop}{$cp} = $value;
	}
}

# We create a separate array for each normalization form rather than,
# say, a two-dimensional array, because that array would be very
# sparse and would create unnecessary overhead especially for the NFC
# lookup.
foreach my $prop (sort keys %data)
{
	# Don't build the tables for the "D" forms because they are too
	# big.  See also unicode_is_normalized_quickcheck().
	next if $prop eq "NFD_QC" || $prop eq "NFKD_QC";

	print "\n";
	print
	  "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";

	my %subdata = %{ $data{$prop} };
	my @cp_packed;
	foreach my $cp (sort { $a <=> $b } keys %subdata)
	{
		my $qc;
		if ($subdata{$cp} eq 'N')
		{
			$qc = 'UNICODE_NORM_QC_NO';
		}
		elsif ($subdata{$cp} eq 'M')
		{
			$qc = 'UNICODE_NORM_QC_MAYBE';
		}
		else
		{
			die;
		}
		printf "\t{0x%04X, %s},\n", $cp, $qc;

		# Save the bytes as a string in network order.
		push @cp_packed, pack('N', $cp);
	}

	print "};\n";

	# Emit the definition of the perfect hash function.
	my $funcname = $prop . '_hash_func';
	my $f        = PerfectHash::generate_hash_function(\@cp_packed, $funcname,
		fixed_key_length => 4);
	printf "\n/* Perfect hash function for %s */", $prop;
	print "\nstatic $f\n";

	# Emit the structure that wraps the hash lookup information into
	# one variable.
	printf "/* Hash lookup information for %s */", $prop;
	printf "\nstatic const pg_unicode_norminfo ";
	printf "UnicodeNormInfo_%s = {\n", $prop;
	printf "\tUnicodeNormProps_%s,\n", $prop;
	printf "\t%s,\n",                  $funcname;
	printf "\t%d\n",                   scalar @cp_packed;
	printf "};\n";
}