summaryrefslogtreecommitdiffstats
path: root/src/common/unicode/generate-unicode_normprops_table.pl
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:19:15 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:19:15 +0000
commit6eb9c5a5657d1fe77b55cc261450f3538d35a94d (patch)
tree657d8194422a5daccecfd42d654b8a245ef7b4c8 /src/common/unicode/generate-unicode_normprops_table.pl
parentInitial commit. (diff)
downloadpostgresql-13-upstream.tar.xz
postgresql-13-upstream.zip
Adding upstream version 13.4.upstream/13.4upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/common/unicode/generate-unicode_normprops_table.pl')
-rw-r--r--src/common/unicode/generate-unicode_normprops_table.pl88
1 files changed, 88 insertions, 0 deletions
diff --git a/src/common/unicode/generate-unicode_normprops_table.pl b/src/common/unicode/generate-unicode_normprops_table.pl
new file mode 100644
index 0000000..e8e5097
--- /dev/null
+++ b/src/common/unicode/generate-unicode_normprops_table.pl
@@ -0,0 +1,88 @@
+#!/usr/bin/perl
+#
+# Generate table of Unicode normalization "quick check" properties
+# (see UAX #15). Pass DerivedNormalizationProps.txt as argument. The
+# output is on stdout.
+#
+# Copyright (c) 2020, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+my %data;
+
+print
+ "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n";
+
+print <<EOS;
+#include "common/unicode_norm.h"
+
+/*
+ * We use a bit field here to save space.
+ */
+typedef struct
+{
+ unsigned int codepoint:21;
+ signed int quickcheck:4; /* really UnicodeNormalizationQC */
+} pg_unicode_normprops;
+EOS
+
+foreach my $line (<ARGV>)
+{
+ chomp $line;
+ $line =~ s/\s*#.*$//;
+ next if $line eq '';
+ my ($codepoint, $prop, $value) = split /\s*;\s*/, $line;
+ next if $prop !~ /_QC/;
+
+ my ($first, $last);
+ if ($codepoint =~ /\.\./)
+ {
+ ($first, $last) = split /\.\./, $codepoint;
+ }
+ else
+ {
+ $first = $last = $codepoint;
+ }
+
+ foreach my $cp (hex($first) .. hex($last))
+ {
+ $data{$prop}{$cp} = $value;
+ }
+}
+
+# We create a separate array for each normalization form rather than,
+# say, a two-dimensional array, because that array would be very
+# sparse and would create unnecessary overhead especially for the NFC
+# lookup.
+foreach my $prop (sort keys %data)
+{
+ # Don't build the tables for the "D" forms because they are too
+ # big. See also unicode_is_normalized_quickcheck().
+ next if $prop eq "NFD_QC" || $prop eq "NFKD_QC";
+
+ print "\n";
+ print
+ "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";
+
+ my %subdata = %{ $data{$prop} };
+ foreach my $cp (sort { $a <=> $b } keys %subdata)
+ {
+ my $qc;
+ if ($subdata{$cp} eq 'N')
+ {
+ $qc = 'UNICODE_NORM_QC_NO';
+ }
+ elsif ($subdata{$cp} eq 'M')
+ {
+ $qc = 'UNICODE_NORM_QC_MAYBE';
+ }
+ else
+ {
+ die;
+ }
+ printf "\t{0x%04X, %s},\n", $cp, $qc;
+ }
+
+ print "};\n";
+}