summaryrefslogtreecommitdiffstats
path: root/src/common/unicode
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/common/unicode/.gitignore9
-rw-r--r--src/common/unicode/Makefile72
-rw-r--r--src/common/unicode/README28
-rw-r--r--src/common/unicode/generate-norm_test_table.pl106
-rw-r--r--src/common/unicode/generate-unicode_combining_table.pl51
-rw-r--r--src/common/unicode/generate-unicode_east_asian_fw_table.pl76
-rw-r--r--src/common/unicode/generate-unicode_norm_table.pl406
-rw-r--r--src/common/unicode/generate-unicode_normprops_table.pl125
-rw-r--r--src/common/unicode/norm_test.c86
-rw-r--r--src/common/unicode_norm.c634
10 files changed, 1593 insertions, 0 deletions
diff --git a/src/common/unicode/.gitignore b/src/common/unicode/.gitignore
new file mode 100644
index 0000000..46243f7
--- /dev/null
+++ b/src/common/unicode/.gitignore
@@ -0,0 +1,9 @@
+/norm_test
+/norm_test_table.h
+
+# Downloaded files
+/CompositionExclusions.txt
+/DerivedNormalizationProps.txt
+/EastAsianWidth.txt
+/NormalizationTest.txt
+/UnicodeData.txt
diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile
new file mode 100644
index 0000000..60e01e7
--- /dev/null
+++ b/src/common/unicode/Makefile
@@ -0,0 +1,72 @@
+#-------------------------------------------------------------------------
+#
+# Makefile
+# Makefile for src/common/unicode
+#
+# IDENTIFICATION
+# src/common/unicode/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/common/unicode
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+override CPPFLAGS := -DFRONTEND -I. $(CPPFLAGS)
+LIBS += $(PTHREAD_LIBS)
+
+# By default, do nothing.
+all:
+
+update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
+ mv $^ $(top_srcdir)/src/include/common/
+ $(MAKE) normalization-check
+
+# These files are part of the Unicode Character Database. Download
+# them on demand. The dependency on Makefile.global is for
+# UNICODE_VERSION.
+UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
+ $(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
+
+# Generation of conversion tables used for string normalization with
+# UTF-8 strings.
+unicode_norm_hashfunc.h: unicode_norm_table.h
+
+unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt
+ $(PERL) $<
+
+unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
+ $(PERL) $^ >$@
+
+unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt
+ $(PERL) $^ >$@
+
+unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt
+ $(PERL) $^ >$@
+
+# Test suite
+normalization-check: norm_test
+ ./norm_test
+
+norm_test: norm_test.o ../unicode_norm.o | submake-common
+
+norm_test.o: norm_test_table.h
+
+.PHONY: submake-common
+
+submake-common:
+ $(MAKE) -C .. all
+
+norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt
+ perl $^ $@
+
+.PHONY: normalization-check
+
+
+clean:
+ rm -f $(OBJS) norm_test norm_test.o
+
+distclean: clean
+ rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h
+
+maintainer-clean: distclean
diff --git a/src/common/unicode/README b/src/common/unicode/README
new file mode 100644
index 0000000..56956f6
--- /dev/null
+++ b/src/common/unicode/README
@@ -0,0 +1,28 @@
+This directory contains tools to generate the tables in
+src/include/common/unicode_norm.h, used for Unicode normalization. The
+generated .h file is included in the source tree, so these are normally not
+needed to build PostgreSQL, only if you need to re-generate the .h file
+from the Unicode data files for some reason, e.g. to update to a new version
+of Unicode.
+
+Generating unicode_norm_table.h
+-------------------------------
+
+Run
+
+ make update-unicode
+
+from the top level of the source tree and commit the result.
+
+Tests
+-----
+
+The Unicode consortium publishes a comprehensive test suite for the
+normalization algorithm, in a file called NormalizationTest.txt. This
+directory also contains a perl script and some C code, to run our
+normalization code with all the test strings in NormalizationTest.txt.
+To download NormalizationTest.txt and run the tests:
+
+ make normalization-check
+
+This is also run as part of the update-unicode target.
diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl
new file mode 100644
index 0000000..838f552
--- /dev/null
+++ b/src/common/unicode/generate-norm_test_table.pl
@@ -0,0 +1,106 @@
+#!/usr/bin/perl
+#
+# Read Unicode consortium's normalization test suite, NormalizationTest.txt,
+# and generate a C array from it, for norm_test.c.
+#
+# NormalizationTest.txt is part of the Unicode Character Database.
+#
+# Copyright (c) 2000-2022, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+use File::Basename;
+
+die "Usage: $0 INPUT_FILE OUTPUT_FILE\n" if @ARGV != 2;
+my $input_file = $ARGV[0];
+my $output_file = $ARGV[1];
+my $output_base = basename($output_file);
+
+# Open the input and output files
+open my $INPUT, '<', $input_file
+ or die "Could not open input file $input_file: $!";
+open my $OUTPUT, '>', $output_file
+ or die "Could not open output file $output_file: $!\n";
+
+# Print header of output file.
+print $OUTPUT <<HEADER;
+/*-------------------------------------------------------------------------
+ *
+ * norm_test_table.h
+ * Test strings for Unicode normalization.
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/common/unicode/norm_test_table.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * File auto-generated by src/common/unicode/generate-norm_test_table.pl, do
+ * not edit. There is deliberately not an #ifndef PG_NORM_TEST_TABLE_H
+ * here.
+ */
+
+typedef struct
+{
+ int linenum;
+ pg_wchar input[50];
+ pg_wchar output[4][50];
+} pg_unicode_test;
+
+/* test table */
+HEADER
+print $OUTPUT
+ "static const pg_unicode_test UnicodeNormalizationTests[] =\n{\n";
+
+# Helper routine to convert a space-separated list of Unicode characters to
+# hexadecimal list format, suitable for outputting in a C array.
+sub codepoint_string_to_hex
+{
+ my $codepoint_string = shift;
+
+ my $result;
+
+ foreach (split(' ', $codepoint_string))
+ {
+ my $cp = $_;
+ my $utf8 = "0x$cp, ";
+ $result .= $utf8;
+ }
+ $result .= '0'; # null-terminated the array
+ return $result;
+}
+
+# Process the input file line by line
+my $linenum = 0;
+while (my $line = <$INPUT>)
+{
+ $linenum = $linenum + 1;
+ if ($line =~ /^\s*#/) { next; } # ignore comments
+
+ if ($line =~ /^@/) { next; } # ignore @Part0 like headers
+
+ # Split the line wanted and get the fields needed:
+ #
+ # source; NFC; NFD; NFKC; NFKD
+ my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line);
+
+ my $source_utf8 = codepoint_string_to_hex($source);
+ my $nfc_utf8 = codepoint_string_to_hex($nfc);
+ my $nfd_utf8 = codepoint_string_to_hex($nfd);
+ my $nfkc_utf8 = codepoint_string_to_hex($nfkc);
+ my $nfkd_utf8 = codepoint_string_to_hex($nfkd);
+
+ print $OUTPUT
+ "\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n";
+}
+
+# Output terminator entry
+print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }";
+print $OUTPUT "\n};\n";
+
+close $OUTPUT;
+close $INPUT;
diff --git a/src/common/unicode/generate-unicode_combining_table.pl b/src/common/unicode/generate-unicode_combining_table.pl
new file mode 100644
index 0000000..8177c20
--- /dev/null
+++ b/src/common/unicode/generate-unicode_combining_table.pl
@@ -0,0 +1,51 @@
+#!/usr/bin/perl
+#
+# Generate sorted list of non-overlapping intervals of non-spacing
+# characters, using Unicode data files as input. Pass UnicodeData.txt
+# as argument. The output is on stdout.
+#
+# Copyright (c) 2019-2022, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+my $range_start = undef;
+my $codepoint;
+my $prev_codepoint;
+my $count = 0;
+
+print
+ "/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */\n\n";
+
+print "static const struct mbinterval combining[] = {\n";
+
+foreach my $line (<ARGV>)
+{
+ chomp $line;
+ my @fields = split ';', $line;
+ $codepoint = hex $fields[0];
+
+ if ($fields[2] eq 'Me' || $fields[2] eq 'Mn')
+ {
+ # combining character, save for start of range
+ if (!defined($range_start))
+ {
+ $range_start = $codepoint;
+ }
+ }
+ else
+ {
+ # not a combining character, print out previous range if any
+ if (defined($range_start))
+ {
+ printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_codepoint;
+ $range_start = undef;
+ }
+ }
+}
+continue
+{
+ $prev_codepoint = $codepoint;
+}
+
+print "};\n";
diff --git a/src/common/unicode/generate-unicode_east_asian_fw_table.pl b/src/common/unicode/generate-unicode_east_asian_fw_table.pl
new file mode 100644
index 0000000..9d03684
--- /dev/null
+++ b/src/common/unicode/generate-unicode_east_asian_fw_table.pl
@@ -0,0 +1,76 @@
+#!/usr/bin/perl
+#
+# Generate a sorted list of non-overlapping intervals of East Asian Wide (W)
+# and East Asian Fullwidth (F) characters, using Unicode data files as input.
+# Pass EastAsianWidth.txt as argument. The output is on stdout.
+#
+# Copyright (c) 2019-2022, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+my $range_start = undef;
+my ($first, $last);
+my $prev_last;
+
+print
+ "/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n";
+
+print "static const struct mbinterval east_asian_fw[] = {\n";
+
+foreach my $line (<ARGV>)
+{
+ chomp $line;
+ $line =~ s/\s*#.*$//;
+ next if $line eq '';
+ my ($codepoint, $width) = split ';', $line;
+
+ if ($codepoint =~ /\.\./)
+ {
+ ($first, $last) = split /\.\./, $codepoint;
+ }
+ else
+ {
+ $first = $last = $codepoint;
+ }
+
+ ($first, $last) = map(hex, ($first, $last));
+
+ if ($width eq 'F' || $width eq 'W')
+ {
+ # fullwidth/wide characters
+ if (!defined($range_start))
+ {
+ # save for start of range if one hasn't been started yet
+ $range_start = $first;
+ }
+ elsif ($first != $prev_last + 1)
+ {
+ # ranges aren't contiguous; emit the last and start a new one
+ printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+ $range_start = $first;
+ }
+ }
+ else
+ {
+ # not wide characters, print out previous range if any
+ if (defined($range_start))
+ {
+ printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+ $range_start = undef;
+ }
+ }
+}
+continue
+{
+ $prev_last = $last;
+}
+
+# don't forget any ranges at the very end of the database (though there are none
+# as of Unicode 13.0)
+if (defined($range_start))
+{
+ printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
+}
+
+print "};\n";
diff --git a/src/common/unicode/generate-unicode_norm_table.pl b/src/common/unicode/generate-unicode_norm_table.pl
new file mode 100644
index 0000000..e442345
--- /dev/null
+++ b/src/common/unicode/generate-unicode_norm_table.pl
@@ -0,0 +1,406 @@
+#!/usr/bin/perl
+#
+# Generate a composition table and its lookup utilities, using Unicode data
+# files as input.
+#
+# Input: UnicodeData.txt and CompositionExclusions.txt
+# Output: unicode_norm_table.h and unicode_norm_hashfunc.h
+#
+# Copyright (c) 2000-2022, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::RealBin/../../tools/";
+use PerfectHash;
+
+my $output_table_file = "unicode_norm_table.h";
+my $output_func_file = "unicode_norm_hashfunc.h";
+
+my $FH;
+
+# Read list of codes that should be excluded from re-composition.
+my @composition_exclusion_codes = ();
+open($FH, '<', "CompositionExclusions.txt")
+ or die "Could not open CompositionExclusions.txt: $!.";
+while (my $line = <$FH>)
+{
+ if ($line =~ /^([[:xdigit:]]+)/)
+ {
+ push @composition_exclusion_codes, $1;
+ }
+}
+close $FH;
+
+# Read entries from UnicodeData.txt into a list, and a hash table. We need
+# three fields from each row: the codepoint, canonical combining class,
+# and character decomposition mapping
+my @characters = ();
+my %character_hash = ();
+open($FH, '<', "UnicodeData.txt")
+ or die "Could not open UnicodeData.txt: $!.";
+while (my $line = <$FH>)
+{
+
+ # Split the line wanted and get the fields needed:
+ # - Unicode code value
+ # - Canonical Combining Class
+ # - Character Decomposition Mapping
+ my @elts = split(';', $line);
+ my $code = $elts[0];
+ my $class = $elts[3];
+ my $decomp = $elts[5];
+
+ # Skip codepoints above U+10FFFF. They cannot be represented in 4 bytes
+ # in UTF-8, and PostgreSQL doesn't support UTF-8 characters longer than
+ # 4 bytes. (This is just pro forma, as there aren't any such entries in
+ # the data file, currently.)
+ next if hex($code) > 0x10FFFF;
+
+ # Skip characters with no decompositions and a class of 0, to reduce the
+ # table size.
+ next if $class eq '0' && $decomp eq '';
+
+ my %char_entry = (code => $code, class => $class, decomp => $decomp);
+ push(@characters, \%char_entry);
+ $character_hash{$code} = \%char_entry;
+}
+close $FH;
+
+my $num_characters = scalar @characters;
+
+# Start writing out the output files
+open my $OT, '>', $output_table_file
+ or die "Could not open output file $output_table_file: $!\n";
+open my $OF, '>', $output_func_file
+ or die "Could not open output file $output_func_file: $!\n";
+
+print $OT <<HEADER;
+/*-------------------------------------------------------------------------
+ *
+ * unicode_norm_table.h
+ * Composition table used for Unicode normalization
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/common/unicode_norm_table.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl,
+ * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_TABLE_H
+ * here.
+ */
+typedef struct
+{
+ uint32 codepoint; /* Unicode codepoint */
+ uint8 comb_class; /* combining class of character */
+ uint8 dec_size_flags; /* size and flags of decomposition code list */
+ uint16 dec_index; /* index into UnicodeDecomp_codepoints, or the
+ * decomposition itself if DECOMP_INLINE */
+} pg_unicode_decomposition;
+
+#define DECOMP_NO_COMPOSE 0x80 /* don't use for re-composition */
+#define DECOMP_INLINE 0x40 /* decomposition is stored inline in
+ * dec_index */
+#define DECOMP_COMPAT 0x20 /* compatibility mapping */
+
+#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F)
+#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0)
+#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
+#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0)
+
+/* Table of Unicode codepoints and their decompositions */
+static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
+{
+HEADER
+
+print $OF <<HEADER;
+/*-------------------------------------------------------------------------
+ *
+ * unicode_norm_hashfunc.h
+ * Perfect hash functions used for Unicode normalization
+ *
+ * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/common/unicode_norm_hashfunc.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl,
+ * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H
+ * here.
+ */
+
+#include "common/unicode_norm_table.h"
+
+/* Typedef for perfect hash functions */
+typedef int (*cp_hash_func) (const void *key);
+
+/* Information for lookups with perfect hash functions */
+typedef struct
+{
+ const pg_unicode_decomposition *decomps;
+ cp_hash_func hash;
+ int num_decomps;
+} pg_unicode_decompinfo;
+
+typedef struct
+{
+ const uint16 *inverse_lookup;
+ cp_hash_func hash;
+ int num_recomps;
+} pg_unicode_recompinfo;
+
+HEADER
+
+my $decomp_index = 0;
+my $decomp_string = "";
+my @dec_cp_packed;
+my $main_index = 0;
+my @rec_info;
+
+my $last_code = $characters[-1]->{code};
+foreach my $char (@characters)
+{
+ my $code = $char->{code};
+ my $class = $char->{class};
+ my $decomp = $char->{decomp};
+
+ # Save the code point bytes as a string in network order.
+ push @dec_cp_packed, pack('N', hex($char->{code}));
+
+ # The character decomposition mapping field in UnicodeData.txt is a list
+ # of unicode codepoints, separated by space. But it can be prefixed with
+ # so-called compatibility formatting tag, like "<compat>", or "<font>".
+ # The entries with compatibility formatting tags should not be used for
+ # re-composing characters during normalization, so flag them in the table.
+ # (The tag doesn't matter, only whether there is a tag or not)
+ my $compat = 0;
+ if ($decomp =~ /\<.*\>/)
+ {
+ $compat = 1;
+ $decomp =~ s/\<[^][]*\>//g;
+ }
+ my @decomp_elts = split(" ", $decomp);
+
+ # Decomposition size
+ # Print size of decomposition
+ my $decomp_size = scalar(@decomp_elts);
+ die if $decomp_size > 0x1F; # to not overrun bitmask
+
+ my $first_decomp = shift @decomp_elts;
+
+ my $flags = "";
+ my $comment = "";
+
+ if ($compat)
+ {
+ $flags .= " | DECOMP_COMPAT";
+ }
+
+ if ($decomp_size == 2)
+ {
+ # Should this be used for recomposition?
+ if ( $character_hash{$first_decomp}
+ && $character_hash{$first_decomp}->{class} != 0)
+ {
+ $flags .= " | DECOMP_NO_COMPOSE";
+ $comment = "non-starter decomposition";
+ }
+ else
+ {
+ foreach my $lcode (@composition_exclusion_codes)
+ {
+ if ($lcode eq $code)
+ {
+ $flags .= " | DECOMP_NO_COMPOSE";
+ $comment = "in exclusion list";
+ last;
+ }
+ }
+ }
+
+ # Save info for recomposeable codepoints.
+ # Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C
+ # above! See also the inverse lookup in recompose_code() found in
+ # src/common/unicode_norm.c.
+ if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/))
+ {
+ push @rec_info,
+ {
+ code => $code,
+ main_index => $main_index,
+ first => $first_decomp,
+ second => $decomp_elts[0]
+ };
+ }
+ }
+
+ if ($decomp_size == 0)
+ {
+ print $OT "\t{0x$code, $class, 0$flags, 0}";
+ }
+ elsif ($decomp_size == 1 && length($first_decomp) <= 4)
+ {
+
+ # The decomposition consists of a single codepoint, and it fits
+ # in a uint16, so we can store it "inline" in the main table.
+ $flags .= " | DECOMP_INLINE";
+ print $OT "\t{0x$code, $class, 1$flags, 0x$first_decomp}";
+ }
+ else
+ {
+ print $OT "\t{0x$code, $class, $decomp_size$flags, $decomp_index}";
+
+ # Now save the decompositions into a dedicated area that will
+ # be written afterwards. First build the entry dedicated to
+ # a sub-table with the code and decomposition.
+ $decomp_string .= ",\n" if ($decomp_string ne "");
+
+ $decomp_string .= "\t /* $decomp_index */ 0x$first_decomp";
+ foreach (@decomp_elts)
+ {
+ $decomp_string .= ", 0x$_";
+ }
+
+ $decomp_index = $decomp_index + $decomp_size;
+ }
+
+ # Print a comma after all items except the last one.
+ print $OT "," unless ($code eq $last_code);
+
+ print $OT "\t/* $comment */" if ($comment ne "");
+ print $OT "\n";
+
+ $main_index++;
+}
+print $OT "\n};\n\n";
+
+# Print the array of decomposed codes.
+print $OT <<HEADER;
+/* codepoints array */
+static const uint32 UnicodeDecomp_codepoints[$decomp_index] =
+{
+$decomp_string
+};
+HEADER
+
+# Emit the definition of the decomp hash function.
+my $dec_funcname = 'Decomp_hash_func';
+my $dec_func = PerfectHash::generate_hash_function(\@dec_cp_packed,
+ $dec_funcname, fixed_key_length => 4);
+print $OF "/* Perfect hash function for decomposition */\n";
+print $OF "static $dec_func\n";
+
+# Emit the structure that wraps the hash lookup information into
+# one variable.
+print $OF <<HEADER;
+/* Hash lookup information for decomposition */
+static const pg_unicode_decompinfo UnicodeDecompInfo =
+{
+ UnicodeDecompMain,
+ $dec_funcname,
+ $num_characters
+};
+
+HEADER
+
+# Find the lowest codepoint that decomposes to each recomposeable
+# code pair and create a mapping to it.
+my $recomp_string = "";
+my @rec_cp_packed;
+my %seenit;
+my $firstentry = 1;
+foreach my $rec (sort recomp_sort @rec_info)
+{
+ # The hash key is formed by concatenating the bytes of the two
+ # codepoints. See also recompose_code() in common/unicode_norm.c.
+ my $hashkey = (hex($rec->{first}) << 32) | hex($rec->{second});
+
+ # We are only interested in the lowest code point that decomposes
+ # to the given code pair.
+ next if $seenit{$hashkey};
+
+ # Save the hash key bytes in network order
+ push @rec_cp_packed, pack('Q>', $hashkey);
+
+ # Append inverse lookup element
+ $recomp_string .= ",\n" if !$firstentry;
+ $recomp_string .= sprintf "\t/* U+%s+%s -> U+%s */ %s",
+ $rec->{first},
+ $rec->{second},
+ $rec->{code},
+ $rec->{main_index};
+
+ $seenit{$hashkey} = 1;
+ $firstentry = 0;
+}
+
+# Emit the inverse lookup array containing indexes into UnicodeDecompMain.
+my $num_recomps = scalar @rec_cp_packed;
+print $OF <<HEADER;
+/* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */
+static const uint16 RecompInverseLookup[$num_recomps] =
+{
+$recomp_string
+};
+
+HEADER
+
+# Emit the definition of the recomposition hash function.
+my $rec_funcname = 'Recomp_hash_func';
+my $rec_func =
+ PerfectHash::generate_hash_function(\@rec_cp_packed, $rec_funcname,
+ fixed_key_length => 8);
+print $OF "/* Perfect hash function for recomposition */\n";
+print $OF "static $rec_func\n";
+
+# Emit the structure that wraps the hash lookup information into
+# one variable.
+print $OF <<HEADER;
+/* Hash lookup information for recomposition */
+static const pg_unicode_recompinfo UnicodeRecompInfo =
+{
+ RecompInverseLookup,
+ $rec_funcname,
+ $num_recomps
+};
+HEADER
+
+close $OT;
+close $OF;
+
+sub recomp_sort
+{
+ my $a1 = hex($a->{first});
+ my $b1 = hex($b->{first});
+
+ my $a2 = hex($a->{second});
+ my $b2 = hex($b->{second});
+
+ # First sort by the first code point
+ return -1 if $a1 < $b1;
+ return 1 if $a1 > $b1;
+
+ # Then sort by the second code point
+ return -1 if $a2 < $b2;
+ return 1 if $a2 > $b2;
+
+ # Finally sort by the code point that decomposes into first and
+ # second ones.
+ my $acode = hex($a->{code});
+ my $bcode = hex($b->{code});
+
+ return -1 if $acode < $bcode;
+ return 1 if $acode > $bcode;
+
+ die "found duplicate entries of recomposeable code pairs";
+}
diff --git a/src/common/unicode/generate-unicode_normprops_table.pl b/src/common/unicode/generate-unicode_normprops_table.pl
new file mode 100644
index 0000000..08e41b3
--- /dev/null
+++ b/src/common/unicode/generate-unicode_normprops_table.pl
@@ -0,0 +1,125 @@
+#!/usr/bin/perl
+#
+# Generate table of Unicode normalization "quick check" properties
+# (see UAX #15). Pass DerivedNormalizationProps.txt as argument. The
+# output is on stdout.
+#
+# Copyright (c) 2020-2022, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+use FindBin;
+use lib "$FindBin::RealBin/../../tools/";
+use PerfectHash;
+
+my %data;
+
+print
+ "/* generated by src/common/unicode/generate-unicode_normprops_table.pl, do not edit */\n\n";
+
+print <<EOS;
+#include "common/unicode_norm.h"
+
+/*
+ * Normalization quick check entry for codepoint. We use a bit field
+ * here to save space.
+ */
+typedef struct
+{
+ unsigned int codepoint:21;
+ signed int quickcheck:4; /* really UnicodeNormalizationQC */
+} pg_unicode_normprops;
+
+/* Typedef for hash function on quick check table */
+typedef int (*qc_hash_func) (const void *key);
+
+/* Information for quick check lookup with perfect hash function */
+typedef struct
+{
+ const pg_unicode_normprops *normprops;
+ qc_hash_func hash;
+ int num_normprops;
+} pg_unicode_norminfo;
+EOS
+
+foreach my $line (<ARGV>)
+{
+ chomp $line;
+ $line =~ s/\s*#.*$//;
+ next if $line eq '';
+ my ($codepoint, $prop, $value) = split /\s*;\s*/, $line;
+ next if $prop !~ /_QC/;
+
+ my ($first, $last);
+ if ($codepoint =~ /\.\./)
+ {
+ ($first, $last) = split /\.\./, $codepoint;
+ }
+ else
+ {
+ $first = $last = $codepoint;
+ }
+
+ foreach my $cp (hex($first) .. hex($last))
+ {
+ $data{$prop}{$cp} = $value;
+ }
+}
+
+# We create a separate array for each normalization form rather than,
+# say, a two-dimensional array, because that array would be very
+# sparse and would create unnecessary overhead especially for the NFC
+# lookup.
+foreach my $prop (sort keys %data)
+{
+ # Don't build the tables for the "D" forms because they are too
+ # big. See also unicode_is_normalized_quickcheck().
+ next if $prop eq "NFD_QC" || $prop eq "NFKD_QC";
+
+ print "\n";
+ print
+ "static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";
+
+ my %subdata = %{ $data{$prop} };
+ my @cp_packed;
+ foreach my $cp (sort { $a <=> $b } keys %subdata)
+ {
+ my $qc;
+ if ($subdata{$cp} eq 'N')
+ {
+ $qc = 'UNICODE_NORM_QC_NO';
+ }
+ elsif ($subdata{$cp} eq 'M')
+ {
+ $qc = 'UNICODE_NORM_QC_MAYBE';
+ }
+ else
+ {
+ die;
+ }
+ printf "\t{0x%04X, %s},\n", $cp, $qc;
+
+ # Save the bytes as a string in network order.
+ push @cp_packed, pack('N', $cp);
+ }
+
+ print "};\n";
+
+ # Emit the definition of the perfect hash function.
+ my $funcname = $prop . '_hash_func';
+ my $f = PerfectHash::generate_hash_function(\@cp_packed, $funcname,
+ fixed_key_length => 4);
+ printf "\n/* Perfect hash function for %s */", $prop;
+ print "\nstatic $f\n";
+
+ # Emit the structure that wraps the hash lookup information into
+ # one variable.
+ printf "/* Hash lookup information for %s */", $prop;
+ printf "\nstatic const pg_unicode_norminfo ";
+ printf "UnicodeNormInfo_%s = {\n", $prop;
+ printf "\tUnicodeNormProps_%s,\n", $prop;
+ printf "\t%s,\n", $funcname;
+ printf "\t%d\n", scalar @cp_packed;
+ printf "};\n";
+}
diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c
new file mode 100644
index 0000000..0e244ad
--- /dev/null
+++ b/src/common/unicode/norm_test.c
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ * norm_test.c
+ * Program to test Unicode normalization functions.
+ *
+ * Portions Copyright (c) 2017-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/common/unicode/norm_test.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres_fe.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/unicode_norm.h"
+
+#include "norm_test_table.h"
+
+static char *
+print_wchar_str(const pg_wchar *s)
+{
+#define BUF_DIGITS 50
+ static char buf[BUF_DIGITS * 11 + 1];
+ int i;
+ char *p;
+
+ i = 0;
+ p = buf;
+ while (*s && i < BUF_DIGITS)
+ {
+ p += sprintf(p, "U+%04X ", *s);
+ i++;
+ s++;
+ }
+ *p = '\0';
+
+ return buf;
+}
+
+static int
+pg_wcscmp(const pg_wchar *s1, const pg_wchar *s2)
+{
+ for (;;)
+ {
+ if (*s1 < *s2)
+ return -1;
+ if (*s1 > *s2)
+ return 1;
+ if (*s1 == 0)
+ return 0;
+ s1++;
+ s2++;
+ }
+}
+
+int
+main(int argc, char **argv)
+{
+ const pg_unicode_test *test;
+
+ for (test = UnicodeNormalizationTests; test->input[0] != 0; test++)
+ {
+ for (int form = 0; form < 4; form++)
+ {
+ pg_wchar *result;
+
+ result = unicode_normalize(form, test->input);
+
+ if (pg_wcscmp(test->output[form], result) != 0)
+ {
+ printf("FAILURE (NormalizationTest.txt line %d form %d):\n", test->linenum, form);
+ printf("input: %s\n", print_wchar_str(test->input));
+ printf("expected: %s\n", print_wchar_str(test->output[form]));
+ printf("got: %s\n", print_wchar_str(result));
+ printf("\n");
+ exit(1);
+ }
+ }
+ }
+
+ printf("All tests successful!\n");
+ exit(0);
+}
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
new file mode 100644
index 0000000..8f1402e
--- /dev/null
+++ b/src/common/unicode_norm.c
@@ -0,0 +1,634 @@
+/*-------------------------------------------------------------------------
+ * unicode_norm.c
+ * Normalize a Unicode string
+ *
+ * This implements Unicode normalization, per the documentation at
+ * https://www.unicode.org/reports/tr15/.
+ *
+ * Portions Copyright (c) 2017-2022, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/common/unicode_norm.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef FRONTEND
+#include "postgres.h"
+#else
+#include "postgres_fe.h"
+#endif
+
+#include "common/unicode_norm.h"
+#ifndef FRONTEND
+#include "common/unicode_norm_hashfunc.h"
+#include "common/unicode_normprops_table.h"
+#include "port/pg_bswap.h"
+#else
+#include "common/unicode_norm_table.h"
+#endif
+
+#ifndef FRONTEND
+#define ALLOC(size) palloc(size)
+#define FREE(size) pfree(size)
+#else
+#define ALLOC(size) malloc(size)
+#define FREE(size) free(size)
+#endif
+
+/* Constants for calculations with Hangul characters */
+#define SBASE 0xAC00 /* U+AC00 */
+#define LBASE 0x1100 /* U+1100 */
+#define VBASE 0x1161 /* U+1161 */
+#define TBASE 0x11A7 /* U+11A7 */
+#define LCOUNT 19
+#define VCOUNT 21
+#define TCOUNT 28
+#define NCOUNT VCOUNT * TCOUNT
+#define SCOUNT LCOUNT * NCOUNT
+
+#ifdef FRONTEND
+/* comparison routine for bsearch() of decomposition lookup table. */
+static int
+conv_compare(const void *p1, const void *p2)
+{
+ uint32 v1,
+ v2;
+
+ v1 = *(const uint32 *) p1;
+ v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
+ return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
+}
+
+#endif
+
+/*
+ * get_code_entry
+ *
+ * Get the entry corresponding to code in the decomposition lookup table.
+ * The backend version of this code uses a perfect hash function for the
+ * lookup, while the frontend version uses a binary search.
+ */
+static const pg_unicode_decomposition *
+get_code_entry(pg_wchar code)
+{
+#ifndef FRONTEND
+ int h;
+ uint32 hashkey;
+ pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
+
+ /*
+ * Compute the hash function. The hash key is the codepoint with the bytes
+ * in network order.
+ */
+ hashkey = pg_hton32(code);
+ h = decompinfo.hash(&hashkey);
+
+ /* An out-of-range result implies no match */
+ if (h < 0 || h >= decompinfo.num_decomps)
+ return NULL;
+
+ /*
+ * Since it's a perfect hash, we need only match to the specific codepoint
+ * it identifies.
+ */
+ if (code != decompinfo.decomps[h].codepoint)
+ return NULL;
+
+ /* Success! */
+ return &decompinfo.decomps[h];
+#else
+ return bsearch(&(code),
+ UnicodeDecompMain,
+ lengthof(UnicodeDecompMain),
+ sizeof(pg_unicode_decomposition),
+ conv_compare);
+#endif
+}
+
+/*
+ * Get the combining class of the given codepoint.
+ */
+static uint8
+get_canonical_class(pg_wchar code)
+{
+ const pg_unicode_decomposition *entry = get_code_entry(code);
+
+ /*
+ * If no entries are found, the character used is either an Hangul
+ * character or a character with a class of 0 and no decompositions.
+ */
+ if (!entry)
+ return 0;
+ else
+ return entry->comb_class;
+}
+
+/*
+ * Given a decomposition entry looked up earlier, get the decomposed
+ * characters.
+ *
+ * Note: the returned pointer can point to statically allocated buffer, and
+ * is only valid until next call to this function!
+ */
+static const pg_wchar *
+get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
+{
+ static pg_wchar x;
+
+ if (DECOMPOSITION_IS_INLINE(entry))
+ {
+ Assert(DECOMPOSITION_SIZE(entry) == 1);
+ x = (pg_wchar) entry->dec_index;
+ *dec_size = 1;
+ return &x;
+ }
+ else
+ {
+ *dec_size = DECOMPOSITION_SIZE(entry);
+ return &UnicodeDecomp_codepoints[entry->dec_index];
+ }
+}
+
+/*
+ * Calculate how many characters a given character will decompose to.
+ *
+ * This needs to recurse, if the character decomposes into characters that
+ * are, in turn, decomposable.
+ */
+static int
+get_decomposed_size(pg_wchar code, bool compat)
+{
+ const pg_unicode_decomposition *entry;
+ int size = 0;
+ int i;
+ const uint32 *decomp;
+ int dec_size;
+
+ /*
+ * Fast path for Hangul characters not stored in tables to save memory as
+ * decomposition is algorithmic. See
+ * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
+ * on the matter.
+ */
+ if (code >= SBASE && code < SBASE + SCOUNT)
+ {
+ uint32 tindex,
+ sindex;
+
+ sindex = code - SBASE;
+ tindex = sindex % TCOUNT;
+
+ if (tindex != 0)
+ return 3;
+ return 2;
+ }
+
+ entry = get_code_entry(code);
+
+ /*
+ * Just count current code if no other decompositions. A NULL entry is
+ * equivalent to a character with class 0 and no decompositions.
+ */
+ if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
+ (!compat && DECOMPOSITION_IS_COMPAT(entry)))
+ return 1;
+
+ /*
+ * If this entry has other decomposition codes look at them as well. First
+ * get its decomposition in the list of tables available.
+ */
+ decomp = get_code_decomposition(entry, &dec_size);
+ for (i = 0; i < dec_size; i++)
+ {
+ uint32 lcode = decomp[i];
+
+ size += get_decomposed_size(lcode, compat);
+ }
+
+ return size;
+}
+
+/*
+ * Recompose a set of characters. For hangul characters, the calculation
+ * is algorithmic. For others, an inverse lookup at the decomposition
+ * table is necessary. Returns true if a recomposition can be done, and
+ * false otherwise.
+ */
+static bool
+recompose_code(uint32 start, uint32 code, uint32 *result)
+{
+ /*
+ * Handle Hangul characters algorithmically, per the Unicode spec.
+ *
+ * Check if two current characters are L and V.
+ */
+ if (start >= LBASE && start < LBASE + LCOUNT &&
+ code >= VBASE && code < VBASE + VCOUNT)
+ {
+ /* make syllable of form LV */
+ uint32 lindex = start - LBASE;
+ uint32 vindex = code - VBASE;
+
+ *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
+ return true;
+ }
+ /* Check if two current characters are LV and T */
+ else if (start >= SBASE && start < (SBASE + SCOUNT) &&
+ ((start - SBASE) % TCOUNT) == 0 &&
+ code >= TBASE && code < (TBASE + TCOUNT))
+ {
+ /* make syllable of form LVT */
+ uint32 tindex = code - TBASE;
+
+ *result = start + tindex;
+ return true;
+ }
+ else
+ {
+ const pg_unicode_decomposition *entry;
+
+ /*
+ * Do an inverse lookup of the decomposition tables to see if anything
+ * matches. The comparison just needs to be a perfect match on the
+ * sub-table of size two, because the start character has already been
+ * recomposed partially. This lookup uses a perfect hash function for
+ * the backend code.
+ */
+#ifndef FRONTEND
+
+ int h,
+ inv_lookup_index;
+ uint64 hashkey;
+ pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;
+
+ /*
+ * Compute the hash function. The hash key is formed by concatenating
+ * bytes of the two codepoints in network order. See also
+ * src/common/unicode/generate-unicode_norm_table.pl.
+ */
+ hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
+ h = recompinfo.hash(&hashkey);
+
+ /* An out-of-range result implies no match */
+ if (h < 0 || h >= recompinfo.num_recomps)
+ return false;
+
+ inv_lookup_index = recompinfo.inverse_lookup[h];
+ entry = &UnicodeDecompMain[inv_lookup_index];
+
+ if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
+ code == UnicodeDecomp_codepoints[entry->dec_index + 1])
+ {
+ *result = entry->codepoint;
+ return true;
+ }
+
+#else
+
+ int i;
+
+ for (i = 0; i < lengthof(UnicodeDecompMain); i++)
+ {
+ entry = &UnicodeDecompMain[i];
+
+ if (DECOMPOSITION_SIZE(entry) != 2)
+ continue;
+
+ if (DECOMPOSITION_NO_COMPOSE(entry))
+ continue;
+
+ if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
+ code == UnicodeDecomp_codepoints[entry->dec_index + 1])
+ {
+ *result = entry->codepoint;
+ return true;
+ }
+ }
+#endif /* !FRONTEND */
+ }
+
+ return false;
+}
+
+/*
+ * Decompose the given code into the array given by caller. The
+ * decomposition begins at the position given by caller, saving one
+ * lookup on the decomposition table. The current position needs to be
+ * updated here to let the caller know from where to continue filling
+ * in the array result.
+ */
+static void
+decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
+{
+ const pg_unicode_decomposition *entry;
+ int i;
+ const uint32 *decomp;
+ int dec_size;
+
+ /*
+ * Fast path for Hangul characters not stored in tables to save memory as
+ * decomposition is algorithmic. See
+ * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
+ * on the matter.
+ */
+ if (code >= SBASE && code < SBASE + SCOUNT)
+ {
+ uint32 l,
+ v,
+ tindex,
+ sindex;
+ pg_wchar *res = *result;
+
+ sindex = code - SBASE;
+ l = LBASE + sindex / (VCOUNT * TCOUNT);
+ v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
+ tindex = sindex % TCOUNT;
+
+ res[*current] = l;
+ (*current)++;
+ res[*current] = v;
+ (*current)++;
+
+ if (tindex != 0)
+ {
+ res[*current] = TBASE + tindex;
+ (*current)++;
+ }
+
+ return;
+ }
+
+ entry = get_code_entry(code);
+
+ /*
+ * Just fill in with the current decomposition if there are no
+ * decomposition codes to recurse to. A NULL entry is equivalent to a
+ * character with class 0 and no decompositions, so just leave also in
+ * this case.
+ */
+ if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
+ (!compat && DECOMPOSITION_IS_COMPAT(entry)))
+ {
+ pg_wchar *res = *result;
+
+ res[*current] = code;
+ (*current)++;
+ return;
+ }
+
+ /*
+ * If this entry has other decomposition codes look at them as well.
+ */
+ decomp = get_code_decomposition(entry, &dec_size);
+ for (i = 0; i < dec_size; i++)
+ {
+ pg_wchar lcode = (pg_wchar) decomp[i];
+
+ /* Leave if no more decompositions */
+ decompose_code(lcode, compat, result, current);
+ }
+}
+
+/*
+ * unicode_normalize - Normalize a Unicode string to the specified form.
+ *
+ * The input is a 0-terminated array of codepoints.
+ *
+ * In frontend, returns a 0-terminated array of codepoints, allocated with
+ * malloc. Or NULL if we run out of memory. In backend, the returned
+ * string is palloc'd instead, and OOM is reported with ereport().
+ */
+pg_wchar *
+unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
+{
+ bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
+ bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
+ pg_wchar *decomp_chars;
+ pg_wchar *recomp_chars;
+ int decomp_size,
+ current_size;
+ int count;
+ const pg_wchar *p;
+
+ /* variables for recomposition */
+ int last_class;
+ int starter_pos;
+ int target_pos;
+ uint32 starter_ch;
+
+ /* First, do character decomposition */
+
+ /*
+ * Calculate how many characters long the decomposed version will be.
+ */
+ decomp_size = 0;
+ for (p = input; *p; p++)
+ decomp_size += get_decomposed_size(*p, compat);
+
+ decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+ if (decomp_chars == NULL)
+ return NULL;
+
+ /*
+ * Now fill in each entry recursively. This needs a second pass on the
+ * decomposition table.
+ */
+ current_size = 0;
+ for (p = input; *p; p++)
+ decompose_code(*p, compat, &decomp_chars, &current_size);
+ decomp_chars[decomp_size] = '\0';
+ Assert(decomp_size == current_size);
+
+ /* Leave if there is nothing to decompose */
+ if (decomp_size == 0)
+ return decomp_chars;
+
+ /*
+ * Now apply canonical ordering.
+ */
+ for (count = 1; count < decomp_size; count++)
+ {
+ pg_wchar prev = decomp_chars[count - 1];
+ pg_wchar next = decomp_chars[count];
+ pg_wchar tmp;
+ const uint8 prevClass = get_canonical_class(prev);
+ const uint8 nextClass = get_canonical_class(next);
+
+ /*
+ * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
+ * annex 4, a sequence of two adjacent characters in a string is an
+ * exchangeable pair if the combining class (from the Unicode
+ * Character Database) for the first character is greater than the
+ * combining class for the second, and the second is not a starter. A
+ * character is a starter if its combining class is 0.
+ */
+ if (prevClass == 0 || nextClass == 0)
+ continue;
+
+ if (prevClass <= nextClass)
+ continue;
+
+ /* exchange can happen */
+ tmp = decomp_chars[count - 1];
+ decomp_chars[count - 1] = decomp_chars[count];
+ decomp_chars[count] = tmp;
+
+ /* backtrack to check again */
+ if (count > 1)
+ count -= 2;
+ }
+
+ if (!recompose)
+ return decomp_chars;
+
+ /*
+ * The last phase of NFC and NFKC is the recomposition of the reordered
+ * Unicode string using combining classes. The recomposed string cannot be
+ * longer than the decomposed one, so make the allocation of the output
+ * string based on that assumption.
+ */
+ recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+ if (!recomp_chars)
+ {
+ FREE(decomp_chars);
+ return NULL;
+ }
+
+ last_class = -1; /* this eliminates a special check */
+ starter_pos = 0;
+ target_pos = 1;
+ starter_ch = recomp_chars[0] = decomp_chars[0];
+
+ for (count = 1; count < decomp_size; count++)
+ {
+ pg_wchar ch = decomp_chars[count];
+ int ch_class = get_canonical_class(ch);
+ pg_wchar composite;
+
+ if (last_class < ch_class &&
+ recompose_code(starter_ch, ch, &composite))
+ {
+ recomp_chars[starter_pos] = composite;
+ starter_ch = composite;
+ }
+ else if (ch_class == 0)
+ {
+ starter_pos = target_pos;
+ starter_ch = ch;
+ last_class = -1;
+ recomp_chars[target_pos++] = ch;
+ }
+ else
+ {
+ last_class = ch_class;
+ recomp_chars[target_pos++] = ch;
+ }
+ }
+ recomp_chars[target_pos] = (pg_wchar) '\0';
+
+ FREE(decomp_chars);
+
+ return recomp_chars;
+}
+
+/*
+ * Normalization "quick check" algorithm; see
+ * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
+ */
+
+/* We only need this in the backend. */
+#ifndef FRONTEND
+
+static const pg_unicode_normprops *
+qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
+{
+ int h;
+ uint32 hashkey;
+
+ /*
+ * Compute the hash function. The hash key is the codepoint with the bytes
+ * in network order.
+ */
+ hashkey = pg_hton32(ch);
+ h = norminfo->hash(&hashkey);
+
+ /* An out-of-range result implies no match */
+ if (h < 0 || h >= norminfo->num_normprops)
+ return NULL;
+
+ /*
+ * Since it's a perfect hash, we need only match to the specific codepoint
+ * it identifies.
+ */
+ if (ch != norminfo->normprops[h].codepoint)
+ return NULL;
+
+ /* Success! */
+ return &norminfo->normprops[h];
+}
+
+/*
+ * Look up the normalization quick check character property
+ */
+static UnicodeNormalizationQC
+qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
+{
+ const pg_unicode_normprops *found = NULL;
+
+ switch (form)
+ {
+ case UNICODE_NFC:
+ found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
+ break;
+ case UNICODE_NFKC:
+ found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
+ break;
+ default:
+ Assert(false);
+ break;
+ }
+
+ if (found)
+ return found->quickcheck;
+ else
+ return UNICODE_NORM_QC_YES;
+}
+
+UnicodeNormalizationQC
+unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
+{
+ uint8 lastCanonicalClass = 0;
+ UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
+
+ /*
+ * For the "D" forms, we don't run the quickcheck. We don't include the
+ * lookup tables for those because they are huge, checking for these
+ * particular forms is less common, and running the slow path is faster
+ * for the "D" forms than the "C" forms because you don't need to
+ * recompose, which is slow.
+ */
+ if (form == UNICODE_NFD || form == UNICODE_NFKD)
+ return UNICODE_NORM_QC_MAYBE;
+
+ for (const pg_wchar *p = input; *p; p++)
+ {
+ pg_wchar ch = *p;
+ uint8 canonicalClass;
+ UnicodeNormalizationQC check;
+
+ canonicalClass = get_canonical_class(ch);
+ if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
+ return UNICODE_NORM_QC_NO;
+
+ check = qc_is_allowed(form, ch);
+ if (check == UNICODE_NORM_QC_NO)
+ return UNICODE_NORM_QC_NO;
+ else if (check == UNICODE_NORM_QC_MAYBE)
+ result = UNICODE_NORM_QC_MAYBE;
+
+ lastCanonicalClass = canonicalClass;
+ }
+ return result;
+}
+
+#endif /* !FRONTEND */