#!/usr/bin/perl # # Generate a composition table and its lookup utilities, using Unicode data # files as input. # # Input: UnicodeData.txt and CompositionExclusions.txt # Output: unicode_norm_table.h and unicode_norm_hashfunc.h # # Copyright (c) 2000-2021, PostgreSQL Global Development Group use strict; use warnings; use FindBin; use lib "$FindBin::RealBin/../../tools/"; use PerfectHash; my $output_table_file = "unicode_norm_table.h"; my $output_func_file = "unicode_norm_hashfunc.h"; my $FH; # Read list of codes that should be excluded from re-composition. my @composition_exclusion_codes = (); open($FH, '<', "CompositionExclusions.txt") or die "Could not open CompositionExclusions.txt: $!."; while (my $line = <$FH>) { if ($line =~ /^([[:xdigit:]]+)/) { push @composition_exclusion_codes, $1; } } close $FH; # Read entries from UnicodeData.txt into a list, and a hash table. We need # three fields from each row: the codepoint, canonical combining class, # and character decomposition mapping my @characters = (); my %character_hash = (); open($FH, '<', "UnicodeData.txt") or die "Could not open UnicodeData.txt: $!."; while (my $line = <$FH>) { # Split the line wanted and get the fields needed: # - Unicode code value # - Canonical Combining Class # - Character Decomposition Mapping my @elts = split(';', $line); my $code = $elts[0]; my $class = $elts[3]; my $decomp = $elts[5]; # Skip codepoints above U+10FFFF. They cannot be represented in 4 bytes # in UTF-8, and PostgreSQL doesn't support UTF-8 characters longer than # 4 bytes. (This is just pro forma, as there aren't any such entries in # the data file, currently.) next if hex($code) > 0x10FFFF; # Skip characters with no decompositions and a class of 0, to reduce the # table size. next if $class eq '0' && $decomp eq ''; my %char_entry = (code => $code, class => $class, decomp => $decomp); push(@characters, \%char_entry); $character_hash{$code} = \%char_entry; } close $FH; my $num_characters = scalar @characters; # Start writing out the output files open my $OT, '>', $output_table_file or die "Could not open output file $output_table_file: $!\n"; open my $OF, '>', $output_func_file or die "Could not open output file $output_func_file: $!\n"; print $OT <dec_size_flags & 0x1F) #define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0) #define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0) #define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0) /* Table of Unicode codepoints and their decompositions */ static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] = { HEADER print $OF <{code}; foreach my $char (@characters) { my $code = $char->{code}; my $class = $char->{class}; my $decomp = $char->{decomp}; # Save the code point bytes as a string in network order. push @dec_cp_packed, pack('N', hex($char->{code})); # The character decomposition mapping field in UnicodeData.txt is a list # of unicode codepoints, separated by space. But it can be prefixed with # so-called compatibility formatting tag, like "", or "". # The entries with compatibility formatting tags should not be used for # re-composing characters during normalization, so flag them in the table. # (The tag doesn't matter, only whether there is a tag or not) my $compat = 0; if ($decomp =~ /\<.*\>/) { $compat = 1; $decomp =~ s/\<[^][]*\>//g; } my @decomp_elts = split(" ", $decomp); # Decomposition size # Print size of decomposition my $decomp_size = scalar(@decomp_elts); die if $decomp_size > 0x1F; # to not overrun bitmask my $first_decomp = shift @decomp_elts; my $flags = ""; my $comment = ""; if ($compat) { $flags .= " | DECOMP_COMPAT"; } if ($decomp_size == 2) { # Should this be used for recomposition? if ( $character_hash{$first_decomp} && $character_hash{$first_decomp}->{class} != 0) { $flags .= " | DECOMP_NO_COMPOSE"; $comment = "non-starter decomposition"; } else { foreach my $lcode (@composition_exclusion_codes) { if ($lcode eq $code) { $flags .= " | DECOMP_NO_COMPOSE"; $comment = "in exclusion list"; last; } } } # Save info for recomposeable codepoints. # Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C # above! See also the inverse lookup in recompose_code() found in # src/common/unicode_norm.c. if (!($flags =~ /DECOMP_COMPAT/ || $flags =~ /DECOMP_NO_COMPOSE/)) { push @rec_info, { code => $code, main_index => $main_index, first => $first_decomp, second => $decomp_elts[0] }; } } if ($decomp_size == 0) { print $OT "\t{0x$code, $class, 0$flags, 0}"; } elsif ($decomp_size == 1 && length($first_decomp) <= 4) { # The decomposition consists of a single codepoint, and it fits # in a uint16, so we can store it "inline" in the main table. $flags .= " | DECOMP_INLINE"; print $OT "\t{0x$code, $class, 1$flags, 0x$first_decomp}"; } else { print $OT "\t{0x$code, $class, $decomp_size$flags, $decomp_index}"; # Now save the decompositions into a dedicated area that will # be written afterwards. First build the entry dedicated to # a sub-table with the code and decomposition. $decomp_string .= ",\n" if ($decomp_string ne ""); $decomp_string .= "\t /* $decomp_index */ 0x$first_decomp"; foreach (@decomp_elts) { $decomp_string .= ", 0x$_"; } $decomp_index = $decomp_index + $decomp_size; } # Print a comma after all items except the last one. print $OT "," unless ($code eq $last_code); print $OT "\t/* $comment */" if ($comment ne ""); print $OT "\n"; $main_index++; } print $OT "\n};\n\n"; # Print the array of decomposed codes. print $OT < 4); print $OF "/* Perfect hash function for decomposition */\n"; print $OF "static $dec_func\n"; # Emit the structure that wraps the hash lookup information into # one variable. print $OF <{first}) << 32) | hex($rec->{second}); # We are only interested in the lowest code point that decomposes # to the given code pair. next if $seenit{$hashkey}; # Save the hash key bytes in network order push @rec_cp_packed, pack('Q>', $hashkey); # Append inverse lookup element $recomp_string .= ",\n" if !$firstentry; $recomp_string .= sprintf "\t/* U+%s+%s -> U+%s */ %s", $rec->{first}, $rec->{second}, $rec->{code}, $rec->{main_index}; $seenit{$hashkey} = 1; $firstentry = 0; } # Emit the inverse lookup array containing indexes into UnicodeDecompMain. my $num_recomps = scalar @rec_cp_packed; print $OF < 8); print $OF "/* Perfect hash function for recomposition */\n"; print $OF "static $rec_func\n"; # Emit the structure that wraps the hash lookup information into # one variable. print $OF <{first}); my $b1 = hex($b->{first}); my $a2 = hex($a->{second}); my $b2 = hex($b->{second}); # First sort by the first code point return -1 if $a1 < $b1; return 1 if $a1 > $b1; # Then sort by the second code point return -1 if $a2 < $b2; return 1 if $a2 > $b2; # Finally sort by the code point that decomposes into first and # second ones. my $acode = hex($a->{code}); my $bcode = hex($b->{code}); return -1 if $acode < $bcode; return 1 if $acode > $bcode; die "found duplicate entries of recomposeable code pairs"; }