diff options
Diffstat (limited to 'src/common/unicode/generate-norm_test_table.pl')
-rw-r--r-- | src/common/unicode/generate-norm_test_table.pl | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl new file mode 100644 index 0000000..838f552 --- /dev/null +++ b/src/common/unicode/generate-norm_test_table.pl @@ -0,0 +1,106 @@ +#!/usr/bin/perl +# +# Read Unicode consortium's normalization test suite, NormalizationTest.txt, +# and generate a C array from it, for norm_test.c. +# +# NormalizationTest.txt is part of the Unicode Character Database. +# +# Copyright (c) 2000-2022, PostgreSQL Global Development Group + +use strict; +use warnings; + +use File::Basename; + +die "Usage: $0 INPUT_FILE OUTPUT_FILE\n" if @ARGV != 2; +my $input_file = $ARGV[0]; +my $output_file = $ARGV[1]; +my $output_base = basename($output_file); + +# Open the input and output files +open my $INPUT, '<', $input_file + or die "Could not open input file $input_file: $!"; +open my $OUTPUT, '>', $output_file + or die "Could not open output file $output_file: $!\n"; + +# Print header of output file. +print $OUTPUT <<HEADER; +/*------------------------------------------------------------------------- + * + * norm_test_table.h + * Test strings for Unicode normalization. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/common/unicode/norm_test_table.h + * + *------------------------------------------------------------------------- + */ + +/* + * File auto-generated by src/common/unicode/generate-norm_test_table.pl, do + * not edit. There is deliberately not an #ifndef PG_NORM_TEST_TABLE_H + * here. + */ + +typedef struct +{ + int linenum; + pg_wchar input[50]; + pg_wchar output[4][50]; +} pg_unicode_test; + +/* test table */ +HEADER +print $OUTPUT + "static const pg_unicode_test UnicodeNormalizationTests[] =\n{\n"; + +# Helper routine to convert a space-separated list of Unicode characters to +# hexadecimal list format, suitable for outputting in a C array. +sub codepoint_string_to_hex +{ + my $codepoint_string = shift; + + my $result; + + foreach (split(' ', $codepoint_string)) + { + my $cp = $_; + my $utf8 = "0x$cp, "; + $result .= $utf8; + } + $result .= '0'; # null-terminated the array + return $result; +} + +# Process the input file line by line +my $linenum = 0; +while (my $line = <$INPUT>) +{ + $linenum = $linenum + 1; + if ($line =~ /^\s*#/) { next; } # ignore comments + + if ($line =~ /^@/) { next; } # ignore @Part0 like headers + + # Split the line wanted and get the fields needed: + # + # source; NFC; NFD; NFKC; NFKD + my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line); + + my $source_utf8 = codepoint_string_to_hex($source); + my $nfc_utf8 = codepoint_string_to_hex($nfc); + my $nfd_utf8 = codepoint_string_to_hex($nfd); + my $nfkc_utf8 = codepoint_string_to_hex($nfkc); + my $nfkd_utf8 = codepoint_string_to_hex($nfkd); + + print $OUTPUT + "\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n"; +} + +# Output terminator entry +print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }"; +print $OUTPUT "\n};\n"; + +close $OUTPUT; +close $INPUT; |