#!/usr/bin/perl # # Read Unicode consortium's normalization test suite, NormalizationTest.txt, # and generate a C array from it, for norm_test.c. # # NormalizationTest.txt is part of the Unicode Character Database. # # Copyright (c) 2000-2020, PostgreSQL Global Development Group use strict; use warnings; use File::Basename; die "Usage: $0 INPUT_FILE OUTPUT_FILE\n" if @ARGV != 2; my $input_file = $ARGV[0]; my $output_file = $ARGV[1]; my $output_base = basename($output_file); # Open the input and output files open my $INPUT, '<', $input_file or die "Could not open input file $input_file: $!"; open my $OUTPUT, '>', $output_file or die "Could not open output file $output_file: $!\n"; # Print header of output file. print $OUTPUT <) { $linenum = $linenum + 1; if ($line =~ /^\s*#/) { next; } # ignore comments if ($line =~ /^@/) { next; } # ignore @Part0 like headers # Split the line wanted and get the fields needed: # # source; NFC; NFD; NFKC; NFKD my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line); my $source_utf8 = codepoint_string_to_hex($source); my $nfc_utf8 = codepoint_string_to_hex($nfc); my $nfd_utf8 = codepoint_string_to_hex($nfd); my $nfkc_utf8 = codepoint_string_to_hex($nfkc); my $nfkd_utf8 = codepoint_string_to_hex($nfkd); print $OUTPUT "\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n"; } # Output terminator entry print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }"; print $OUTPUT "\n};\n"; close $OUTPUT; close $INPUT;