#!/usr/bin/perl # # Generate a sorted list of non-overlapping intervals of East Asian Wide (W) # and East Asian Fullwidth (F) characters, using Unicode data files as input. # Pass EastAsianWidth.txt as argument. The output is on stdout. # # Copyright (c) 2019-2022, PostgreSQL Global Development Group use strict; use warnings; my $range_start = undef; my ($first, $last); my $prev_last; print "/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n"; print "static const struct mbinterval east_asian_fw[] = {\n"; foreach my $line () { chomp $line; $line =~ s/\s*#.*$//; next if $line eq ''; my ($codepoint, $width) = split ';', $line; if ($codepoint =~ /\.\./) { ($first, $last) = split /\.\./, $codepoint; } else { $first = $last = $codepoint; } ($first, $last) = map(hex, ($first, $last)); if ($width eq 'F' || $width eq 'W') { # fullwidth/wide characters if (!defined($range_start)) { # save for start of range if one hasn't been started yet $range_start = $first; } elsif ($first != $prev_last + 1) { # ranges aren't contiguous; emit the last and start a new one printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last; $range_start = $first; } } else { # not wide characters, print out previous range if any if (defined($range_start)) { printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last; $range_start = undef; } } } continue { $prev_last = $last; } # don't forget any ranges at the very end of the database (though there are none # as of Unicode 13.0) if (defined($range_start)) { printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last; } print "};\n";