src/common/unicode/generate-unicode_east_asian_fw_table.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

#!/usr/bin/perl
#
# Generate a sorted list of non-overlapping intervals of East Asian Wide (W)
# and East Asian Fullwidth (F) characters, using Unicode data files as input.
# Pass EastAsianWidth.txt as argument.  The output is on stdout.
#
# Copyright (c) 2019-2022, PostgreSQL Global Development Group

use strict;
use warnings;

my $range_start = undef;
my ($first, $last);
my $prev_last;

print
  "/* generated by src/common/unicode/generate-unicode_east_asian_fw_table.pl, do not edit */\n\n";

print "static const struct mbinterval east_asian_fw[] = {\n";

foreach my $line (<ARGV>)
{
	chomp $line;
	$line =~ s/\s*#.*$//;
	next if $line eq '';
	my ($codepoint, $width) = split ';', $line;

	if ($codepoint =~ /\.\./)
	{
		($first, $last) = split /\.\./, $codepoint;
	}
	else
	{
		$first = $last = $codepoint;
	}

	($first, $last) = map(hex, ($first, $last));

	if ($width eq 'F' || $width eq 'W')
	{
		# fullwidth/wide characters
		if (!defined($range_start))
		{
			# save for start of range if one hasn't been started yet
			$range_start = $first;
		}
		elsif ($first != $prev_last + 1)
		{
			# ranges aren't contiguous; emit the last and start a new one
			printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
			$range_start = $first;
		}
	}
	else
	{
		# not wide characters, print out previous range if any
		if (defined($range_start))
		{
			printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
			$range_start = undef;
		}
	}
}
continue
{
	$prev_last = $last;
}

# don't forget any ranges at the very end of the database (though there are none
# as of Unicode 13.0)
if (defined($range_start))
{
	printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_last;
}

print "};\n";