summaryrefslogtreecommitdiffstats
path: root/sal/textenc/generate/gb180302000.pl
diff options
context:
space:
mode:
Diffstat (limited to 'sal/textenc/generate/gb180302000.pl')
-rw-r--r--sal/textenc/generate/gb180302000.pl296
1 files changed, 296 insertions, 0 deletions
diff --git a/sal/textenc/generate/gb180302000.pl b/sal/textenc/generate/gb180302000.pl
new file mode 100644
index 0000000000..7543e7a697
--- /dev/null
+++ b/sal/textenc/generate/gb180302000.pl
@@ -0,0 +1,296 @@
+#!/usr/bin/env perl
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+# This file incorporates work covered by the following license notice:
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed
+# with this work for additional information regarding copyright
+# ownership. The ASF licenses this file to you under the Apache
+# License, Version 2.0 (the "License"); you may not use this file
+# except in compliance with the License. You may obtain a copy of
+# the License at http://www.apache.org/licenses/LICENSE-2.0 .
+#
+
+# The following files must be available in a ./input subdir:
+
+# <http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/
+# gb-18030-2000.xml?rev=1.4&content-type=text/plain>:
+# "modified version="3" date="2001-02-21""
+
+$id = "Gb180302000";
+
+sub printUtf32
+{
+ my $utf32 = $_[0];
+ return sprintf("U+%04X", $utf32);
+}
+
+sub printGb
+{
+ if (defined($_[2]))
+ {
+ return sprintf("%02X%02X%02X%02X", $_[0], $_[1], $_[2], $_[3]);
+ }
+ elsif (defined($_[1]))
+ {
+ return sprintf("%02X%02X", $_[0], $_[1]);
+ }
+ else
+ {
+ return sprintf("%02X", $_[0]);
+ }
+}
+
+$gb_map_2_count = 0;
+$gb_map_4_count = 0;
+$gb_map_4_ranges = 0;
+$gb_map_4_max = 0;
+$uni_map_count = 0;
+
+$range_count = 0;
+
+if (1)
+{
+ $filename = "gb-18030-2000.xml";
+ open IN, ("input/" . $filename) or die "Cannot read " . $filename;
+ while (<IN>)
+ {
+ if (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([0-7][0-9A-F])\"\/>$/)
+ {
+ $utf32 = oct("0x" . $1);
+ $gb1 = oct("0x" . $2);
+ ($utf32 == $gb1)
+ or die "Bad " . printUtf32($utf32) . " to " . printGb($gb1);
+ }
+ elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) ([4-789A-F][0-9A-F])\"\/>$/)
+ {
+ $utf32 = oct("0x" . $1);
+ $gb1 = oct("0x" . $2);
+ $gb2 = oct("0x" . $3);
+ $gb_code = ($gb1 - 0x81) * 190
+ + ($gb2 <= 0x7E ? $gb2 - 0x40 : $gb2 - 0x80 + 63);
+ !defined($gb_map_2[$gb_code])
+ or die "Redefined " . printGb($gb1, $gb2);
+ $gb_map_2[$gb_code] = $utf32;
+ ++$gb_map_2_count;
+
+ !defined($uni_map[$utf32]) or die "Double Unicode mapping";
+ $uni_map[$utf32] = $gb1 << 8 | $gb2;
+ ++$uni_map_count;
+ }
+ elsif (/^[ \t]*<a +u=\"([0-9A-F]+)\" +b=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\"\/>$/)
+ {
+ $utf32 = oct("0x" . $1);
+ $gb1 = oct("0x" . $2);
+ $gb2 = oct("0x" . $3);
+ $gb3 = oct("0x" . $4);
+ $gb4 = oct("0x" . $5);
+ $gb_code = ($gb1 - 0x81) * 12600
+ + ($gb2 - 0x30) * 1260
+ + ($gb3 - 0x81) * 10
+ + ($gb4 - 0x30);
+ !defined($gb_map_4[$gb_code])
+ or die "Redefined " . printGb($gb1, $gb2, $gb3, $gb4);
+ $gb_map_4[$gb_code] = $utf32;
+ ++$gb_map_4_count;
+ $gb_map_4_max = $gb_code if ($gb_code > $gb_map_4_max);
+
+ !defined($uni_map[$utf32]) or die "Double Unicode mapping";
+ $uni_map[$utf32] = $gb1 << 24 | $gb2 << 16 | $gb3 << 8 | $gb4;
+ ++$uni_map_count;
+ }
+ elsif (/<a /)
+ {
+ die "Bad format";
+ }
+ elsif (/^[ \t]*<range +uFirst=\"([0-9A-F]+)\" +uLast=\"([0-9A-F]+)\" +bFirst=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bLast=\"([89A-F][0-9A-F]) (3[0-9]) ([89A-F][0-9A-F]) (3[0-9])\" +bMin=\"81 30 81 30\" +bMax=\"FE 39 FE 39\"\/>$/)
+ {
+ $utf32_first = oct("0x" . $1);
+ $utf32_last = oct("0x" . $2);
+ $gb1_first = oct("0x" . $3);
+ $gb2_first = oct("0x" . $4);
+ $gb3_first = oct("0x" . $5);
+ $gb4_first = oct("0x" . $6);
+ $gb1_last = oct("0x" . $7);
+ $gb2_last = oct("0x" . $8);
+ $gb3_last = oct("0x" . $9);
+ $gb4_last = oct("0x" . $10);
+ $linear_first
+ = ($gb1_first - 0x81) * 12600
+ + ($gb2_first - 0x30) * 1260
+ + ($gb3_first - 0x81) * 10
+ + ($gb4_first - 0x30);
+ $linear_last
+ = ($gb1_last - 0x81) * 12600
+ + ($gb2_last - 0x30) * 1260
+ + ($gb3_last - 0x81) * 10
+ + ($gb4_last - 0x30);
+ ($utf32_last - $utf32_first == $linear_last - $linear_first)
+ or die "Bad range";
+ if ($linear_first != 189000 || $linear_last != 1237575)
+ {
+ $range_uni_first[$range_count] = $utf32_first;
+ $range_uni_last[$range_count]
+ = ($utf32_last == 0xD7FF ? 0xDFFF : $utf32_last);
+ $range_linear_first[$range_count] = $linear_first;
+ $range_linear_last[$range_count] = $linear_last;
+ ++$range_count;
+ $gb_map_4_ranges += $linear_last - $linear_first + 1;
+ $gb_map_4_max = $linear_last
+ if ($linear_last > $gb_map_4_max);
+ }
+ }
+ elsif (/<range /)
+ {
+ die "Bad format";
+ }
+ }
+ close IN;
+}
+
+print "gb_map_2_count = ", $gb_map_2_count,
+ ", gb_map_4_count = ", $gb_map_4_count,
+ ", gb_map_4_ranges = ", $gb_map_4_ranges,
+ ", gb_map_4_max = ", $gb_map_4_max,
+ ", uni_map_count = ", $uni_map_count, "\n";
+($gb_map_2_count == 23940) or die "Bad gb_map_2_count != 23940";
+($gb_map_4_max == $gb_map_4_count + $gb_map_4_ranges - 1)
+ or die "Bad gb_map_4_max != gb_map_4_count + gb_map_4_ranges";
+($uni_map_count + $gb_map_4_ranges == 0x10000 - (0xE000 - 0xD800) - 0x80)
+ or die "Bad uni_map_count";
+
+$range_index = 0;
+$gb_nonrangedataindex[$range_index] = $gb_map_2_count;
+for ($gb_code = 0; $gb_code < $gb_map_4_max; ++$gb_code)
+{
+ if (defined($gb_map_4[$gb_code]))
+ {
+ $gb_map_2[$gb_map_2_count++] = $gb_map_4[$gb_code];
+ }
+ else
+ {
+ ($gb_code == $range_linear_first[$range_index]) or die "Bad input";
+ $gb_code = $range_linear_last[$range_index];
+ ++$range_index;
+ $gb_nonrangedataindex[$range_index] = $gb_map_2_count;
+ }
+}
+($range_index == $range_count) or die "Bad input";
+
+$filename = lc($id) . ".tab";
+open OUT, ("> " . $filename) or die "Cannot write " . $filename;
+
+{
+ $filename = lc($id). ".pl";
+ open IN, $filename or die "Cannot read ". $filename;
+ $first = 1;
+ while (<IN>)
+ {
+ if (/^\#!.*$/)
+ {
+ }
+ elsif (/^\#(\*.*)$/)
+ {
+ if ($first == 1)
+ {
+ print OUT "/", $1, "\n";
+ $first = 0;
+ }
+ else
+ {
+ print OUT " ", substr($1, 0, length($1) - 1), "/\n";
+ }
+ }
+ elsif (/^\# (.*)$/)
+ {
+ print OUT " *", $1, "\n";
+ }
+ elsif (/^\#(.*)$/)
+ {
+ print OUT " *", $1, "\n";
+ }
+ else
+ {
+ goto done;
+ }
+ }
+ done:
+}
+
+print OUT "\n",
+ "#include \"convertgb18030.h\"\n",
+ "\n",
+ "#include \"sal/types.h\"\n",
+ "\n";
+
+print OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n ";
+for ($gb_code = 0; $gb_code < $gb_map_2_count; ++$gb_code)
+{
+ printf OUT "0x%04X,", $gb_map_2[$gb_code];
+ if ($gb_code % 8 == 7 && $gb_code != $gb_map_2_count - 1)
+ {
+ print OUT "\n ";
+ }
+}
+print OUT "\n};\n\n";
+
+print OUT "static ImplGb180302000ToUnicodeRange const\n aImpl",
+ $id,
+ "ToUnicodeRanges[] = {\n";
+for ($range_index = 0; $range_index < $range_count; ++$range_index)
+{
+ printf OUT " { %d, %d, %d, 0x%04X },\n",
+ $gb_nonrangedataindex[$range_index],
+ $range_linear_first[$range_index],
+ $range_linear_last[$range_index] + 1,
+ $range_uni_first[$range_index];
+}
+print OUT " { -1, 0, 0, 0 }\n};\n\n";
+
+print OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n ";
+$index = 0;
+$range_index = 0;
+$uni_nonrangedataindex[$range_index] = $index;
+for ($utf32 = 0x80; $utf32 <= 0xFFFF; ++$utf32)
+{
+ if (defined($uni_map[$utf32]))
+ {
+ if ($index > 0 && ($index - 1) % 6 == 5)
+ {
+ print OUT "\n ";
+ }
+ $bytes = $uni_map[$utf32];
+ printf OUT ($bytes <= 0xFFFF ? " 0x%04X," : "0x%08X,"), $bytes;
+ ++$index;
+ }
+ else
+ {
+ ($utf32 == $range_uni_first[$range_index]) or die "Bad input";
+ $utf32 = $range_uni_last[$range_index];
+ ++$range_index;
+ $uni_nonrangedataindex[$range_index] = $index;
+ }
+}
+($range_index == $range_count) or die "Bad input";
+print OUT "\n};\n\n";
+
+print OUT "static ImplUnicodeToGb180302000Range const\n aImplUnicodeTo",
+ $id,
+ "Ranges[] = {\n";
+for ($range_index = 0; $range_index < $range_count; ++$range_index)
+{
+ printf OUT " { %d, 0x%04X, 0x%04X, %d },\n",
+ $uni_nonrangedataindex[$range_index],
+ $range_uni_first[$range_index],
+ $range_uni_last[$range_index],
+ $range_linear_first[$range_index];
+}
+print OUT "};\n";
+
+close OUT;