#!/usr/bin/env perl # # This file is part of the LibreOffice project. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. # # This file incorporates work covered by the following license notice: # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed # with this work for additional information regarding copyright # ownership. The ASF licenses this file to you under the Apache # License, Version 2.0 (the "License"); you may not use this file # except in compliance with the License. You may obtain a copy of # the License at http://www.apache.org/licenses/LICENSE-2.0 . # # The following files must be available in a ./input subdir: # : # "modified version="3" date="2001-02-21"" $id = "Gb180302000"; sub printUtf32 { my $utf32 = $_[0]; return sprintf("U+%04X", $utf32); } sub printGb { if (defined($_[2])) { return sprintf("%02X%02X%02X%02X", $_[0], $_[1], $_[2], $_[3]); } elsif (defined($_[1])) { return sprintf("%02X%02X", $_[0], $_[1]); } else { return sprintf("%02X", $_[0]); } } $gb_map_2_count = 0; $gb_map_4_count = 0; $gb_map_4_ranges = 0; $gb_map_4_max = 0; $uni_map_count = 0; $range_count = 0; if (1) { $filename = "gb-18030-2000.xml"; open IN, ("input/" . $filename) or die "Cannot read " . $filename; while () { if (/^[ \t]*$/) { $utf32 = oct("0x" . $1); $gb1 = oct("0x" . $2); ($utf32 == $gb1) or die "Bad " . printUtf32($utf32) . " to " . printGb($gb1); } elsif (/^[ \t]*$/) { $utf32 = oct("0x" . $1); $gb1 = oct("0x" . $2); $gb2 = oct("0x" . $3); $gb_code = ($gb1 - 0x81) * 190 + ($gb2 <= 0x7E ? $gb2 - 0x40 : $gb2 - 0x80 + 63); !defined($gb_map_2[$gb_code]) or die "Redefined " . printGb($gb1, $gb2); $gb_map_2[$gb_code] = $utf32; ++$gb_map_2_count; !defined($uni_map[$utf32]) or die "Double Unicode mapping"; $uni_map[$utf32] = $gb1 << 8 | $gb2; ++$uni_map_count; } elsif (/^[ \t]*$/) { $utf32 = oct("0x" . $1); $gb1 = oct("0x" . $2); $gb2 = oct("0x" . $3); $gb3 = oct("0x" . $4); $gb4 = oct("0x" . $5); $gb_code = ($gb1 - 0x81) * 12600 + ($gb2 - 0x30) * 1260 + ($gb3 - 0x81) * 10 + ($gb4 - 0x30); !defined($gb_map_4[$gb_code]) or die "Redefined " . printGb($gb1, $gb2, $gb3, $gb4); $gb_map_4[$gb_code] = $utf32; ++$gb_map_4_count; $gb_map_4_max = $gb_code if ($gb_code > $gb_map_4_max); !defined($uni_map[$utf32]) or die "Double Unicode mapping"; $uni_map[$utf32] = $gb1 << 24 | $gb2 << 16 | $gb3 << 8 | $gb4; ++$uni_map_count; } elsif (/$/) { $utf32_first = oct("0x" . $1); $utf32_last = oct("0x" . $2); $gb1_first = oct("0x" . $3); $gb2_first = oct("0x" . $4); $gb3_first = oct("0x" . $5); $gb4_first = oct("0x" . $6); $gb1_last = oct("0x" . $7); $gb2_last = oct("0x" . $8); $gb3_last = oct("0x" . $9); $gb4_last = oct("0x" . $10); $linear_first = ($gb1_first - 0x81) * 12600 + ($gb2_first - 0x30) * 1260 + ($gb3_first - 0x81) * 10 + ($gb4_first - 0x30); $linear_last = ($gb1_last - 0x81) * 12600 + ($gb2_last - 0x30) * 1260 + ($gb3_last - 0x81) * 10 + ($gb4_last - 0x30); ($utf32_last - $utf32_first == $linear_last - $linear_first) or die "Bad range"; if ($linear_first != 189000 || $linear_last != 1237575) { $range_uni_first[$range_count] = $utf32_first; $range_uni_last[$range_count] = ($utf32_last == 0xD7FF ? 0xDFFF : $utf32_last); $range_linear_first[$range_count] = $linear_first; $range_linear_last[$range_count] = $linear_last; ++$range_count; $gb_map_4_ranges += $linear_last - $linear_first + 1; $gb_map_4_max = $linear_last if ($linear_last > $gb_map_4_max); } } elsif (/ " . $filename) or die "Cannot write " . $filename; { $filename = lc($id). ".pl"; open IN, $filename or die "Cannot read ". $filename; $first = 1; while () { if (/^\#!.*$/) { } elsif (/^\#(\*.*)$/) { if ($first == 1) { print OUT "/", $1, "\n"; $first = 0; } else { print OUT " ", substr($1, 0, length($1) - 1), "/\n"; } } elsif (/^\# (.*)$/) { print OUT " *", $1, "\n"; } elsif (/^\#(.*)$/) { print OUT " *", $1, "\n"; } else { goto done; } } done: } print OUT "\n", "#include \"convertgb18030.h\"\n", "\n", "#include \"sal/types.h\"\n", "\n"; print OUT "static sal_Unicode const aImpl", $id, "ToUnicodeData[] = {\n "; for ($gb_code = 0; $gb_code < $gb_map_2_count; ++$gb_code) { printf OUT "0x%04X,", $gb_map_2[$gb_code]; if ($gb_code % 8 == 7 && $gb_code != $gb_map_2_count - 1) { print OUT "\n "; } } print OUT "\n};\n\n"; print OUT "static ImplGb180302000ToUnicodeRange const\n aImpl", $id, "ToUnicodeRanges[] = {\n"; for ($range_index = 0; $range_index < $range_count; ++$range_index) { printf OUT " { %d, %d, %d, 0x%04X },\n", $gb_nonrangedataindex[$range_index], $range_linear_first[$range_index], $range_linear_last[$range_index] + 1, $range_uni_first[$range_index]; } print OUT " { -1, 0, 0, 0 }\n};\n\n"; print OUT "static sal_uInt32 const aImplUnicodeTo", $id, "Data[] = {\n "; $index = 0; $range_index = 0; $uni_nonrangedataindex[$range_index] = $index; for ($utf32 = 0x80; $utf32 <= 0xFFFF; ++$utf32) { if (defined($uni_map[$utf32])) { if ($index > 0 && ($index - 1) % 6 == 5) { print OUT "\n "; } $bytes = $uni_map[$utf32]; printf OUT ($bytes <= 0xFFFF ? " 0x%04X," : "0x%08X,"), $bytes; ++$index; } else { ($utf32 == $range_uni_first[$range_index]) or die "Bad input"; $utf32 = $range_uni_last[$range_index]; ++$range_index; $uni_nonrangedataindex[$range_index] = $index; } } ($range_index == $range_count) or die "Bad input"; print OUT "\n};\n\n"; print OUT "static ImplUnicodeToGb180302000Range const\n aImplUnicodeTo", $id, "Ranges[] = {\n"; for ($range_index = 0; $range_index < $range_count; ++$range_index) { printf OUT " { %d, 0x%04X, 0x%04X, %d },\n", $uni_nonrangedataindex[$range_index], $range_uni_first[$range_index], $range_uni_last[$range_index], $range_linear_first[$range_index]; } print OUT "};\n"; close OUT;