diff options
Diffstat (limited to 'i18nlangtag/source/isolang/langid.pl')
-rwxr-xr-x | i18nlangtag/source/isolang/langid.pl | 471 |
1 files changed, 471 insertions, 0 deletions
diff --git a/i18nlangtag/source/isolang/langid.pl b/i18nlangtag/source/isolang/langid.pl new file mode 100755 index 000000000..061c69288 --- /dev/null +++ b/i18nlangtag/source/isolang/langid.pl @@ -0,0 +1,471 @@ +: # -*- perl -*- vim: ft=perl +eval 'exec perl -w -S $0 ${1+"$@"}' +if 0; +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# This file incorporates work covered by the following license notice: +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed +# with this work for additional information regarding copyright +# ownership. The ASF licenses this file to you under the Apache +# License, Version 2.0 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.apache.org/licenses/LICENSE-2.0 . +# + +# See Usage() below or invoke without arguments for short instructions. +# For long instructions use the source, Luke ;-) + +use strict; + +sub Usage() +{ + print STDERR + "\n", + "langid - a hackish utility to lookup lang.h language defines and LangIDs,\n", + "isolang.cxx ISO639/ISO3166 mapping, locale data files, langtab.hrc language\n", + "listbox entries, langlist.mk, file_ooo.scp registry name, languages.pm and\n", + "msi-encodinglist.txt\n\n", + + "Usage: $0 [--single] {language string} | {LangID} | {primarylanguage sublanguage} | {language-country}\n\n", + + "A language string will be used as a generic string match in all searched files.\n", + "You may enclose the language string in word delimiters,\n", + "e.g. \\blanguage_german\\b for a specific match.\n", + "If the language string expression matches more than one define,\n", + "e.g. as in 'german', all matching defines will be processed.\n", + "If the language string does not match a define or an identifier in\n", + "langtab.hrc, a generic string match of the listbox entries will be tried.\n\n", + + "Numeric values of LangID,primarylanguage,sublanguage can be given\n", + "decimal, hexadecimal (leading 0x), octal (leading 0) or binary (leading 0b).\n", + "The exact language_define of an exact match will be used in remaining lookups.\n\n", + + "A language-country pair will lookup a xx-YY mapping from isolang.cxx,\n", + "for example: 'en-US' or 'de-' or '-CH',\n", + "xx and YY can be given case insensitive, will be lowered-uppered internally,\n", + "and xx and YY themselves may be regular expressions.\n", + "Also here a list of matches will be processed.\n\n", + + "If option --single is given, only the first match will be processed.\n\n"; +} + +my $SRC_ROOT = $ENV{"SRC_ROOT"}; +if (!defined($SRC_ROOT)) +{ + print "\nNeed \$SRC_ROOT, please set your LibreOffice environment!\n"; + Usage(); + exit 1; +} + +my $LANGUAGE_MASK_PRIMARY = 0x03ff; + +sub getPrimaryLanguage($) +{ + my($lcid) = @_; + return $lcid & $LANGUAGE_MASK_PRIMARY; +} + +sub getSubLanguage($) +{ + my($lcid) = @_; + return $lcid >> 10; +} + +sub makeLangID($$) +{ + my( $sub, $pri) = @_; + return ($sub << 10) | $pri; +} + + +# Note that a regex needs a duplicated pair of backslashes to produce a literal +# \\ like in \\\\* to search for zero or more \ backslashes. +# @addregex can be an optional "block to grep" definition +# (regex-to-start-block, regex-to-end-block, regex-to-find-in-block) +sub grepFile($$$$$@) +{ + my( $regex, $path, $module, $name, $printmsg, @addregex) = @_; + my @result; + my $found = 0; + my $areopen = 0; + my $arecloser = ''; + # Try module under current working directory first to catch local + # modifications. + my $file = "./$module/$name"; + if (!($found = open( IN, $file))) + { + # Then with the given path. + $file = "$path/$module/$name"; + if (!($found = open( IN, $file))) + { + print "No $file\n"; + } + } + if ($found) + { + $found = 0; + while (my $line = <IN>) + { + if ($line =~ /$regex/) + { + if (!$found) + { + $found = 1; + print "$file:\n"; + } + chomp( $line); + print "$line\n"; + push( @result, $line); + } + elsif (@addregex) + { + # By convention first element is opener, second element is closer. + if (!$areopen) + { + if ($line =~ /$addregex[0]/) + { + $areopen = 1; + $arecloser = $addregex[1]; + } + } + if ($areopen) + { + for (my $i = 2; $i < @addregex; ++$i) + { + if ($line =~ /$addregex[$i]/) + { + if (!$found) + { + $found = 1; + print "$file:\n"; + } + chomp( $line); + print "$line\n"; + push( @result, $line); + } + } + if ($line =~ /$arecloser/) + { + $areopen = 0; + } + } + } + } + close( IN); + } + if (!$found && $printmsg) { + print "Not found in $file\n"; + #print "Not found in $file for $regex @addregex\n"; + } + return @result; +} + + +sub main() +{ + my( $lcid, @parts, $grepdef, $options, $single); + $grepdef = 0; + $single = 0; + for ($options = 0; $options < @ARGV && $ARGV[$options] =~ /^--/; ++$options) + { + if ($ARGV[$options] eq '--single') { $single = 1; } + else { print "Unknown option: $ARGV[$options]\n"; } + } + if (@ARGV == 1 + $options) + { + # 0x hex, 0b bin, 0 oct + if ($ARGV[$options] =~ /^0/) { + $lcid = oct( $ARGV[0]); } + elsif ($ARGV[$options] =~ /^[0-9]/) { + $lcid = $ARGV[$options]; } + else + { + $grepdef = $ARGV[$options]; + $lcid = 0; + } + $parts[0] = getPrimaryLanguage( $lcid); + $parts[1] = getSubLanguage( $lcid); + } + elsif (@ARGV == 2 + $options) + { + for (my $i = $options; $i < 2 + $options; ++$i) + { + if ($ARGV[$i] =~ /^0/) { + $parts[$i] = oct( $ARGV[$i]); } + else { + $parts[$i] = $ARGV[$i]; } + } + $lcid = makeLangID( $parts[1], $parts[0]); + } + else + { + Usage(); + return 1; + } + my $modifier = "(?i)"; + my (@resultlist, @greplist, $result); + # If no string was given on the command line, but value(s) were, lookup the + # LangID value to obtain the define identifier. + if ($grepdef) + { + # #define LANGUAGE_AFRIKAANS LanguageType(0x0436) + @resultlist = grepFile( + $modifier . '^\s*#\s*define\s+[A-Z_]*' . $grepdef, + "$SRC_ROOT", "include", "i18nlangtag/lang.h", 1, ()); + } + else + { + printf( "LangID: 0x%04X (dec %d), primary: 0x%03x, sub 0x%02x\n", $lcid, + $lcid, $parts[0], $parts[1]); + my $buf = sprintf( "0x%04X", $lcid); + # #define LANGUAGE_AFRIKAANS LanguageType(0x0436) + @resultlist = grepFile( + '^\s*#\s*define\s+\w+\s+LanguageType\(' . $buf . '\)', + "$SRC_ROOT", "include", "i18nlangtag/lang.h", 1, ()); + } + for $result (@resultlist) + { + # #define LANGUAGE_AFRIKAANS LanguageType(0x0436) + if ($result =~ /^\s*#\s*define\s+(\w+)\s+LanguageType\((0x[0-9a-fA-F]+)\)/) + { + push( @greplist, '\b' . $1 . '\b'); + $modifier = ""; # complete identifier now case sensitive + if ($single) { + last; } + } + } + # If the string given is of the form xx-yy lookup a language,country pair + # to obtain the define identifier. xx and yy may themselves be regexps. + # xx- is a short form for 'xx-.*' and -yy a short form for '.*-yy' + # Note that -Latn for '.*-Latn' also works, accidentally. + if ($grepdef =~ /^(.*)-$/) { + $grepdef = $1 . "-.*"; } + if ($grepdef =~ /^-(.*)$/) { + $grepdef = ".*-" . $1; } + if ($grepdef =~ /^([^-]{2,3})-([^-]{2,2})$/) # catches also .*-.* + { + my $lang = $1; + my $coun = $2; + $lang = lc($lang); + $coun = uc($coun); + # { LANGUAGE_AFRIKAANS, "af", "ZA", 0 }, + @resultlist = grepFile( + '^\s*\{\s*\w+\s*,\s*"' . $lang . '"\s*,\s*"' . $coun . '"\s*,\s*\w+\s*\}\s*,', + "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 1, ()); + for $result (@resultlist) + { + if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/) + { + push( @greplist, '\b' . $1 . '\b'); + $modifier = ""; # complete identifier now case sensitive + if ($single) { + last; } + } + } + $grepdef = 0; + } + # Same for lll-Ssss or lll-Ssss-CC language tag. + if ($grepdef =~ /^([^-]{2,3})-([^-]{4,4})$/ || $grepdef =~ /^([^-]{2,3})-([^-]{4,4})-([^-]{2,2})$/) + { + my $lang = $1; + my $scri = $2; + my $coun = $3; + if (!defined($coun)) { + $coun = ""; } + $lang = lc($lang); + $scri = ucfirst(lc($scri)); + $coun = uc($coun); + # { LANGUAGE_SERBIAN_LATIN_SERBIA, "sr-Latn", "RS", 0 }, + @resultlist = grepFile( + '^\s*\{\s*\w+\s*,\s*"' . $lang . '-' . $scri . '"\s*,\s*"' . $coun . '"\s*,\s*\w+\s*\}\s*,', + "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 1, ()); + for $result (@resultlist) + { + if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+-\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/) + { + push( @greplist, '\b' . $1 . '\b'); + $modifier = ""; # complete identifier now case sensitive + if ($single) { + last; } + } + } + $grepdef = 0; + } + # And for any other language tag that MUST match case. + if ($grepdef =~ /^[^-]+-/) + { + # { LANGUAGE_CATALAN_VALENCIAN, "ca-ES-valencia", "ES", "ca-valencia" }, + @resultlist = grepFile( + '^\s*\{\s*\w+\s*,\s*"' . $grepdef . '"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,', + "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 1, ()); + for $result (@resultlist) + { + if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"([^"]+)"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,/) + { + push( @greplist, '\b' . $1 . '\b'); + $modifier = ""; # complete identifier now case sensitive + if ($single) { + last; } + } + } + $grepdef = 0; + } + if (!@greplist && $grepdef) { + push( @greplist, $grepdef); } + for $grepdef (@greplist) + { + print "\nUsing: " . $grepdef . "\n"; + + # Decimal LCID, was needed for Langpack.ulf but isn't used anymore, + # keep just in case we'd need it again. + # #define LANGUAGE_AFRIKAANS 0x0436 + @resultlist = grepFile( + $modifier . '^\s*#\s*define\s+[A-Z_]*' . $grepdef, + "$SRC_ROOT", "include", "i18nlangtag/lang.h", 1, ()); + my @lcidlist; + for $result (@resultlist) + { + # #define LANGUAGE_AFRIKAANS LanguageType(0x0436) + if ($result =~ /^\s*#\s*define\s+(\w+)\s+LanguageType\((0x[0-9a-fA-F]+)\)/) + { + push( @lcidlist, oct( $2)); + } + } + + my @allresultslist; + # { LANGUAGE_AFRIKAANS, "af", "ZA", 0 }, + @resultlist = grepFile( + $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*"(\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,', + "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 0, ()); + push( @allresultslist, @resultlist); + # { LANGUAGE_SERBIAN_LATIN_SERBIA, "sr-Latn", "RS", 0 }, + @resultlist = grepFile( + $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*"(\w+-\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*}\s*,', + "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 0, ()); + push( @allresultslist, @resultlist); + # { LANGUAGE_CATALAN_VALENCIAN, "ca-ES-valencia", "ES", "ca-valencia" }, + @resultlist = grepFile( + $modifier . '^\s*\{\s*.*' . $grepdef . '.*\s*,\s*"([^"]+)"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,', + "$SRC_ROOT", "i18nlangtag", "source/isolang/isolang.cxx", 0, ()); + push( @allresultslist, @resultlist); + + my @langtaggreplist; + for $result (@allresultslist) + { + my $loca; + # { LANGUAGE_AFRIKAANS, "af", "ZA", 0 }, + # { LANGUAGE_SERBIAN_LATIN_SERBIA, "sr-Latn", "RS", 0 }, + if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/ || + $result =~ /^\s*\{\s*(\w+)\s*,\s*"(\w+-\w+)"\s*,\s*"(\w+)?"\s*,\s*\w+\s*\}\s*,/) + { + my $lang = $2; + my $coun = $3; + if ($coun) + { + $loca = $lang . "_" . $coun; + push( @langtaggreplist, '\b' . $lang . '\b(-' . $coun . ')?'); + } + else + { + $loca = $lang; + push( @langtaggreplist, '\b' . $lang . '\b'); + } + } + # { LANGUAGE_CATALAN_VALENCIAN, "ca-ES-valencia", "ES", "ca-valencia" }, + if ($result =~ /^\s*\{\s*(\w+)\s*,\s*"([^"]+)"\s*,\s*"(\w*)"\s*,\s*"([^"]*)"\s*\}\s*,/) + { + $loca = $2; + my $lang = $4; + my $coun = $3; + if ($lang) + { + if ($coun) + { + push( @langtaggreplist, '\b' . $lang . '\b(-' . $coun . ')?'); + } + else + { + push( @langtaggreplist, '\b' . $lang . '\b'); + } + } + } + if ($loca) + { + $loca =~ s/-/_/g; + my $file = "$SRC_ROOT/i18npool/source/localedata/data/$loca.xml"; + my $found = open( LD, $file); + if ($found) + { + print "Found $file:\n"; + my $on = 0; + while (my $line = <LD>) + { + if ($line =~ /<(Language|Country|Variant)>/) { + $on = 1; } + if ($on) { + print $line; } + if ($line =~ /<\/(Language|Country|Variant)>/) { + $on = 0; } + } + close( LD); + } + else { + print "No $file\n"; } + } + } + + # Find any special treatment, may need inspection then. + # $grepdef already has \b word delimiters. + grepFile( + $modifier . $grepdef, + "$SRC_ROOT", "i18nlangtag", "source/isolang/mslangid.cxx", 1, ()); + + my $module = "svtools"; + my $name = "inc/langtab.hrc"; + # { NC_("STR_ARR_SVT_LANGUAGE_TABLE", "Afrikaans (South Africa)") , LANGUAGE_AFRIKAANS }, + # lookup define + @resultlist = grepFile( + $modifier . '^\s*\{\s*NC_\(\s*"[^"]*"\s*,\s*".*"\s*\)\s*,.*' . $grepdef . '.*\}', + "$SRC_ROOT", $module, $name, 1, ()); + # lookup string + if (!@resultlist) { + grepFile( + $modifier . '^\s*\{\s*NC_\(\s*"[^"]*"\s*,\s*".*' . $grepdef . '.*"\s*\)\s*,.*\}', + "$SRC_ROOT", $module, $name, 1, ()); } + + for my $langtag (@langtaggreplist) + { + # Name (xxx) = "/registry/spool/org/openoffice/Office/Common-ctl.xcu"; + grepFile( + '^\s*Name\s*\(' . $langtag . '\)\s*=', + "$SRC_ROOT", "scp2", "source/ooo/file_ooo.scp", 1, ()); + + # completelangiso=af ar as-IN ... zu + grepFile( + '^\s*completelangiso\s*=\s*(\s*([a-z]{2,3})(-[A-Z][A-Z])?)*' . $langtag . '', + "$SRC_ROOT", "solenv", "inc/langlist.mk", 1, + # Also grep the list of tags, one per line, \ backslash continued. + ('^\s*completelangiso\s*=', '^\s*$', '^\s*' . $langtag . '\s*\\\\*$')); + + # af 1252 1078 # Afrikaans + grepFile( + '^\s*' . $langtag . '', + "$SRC_ROOT", "l10ntools", "source/ulfconv/msi-encodinglist.txt", 1, ()); + + # 27:af:afrikaans + grepFile( + '^\d*:' . $langtag . '', + "$SRC_ROOT", "bin", "lo-xlate-lang", 1, ()); + } + } + return 0; +} + +main(); |