diff options
Diffstat (limited to 'layout/mathml/updateOperatorDictionary.pl')
-rwxr-xr-x | layout/mathml/updateOperatorDictionary.pl | 459 |
1 files changed, 459 insertions, 0 deletions
diff --git a/layout/mathml/updateOperatorDictionary.pl b/layout/mathml/updateOperatorDictionary.pl new file mode 100755 index 0000000000..01fc4d0a88 --- /dev/null +++ b/layout/mathml/updateOperatorDictionary.pl @@ -0,0 +1,459 @@ +#!/usr/bin/perl +# -*- Mode: Perl; tab-width: 2; indent-tabs-mode: nil; -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +use XML::LibXSLT; +use XML::LibXML; +use LWP::Simple; + +# output files +$FILE_UNICODE = "unicode.xml"; +$FILE_DICTIONARY = "dictionary.xml"; +$FILE_DIFFERENCES = "differences.txt"; +$FILE_NEW_DICTIONARY = "new_dictionary.txt"; +$FILE_SYNTAX_ERRORS = "syntax_errors.txt"; + +# our dictionary (property file) +$MOZ_DICTIONARY = "mathfont.properties"; + +# dictionary provided by the W3C in "XML Entity Definitions for Characters" +$WG_DICTIONARY_URL = "https://raw.githubusercontent.com/w3c/xml-entities/gh-pages/unicode.xml"; + +# XSL stylesheet to extract relevant data from the dictionary +$DICTIONARY_XSL = "operatorDictionary.xsl"; + +# dictionary provided by the W3C transformed with operatorDictionary.xsl +$WG_DICTIONARY = $FILE_DICTIONARY; + +if (!($#ARGV >= 0 && + ((($ARGV[0] eq "download") && $#ARGV <= 1) || + (($ARGV[0] eq "compare") && $#ARGV <= 1) || + (($ARGV[0] eq "check") && $#ARGV <= 0) || + (($ARGV[0] eq "clean") && $#ARGV <= 0)))) { + &usage; +} + +if ($ARGV[0] eq "download") { + if ($#ARGV == 1) { + $WG_DICTIONARY_URL = $ARGV[1]; + } + print "Downloading $WG_DICTIONARY_URL...\n"; + getstore($WG_DICTIONARY_URL, $FILE_UNICODE); + + print "Converting $FILE_UNICODE into $FILE_DICTIONARY...\n"; + my $xslt = XML::LibXSLT->new(); + my $source = XML::LibXML->load_xml(location => $FILE_UNICODE); + my $style_doc = XML::LibXML->load_xml(location => $DICTIONARY_XSL, + no_cdata=>1); + my $stylesheet = $xslt->parse_stylesheet($style_doc); + my $results = $stylesheet->transform($source); + open($file, ">$FILE_DICTIONARY") || die ("Couldn't open $FILE_DICTIONARY!"); + print $file $stylesheet->output_as_bytes($results); + close($file); + exit 0; +} + +if ($ARGV[0] eq "clean") { + unlink($FILE_UNICODE, + $FILE_DICTIONARY, + $FILE_DIFFERENCES, + $FILE_NEW_DICTIONARY, + $FILE_SYNTAX_ERRORS); + exit 0; +} + +if ($ARGV[0] eq "compare" && $#ARGV == 1) { + $WG_DICTIONARY = $ARGV[1]; +} + +################################################################################ +# structure of the dictionary used by this script: +# - key: same as in mathfont.properties +# - table: +# index | value +# 0 | description +# 1 | lspace +# 2 | rspace +# 4 | largeop +# 5 | movablelimits +# 6 | stretchy +# 7 | separator +# 8 | accent +# 9 | fence +# 10 | symmetric +# 13 | direction + +# 1) build %moz_hash from $MOZ_DICTIONARY + +print "loading $MOZ_DICTIONARY...\n"; +open($file, $MOZ_DICTIONARY) || die ("Couldn't open $MOZ_DICTIONARY!"); + +print "building dictionary...\n"; +while (<$file>) { + next unless (m/^operator\.(.*)$/); + (m/^([\w|\.|\\]*)\s=\s(.*)\s#\s(.*)$/); + + # 1.1) build the key + $key = $1; + + # 1.2) build the array + $_ = $2; + @value = (); + $value[0] = $3; + if (m/^(.*)lspace:(\d)(.*)$/) { $value[1] = $2; } else { $value[1] = "5"; } + if (m/^(.*)rspace:(\d)(.*)$/) { $value[2] = $2; } else { $value[2] = "5"; } + $value[4] = (m/^(.*)largeop(.*)$/); + $value[5] = (m/^(.*)movablelimits(.*)$/); + $value[6] = (m/^(.*)stretchy(.*)$/); + $value[7] = (m/^(.*)separator(.*)$/); + $value[8] = (m/^(.*)accent(.*)$/); + $value[9] = (m/^(.*)fence(.*)$/); + $value[10] = (m/^(.*)symmetric(.*)$/); + if (m/^(.*)direction:([a-z]*)(.*)$/) { $value[13] = $2; } + else { $value[13] = ""; } + + # 1.3) save the key and value + $moz_hash{$key} = [ @value ]; +} + +close($file); + +################################################################################ +# 2) If mode "check", verify validity of our operator dictionary and quit. +# If mode "compare", go to step 3) + +if ($ARGV[0] eq "check") { + print "checking operator dictionary...\n"; + open($file_syntax_errors, ">$FILE_SYNTAX_ERRORS") || + die ("Couldn't open $FILE_SYNTAX_ERRORS!"); + + $nb_errors = 0; + $nb_warnings = 0; + @moz_keys = (keys %moz_hash); + # check the validity of our private data + while ($key = pop(@moz_keys)) { + + if ($key =~ /\\u.+\\u.+\\u.+/) { + $valid = 0; + $nb_errors++; + print $file_syntax_errors "error: \"$key\" has more than 2 characters\n"; + } + + if ($key =~ /\\u20D2\./ || $key =~ /\\u0338\./) { + $valid = 0; + $nb_errors++; + print $file_syntax_errors "error: \"$key\" ends with character U+20D2 or U+0338\n"; + } + + @moz = @{ $moz_hash{$key} }; + $entry = &generateEntry($key, @moz); + $valid = 1; + + if (!(@moz[13] eq "" || + @moz[13] eq "horizontal" || + @moz[13] eq "vertical")) { + $valid = 0; + $nb_errors++; + print $file_syntax_errors "error: invalid direction \"$moz[13]\"\n"; + } + + if (@moz[4] && !(@moz[13] eq "vertical")) { + $valid = 0; + $nb_errors++; + print $file_syntax_errors "error: operator is largeop but does not have vertical direction\n"; + } + + if (!$valid) { + print $file_syntax_errors $entry; + print $file_syntax_errors "\n"; + } + } + + # check that all forms have the same direction. + @moz_keys = (keys %moz_hash); + while ($key = pop(@moz_keys)) { + + if (@{ $moz_hash{$key} }) { + # the operator has not been removed from the hash table yet. + + $_ = $key; + (m/^([\w|\.|\\]*)\.(prefix|infix|postfix)$/); + $key_prefix = "$1.prefix"; + $key_infix = "$1.infix"; + $key_postfix = "$1.postfix"; + @moz_prefix = @{ $moz_hash{$key_prefix} }; + @moz_infix = @{ $moz_hash{$key_infix} }; + @moz_postfix = @{ $moz_hash{$key_postfix} }; + + $same_direction = 1; + + if (@moz_prefix) { + if (@moz_infix && + !($moz_infix[13] eq $moz_prefix[13])) { + $same_direction = 0; + } + if (@moz_postfix && + !($moz_postfix[13] eq $moz_prefix[13])) { + $same_direction = 0; + } + } + if (@moz_infix) { + if (@moz_postfix && + !($moz_postfix[13] eq $moz_infix[13])) { + $same_direction = 0; + } + } + + if (!$same_direction) { + $nb_errors++; + print $file_syntax_errors + "error: operator has a stretchy form, but all forms"; + print $file_syntax_errors + " have not the same direction\n"; + if (@moz_prefix) { + $_ = &generateEntry($key_prefix, @moz_prefix); + print $file_syntax_errors $_; + } + if (@moz_infix) { + $_ = &generateEntry($key_infix, @moz_infix); + print $file_syntax_errors $_; + } + if (@moz_postfix) { + $_ = &generateEntry($key_postfix, @moz_postfix); + print $file_syntax_errors $_; + } + print $file_syntax_errors "\n"; + } + + if (@moz_prefix) { + delete $moz_hash{$key.prefix}; + } + if (@moz_infix) { + delete $moz_hash{$key_infix}; + } + if (@moz_postfix) { + delete $moz_hash{$key_postfix}; + } + } + } + + close($file_syntax_errors); + print "\n"; + if ($nb_errors > 0 || $nb_warnings > 0) { + print "$nb_errors error(s) found\n"; + print "$nb_warnings warning(s) found\n"; + print "See output file $FILE_SYNTAX_ERRORS.\n\n"; + } else { + print "No error found.\n\n"; + } + + exit 0; +} + +################################################################################ +# 3) build %wg_hash and @wg_keys from the page $WG_DICTIONARY + +print "loading $WG_DICTIONARY...\n"; +my $parser = XML::LibXML->new(); +my $doc = $parser->parse_file($WG_DICTIONARY); + +print "building dictionary...\n"; +@wg_keys = (); + +foreach my $entry ($doc->findnodes('/root/entry')) { + # 3.1) build the key + $key = "operator."; + + $_ = $entry->getAttribute("unicode"); + + # Skip non-BMP Arabic characters that are handled specially. + if ($_ == "U1EEF0" || $_ == "U1EEF1") { + next; + } + + $_ = "$_-"; + while (m/^U?0(\w*)-(.*)$/) { + # Concatenate .\uNNNN + $key = "$key\\u$1"; + $_ = $2; + } + + $_ = $entry->getAttribute("form"); # "Form" + $key = "$key.$_"; + + # 3.2) build the array + @value = (); + $value[0] = lc($entry->getAttribute("description")); + $value[1] = $entry->getAttribute("lspace"); + if ($value[1] eq "") { $value[1] = "5"; } + $value[2] = $entry->getAttribute("rspace"); + if ($value[2] eq "") { $value[2] = "5"; } + + $_ = $entry->getAttribute("properties"); + $value[4] = (m/^(.*)largeop(.*)$/); + $value[5] = (m/^(.*)movablelimits(.*)$/); + $value[6] = (m/^(.*)stretchy(.*)$/); + $value[7] = (m/^(.*)separator(.*)$/); + $value[9] = (m/^(.*)fence(.*)$/); + $value[10] = (m/^(.*)symmetric(.*)$/); + + # not stored in the WG dictionary + $value[8] = ""; # accent + $value[13] = ""; # direction + + # 3.3) save the key and value + push(@wg_keys, $key); + $wg_hash{$key} = [ @value ]; +} +@wg_keys = reverse(@wg_keys); + +################################################################################ +# 4) Compare the two dictionaries and output the result + +print "comparing dictionaries...\n"; +open($file_differences, ">$FILE_DIFFERENCES") || + die ("Couldn't open $FILE_DIFFERENCES!"); +open($file_new_dictionary, ">$FILE_NEW_DICTIONARY") || + die ("Couldn't open $FILE_NEW_DICTIONARY!"); + +$conflicting = 0; $conflicting_stretching = 0; +$new = 0; $new_stretching = 0; +$obsolete = 0; $obsolete_stretching = 0; +$unchanged = 0; + +# 4.1) look to the entries of the WG dictionary +while ($key = pop(@wg_keys)) { + + @wg = @{ $wg_hash{$key} }; + delete $wg_hash{$key}; + $wg_value = &generateCommon(@wg); + + if (exists($moz_hash{$key})) { + # entry is in both dictionary + @moz = @{ $moz_hash{$key} }; + delete $moz_hash{$key}; + $moz_value = &generateCommon(@moz); + if ($moz_value ne $wg_value) { + # conflicting entry + print $file_differences "[conflict]"; + $conflicting++; + if ($moz[6] != $wg[6]) { + print $file_differences "[stretching]"; + $conflicting_stretching++; + } + print $file_differences " - $key ($wg[0])\n"; + print $file_differences "-$moz_value\n+$wg_value\n\n"; + $_ = &completeCommon($wg_value, $key, @moz, @wg); + print $file_new_dictionary $_; + } else { + # unchanged entry + $unchanged++; + $_ = &completeCommon($wg_value, $key, @moz, @wg); + print $file_new_dictionary $_; + } + } else { + # we don't have this entry in our dictionary yet + print $file_differences "[new entry]"; + $new++; + if ($wg[6]) { + print $file_differences "[stretching]"; + $new_stretching++; + } + print $file_differences " - $key ($wg[0])\n"; + print $file_differences "-\n+$wg_value\n\n"; + $_ = &completeCommon($wg_value, $key, (), @wg); + print $file_new_dictionary $_; + } +} + +print $file_new_dictionary + "\n# Entries below are not part of the official MathML dictionary\n\n"; +# 4.2) look in our dictionary the remaining entries +@moz_keys = (keys %moz_hash); +@moz_keys = reverse(sort(@moz_keys)); + +while ($key = pop(@moz_keys)) { + @moz = @{ $moz_hash{$key} }; + $moz_value = &generateCommon(@moz); + print $file_differences "[obsolete entry]"; + $obsolete++; + if ($moz[6]) { + print $file_differences "[stretching]"; + $obsolete_stretching++; + } + print $file_differences " - $key ($moz[0])\n"; + print $file_differences "-$moz_value\n+\n\n"; + $_ = &completeCommon($moz_value, $key, (), @moz); + print $file_new_dictionary $_; +} + +close($file_differences); +close($file_new_dictionary); + +print "\n"; +print "- $obsolete obsolete entries "; +print "($obsolete_stretching of them are related to stretching)\n"; +print "- $unchanged unchanged entries\n"; +print "- $conflicting conflicting entries "; +print "($conflicting_stretching of them are related to stretching)\n"; +print "- $new new entries "; +print "($new_stretching of them are related to stretching)\n"; +print "\nSee output files $FILE_DIFFERENCES and $FILE_NEW_DICTIONARY.\n\n"; +print "After having modified the dictionary, please run"; +print "./updateOperatorDictionary check\n\n"; +exit 0; + +################################################################################ +sub usage { + # display the accepted command syntax and quit + print "usage:\n"; + print " ./updateOperatorDictionary.pl download [unicode.xml]\n"; + print " ./updateOperatorDictionary.pl compare [dictionary.xml]\n"; + print " ./updateOperatorDictionary.pl check\n"; + print " ./updateOperatorDictionary.pl clean\n"; + exit 0; +} + +sub generateCommon { + # helper function to generate the string of data shared by both dictionaries + my(@v) = @_; + $entry = "lspace:$v[1] rspace:$v[2]"; + if ($v[4]) { $entry = "$entry largeop"; } + if ($v[5]) { $entry = "$entry movablelimits"; } + if ($v[6]) { $entry = "$entry stretchy"; } + if ($v[7]) { $entry = "$entry separator"; } + if ($v[9]) { $entry = "$entry fence"; } + if ($v[10]) { $entry = "$entry symmetric"; } + return $entry; +} + +sub completeCommon { + # helper to add key and private data to generateCommon + my($entry, $key, @v_moz, @v_wg) = @_; + + $entry = "$key = $entry"; + + if ($v_moz[8]) { $entry = "$entry accent"; } + if ($v_moz[13]) { $entry = "$entry direction:$v_moz[13]"; } + + if ($v_moz[0]) { + # keep our previous comment + $entry = "$entry # $v_moz[0]"; + } else { + # otherwise use the description given by the WG + $entry = "$entry # $v_wg[0]"; + } + + $entry = "$entry\n"; + return $entry; +} + +sub generateEntry { + # helper function to generate an entry of our operator dictionary + my($key, @moz) = @_; + $entry = &generateCommon(@moz); + $entry = &completeCommon($entry, $key, @moz, @moz); + return $entry; +} |