summaryrefslogtreecommitdiffstats
path: root/po4a-gettextize
diff options
context:
space:
mode:
Diffstat (limited to 'po4a-gettextize')
-rwxr-xr-xpo4a-gettextize609
1 files changed, 609 insertions, 0 deletions
diff --git a/po4a-gettextize b/po4a-gettextize
new file mode 100755
index 0000000..0729c4e
--- /dev/null
+++ b/po4a-gettextize
@@ -0,0 +1,609 @@
+#! /usr/bin/env perl
+eval 'exec perl -S $0 ${1+"$@"}'
+ if $running_under_some_shell;
+
+# po4a-gettextize -- convert an original file to a PO file
+#
+# Copyright 2002-2023 by SPI, inc.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of GPL v2.0 or later (see COPYING).
+
+=encoding UTF-8
+
+=head1 NAME
+
+po4a-gettextize - convert an original file (and its translation) to a PO file
+
+=head1 SYNOPSIS
+
+B<po4a-gettextize> B<-f> I<fmt> B<-m> I<master.doc> B<-l> I<XX.doc> B<-p> I<XX.po>
+
+(I<XX.po> is the output, all others are inputs)
+
+=head1 DESCRIPTION
+
+po4a (PO for anything) eases the maintenance of documentation translation using
+the classical gettext tools. The main feature of po4a is that it decouples the
+translation of content from its document structure. Please refer to the page
+L<po4a(7)> for a gentle introduction to this project.
+
+The B<po4a-gettextize> script helps you converting your previously existing
+translations into a po4a-based workflow. This is only to be done once to salvage
+an existing translation while converting to po4a, not on a regular basis after
+the conversion of your project. This tedious process is explained in details in
+Section 'Converting a manual translation to po4a' below.
+
+You must provide both a master file (e.g., the source in English) and an
+existing translated file (e.g., a previous translation attempt without po4a). If
+you provide more than one master or translation files, they will be used in
+sequence, but it may be easier to gettextize each page or chapter separately and
+then use B<msgmerge> to merge all produced PO files. As you wish.
+
+If the master document has non-ASCII characters, the new generated PO file will
+be in UTF-8. If the master document is completely in ASCII, the generated
+PO will use the encoding of the translated input document.
+
+=head1 OPTIONS
+
+=over 4
+
+=item B<-f>, B<--format>
+
+Format of the documentation you want to handle. Use the B<--help-format>
+option to see the list of available formats.
+
+=item B<-m>, B<--master>
+
+File containing the master document to translate. You can use this option
+multiple times if you want to gettextize multiple documents.
+
+=item B<-M>, B<--master-charset>
+
+Charset of the file containing the document to translate.
+
+=item B<-l>, B<--localized>
+
+File containing the localized (translated) document. If you provided
+multiple master files, you may wish to provide multiple localized file by
+using this option more than once.
+
+=item B<-L>, B<--localized-charset>
+
+Charset of the file containing the localized document.
+
+=item B<-p>, B<--po>
+
+File where the message catalog should be written. If not given, the message
+catalog will be written to the standard output.
+
+=item B<-o>, B<--option>
+
+Extra option(s) to pass to the format plugin. See the documentation of each
+plugin for more information about the valid options and their meanings. For
+example, you could pass '-o tablecells' to the AsciiDoc parser, while the
+text parser would accept '-o tabs=split'.
+
+=item B<-h>, B<--help>
+
+Show a short help message.
+
+=item B<--help-format>
+
+List the documentation formats understood by po4a.
+
+=item B<-k> B<--keep-temps>
+
+Keep the temporary master and localized POT files built before merging.
+This can be useful to understand why these files get desynchronized, leading
+to gettextization problems.
+
+=item B<-V>, B<--version>
+
+Display the version of the script and exit.
+
+=item B<-v>, B<--verbose>
+
+Increase the verbosity of the program.
+
+=item B<-d>, B<--debug>
+
+Output some debugging information.
+
+=item B<--msgid-bugs-address> I<email@address>
+
+Set the report address for msgid bugs. By default, the created POT files
+have no Report-Msgid-Bugs-To fields.
+
+=item B<--copyright-holder> I<string>
+
+Set the copyright holder in the POT header. The default value is
+"Free Software Foundation, Inc."
+
+=item B<--package-name> I<string>
+
+Set the package name for the POT header. The default is "PACKAGE".
+
+=item B<--package-version> I<string>
+
+Set the package version for the POT header. The default is "VERSION".
+
+=back
+
+=head2 Converting a manual translation to po4a
+
+B<po4a-gettextize> synchronizes the master and localized files to extract their
+content into a PO file. The content of the master file gives the B<msgid> while
+the content of the localized file gives the B<msgstr>. This process is somewhat
+fragile: the Nth string of the translated file is supposed to be the translation
+of the Nth string in the original.
+
+Gettextization works best if you manage to retrieve the exact version of the
+original document that was used for translation. Even so, you may need to fiddle
+with both master and localized files to align their structure if it was changed
+by the original translator, so working on files' copies is advised.
+
+Internally, each po4a parser reports the syntactical type of each extracted
+strings. This is how desynchronization are detected during the gettextization.
+In the example depicted below, it is very unlikely that the 4th string in
+translation (of type 'chapter') is the translation of the 4th string in original
+(of type 'paragraph'). It is more likely that a new paragraph was added to the
+original, or that two original paragraphs were merged together in the
+translation.
+
+ Original Translation
+
+ chapter chapter
+ paragraph paragraph
+ paragraph paragraph
+ paragraph chapter
+ chapter paragraph
+ paragraph paragraph
+
+B<po4a-gettextize> will verbosely diagnose any structure desynchronization. When
+this happens, you should manually edit the files to add fake paragraphs or
+remove some content here and there until the structure of both files actually
+match. Some tricks are given below to salvage the most of the existing
+translation while doing so.
+
+If you are lucky enough to have a perfect match in the file structures out of
+the box, building a correct PO file is a matter of seconds. Otherwise, you will
+soon understand why this process has such an ugly name :) Even so,
+gettextization often remains faster than translating everything again. I
+gettextized the French translation of the whole Perl documentation in one day
+despite the I<many> synchronization issues. Given the amount of text (2MB of
+original text), restarting the translation without first salvaging the old
+translations would have required several months of work. In addition, this grunt
+work is the price to pay to get the comfort of po4a. Once converted, the
+synchronization between master documents and translations will always be fully
+automatic.
+
+After a successful gettextization, the produced documents should be manually
+checked for undetected disparities and silent errors, as explained below.
+
+=head3 Hints and tricks for the gettextization process
+
+The gettextization stops as soon as a desynchronization is detected. When this
+happens, you need to edit the files as much as needed to re-align the files'
+structures. B<po4a-gettextize> is rather verbose when things go wrong. It
+reports the strings that don't match, their positions in the text, and the type
+of each of them. Moreover, the PO file generated so far is dumped as
+F<gettextization.failed.po> for further inspection.
+
+Here are some tricks to help you in this tedious process and ensure that you
+salvage the most of the previous translation:
+
+=over
+
+=item
+
+Remove all extra content of the translations, such as the section giving credits
+to the translators. They should be added separately to B<po4a> as addenda (see
+L<po4a(7)>).
+
+=item
+
+When editing the files to align their structures, prefer editing the translation
+if possible. Indeed, if the changes to the original are too intrusive, the old
+and new versions will not be matched during the first po4a run after
+gettextization (see below). Any unmatched translation will be dumped anyway.
+That being said, you still want to edit the original document if it's too hard
+to get the gettextization to proceed otherwise, even if it means that one
+paragraph of the translation is dumped. The important thing is to get a first PO
+file to start with.
+
+=item
+
+Do not hesitate to kill any original content that would not exist in the
+translated version. This content will be automatically reintroduced afterward,
+when synchronizing the PO file with the document.
+
+=item
+
+You should probably inform the original author of any structural change in the
+translation that seems justified. Issues in the original document should
+reported to the author. Fixing them in your translation only fixes them for a
+part of the community. Plus, it is impossible to do so when using po4a ;) But
+you probably want to wait until the end of the conversion to B<po4a> before
+changing the original files.
+
+=item
+
+Sometimes, the paragraph content does match, but not their types. Fixing it is
+rather format-dependent. In POD and man, it often comes from the fact that one
+of them contains a line beginning with a white space while the other does not.
+In those formats, such paragraph cannot be wrapped and thus become a different
+type. Just remove the space and you are fine. It may also be a typo in the tag
+name in XML.
+
+Likewise, two paragraphs may get merged together in POD when the separating
+line contains some spaces, or when there is no empty line between the B<=item>
+line and the content of the item.
+
+=item
+
+Sometimes, the desynchronization message seems odd because the translation is
+attached to the wrong original paragraph. It is the sign of an undetected issue
+earlier in the process. Search for the actual desynchronization point by
+inspecting the file F<gettextization.failed.po> that was produced, and fix the
+problem where it really is.
+
+=item
+
+Other issues may come from duplicated strings in either the original or
+translation. Duplicated strings are merged in PO files, with two references.
+This constitutes a difficulty for the gettextization algorithm, that is a simple
+one to one pairing between the B<msgid>s of both the master and the localized
+files. It is however believed that recent versions of po4a deal properly with
+duplicated strings, so you should report any remaining issue that you may encounter.
+
+=back
+
+=head2 Reviewing files produced by B<po4a-gettextize>
+
+Any file produced by B<po4a-gettextize> should be manually reviewed, even when
+the script terminates successfully. You should skim over the PO file, ensuring
+that the B<msgid> and B<msgstr> actually match. It is not necessary to ensure
+that the translation is perfectly correct yet, as all entries are marked as
+fuzzy translations anyway. You only need to check for obvious matching issues
+because badly matched translations will be dumped in subsequent steps while you
+want to salvage them.
+
+Fortunately, this step does not require to master the target languages as you
+only want to recognize similar elements in each B<msgid> and its corresponding
+B<msgstr>. As a speaker of French, English, and some German myself, I can do
+this for all European languages at least, even if I cannot say one word of most
+of these languages. I sometimes manage to detect matching issues in non-Latin
+languages by looking at string length, phrase structures (does the amount of
+interrogation marks match?) and other clues, but I prefer when someone else can
+review those languages.
+
+If you detect a mismatch, edit the original and translation files as if
+B<po4a-gettextize> reported an error, and try again. Once you have a decent PO
+file for your previous translation, backup it until you get po4a working
+correctly.
+
+=head2 Running B<po4a> for the first time
+
+The easiest way to setup po4a is to write a B<po4a.conf> configuration file, and
+use the integrated B<po4a> program (B<po4a-updatepo> and B<po4a-translate> are
+deprecated). Please check the "CONFIGURATION FILE" Section in L<po4a(1)>
+documentation for more details.
+
+When B<po4a> runs for the first time, the current version of the master
+documents will be used to update the PO files containing the old translations
+that you salvaged through gettextization. This can take quite a long time,
+because many of the B<msgid>s of from the gettextization do not exactly match
+the elements of the POT file built from the recent master files. This forces
+gettext to search for the closest one using a costly string proximity algorithm.
+For example, the first run over the Perl documentation's French translation (5.5
+MB PO file) took about 48 hours (yes, two days) while the subsequent ones only
+take seconds.
+
+=head2 Moving your translations to production
+
+After this first run, the PO files are ready to be reviewed by translators. All
+entries were marked as fuzzy in the PO file by B<po4a-gettextization>, forcing
+their careful review before use. Translators should take each entry to verify
+that the salvaged translation actually match the current original text, update
+the translation on need, and remove the fuzzy markers.
+
+Once enough fuzzy markers are removed, B<po4a> will start generating the
+translation files on disk, and you're ready to move your translation workflow to
+production. Some projects find it useful to rely on weblate to coordinate
+between translators and maintainers, but that's beyond B<po4a>' scope.
+
+=head1 SEE ALSO
+
+L<po4a(1)>,
+L<po4a-normalize(1)>,
+L<po4a-translate(1)>,
+L<po4a-updatepo(1)>,
+L<po4a(7)>.
+
+=head1 AUTHORS
+
+ Denis Barbier <barbier@linuxfr.org>
+ Nicolas François <nicolas.francois@centraliens.net>
+ Martin Quinson (mquinson#debian.org)
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright 2002-2023 by SPI, inc.
+
+This program is free software; you may redistribute it and/or modify it
+under the terms of GPL v2.0 or later (see the COPYING file).
+
+=cut
+
+use 5.16.0;
+use strict;
+use warnings;
+
+use Getopt::Long qw(GetOptions);
+
+use Locale::Po4a::Chooser;
+use Locale::Po4a::TransTractor;
+use Locale::Po4a::Common;
+
+use Pod::Usage qw(pod2usage);
+
+our %debug = ( 'encoding' => 0, );
+
+Locale::Po4a::Common::textdomain('po4a');
+
+# This function produces one translated message catalog from two catalogs, an
+# original and a translation. This process is described in L<po4a(7)|po4a.7>,
+# section I<Gettextization: how does it work?>.
+
+sub gettextize {
+ my ( $poorig, $potrans ) = ( shift, shift );
+
+ my $pores = Locale::Po4a::Po->new();
+
+ my $please_fail = 0;
+ my $toobad = dgettext( "po4a",
+ "\nThe gettextization failed (once again). Don't give up, "
+ . "gettextizing is a subtle art, but this is only needed once "
+ . "to convert a project to the gorgeous luxus offered by po4a "
+ . "to translators."
+ . "\nPlease refer to the po4a(7) documentation, the section "
+ . "\"HOWTO convert a pre-existing translation to po4a?\" "
+ . "contains several hints to help you in your task" );
+
+ # Don't fail right now when the entry count does not match. Instead, give
+ # it a try so that the user can see where we fail (which is probably where
+ # the problem is).
+ if ( $poorig->count_entries_doc() > $potrans->count_entries_doc() ) {
+ warn wrap_mod(
+ "po4a gettextize",
+ dgettext(
+ "po4a",
+ "Original has more strings than the translation (%d>%d). "
+ . "Please fix it by editing the translated version to add "
+ . "some dummy entry."
+ ),
+ $poorig->count_entries_doc(),
+ $potrans->count_entries_doc()
+ );
+ $please_fail = 1;
+ } elsif ( $poorig->count_entries_doc() < $potrans->count_entries_doc() ) {
+ warn wrap_mod(
+ "po4a gettextize",
+ dgettext(
+ "po4a",
+ "Original has less strings than the translation (%d<%d). "
+ . "Please fix it by removing the extra entry from the "
+ . "translated file. You may need an addendum (cf po4a(7)) "
+ . "to reput the chunk in place after gettextization. A "
+ . "possible cause is that a text duplicated in the original "
+ . "is not translated the same way each time. Remove one of "
+ . "the translations, and you're fine."
+ ),
+ $poorig->count_entries_doc(),
+ $potrans->count_entries_doc()
+ );
+ $please_fail = 1;
+ }
+
+ for (
+ my ( $o, $t ) = ( 0, 0 ) ;
+ $o < $poorig->count_entries_doc() && $t < $potrans->count_entries_doc() ;
+ $o++, $t++
+ )
+ {
+ #
+ # Extract some informations
+
+ my ( $orig, $trans ) = ( $poorig->msgid_doc($o), $potrans->msgid_doc($t) );
+
+ # print STDERR "Matches [[$orig]]<<$trans>>\n";
+
+ my ( $reforig, $reftrans ) = ( $poorig->{po}{$orig}{'reference'}, $potrans->{po}{$trans}{'reference'} );
+ my ( $typeorig, $typetrans ) = ( $poorig->type_doc($o), $potrans->type_doc($t) );
+
+ #
+ # Make sure the type of both string exist
+ #
+ die wrap_mod( "po4a gettextize", "Internal error: type of original string number %s isn't provided", $o )
+ if ( $typeorig eq '' );
+
+ die wrap_mod( "po4a gettextize", "Internal error: type of translated string number %s isn't provided", $o )
+ if ( $typetrans eq '' );
+
+ #
+ # Make sure both type are the same
+ #
+ if ( $typeorig ne $typetrans ) {
+ $pores->write("gettextization.failed.po");
+ eval {
+ # Recode $trans into current charset, if possible
+ require I18N::Langinfo;
+ I18N::Langinfo->import(qw(langinfo CODESET));
+ my $codeset = langinfo( CODESET() );
+ Encode::from_to( $trans, $potrans->get_charset, $codeset );
+ };
+ die wrap_msg(
+ dgettext( "po4a",
+ "po4a gettextization: Structure disparity between "
+ . "original and translated files:\n"
+ . "msgid (at %s) is of type '%s' while\n"
+ . "msgstr (at %s) is of type '%s'.\n"
+ . "Original text: %s\n"
+ . "Translated text: %s\n"
+ . "(result so far dumped to gettextization.failed.po)" )
+ . "%s",
+ $reforig,
+ $typeorig,
+ $reftrans,
+ $typetrans,
+ $orig, $trans, $toobad
+ );
+ }
+
+ #
+ # Push the entry
+ #
+ my $flags;
+ if ( defined $poorig->{po}{$orig}{'flags'} ) {
+ $flags = $poorig->{po}{$orig}{'flags'} . " fuzzy";
+ } else {
+ $flags = "fuzzy";
+ }
+ $pores->push_raw(
+ 'msgid' => $orig,
+ 'msgstr' => $trans,
+ 'flags' => $flags,
+ 'type' => $typeorig,
+ 'reference' => $reforig,
+ 'conflict' => 1,
+ 'transref' => $potrans->{po}{$trans}{'reference'}
+ )
+ unless ( defined( $pores->{po}{$orig} )
+ and ( $pores->{po}{$orig}{'msgstr'} eq $trans ) )
+
+ # FIXME: maybe we should be smarter about what reference should be
+ # sent to push_raw.
+ }
+
+ # make sure we return a useful error message when entry count differ
+ die "$toobad\n" if $please_fail;
+
+ return $pores;
+}
+
+sub show_version {
+ Locale::Po4a::Common::show_version("po4a-gettextize");
+ exit 0;
+}
+
+my %opts = (
+ "verbose" => 0,
+ "debug" => 0,
+ "copyright-holder" => undef,
+ "msgid-bugs-address" => undef,
+ "package-name" => undef,
+ "package-version" => undef
+);
+
+my ($pofile) = ('-');
+my ( @masterfile, @locfile, $help_fmt, $help, $keep_temps, $type, @options );
+my ( $mastchar, $locchar );
+Getopt::Long::config( 'bundling', 'no_getopt_compat', 'no_auto_abbrev' );
+GetOptions(
+ 'help|h' => \$help,
+ 'help-format' => \$help_fmt,
+ 'keep-temps|k' => \$keep_temps,
+
+ 'master|m=s' => \@masterfile,
+ 'localized|l=s' => \@locfile,
+ 'po|p=s' => \$pofile,
+ 'format|f=s' => \$type,
+
+ 'master-charset|M=s' => \$mastchar,
+ 'localized-charset|L=s' => \$locchar,
+
+ 'option|o=s' => \@options,
+
+ 'copyright-holder=s' => \$opts{"copyright-holder"},
+ 'msgid-bugs-address=s' => \$opts{"msgid-bugs-address"},
+ 'package-name=s' => \$opts{"package-name"},
+ 'package-version=s' => \$opts{"package-version"},
+
+ 'verbose|v' => \$opts{"verbose"},
+ 'debug|d' => \$opts{"debug"},
+ 'version|V' => \&show_version
+) or pod2usage();
+
+# Argument check
+$help && pod2usage( -verbose => 1, -exitval => 0 );
+$help_fmt && Locale::Po4a::Chooser::list(0);
+pod2usage() if ( scalar @ARGV > 1 ) || ( scalar @masterfile < 1 );
+$locchar //= "UTF-8";
+
+foreach (@options) {
+ if (m/^([^=]*)=(.*)$/) {
+ $opts{$1} = "$2";
+ } else {
+ $opts{$_} = 1;
+ }
+}
+
+if ( scalar @locfile == 0 ) {
+ die wrap_msg(
+ gettext(
+ "You must provide the same amount of master files and localized files to synchronize them, "
+ . "as po4a-gettextize is intended to synchronize master files and previously existing translations. "
+ . "If just want to extract POT files of your master files, please use po4a-updatepo. "
+ . "Please note that the most convenient way of using po4a is to write a po4a.conf file and use the integrated po4a(1) program."
+ )
+ );
+}
+
+# Check file existence
+foreach my $file ( @masterfile, @locfile ) {
+ $file eq '-' || -e $file || die wrap_msg( gettext("File %s does not exist."), $file );
+}
+
+print wrap_msg(
+ gettext(
+ "po4a-gettextize is only useful to convert previously existing translations to a PO based workflow. "
+ . "Once you successfully converted your project to po4a, you should use the po4a(1) program to maintain it and update your translations."
+ )
+);
+
+# Declare the TransTractor parsers
+my ( $mastertt, $transtt ) = ( Locale::Po4a::Chooser::new( $type, %opts ), Locale::Po4a::Chooser::new( $type, %opts ) );
+
+# Parse master file forcing conversion to utf if it's not in ascii
+foreach my $file (@masterfile) {
+ $mastertt->read( $file, $file, $mastchar // '' );
+}
+$mastertt->parse;
+
+# Implementation note:
+# In practice, po4a-gettextize uses the po4a parsers on both the original and the
+# translation files to extract two PO files. A third PO file is built from them
+# taking strings from the second as translation of strings from the first.
+
+# Let's merge the two transtractor files
+
+foreach my $file (@locfile) {
+ $transtt->read( $file, $file, $locchar );
+}
+$transtt->parse;
+
+if ($keep_temps) {
+ $mastertt->getpoout()->write("po4atemp.master.po");
+ $transtt->getpoout()->write("po4atemp.localized.po");
+ print wrap_msg(
+ dgettext(
+ "po4a", "Temporary master and localized POT files dumped to po4atemp.master.po and po4atemp.localized.po"
+ )
+ );
+}
+my $mergedpo = gettextize( $mastertt->getpoout(), $transtt->getpoout() );
+
+$mergedpo->write($pofile);
+
+__END__