summaryrefslogtreecommitdiffstats
path: root/po4a-gettextize
blob: 0729c4ed43c0df29cdb29f082fbcf70b6db859f9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
#! /usr/bin/env perl
eval 'exec perl -S $0 ${1+"$@"}'
  if $running_under_some_shell;

# po4a-gettextize -- convert an original file to a PO file
#
# Copyright 2002-2023 by SPI, inc.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of GPL v2.0 or later (see COPYING).

=encoding UTF-8

=head1 NAME

po4a-gettextize - convert an original file (and its translation) to a PO file

=head1 SYNOPSIS

B<po4a-gettextize> B<-f> I<fmt> B<-m> I<master.doc> B<-l> I<XX.doc> B<-p> I<XX.po>

(I<XX.po> is the output, all others are inputs)

=head1 DESCRIPTION

po4a (PO for anything) eases the maintenance of documentation translation using
the classical gettext tools. The main feature of po4a is that it decouples the
translation of content from its document structure.  Please refer to the page
L<po4a(7)> for a gentle introduction to this project.

The B<po4a-gettextize> script helps you converting your previously existing
translations into a po4a-based workflow. This is only to be done once to salvage
an existing translation while converting to po4a, not on a regular basis after
the conversion of your project. This tedious process is explained in details in
Section 'Converting a manual translation to po4a' below.

You must provide both a master file (e.g., the source in English) and an
existing translated file (e.g., a previous translation attempt without po4a). If
you provide more than one master or translation files, they will be used in
sequence, but it may be easier to gettextize each page or chapter separately and
then use B<msgmerge> to merge all produced PO files. As you wish.

If the master document has non-ASCII characters, the new generated PO file will
be in UTF-8. If the master document is completely in ASCII, the generated
PO will use the encoding of the translated input document.

=head1 OPTIONS

=over 4

=item B<-f>, B<--format>

Format of the documentation you want to handle. Use the B<--help-format>
option to see the list of available formats.

=item B<-m>, B<--master>

File containing the master document to translate. You can use this option
multiple times if you want to gettextize multiple documents.

=item B<-M>, B<--master-charset>

Charset of the file containing the document to translate.

=item B<-l>, B<--localized>

File containing the localized (translated) document. If you provided
multiple master files, you may wish to provide multiple localized file by
using this option more than once.

=item B<-L>, B<--localized-charset>

Charset of the file containing the localized document.

=item B<-p>, B<--po>

File where the message catalog should be written. If not given, the message
catalog will be written to the standard output.

=item B<-o>, B<--option>

Extra option(s) to pass to the format plugin. See the documentation of each
plugin for more information about the valid options and their meanings. For
example, you could pass '-o tablecells' to the AsciiDoc parser, while the
text parser would accept '-o tabs=split'.

=item B<-h>, B<--help>

Show a short help message.

=item B<--help-format>

List the documentation formats understood by po4a.

=item B<-k> B<--keep-temps>

Keep the temporary master and localized POT files built before merging.
This can be useful to understand why these files get desynchronized, leading
to gettextization problems.

=item B<-V>, B<--version>

Display the version of the script and exit.

=item B<-v>, B<--verbose>

Increase the verbosity of the program.

=item B<-d>, B<--debug>

Output some debugging information.

=item B<--msgid-bugs-address> I<email@address>

Set the report address for msgid bugs. By default, the created POT files
have no Report-Msgid-Bugs-To fields.

=item B<--copyright-holder> I<string>

Set the copyright holder in the POT header. The default value is
"Free Software Foundation, Inc."

=item B<--package-name> I<string>

Set the package name for the POT header. The default is "PACKAGE".

=item B<--package-version> I<string>

Set the package version for the POT header. The default is "VERSION".

=back

=head2 Converting a manual translation to po4a

B<po4a-gettextize> synchronizes the master and localized files to extract their
content into a PO file. The content of the master file gives the B<msgid> while
the content of the localized file gives the B<msgstr>. This process is somewhat
fragile: the Nth string of the translated file is supposed to be the translation
of the Nth string in the original.

Gettextization works best if you manage to retrieve the exact version of the
original document that was used for translation. Even so, you may need to fiddle
with both master and localized files to align their structure if it was changed
by the original translator, so working on files' copies is advised.

Internally, each po4a parser reports the syntactical type of each extracted
strings. This is how desynchronization are detected during the gettextization.
In the example depicted below, it is very unlikely that the 4th string in
translation (of type 'chapter') is the translation of the 4th string in original
(of type 'paragraph'). It is more likely that a new paragraph was added to the
original, or that two original paragraphs were merged together in the
translation.

    Original         Translation

  chapter            chapter
    paragraph          paragraph
    paragraph          paragraph
    paragraph        chapter
  chapter              paragraph
    paragraph          paragraph

B<po4a-gettextize> will verbosely diagnose any structure desynchronization. When
this happens, you should manually edit the files to add fake paragraphs or
remove some content here and there until the structure of both files actually
match. Some tricks are given below to salvage the most of the existing
translation while doing so.

If you are lucky enough to have a perfect match in the file structures out of
the box, building a correct PO file is a matter of seconds. Otherwise, you will
soon understand why this process has such an ugly name :) Even so,
gettextization often remains faster than translating everything again. I
gettextized the French translation of the whole Perl documentation in one day
despite the I<many> synchronization issues. Given the amount of text (2MB of
original text), restarting the translation without first salvaging the old
translations would have required several months of work. In addition, this grunt
work is the price to pay to get the comfort of po4a. Once converted, the
synchronization between master documents and translations will always be fully
automatic.

After a successful gettextization, the produced documents should be manually
checked for undetected disparities and silent errors, as explained below.

=head3 Hints and tricks for the gettextization process

The gettextization stops as soon as a desynchronization is detected. When this
happens, you need to edit the files as much as needed to re-align the files'
structures. B<po4a-gettextize> is rather verbose when things go wrong. It
reports the strings that don't match, their positions in the text, and the type
of each of them. Moreover, the PO file generated so far is dumped as
F<gettextization.failed.po> for further inspection.

Here are some tricks to help you in this tedious process and ensure that you
salvage the most of the previous translation:

=over

=item

Remove all extra content of the translations, such as the section giving credits
to the translators. They should be added separately to B<po4a> as addenda (see
L<po4a(7)>).

=item

When editing the files to align their structures, prefer editing the translation
if possible. Indeed, if the changes to the original are too intrusive, the old
and new versions will not be matched during the first po4a run after
gettextization (see below). Any unmatched translation will be dumped anyway.
That being said, you still want to edit the original document if it's too hard
to get the gettextization to proceed otherwise, even if it means that one
paragraph of the translation is dumped. The important thing is to get a first PO
file to start with.

=item

Do not hesitate to kill any original content that would not exist in the
translated version. This content will be automatically reintroduced afterward,
when synchronizing the PO file with the document.

=item

You should probably inform the original author of any structural change in the
translation that seems justified. Issues in the original document should
reported to the author. Fixing them in your translation only fixes them for a
part of the community. Plus, it is impossible to do so when using po4a ;) But
you probably want to wait until the end of the conversion to B<po4a> before
changing the original files.

=item

Sometimes, the paragraph content does match, but not their types. Fixing it is
rather format-dependent. In POD and man, it often comes from the fact that one
of them contains a line beginning with a white space while the other does not.
In those formats, such paragraph cannot be wrapped and thus become a different
type. Just remove the space and you are fine. It may also be a typo in the tag
name in XML.

Likewise, two paragraphs may get merged together in POD when the separating
line contains some spaces, or when there is no empty line between the B<=item>
line and the content of the item.

=item

Sometimes, the desynchronization message seems odd because the translation is
attached to the wrong original paragraph. It is the sign of an undetected issue
earlier in the process. Search for the actual desynchronization point by
inspecting the file F<gettextization.failed.po> that was produced, and fix the
problem where it really is.

=item

Other issues may come from duplicated strings in either the original or
translation. Duplicated strings are merged in PO files, with two references.
This constitutes a difficulty for the gettextization algorithm, that is a simple
one to one pairing between the B<msgid>s of both the master and the localized
files. It is however believed that recent versions of po4a deal properly with
duplicated strings, so you should report any remaining issue that you may encounter.

=back

=head2 Reviewing files produced by B<po4a-gettextize>

Any file produced by B<po4a-gettextize> should be manually reviewed, even when
the script terminates successfully. You should skim over the PO file, ensuring
that the B<msgid> and B<msgstr> actually match. It is not necessary to ensure
that the translation is perfectly correct yet, as all entries are marked as
fuzzy translations anyway. You only need to check for obvious matching issues
because badly matched translations will be dumped in subsequent steps while you
want to salvage them.

Fortunately, this step does not require to master the target languages as you
only want to recognize similar elements in each B<msgid> and its corresponding
B<msgstr>. As a speaker of French, English, and some German myself, I can do
this for all European languages at least, even if I cannot say one word of most
of these languages. I sometimes manage to detect matching issues in non-Latin
languages by looking at string length, phrase structures (does the amount of
interrogation marks match?) and other clues, but I prefer when someone else can
review those languages.

If you detect a mismatch, edit the original and translation files as if
B<po4a-gettextize> reported an error, and try again. Once you have a decent PO
file for your previous translation, backup it until you get po4a working
correctly.

=head2 Running B<po4a> for the first time

The easiest way to setup po4a is to write a B<po4a.conf> configuration file, and
use the integrated B<po4a> program (B<po4a-updatepo> and B<po4a-translate> are
deprecated). Please check the "CONFIGURATION FILE" Section in L<po4a(1)>
documentation for more details.

When B<po4a> runs for the first time, the current version of the master
documents will be used to update the PO files containing the old translations
that you salvaged through gettextization. This can take quite a long time,
because many of the B<msgid>s of from the gettextization do not exactly match
the elements of the POT file built from the recent master files. This forces
gettext to search for the closest one using a costly string proximity algorithm.
For example, the first run over the Perl documentation's French translation (5.5
MB PO file) took about 48 hours (yes, two days) while the subsequent ones only
take seconds.

=head2 Moving your translations to production

After this first run, the PO files are ready to be reviewed by translators. All
entries were marked as fuzzy in the PO file by B<po4a-gettextization>, forcing
their careful review before use. Translators should take each entry to verify
that the salvaged translation actually match the current original text, update
the translation on need, and remove the fuzzy markers.

Once enough fuzzy markers are removed, B<po4a> will start generating the
translation files on disk, and you're ready to move your translation workflow to
production. Some projects find it useful to rely on weblate to coordinate
between translators and maintainers, but that's beyond B<po4a>' scope.

=head1 SEE ALSO

L<po4a(1)>,
L<po4a-normalize(1)>,
L<po4a-translate(1)>,
L<po4a-updatepo(1)>,
L<po4a(7)>.

=head1 AUTHORS

 Denis Barbier <barbier@linuxfr.org>
 Nicolas François <nicolas.francois@centraliens.net>
 Martin Quinson (mquinson#debian.org)

=head1 COPYRIGHT AND LICENSE

Copyright 2002-2023 by SPI, inc.

This program is free software; you may redistribute it and/or modify it
under the terms of GPL v2.0 or later (see the COPYING file).

=cut

use 5.16.0;
use strict;
use warnings;

use Getopt::Long qw(GetOptions);

use Locale::Po4a::Chooser;
use Locale::Po4a::TransTractor;
use Locale::Po4a::Common;

use Pod::Usage qw(pod2usage);

our %debug = ( 'encoding' => 0, );

Locale::Po4a::Common::textdomain('po4a');

# This function produces one translated message catalog from two catalogs, an
# original and a translation. This process is described in L<po4a(7)|po4a.7>,
# section I<Gettextization: how does it work?>.

sub gettextize {
    my ( $poorig, $potrans ) = ( shift, shift );

    my $pores = Locale::Po4a::Po->new();

    my $please_fail = 0;
    my $toobad      = dgettext( "po4a",
            "\nThe gettextization failed (once again). Don't give up, "
          . "gettextizing is a subtle art, but this is only needed once "
          . "to convert a project to the gorgeous luxus offered by po4a "
          . "to translators."
          . "\nPlease refer to the po4a(7) documentation, the section "
          . "\"HOWTO convert a pre-existing translation to po4a?\" "
          . "contains several hints to help you in your task" );

    # Don't fail right now when the entry count does not match. Instead, give
    # it a try so that the user can see where we fail (which is probably where
    # the problem is).
    if ( $poorig->count_entries_doc() > $potrans->count_entries_doc() ) {
        warn wrap_mod(
            "po4a gettextize",
            dgettext(
                "po4a",
                "Original has more strings than the translation (%d>%d). "
                  . "Please fix it by editing the translated version to add "
                  . "some dummy entry."
            ),
            $poorig->count_entries_doc(),
            $potrans->count_entries_doc()
        );
        $please_fail = 1;
    } elsif ( $poorig->count_entries_doc() < $potrans->count_entries_doc() ) {
        warn wrap_mod(
            "po4a gettextize",
            dgettext(
                "po4a",
                "Original has less strings than the translation (%d<%d). "
                  . "Please fix it by removing the extra entry from the "
                  . "translated file. You may need an addendum (cf po4a(7)) "
                  . "to reput the chunk in place after gettextization. A "
                  . "possible cause is that a text duplicated in the original "
                  . "is not translated the same way each time. Remove one of "
                  . "the translations, and you're fine."
            ),
            $poorig->count_entries_doc(),
            $potrans->count_entries_doc()
        );
        $please_fail = 1;
    }

    for (
        my ( $o, $t ) = ( 0, 0 ) ;
        $o < $poorig->count_entries_doc() && $t < $potrans->count_entries_doc() ;
        $o++, $t++
      )
    {
        #
        # Extract some informations

        my ( $orig, $trans ) = ( $poorig->msgid_doc($o), $potrans->msgid_doc($t) );

        #       print STDERR "Matches [[$orig]]<<$trans>>\n";

        my ( $reforig, $reftrans ) = ( $poorig->{po}{$orig}{'reference'}, $potrans->{po}{$trans}{'reference'} );
        my ( $typeorig, $typetrans ) = ( $poorig->type_doc($o), $potrans->type_doc($t) );

        #
        # Make sure the type of both string exist
        #
        die wrap_mod( "po4a gettextize", "Internal error: type of original string number %s isn't provided", $o )
          if ( $typeorig eq '' );

        die wrap_mod( "po4a gettextize", "Internal error: type of translated string number %s isn't provided", $o )
          if ( $typetrans eq '' );

        #
        # Make sure both type are the same
        #
        if ( $typeorig ne $typetrans ) {
            $pores->write("gettextization.failed.po");
            eval {
                # Recode $trans into current charset, if possible
                require I18N::Langinfo;
                I18N::Langinfo->import(qw(langinfo CODESET));
                my $codeset = langinfo( CODESET() );
                Encode::from_to( $trans, $potrans->get_charset, $codeset );
            };
            die wrap_msg(
                dgettext( "po4a",
                        "po4a gettextization: Structure disparity between "
                      . "original and translated files:\n"
                      . "msgid (at %s) is of type '%s' while\n"
                      . "msgstr (at %s) is of type '%s'.\n"
                      . "Original text: %s\n"
                      . "Translated text: %s\n"
                      . "(result so far dumped to gettextization.failed.po)" )
                  . "%s",
                $reforig,
                $typeorig,
                $reftrans,
                $typetrans,
                $orig, $trans, $toobad
            );
        }

        #
        # Push the entry
        #
        my $flags;
        if ( defined $poorig->{po}{$orig}{'flags'} ) {
            $flags = $poorig->{po}{$orig}{'flags'} . " fuzzy";
        } else {
            $flags = "fuzzy";
        }
        $pores->push_raw(
            'msgid'     => $orig,
            'msgstr'    => $trans,
            'flags'     => $flags,
            'type'      => $typeorig,
            'reference' => $reforig,
            'conflict'  => 1,
            'transref'  => $potrans->{po}{$trans}{'reference'}
          )
          unless ( defined( $pores->{po}{$orig} )
            and ( $pores->{po}{$orig}{'msgstr'} eq $trans ) )

          # FIXME: maybe we should be smarter about what reference should be
          #        sent to push_raw.
    }

    # make sure we return a useful error message when entry count differ
    die "$toobad\n" if $please_fail;

    return $pores;
}

sub show_version {
    Locale::Po4a::Common::show_version("po4a-gettextize");
    exit 0;
}

my %opts = (
    "verbose"            => 0,
    "debug"              => 0,
    "copyright-holder"   => undef,
    "msgid-bugs-address" => undef,
    "package-name"       => undef,
    "package-version"    => undef
);

my ($pofile) = ('-');
my ( @masterfile, @locfile, $help_fmt, $help, $keep_temps, $type, @options );
my ( $mastchar, $locchar );
Getopt::Long::config( 'bundling', 'no_getopt_compat', 'no_auto_abbrev' );
GetOptions(
    'help|h'       => \$help,
    'help-format'  => \$help_fmt,
    'keep-temps|k' => \$keep_temps,

    'master|m=s'    => \@masterfile,
    'localized|l=s' => \@locfile,
    'po|p=s'        => \$pofile,
    'format|f=s'    => \$type,

    'master-charset|M=s'    => \$mastchar,
    'localized-charset|L=s' => \$locchar,

    'option|o=s' => \@options,

    'copyright-holder=s'   => \$opts{"copyright-holder"},
    'msgid-bugs-address=s' => \$opts{"msgid-bugs-address"},
    'package-name=s'       => \$opts{"package-name"},
    'package-version=s'    => \$opts{"package-version"},

    'verbose|v' => \$opts{"verbose"},
    'debug|d'   => \$opts{"debug"},
    'version|V' => \&show_version
) or pod2usage();

# Argument check
$help     && pod2usage( -verbose => 1, -exitval => 0 );
$help_fmt && Locale::Po4a::Chooser::list(0);
pod2usage() if ( scalar @ARGV > 1 ) || ( scalar @masterfile < 1 );
$locchar //= "UTF-8";

foreach (@options) {
    if (m/^([^=]*)=(.*)$/) {
        $opts{$1} = "$2";
    } else {
        $opts{$_} = 1;
    }
}

if ( scalar @locfile == 0 ) {
    die wrap_msg(
        gettext(
                "You must provide the same amount of master files and localized files to synchronize them, "
              . "as po4a-gettextize is intended to synchronize master files and previously existing translations. "
              . "If just want to extract POT files of your master files, please use po4a-updatepo. "
              . "Please note that the most convenient way of using po4a is to write a po4a.conf file and use the integrated po4a(1) program."
        )
    );
}

# Check file existence
foreach my $file ( @masterfile, @locfile ) {
    $file eq '-' || -e $file || die wrap_msg( gettext("File %s does not exist."), $file );
}

print wrap_msg(
    gettext(
            "po4a-gettextize is only useful to convert previously existing translations to a PO based workflow. "
          . "Once you successfully converted your project to po4a, you should use the po4a(1) program to maintain it and update your translations."
    )
);

# Declare the TransTractor parsers
my ( $mastertt, $transtt ) = ( Locale::Po4a::Chooser::new( $type, %opts ), Locale::Po4a::Chooser::new( $type, %opts ) );

# Parse master file forcing conversion to utf if it's not in ascii
foreach my $file (@masterfile) {
    $mastertt->read( $file, $file, $mastchar // '' );
}
$mastertt->parse;

# Implementation note:
# In practice, po4a-gettextize uses the po4a parsers on both the original and the
# translation files to extract two PO files. A third PO file is built from them
# taking strings from the second as translation of strings from the first.

# Let's merge the two transtractor files

foreach my $file (@locfile) {
    $transtt->read( $file, $file, $locchar );
}
$transtt->parse;

if ($keep_temps) {
    $mastertt->getpoout()->write("po4atemp.master.po");
    $transtt->getpoout()->write("po4atemp.localized.po");
    print wrap_msg(
        dgettext(
            "po4a", "Temporary master and localized POT files dumped to po4atemp.master.po and po4atemp.localized.po"
        )
    );
}
my $mergedpo = gettextize( $mastertt->getpoout(), $transtt->getpoout() );

$mergedpo->write($pofile);

__END__