summaryrefslogtreecommitdiffstats
path: root/lib/Locale/Po4a/TransTractor.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Locale/Po4a/TransTractor.pm')
-rw-r--r--lib/Locale/Po4a/TransTractor.pm1337
1 files changed, 1337 insertions, 0 deletions
diff --git a/lib/Locale/Po4a/TransTractor.pm b/lib/Locale/Po4a/TransTractor.pm
new file mode 100644
index 0000000..0bc5b8f
--- /dev/null
+++ b/lib/Locale/Po4a/TransTractor.pm
@@ -0,0 +1,1337 @@
+#!/usr/bin/perl -w
+
+require Exporter;
+
+package Locale::Po4a::TransTractor;
+use DynaLoader;
+
+use 5.16.0;
+use strict;
+use warnings;
+
+use subs qw(makespace);
+use vars qw($VERSION @ISA @EXPORT);
+$VERSION = "0.72";
+@ISA = qw(DynaLoader);
+@EXPORT = qw(new process translate
+ read write readpo writepo
+ getpoout setpoout get_in_charset get_out_charset handle_yaml);
+
+# Try to use a C extension if present.
+eval("bootstrap Locale::Po4a::TransTractor $VERSION");
+
+use Carp qw(croak confess);
+use Locale::Po4a::Po;
+use Locale::Po4a::Common;
+
+use File::Path; # mkdir before write
+use File::Spec;
+
+=encoding UTF-8
+
+=head1 NAME
+
+Locale::Po4a::TransTractor - generic trans(lator ex)tractor.
+
+=head1 DESCRIPTION
+
+The po4a (PO for anything) project goal is to ease translations (and more
+interestingly, the maintenance of translations) using gettext tools on
+areas where they were not expected like documentation.
+
+This class is the ancestor of every po4a parser used to parse a document, to
+search translatable strings, to extract them to a PO file and to replace them by
+their translation in the output document.
+
+More formally, it takes the following arguments as input:
+
+=over 2
+
+=item -
+
+a document to translate;
+
+=item -
+
+a PO file containing the translations to use.
+
+=back
+
+As output, it produces:
+
+=over 2
+
+=item -
+
+another PO file, resulting of the extraction of translatable strings from
+the input document;
+
+=item -
+
+a translated document, with the same structure than the one in input, but
+with all translatable strings replaced with the translations found in the
+PO file provided in input.
+
+=back
+
+Here is a graphical representation of this:
+
+ Input document --\ /---> Output document
+ \ / (translated)
+ +-> parse() function -----+
+ / \
+ Input PO --------/ \---> Output PO
+ (extracted)
+
+=head1 FUNCTIONS YOUR PARSER SHOULD OVERRIDE
+
+=over 4
+
+=item parse()
+
+This is where all the work takes place: the parsing of input documents, the
+generation of output, and the extraction of the translatable strings. This
+is pretty simple using the provided functions presented in the section
+B<INTERNAL FUNCTIONS> below. See also the B<SYNOPSIS>, which presents an
+example.
+
+This function is called by the process() function below, but if you choose
+to use the new() function, and to add content manually to your document,
+you will have to call this function yourself.
+
+=item docheader()
+
+This function returns the header we should add to the produced document,
+quoted properly to be a comment in the target language. See the section
+B<Educating developers about translations>, from L<po4a(7)|po4a.7>, for what
+it is good for.
+
+=back
+
+=cut
+
+sub docheader { }
+
+sub parse { }
+
+=head1 SYNOPSIS
+
+The following example parses a list of paragraphs beginning with "<p>". For the sake
+of simplicity, we assume that the document is well formatted, i.e. that '<p>'
+tags are the only tags present, and that this tag is at the very beginning
+of each paragraph.
+
+ sub parse {
+ my $self = shift;
+
+ PARAGRAPH: while (1) {
+ my ($paragraph,$pararef)=("","");
+ my $first=1;
+ my ($line,$lref)=$self->shiftline();
+ while (defined($line)) {
+ if ($line =~ m/<p>/ && !$first--; ) {
+ # Not the first time we see <p>.
+ # Reput the current line in input,
+ # and put the built paragraph to output
+ $self->unshiftline($line,$lref);
+
+ # Now that the document is formed, translate it:
+ # - Remove the leading tag
+ $paragraph =~ s/^<p>//s;
+
+ # - push to output the leading tag (untranslated) and the
+ # rest of the paragraph (translated)
+ $self->pushline( "<p>"
+ . $self->translate($paragraph,$pararef)
+ );
+
+ next PARAGRAPH;
+ } else {
+ # Append to the paragraph
+ $paragraph .= $line;
+ $pararef = $lref unless(length($pararef));
+ }
+
+ # Reinit the loop
+ ($line,$lref)=$self->shiftline();
+ }
+ # Did not get a defined line? End of input file.
+ return;
+ }
+ }
+
+Once you've implemented the parse function, you can use your document
+class, using the public interface presented in the next section.
+
+=head1 PUBLIC INTERFACE for scripts using your parser
+
+=head2 Constructor
+
+=over 4
+
+=item process(%)
+
+This function can do all you need to do with a po4a document in one
+invocation. Its arguments must be packed as a hash. ACTIONS:
+
+=over 3
+
+=item a.
+
+Reads all the PO files specified in po_in_name
+
+=item b.
+
+Reads all original documents specified in file_in_name
+
+=item c.
+
+Parses the document
+
+=item d.
+
+Reads and applies all the addenda specified
+
+=item e.
+
+Writes the translated document to file_out_name (if given)
+
+=item f.
+
+Writes the extracted PO file to po_out_name (if given)
+
+=back
+
+ARGUMENTS, beside the ones accepted by new() (with expected type):
+
+=over 4
+
+=item file_in_name (@)
+
+List of filenames where we should read the input document.
+
+=item file_in_charset ($)
+
+Charset used in the input document (if it isn't specified, use UTF-8).
+
+=item file_out_name ($)
+
+Filename where we should write the output document.
+
+=item file_out_charset ($)
+
+Charset used in the output document (if it isn't specified, use UTF-8).
+
+=item po_in_name (@)
+
+List of filenames where we should read the input PO files from, containing
+the translation which will be used to translate the document.
+
+=item po_out_name ($)
+
+Filename where we should write the output PO file, containing the strings
+extracted from the input document.
+
+=item addendum (@)
+
+List of filenames where we should read the addenda from.
+
+=item addendum_charset ($)
+
+Charset for the addenda.
+
+=back
+
+=item new(%)
+
+Create a new po4a document. Accepted options (in the hash passed as a parameter):
+
+=over 4
+
+=item verbose ($)
+
+Sets the verbosity.
+
+=item debug ($)
+
+Sets the debugging.
+
+=item wrapcol ($)
+
+The column at which we should wrap text in output document (default: 76).
+
+The negative value means not to wrap lines at all.
+
+=back
+
+Also it accepts next options for underlying Po-files: B<porefs>,
+B<copyright-holder>, B<msgid-bugs-address>, B<package-name>,
+B<package-version>, B<wrap-po>.
+
+=cut
+
+sub process {
+ my $self = shift;
+ ## Parameters are passed as an hash to avoid long and error-prone parameter lists
+ my %params = @_;
+
+ # Parameter checking
+ foreach ( keys %params ) {
+ confess "Unexpected parameter to process(): $_. Please report that bug."
+ unless ( $_ eq 'po_in_name'
+ || $_ eq 'po_out_name'
+ || $_ eq 'file_in_name'
+ || $_ eq 'file_in_charset'
+ || $_ eq 'file_out_name'
+ || $_ eq 'file_out_charset'
+ || $_ eq 'addendum'
+ || $_ eq 'addendum_charset'
+ || $_ eq 'srcdir'
+ || $_ eq 'destdir'
+ || $_ eq 'calldir' );
+ }
+
+ $self->{TT}{'file_in_charset'} = $params{'file_in_charset'} // 'UTF-8';
+ $self->{TT}{'file_out_charset'} = $params{'file_out_charset'} // 'UTF-8';
+ $self->{TT}{'addendum_charset'} = $params{'addendum_charset'};
+
+ our ( $destdir, $srcdir, $calldir ) = ( $params{'destdir'}, $params{'srcdir'}, $params{'calldir'} );
+
+ sub _input_file {
+ my $filename = $_[0];
+ return $filename if ( File::Spec->file_name_is_absolute($filename) );
+ foreach ( ( $destdir, $srcdir, $calldir ) ) {
+ next unless defined $_;
+ my $p = File::Spec->catfile( $_, $filename );
+ return $p if -e $p;
+ }
+ return $filename;
+ }
+
+ sub _output_file {
+ my $filename = $_[0];
+ return $filename if ( File::Spec->file_name_is_absolute($filename) );
+ foreach ( ( $destdir, $calldir ) ) {
+ next unless defined $_;
+ return File::Spec->catfile( $_, $filename ) if -d $_ and -w $_;
+ }
+ return $filename;
+ }
+
+ foreach my $file ( @{ $params{'po_in_name'} } ) {
+ my $infile = _input_file($file);
+ print STDERR wrap_mod( "po4a::transtractor::process", "Read PO file $infile" )
+ if $self->debug();
+ $self->readpo($infile);
+ }
+ foreach my $file ( @{ $params{'file_in_name'} } ) {
+ my $infile = _input_file($file);
+ print STDERR wrap_mod( "po4a::transtractor::process", "Read document $infile" )
+ if $self->debug();
+ $self->read( $infile, $file, $params{'file_in_charset'} );
+ }
+ print STDERR wrap_mod( "po4a::transtractor::process", "Call parse()" ) if $self->debug();
+ $self->parse();
+ print STDERR wrap_mod( "po4a::transtractor::process", "Done parse()" ) if $self->debug();
+ foreach my $file ( @{ $params{'addendum'} } ) {
+ my $infile = _input_file($file);
+ print STDERR wrap_mod( "po4a::transtractor::process", "Apply addendum $infile" )
+ if $self->debug();
+ $self->addendum($file) || die "An addendum failed\n";
+ }
+
+ if ( defined $params{'file_out_name'} ) {
+ my $outfile = _output_file( $params{'file_out_name'} );
+ print STDERR wrap_mod( "po4a::transtractor::process", "Write document $outfile" )
+ if $self->debug();
+ $self->write( $outfile, $self->{TT}{'file_out_charset'} );
+ }
+ if ( defined $params{'po_out_name'} ) {
+ my $outfile = _output_file( $params{'po_out_name'} );
+ print STDERR wrap_mod( "po4a::transtractor::process", "Write PO file $outfile" )
+ if $self->debug();
+ $self->writepo($outfile);
+ }
+ return $self;
+}
+
+sub new {
+ ## Determine if we were called via an object-ref or a classname
+ my $this = shift;
+ my $class = ref($this) || $this;
+ my $self = {};
+ my %options = @_;
+ ## Bless ourselves into the desired class and perform any initialization
+ bless $self, $class;
+
+ ## initialize the plugin
+ # prevent the plugin from croaking on the options intended for Po.pm or ourself
+ $self->{options}{'porefs'} = '';
+ $self->{options}{'copyright-holder'} = '';
+ $self->{options}{'msgid-bugs-address'} = '';
+ $self->{options}{'package-name'} = '';
+ $self->{options}{'package-version'} = '';
+ $self->{options}{'wrap-po'} = '';
+ $self->{options}{'wrapcol'} = '';
+
+ # let the plugin parse the options and such
+ $self->initialize(%options);
+
+ ## Create our private data
+ my %po_options;
+ $po_options{'porefs'} = $options{'porefs'};
+ $po_options{'copyright-holder'} = $options{'copyright-holder'};
+ $po_options{'msgid-bugs-address'} = $options{'msgid-bugs-address'};
+ $po_options{'package-name'} = $options{'package-name'};
+ $po_options{'package-version'} = $options{'package-version'};
+ $po_options{'wrap-po'} = $options{'wrap-po'};
+
+ # private data
+ $self->{TT} = ();
+ $self->{TT}{po_in} = Locale::Po4a::Po->new( \%po_options );
+ $self->{TT}{po_out} = Locale::Po4a::Po->new( \%po_options );
+
+ # Warning, $self->{TT}{doc_in} is an array of array:
+ # The document is split on lines, and for each array in array
+ # [0] is the line content, [1] is the reference $filename:$linenum
+ $self->{TT}{doc_in} = ();
+ $self->{TT}{doc_out} = ();
+ if ( defined $options{'verbose'} ) {
+ $self->{TT}{verbose} = $options{'verbose'};
+ }
+ if ( defined $options{'debug'} ) {
+ $self->{TT}{debug} = $options{'debug'};
+ }
+ if ( defined $options{'wrapcol'} ) {
+ if ( $options{'wrapcol'} < 0) {
+ $self->{TT}{wrapcol} = 'Inf';
+ } else {
+ $self->{TT}{wrapcol} = $options{'wrapcol'};
+ }
+ } else {
+ $self->{TT}{wrapcol} = 76;
+ }
+
+ return $self;
+}
+
+=back
+
+=head2 Manipulating document files
+
+=over 4
+
+=item read($$)
+
+Add another input document data at the end of the existing array
+C<< @{$self->{TT}{doc_in}} >>. The argument is the filename to read. If a second
+argument is provided, it is the filename to use in the references.
+
+This array C<< @{$self->{TT}{doc_in}} >> holds this input document data as an
+array of strings with alternating meanings.
+ * The string C<$textline> holding each line of the input text data.
+ * The string C<< $filename:$linenum >> holding its location and called as
+ "reference" (C<linenum> starts with 1).
+
+Please note that it does not parse anything. You should use the parse()
+function when you're done with packing input files into the document.
+
+=cut
+
+sub read() {
+ my $self = shift;
+ my $filename = shift or confess "Cannot write to a file without filename";
+ my $refname = shift or confess "Cannot write to a file without refname";
+ my $charset = shift || 'UTF-8';
+ my $linenum = 0;
+
+ use warnings FATAL => 'utf8';
+ use Encode qw(:fallbacks);
+ use PerlIO::encoding;
+ $PerlIO::encoding::fallback = FB_CROAK;
+
+ my $fh;
+ open( $fh, "<:encoding($charset)", $filename )
+ or croak wrap_msg( dgettext( "po4a", "Cannot read from %s: %s" ), $filename, $! );
+
+ # If we see a BOM while not in UTF-8, we want to croak. But this code is in an eval to deal with
+ # encoding issues. So save the BOM error until after the eval block
+ my $BOM_detected = 0;
+
+ eval {
+ while ( defined( my $textline = <$fh> ) ) {
+ $linenum++;
+ if ( $linenum == 1 && $textline =~ m/^\N{BOM}/ ) { # UTF-8 BOM detected
+ $BOM_detected = 1 if ( uc($charset) ne 'UTF-8' ); # Save the error message for after the eval{} bloc
+ $textline =~ s/^\N{BOM}//;
+ }
+ my $ref = "$refname:$linenum";
+ $textline =~ s/\r$//;
+ my @entry = ( $textline, $ref );
+ push @{ $self->{TT}{doc_in} }, @entry;
+ }
+ };
+ my $error = $@;
+ if ( length($error) ) {
+ chomp $error;
+ die wrap_msg( dgettext( "po4a", "Malformed encoding while reading from file %s with charset %s: %s" ),
+ $filename, $charset, $error );
+ }
+
+ # Croak if we need to
+ if ($BOM_detected) {
+ croak wrap_msg(
+ dgettext(
+ "po4a",
+ "The file %s starts with a BOM char indicating that its encoding is UTF-8, but you specified %s instead."
+ ),
+ $filename,
+ $charset
+ );
+ }
+
+ close $fh
+ or croak wrap_msg( dgettext( "po4a", "Cannot close %s after reading: %s" ), $filename, $! );
+}
+
+=item write($)
+
+Write the translated document to the given filename.
+
+This translated document data are provided by:
+ * C<< $self->docheader() >> holding the header text for the plugin, and
+ * C<< @{$self->{TT}{doc_out}} >> holding each line of the main translated text in the array.
+
+=cut
+
+sub write {
+ my $self = shift;
+ my $filename = shift or confess "Cannot write to a file without filename";
+ my $charset = shift || 'UTF-8';
+
+ use warnings FATAL => 'utf8';
+
+ my $fh;
+ if ( $filename eq '-' ) {
+ $fh = \*STDOUT;
+ } else {
+
+ # make sure the directory in which we should write the localized file exists
+ my $dir = $filename;
+ if ( $dir =~ m|/| ) {
+ $dir =~ s|/[^/]*$||;
+
+ File::Path::mkpath( $dir, 0, 0755 ) # Croaks on error
+ if ( length($dir) && !-e $dir );
+ }
+ open( $fh, ">:encoding($charset)", $filename )
+ or croak wrap_msg( dgettext( "po4a", "Cannot write to %s: %s" ), $filename, $! );
+ }
+
+ map { print $fh $_ } $self->docheader();
+ eval {
+ map { print $fh $_ } @{ $self->{TT}{doc_out} };
+
+ # we use the "eval {} or do {}" approach to deal with exceptions, cf https://perlmaven.com/fatal-errors-in-external-modules
+ # but we want it to fail only if there is an error. It seems to be some cases where "map" returns false even if there is no error.
+ # Thus this final 1 to evaluate to true in absence of error.
+ 1;
+ } or do {
+ my $error = $@ || 'Unknown failure';
+ chomp $error;
+ if ( $charset ne 'UTF-8' && $error =~ /^"\\x\{([^"}]*)\}"/ ) {
+
+ # Attempt to write the char that cannot be written. Very fragile code
+ binmode STDERR, ':encoding(UTF-8)';
+ my $char = chr( hex($1) );
+ die wrap_msg(
+ dgettext( "po4a", "Malformed encoding while writing char '%s' to file %s with charset %s: %s" ),
+ $char, $filename, $charset, $error );
+ } else {
+ die wrap_msg( dgettext( "po4a", "Malformed encoding while writing to file %s with charset %s: %s" ),
+ $filename, $charset, $error );
+ }
+ };
+
+ if ( $filename ne '-' ) {
+ close $fh or croak wrap_msg( dgettext( "po4a", "Cannot close %s after writing: %s" ), $filename, $! );
+ }
+
+}
+
+=back
+
+=head2 Manipulating PO files
+
+=over 4
+
+=item readpo($)
+
+Add the content of a file (which name is passed as argument) to the
+existing input PO. The old content is not discarded.
+
+=item writepo($)
+
+Write the extracted PO file to the given filename.
+
+=item stats()
+
+Returns some statistics about the translation done so far. Please note that
+it's not the same statistics than the one printed by msgfmt
+--statistic. Here, it's stats about recent usage of the PO file, while
+msgfmt reports the status of the file. It is a wrapper to the
+Locale::Po4a::Po::stats_get function applied to the input PO file. Example
+of use:
+
+ [normal use of the po4a document...]
+
+ ($percent,$hit,$queries) = $document->stats();
+ print "We found translations for $percent\% ($hit from $queries) of strings.\n";
+
+=back
+
+=cut
+
+sub getpoout {
+ return $_[0]->{TT}{po_out};
+}
+
+sub setpoout {
+ $_[0]->{TT}{po_out} = $_[1];
+}
+
+sub readpo {
+ $_[0]->{TT}{po_in}->read( $_[1] );
+}
+
+sub writepo {
+ $_[0]->{TT}{po_out}->write( $_[1] );
+}
+
+sub stats {
+ return $_[0]->{TT}{po_in}->stats_get();
+}
+
+=head2 Manipulating addenda
+
+=over 4
+
+=item addendum($)
+
+Please refer to L<po4a(7)|po4a.7> for more information on what addenda are,
+and how translators should write them. To apply an addendum to the translated
+document, simply pass its filename to this function and you are done ;)
+
+This function returns a non-null integer on error.
+
+=cut
+
+# Internal function to read the header.
+sub addendum_parse {
+ my $filename = shift;
+ my $charset = shift || 'UTF-8';
+ my $header;
+
+ my ( $errcode, $mode, $position, $boundary, $bmode, $content ) = ( 1, "", "", "", "", "" );
+
+ unless ( open( INS, "<:encoding($charset)", $filename ) ) {
+ warn wrap_msg( dgettext( "po4a", "Cannot read from %s: %s" ), $filename, $! );
+ goto END_PARSE_ADDFILE;
+ }
+
+ $PerlIO::encoding::fallback = FB_CROAK;
+ eval {
+ unless ( defined( $header = <INS> ) && $header ) {
+ warn wrap_msg( dgettext( "po4a", "Cannot read po4a header from %s." ), $filename );
+ goto END_PARSE_ADDFILE;
+ }
+ } or do {
+ my $error = $@ || 'Unknown failure';
+ chomp $error;
+ die wrap_msg( dgettext( "po4a", "Malformed encoding while reading from file %s with charset %s: %s" ),
+ $filename, $charset, $error );
+ };
+
+ unless ( $header =~ s/PO4A-HEADER://i ) {
+ warn wrap_msg( dgettext( "po4a", "First line of %s does not look like a po4a header." ), $filename );
+ goto END_PARSE_ADDFILE;
+ }
+ foreach my $part ( split( /;/, $header ) ) {
+ unless ( $part =~ m/^\s*([^=]*)=(.*)$/ ) {
+ warn wrap_msg( dgettext( "po4a", "Syntax error in po4a header of %s, near \"%s\"" ), $filename, $part );
+ goto END_PARSE_ADDFILE;
+ }
+ my ( $key, $value ) = ( $1, $2 );
+ $key = lc($key);
+ if ( $key eq 'mode' ) {
+ $mode = lc($value);
+ } elsif ( $key eq 'position' ) {
+ $position = $value;
+ } elsif ( $key eq 'endboundary' ) {
+ $boundary = $value;
+ $bmode = 'after';
+ } elsif ( $key eq 'beginboundary' ) {
+ $boundary = $value;
+ $bmode = 'before';
+ } else {
+ warn wrap_msg( dgettext( "po4a", "Invalid argument in the po4a header of %s: %s" ), $filename, $key );
+ goto END_PARSE_ADDFILE;
+ }
+ }
+
+ unless ( length($mode) ) {
+ warn wrap_msg( dgettext( "po4a", "The po4a header of %s does not define the mode." ), $filename );
+ goto END_PARSE_ADDFILE;
+ }
+ unless ( $mode eq "before" || $mode eq "after" || $mode eq "eof" ) {
+ warn wrap_msg(
+ dgettext(
+ "po4a",
+ "Mode invalid in the po4a header of %s: should be 'before', 'after' or 'eof'. Instead, it is '%s'."
+ ),
+ $filename,
+ $mode
+ );
+ goto END_PARSE_ADDFILE;
+ }
+
+ unless ( length($position) || $mode eq "eof" ) {
+ warn wrap_msg( dgettext( "po4a", "The po4a header of %s does not define the position." ), $filename );
+ goto END_PARSE_ADDFILE;
+ }
+ if ( $mode eq "after" && length($boundary) == 0 ) {
+ warn wrap_msg( dgettext( "po4a", "No ending boundary given in the po4a header, but mode=after." ) );
+ goto END_PARSE_ADDFILE;
+ }
+ if ( $mode eq "eof" && length($position) ) {
+ warn wrap_msg( dgettext( "po4a", "No position needed when mode=eof." ) );
+ goto END_PARSE_ADDFILE;
+ }
+ if ( $mode eq "eof" && length($boundary) ) {
+ warn wrap_msg( dgettext( "po4a", "No ending boundary needed when mode=eof." ) );
+ goto END_PARSE_ADDFILE;
+ }
+
+ eval {
+ while ( defined( my $line = <INS> ) ) {
+ $content .= $line;
+ }
+ };
+ my $error = $@;
+ if ( length($error) ) {
+ chomp $error;
+ die wrap_msg( dgettext( "po4a", "Malformed encoding while reading from file %s with charset %s: %s" ),
+ $filename, $charset, $error );
+ }
+ close INS;
+
+ $errcode = 0;
+ END_PARSE_ADDFILE:
+ return ( $errcode, $mode, $position, $boundary, $bmode, $content );
+}
+
+sub mychomp {
+ my ($str) = shift;
+ chomp($str);
+ return $str;
+}
+
+sub addendum {
+ my ( $self, $filename ) = @_;
+
+ print STDERR wrap_mod( "po4a::transtractor::addendum", "Apply addendum %s", $filename )
+ if $self->debug();
+ unless ($filename) {
+ warn wrap_msg( dgettext( "po4a", "Cannot apply addendum when not given the filename" ) );
+ return 0;
+ }
+ die wrap_msg( dgettext( "po4a", "Addendum %s does not exist." ), $filename )
+ unless -e $filename;
+
+ my ( $errcode, $mode, $position, $boundary, $bmode, $content ) =
+ addendum_parse( $filename, $self->{TT}{'addendum_charset'} );
+ return 0 if ($errcode);
+
+ # In order to make addendum more intuitive, each array item of
+ # @{$self->{TT}{doc_out}} must not have internal "\n". But previous parser
+ # code may put multiple internal "\n" to address things like placeholder
+ # tag handling. Let's normalize array content.
+ # Use internal "\n" as delimiter but keep it by using the lookbehind trick.
+ @{ $self->{TT}{doc_out} } = map { split /(?<=\n)/, $_ } @{ $self->{TT}{doc_out} };
+
+ # Bugs around addendum is hard to understand. So let's print involved data explicitly.
+ if ( $self->debug() ) {
+ print STDERR "Addendum position regex=$position\n";
+ print STDERR "Addendum mode=$mode\n";
+ if ( $mode eq "after" ) {
+ print STDERR "Addendum boundary regex=$boundary\n";
+ print STDERR "Addendum boundary mode=$bmode\n";
+ }
+ print STDERR "Addendum content (begin):\n";
+ print STDERR "$content";
+ print STDERR "Addendum content (end)\n";
+ print STDERR "Output items searched for the addendum insertion position:\n";
+ foreach my $item ( @{ $self->{TT}{doc_out} } ) {
+ print STDERR $item;
+ print STDERR "\n----- [ search item end marker with a preceding newline ] -----\n";
+ }
+ print STDERR "Start searching addendum insertion position...\n";
+ }
+
+ unless ( $mode eq 'eof' ) {
+ my $found = scalar grep { /$position/ } @{ $self->{TT}{doc_out} };
+ if ( $found == 0 ) {
+ warn wrap_msg( dgettext( "po4a", "No candidate position for the addendum %s." ), $filename );
+ return 0;
+ }
+ if ( $found > 1 ) {
+ warn wrap_msg( dgettext( "po4a", "More than one candidate position found for the addendum %s." ),
+ $filename );
+ return 0;
+ }
+ }
+
+ if ( $mode eq "eof" ) {
+ push @{ $self->{TT}{doc_out} }, $content;
+ } elsif ( $mode eq "before" ) {
+ if ( $self->verbose() > 1 || $self->debug() ) {
+ map {
+ print STDERR wrap_msg( dgettext( "po4a", "Addendum '%s' applied before this line: %s" ), $filename, $_ )
+ if (/$position/);
+ } @{ $self->{TT}{doc_out} };
+ }
+ @{ $self->{TT}{doc_out} } = map { /$position/ ? ( $content, $_ ) : $_ } @{ $self->{TT}{doc_out} };
+ } else {
+ my @newres = ();
+
+ do {
+ # make sure it doesn't whine on empty document
+ my $line = scalar @{ $self->{TT}{doc_out} } ? shift @{ $self->{TT}{doc_out} } : "";
+ push @newres, $line;
+ my $outline = mychomp($line);
+ $outline =~ s/^[ \t]*//;
+
+ if ( $line =~ m/$position/ ) {
+ while ( $line = shift @{ $self->{TT}{doc_out} } ) {
+ last if ( $line =~ /$boundary/ );
+ push @newres, $line;
+ }
+ if ( defined $line ) {
+ if ( $bmode eq 'before' ) {
+ print wrap_msg( dgettext( "po4a", "Addendum '%s' applied before this line: %s" ),
+ $filename, $outline )
+ if ( $self->verbose() > 1 || $self->debug() );
+ push @newres, $content;
+ push @newres, $line;
+ } else {
+ print wrap_msg( dgettext( "po4a", "Addendum '%s' applied after the line: %s." ),
+ $filename, $outline )
+ if ( $self->verbose() > 1 || $self->debug() );
+ push @newres, $line;
+ push @newres, $content;
+ }
+ } else {
+ print wrap_msg( dgettext( "po4a", "Addendum '%s' applied at the end of the file." ), $filename )
+ if ( $self->verbose() > 1 || $self->debug() );
+ push @newres, $content;
+ }
+ }
+ } while ( scalar @{ $self->{TT}{doc_out} } );
+ @{ $self->{TT}{doc_out} } = @newres;
+ }
+ print STDERR wrap_mod( "po4a::transtractor::addendum", "Done with addendum %s", $filename )
+ if $self->debug();
+ return 1;
+}
+
+=back
+
+=head1 INTERNAL FUNCTIONS used to write derivative parsers
+
+=head2 Getting input, providing output
+
+Four functions are provided to get input and return output. They are very
+similar to shift/unshift and push/pop of Perl.
+
+ * Perl shift returns the first array item and drop it from the array.
+ * Perl unshift prepends an item to the array as the first array item.
+ * Perl pop returns the last array item and drop it from the array.
+ * Perl push appends an item to the array as the last array item.
+
+The first pair is about input, while the second is about output. Mnemonic: in
+input, you are interested in the first line, what shift gives, and in output
+you want to add your result at the end, like push does.
+
+=over 4
+
+=item shiftline()
+
+This function returns the first line to be parsed and its corresponding
+reference (packed as an array) from the array C<< @{$self->{TT}{doc_in}} >> and
+drop these first 2 array items. Here, the reference is provided by a string
+C<< $filename:$linenum >>.
+
+=item unshiftline($$)
+
+Unshifts the last shifted line of the input document and its corresponding
+reference back to the head of C<< {$self->{TT}{doc_in}} >>.
+
+=item pushline($)
+
+Push a new line to the end of C<< {$self->{TT}{doc_out}} >>.
+
+=item popline()
+
+Pop the last pushed line from the end of C<< {$self->{TT}{doc_out}} >>.
+
+=back
+
+=cut
+
+sub shiftline {
+ my ( $line, $ref ) = ( shift @{ $_[0]->{TT}{doc_in} }, shift @{ $_[0]->{TT}{doc_in} } );
+ return ( $line, $ref );
+}
+
+sub unshiftline {
+ my $self = shift;
+ unshift @{ $self->{TT}{doc_in} }, @_;
+}
+
+sub pushline { push @{ $_[0]->{TT}{doc_out} }, $_[1] if defined $_[1]; }
+sub popline { return pop @{ $_[0]->{TT}{doc_out} }; }
+
+=head2 Marking strings as translatable
+
+One function is provided to handle the text which should be translated.
+
+=over 4
+
+=item translate($$$)
+
+Mandatory arguments:
+
+=over 2
+
+=item -
+
+A string to translate
+
+=item -
+
+The reference of this string (i.e. position in inputfile)
+
+=item -
+
+The type of this string (i.e. the textual description of its structural role;
+used in Locale::Po4a::Po::gettextization(); see also L<po4a(7)|po4a.7>,
+section B<Gettextization: how does it work?>)
+
+=back
+
+This function can also take some extra arguments. They must be organized as
+a hash. For example:
+
+ $self->translate("string","ref","type",
+ 'wrap' => 1);
+
+=over
+
+=item B<wrap>
+
+boolean indicating whether we can consider that whitespaces in string are
+not important. If yes, the function canonizes the string before looking for
+a translation or extracting it, and wraps the translation.
+
+=item B<wrapcol>
+
+the column at which we should wrap (default: the value of B<wrapcol> specified
+during creation of the TransTractor or 76).
+
+The negative value will be substracted from the default.
+
+=item B<comment>
+
+an extra comment to add to the entry.
+
+=back
+
+Actions:
+
+=over 2
+
+=item -
+
+Pushes the string, reference and type to po_out.
+
+=item -
+
+Returns the translation of the string (as found in po_in) so that the
+parser can build the doc_out.
+
+=item -
+
+Handles the charsets to recode the strings before sending them to
+po_out and before returning the translations.
+
+=back
+
+=back
+
+=cut
+
+sub translate {
+ my $self = shift;
+ my ( $string, $ref, $type ) = ( shift, shift, shift );
+ my (%options) = @_;
+
+ return "" unless length($string);
+
+ # my %validoption;
+ # map { $validoption{$_}=1 } (qw(wrap wrapcoll));
+ # foreach (keys %options) {
+ # Carp::confess "internal error: translate() called with unknown arg $_. Valid options: $validoption"
+ # unless $validoption{$_};
+ # }
+
+ if ( !defined $options{'wrapcol'} ) {
+ $options{'wrapcol'} = $self->{TT}{wrapcol}
+ } elsif ( $options{'wrapcol'} < 0 ) {
+ $options{'wrapcol'} = $self->{TT}{wrapcol} + $options{'wrapcol'};
+ }
+ my $transstring = $self->{TT}{po_in}->gettext(
+ $string,
+ 'wrap' => $options{'wrap'} || 0,
+ 'wrapcol' => $options{'wrapcol'}
+ );
+
+ # the comments provided by the modules are automatic comments from the PO point of view
+ $self->{TT}{po_out}->push(
+ 'msgid' => $string,
+ 'reference' => $ref,
+ 'type' => $type,
+ 'automatic' => $options{'comment'},
+ 'flags' => $options{'flags'},
+ 'wrap' => $options{'wrap'} || 0,
+ );
+
+ if ( $options{'wrap'} || 0 ) {
+ $transstring =~ s/( *)$//s;
+ my $trailing_spaces = $1 || "";
+ $transstring =~ s/(?<!\\) +$//gm;
+ $transstring .= $trailing_spaces;
+ }
+
+ return $transstring;
+}
+
+=head2 Misc functions
+
+=over 4
+
+=item verbose()
+
+Returns if the verbose option was passed during the creation of the
+TransTractor.
+
+=cut
+
+sub verbose {
+ if ( defined $_[1] ) {
+ $_[0]->{TT}{verbose} = $_[1];
+ } else {
+ return $_[0]->{TT}{verbose} || 0; # undef and 0 have the same meaning, but one generates warnings
+ }
+}
+
+=item debug()
+
+Returns if the debug option was passed during the creation of the
+TransTractor.
+
+=cut
+
+sub debug {
+ return $_[0]->{TT}{debug};
+}
+
+=item get_in_charset()
+
+This function return the charset that was provided as master charset
+
+=cut
+
+sub get_in_charset() {
+ return $_[0]->{TT}{'file_in_charset'};
+}
+
+=item get_out_charset()
+
+This function will return the charset that should be used in the output
+document (usually useful to substitute the input document's detected charset
+where it has been found).
+
+It will use the output charset specified in the command line. If it wasn't
+specified, it will use the input PO's charset, and if the input PO has the
+default "CHARSET", it will return the input document's charset, so that no
+encoding is performed.
+
+=cut
+
+sub get_out_charset {
+ my $self = shift;
+
+ # Prefer the value specified on the command line
+ return $self->{TT}{'file_out_charset'}
+ if length( $self->{TT}{'file_out_charset'} );
+
+ return $self->{TT}{po_in}->get_charset if $self->{TT}{po_in}->get_charset ne 'CHARSET';
+
+ return $self->{TT}{'file_in_charset'} if length( $self->{TT}{'file_in_charset'} );
+
+ return 'UTF-8';
+}
+
+# Push the translation of a Yaml document or Yaml Front-Matter header, parsed by YAML::Tiny in any case
+# $is_yfm is a boolean indicating whether we are dealing with a Front Matter (true value) or whole document (false value)
+sub handle_yaml {
+ my ( $self, $is_yfm, $blockref, $yamlarray, $yfm_keys, $yfm_skip_array, $yfm_paths ) = @_;
+
+ die "Empty YAML " . ( $is_yfm ? "Front Matter" : "document" ) unless ( length($yamlarray) > 0 );
+
+ my ( $indent, $ctx ) = ( 0, "" );
+ foreach my $cursor (@$yamlarray) {
+
+ # An empty document
+ if ( !defined $cursor ) {
+ $self->pushline("---\n");
+
+ # Do nothing
+
+ # A scalar document
+ } elsif ( !ref $cursor ) {
+ $self->pushline("---\n");
+ $self->pushline(
+ format_scalar(
+ $self->translate(
+ $cursor, $blockref,
+ "YAML " . ( $is_yfm ? "Front Matter " : "" ) . "(scalar)",
+ "wrap" => 0
+ )
+ )
+ );
+
+ # A list at the root
+ } elsif ( ref $cursor eq 'ARRAY' ) {
+ if (@$cursor) {
+ $self->pushline("---\n");
+ do_array( $self, $is_yfm, $blockref, $cursor, $indent, $ctx, $yfm_keys, $yfm_skip_array, $yfm_paths );
+ } else {
+ $self->pushline("---[]\n");
+ }
+
+ # A hash at the root
+ } elsif ( ref $cursor eq 'HASH' ) {
+ if (%$cursor) {
+ $self->pushline("---\n");
+ do_hash( $self, $is_yfm, $blockref, $cursor, $indent, $ctx, $yfm_keys, $yfm_skip_array, $yfm_paths );
+ } else {
+ $self->pushline("--- {}\n");
+ }
+
+ } else {
+ die( "Cannot serialize " . ref($cursor) );
+ }
+ }
+
+ # Escape the string to make it valid in YAML.
+ # This is very similar to YAML::Tiny::_dump_scalar but does not do the internal->UTF-8 decoding,
+ # as the translations that we feed into this function are already in UTF-8
+ sub format_scalar {
+ my $string = $_[0];
+ my $is_key = $_[1];
+
+ return '~' unless defined $string;
+ return "''" unless length $string;
+ if ( Scalar::Util::looks_like_number($string) ) {
+
+ # keys and values that have been used as strings get quoted
+ if ($is_key) {
+ return qq['$string'];
+ } else {
+ return $string;
+ }
+ }
+ if ( $string =~ /[\\\'\n]/ ) {
+ $string =~ s/\\/\\\\/g;
+ $string =~ s/"/\\"/g;
+ $string =~ s/\n/\\n/g;
+ return qq|"$string"|;
+ }
+ if ( $string =~ /(?:^[~!@#%&*|>?:,'"`{}\[\]]|^-+$|\s|:\z)/ ) {
+ return "'$string'";
+ }
+ return $string;
+ }
+
+ sub do_array {
+ my ( $self, $is_yfm, $blockref, $array, $indent, $ctx, $yfm_keys, $yfm_skip_array, $yfm_paths ) = @_;
+ foreach my $el (@$array) {
+ my $header = ( ' ' x $indent ) . '- ';
+ my $type = ref $el;
+ if ( !$type ) {
+ if ($yfm_skip_array) {
+ $self->pushline( $header . YAML::Tiny::_dump_scalar( "dummy", $el, 0 ) . "\n" );
+ } else {
+ $self->pushline(
+ $header
+ . format_scalar(
+ $self->translate(
+ $el, $blockref,
+ ( $is_yfm ? "Yaml Front Matter " : "" ) . "Array Element:$ctx", "wrap" => 0
+ )
+ )
+ . "\n"
+ );
+ }
+
+ } elsif ( $type eq 'ARRAY' ) {
+ if (@$el) {
+ $self->pushline( $header . "\n" );
+ do_array( $self, $is_yfm, $blockref, $el, $indent + 1,
+ $ctx, $yfm_keys, $yfm_skip_array, $yfm_paths );
+ } else {
+ $self->pushline( $header . " []\n" );
+ }
+
+ } elsif ( $type eq 'HASH' ) {
+ if ( keys %$el ) {
+ $self->pushline( $header . "\n" );
+ do_hash( $self, $is_yfm, $blockref, $el, $indent + 1, $ctx, $yfm_keys, $yfm_skip_array,
+ $yfm_paths );
+ } else {
+ $self->pushline( $header . " {}\n" );
+ }
+
+ } else {
+ die "YAML $type references not supported";
+ }
+ }
+ }
+
+ sub do_hash {
+ my ( $self, $is_yfm, $blockref, $hash, $indent, $ctx, $yfm_keys, $yfm_skip_array, $yfm_paths ) = @_;
+
+ foreach my $name ( sort keys %$hash ) {
+ my $el = $hash->{$name} // "";
+ my $header = ( ' ' x $indent ) . YAML::Tiny::_dump_scalar( "dummy", $name, 1 ) . ":";
+
+ unless ( length($el) > 0 ) { # empty element, as in "tags: " with nothing after the column
+ $self->pushline( $header . "\n" );
+ next;
+ }
+
+ my $type = ref $el;
+ if ( !$type ) {
+ my %keys = %{$yfm_keys};
+ my %paths = %{$yfm_paths};
+ my $path = "$ctx $name" =~ s/^\s+|\s+$//gr; # Need to trim the path, at least when there is no ctx yet
+
+ if ( ( $el eq 'false' ) or ( $el eq 'true' ) ) { # Do not translate nor quote booleans
+ $self->pushline("$header $el\n");
+ } elsif (
+ ( scalar %keys > 0 && exists $keys{$name} ) or # the key we need is provided
+ ( scalar %paths > 0 && exists $paths{$path} ) or # that path is provided
+ ( scalar %keys == 0 && scalar %paths == 0 ) # no key and no path provided
+ )
+ {
+ my $translation = $self->translate(
+ $el, $blockref,
+ ( $is_yfm ? "Yaml Front Matter " : "" ) . "Hash Value:$ctx $name",
+ "wrap" => 0
+ );
+
+ # add extra quotes to the parameter, as a protection to the extra chars that the translator could add
+ $self->pushline( $header . ' ' . format_scalar($translation) . "\n" );
+ } else {
+
+ # Work around a bug in YAML::Tiny that quotes numbers
+ # See https://github.com/Perl-Toolchain-Gang/YAML-Tiny#additional-perl-specific-notes
+ if ( Scalar::Util::looks_like_number($el) ) {
+ $self->pushline("$header $el\n");
+ } else {
+ $self->pushline( $header . ' ' . YAML::Tiny::_dump_scalar( "dummy", $el ) . "\n" );
+ }
+ }
+
+ } elsif ( $type eq 'ARRAY' ) {
+ if (@$el) {
+ $self->pushline( $header . "\n" );
+ do_array(
+ $self, $is_yfm, $blockref, $el, $indent + 1, "$ctx $name",
+ $yfm_keys, $yfm_skip_array, $yfm_paths
+ );
+ } else {
+ $self->pushline( $header . " []\n" );
+ }
+
+ } elsif ( $type eq 'HASH' ) {
+ if ( keys %$el ) {
+ $self->pushline( $header . "\n" );
+ do_hash(
+ $self, $is_yfm, $blockref, $el, $indent + 1, "$ctx $name",
+ $yfm_keys, $yfm_skip_array, $yfm_paths
+ );
+ } else {
+ $self->pushline( $header . " {}\n" );
+ }
+
+ } else {
+ die "YAML $type references not supported";
+ }
+ }
+ }
+}
+
+=back
+
+=head1 FUTURE DIRECTIONS
+
+One shortcoming of the current TransTractor is that it can't handle
+translated document containing all languages, like debconf templates, or
+.desktop files.
+
+To address this problem, the only interface changes needed are:
+
+=over 2
+
+=item -
+
+take a hash as po_in_name (a list per language)
+
+=item -
+
+add an argument to translate to indicate the target language
+
+=item -
+
+make a pushline_all function, which would make pushline of its content for
+all languages, using a map-like syntax:
+
+ $self->pushline_all({ "Description[".$langcode."]=".
+ $self->translate($line,$ref,$langcode)
+ });
+
+=back
+
+Will see if it's enough ;)
+
+=head1 AUTHORS
+
+ Denis Barbier <barbier@linuxfr.org>
+ Martin Quinson (mquinson#debian.org)
+ Jordi Vilalta <jvprat@gmail.com>
+
+=cut
+
+1;