diff options
Diffstat (limited to 'lib/Locale/Po4a/Text.pm')
-rw-r--r-- | lib/Locale/Po4a/Text.pm | 1072 |
1 files changed, 1072 insertions, 0 deletions
diff --git a/lib/Locale/Po4a/Text.pm b/lib/Locale/Po4a/Text.pm new file mode 100644 index 0000000..b6633c2 --- /dev/null +++ b/lib/Locale/Po4a/Text.pm @@ -0,0 +1,1072 @@ +#!/usr/bin/perl -w + +# Po4a::Text.pm +# +# extract and translate translatable strings from a text documents +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# +######################################################################## + +=encoding UTF-8 + +=head1 NAME + +Locale::Po4a::Text - convert text documents from/to PO files + +=head1 DESCRIPTION + +The po4a (PO for anything) project goal is to ease translations (and more +interestingly, the maintenance of translations) using gettext tools on +areas where they were not expected like documentation. + +Locale::Po4a::Text is a module to help the translation of text documents into +other [human] languages. + +Paragraphs are split on empty lines (or lines containing only spaces or +tabulations). + +If a paragraph contains a line starting by a space (or tabulation), this +paragraph won't be rewrapped. + +=cut + +package Locale::Po4a::Text; + +use 5.16.0; +use strict; +use warnings; + +require Exporter; +use vars qw(@ISA @EXPORT); +@ISA = qw(Locale::Po4a::TransTractor); +@EXPORT = qw(); + +use Locale::Po4a::TransTractor; +use Locale::Po4a::Common; +use YAML::Tiny; +use Syntax::Keyword::Try; + +=head1 OPTIONS ACCEPTED BY THIS MODULE + +These are this module's particular options: + +=over + +=item B<keyvalue> + +Treat paragraphs that look like a colon-separated key-value pair as verbatim +(with the C<no-wrap> flag in the PO file). A key-value pair string is a string +like C<key: value>, containing one or more non-colon and non-space characters +followed by a colon followed by at least one non-space character before the +end of the line. + +=cut + +my $keyvalue = 0; + +=item B<nobullets> + +Deactivate the detection of bullets. + +By default, when a bullet is detected, the bullet paragraph is not considered +as a verbatim paragraph (with the C<no-wrap> flag in the PO file). Instead, the +corresponding paragraph is rewrapped in the translation. + +=cut + +my $bullets = 1; + +=item B<tabs=>I<mode> + +Specify how tabulations shall be handled. The I<mode> can be any of: + +=over + +=item B<split> + +Lines with tabulations introduce breaks in the current paragraph. + +=item B<verbatim> + +Paragraph containing tabulations will not be re-wrapped. + +=back + +By default, tabulations are considered as spaces. + +=cut + +my $tabs = ""; + +=item B<breaks=>I<regex> + +A regular expression matching lines which introduce breaks. +The regular expression will be anchored so that the whole line must match. + +=cut + +my $breaks; + +=item B<debianchangelog> + +Handle the header and footer of +released versions, which only contain non translatable information. + +=cut + +my $debianchangelog = 0; + +=item B<fortunes> + +Handle the fortunes format, which separate fortunes with a line which +consists in '%' or '%%', and use '%%' as the beginning of a comment. + +=cut + +my $fortunes = 0; + +=item B<markdown> + +Handle some special markup in Markdown-formatted texts. + +=cut + +my $markdown = 0; + +=item B<yfm_keys> (markdown-only) + +Comma-separated list of keys to process for translation in the YAML Front Matter +section. All other keys are skipped. Keys are matched with a case-sensitive +match. If B<yfm_paths> and B<yfm_keys> are used together, values are included if +they are matched by at least one of the options. Array values are always translated, +unless the B<yfm_skip_array> option is provided. + +=cut + +my %yfm_keys = (); + +=item B<yfm_lenient> (markdown only) + +Allow the YAML Front Matter parser to fail on malformated headers. This is +particularly helpful when your file starts with a horizontal ruler instead +of a YAML Front Matter, but you insist on using three dashes only for your +ruler. + +=cut + +my $yfm_lenient = 0; + +=item B<yfm_paths> (markdown only) + +=item B<yfm_paths> + +Comma-separated list of hash paths to process for extraction in the YAML +Front Matter section, all other paths are skipped. Paths are matched with a +case-sensitive match. If B<yfm_paths> and B<yfm_keys> are used together, +values are included if they are matched by at least one of the options. +Arrays values are always returned unless the B<yfm_skip_array> option is +provided. + +=cut + +my %yfm_paths = (); + +=item B<yfm_skip_array> (markdown-only) + +Do not translate array values in the YAML Front Matter section. + +=cut + +my $yfm_skip_array = 0; + +=item B<control>[B<=>I<field_list>] + +Handle Debian's control files. +A comma-separated list of fields to be translated can be provided. + +=cut + +my %control = (); + +=item B<neverwrap> + +Prevent po4a from wrapping any lines. This means that every content is handled verbatim, even simple paragraphs. + +=cut + +my $defaultwrap = 1; + +my $parse_func = \&parse_fallback; + +my @comments = (); + +=back + +=cut + +sub initialize { + my $self = shift; + my %options = @_; + + $self->{options}{'control'} = ""; + $self->{options}{'breaks'} = 1; + $self->{options}{'debianchangelog'} = 1; + $self->{options}{'debug'} = 1; + $self->{options}{'fortunes'} = 1; + $self->{options}{'markdown'} = 1; + $self->{options}{'yfm_keys'} = ''; + $self->{options}{'yfm_lenient'} = 0; + $self->{options}{'yfm_paths'} = ''; + $self->{options}{'yfm_skip_array'} = 0; + $self->{options}{'nobullets'} = 0; + $self->{options}{'keyvalue'} = 1; + $self->{options}{'tabs'} = 1; + $self->{options}{'verbose'} = 1; + $self->{options}{'neverwrap'} = 1; + + foreach my $opt ( keys %options ) { + die wrap_mod( "po4a::text", dgettext( "po4a", "Unknown option: %s" ), $opt ) + unless exists $self->{options}{$opt}; + $self->{options}{$opt} = $options{$opt}; + } + + $keyvalue = 1 if ( defined $options{'keyvalue'} ); + $bullets = 0 if ( defined $options{'nobullets'} ); + $tabs = $options{'tabs'} if ( defined $options{'tabs'} ); + $breaks = $options{'breaks'} if ( defined $options{'breaks'} ); + $defaultwrap = 0 if ( defined $options{'neverwrap'} ); + + $parse_func = \&parse_debianchangelog if ( defined $options{'debianchangelog'} ); + $parse_func = \&parse_fortunes if ( defined $options{'fortunes'} ); + + if ( defined $options{'markdown'} ) { + $parse_func = \&parse_markdown; + $markdown = 1; + map { + $_ =~ s/^\s+|\s+$//g; # Trim the keys before using them + $yfm_keys{$_} = 1 + } ( split( ',', $self->{options}{'yfm_keys'} ) ); + map { + $_ =~ s/^\s+|\s+$//g; # Trim the keys before using them + $yfm_paths{$_} = 1 + } ( split( ',', $self->{options}{'yfm_paths'} ) ); + + # map { print STDERR "key $_\n"; } (keys %yfm_keys); + $yfm_skip_array = $self->{options}{'yfm_skip_array'}; + $yfm_lenient = $self->{options}{'yfm_lenient'}; + } else { + foreach my $opt (qw(yfm_keys yfm_lenient yfm_skip_array)) { + die wrap_mod( "po4a::text", dgettext( "po4a", "Option %s is only valid when parsing markdown files." ), + $opt ) + if exists $options{$opt}; + } + } + + if ( defined $options{'control'} ) { + $parse_func = \&parse_control; + if ( $options{'control'} eq "1" ) { + $control{''} = 1; + } else { + foreach my $tag ( split( ',', $options{'control'} ) ) { + $control{$tag} = 1; + } + } + } +} + +sub parse_fallback { + my ( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ) = @_; + if ( + ( $line =~ /^\s*$/ ) + or ( defined $breaks + and $line =~ m/^$breaks$/ ) + ) + { + # Break paragraphs on lines containing only spaces + do_paragraph( $self, $paragraph, $wrapped_mode ); + $paragraph = ""; + $wrapped_mode = $defaultwrap unless defined( $self->{verbatim} ); + $self->pushline( $line . "\n" ); + undef $self->{controlkey}; + } elsif ( $line =~ /^-- $/ ) { + + # Break paragraphs on email signature hint + do_paragraph( $self, $paragraph, $wrapped_mode ); + $paragraph = ""; + $wrapped_mode = $defaultwrap; + $self->pushline( $line . "\n" ); + } elsif ( $line =~ /^=+$/ + or $line =~ /^_+$/ + or $line =~ /^-+$/ ) + { + $wrapped_mode = 0; + $paragraph .= $line . "\n"; + do_paragraph( $self, $paragraph, $wrapped_mode ); + $paragraph = ""; + $wrapped_mode = $defaultwrap; + } elsif ( $tabs eq "split" and $line =~ m/\t/ and $paragraph !~ m/\t/s ) { + $wrapped_mode = 0; + do_paragraph( $self, $paragraph, $wrapped_mode ); + $paragraph = "$line\n"; + $wrapped_mode = 0; + } elsif ( $tabs eq "split" and $line !~ m/\t/ and $paragraph =~ m/\t/s ) { + do_paragraph( $self, $paragraph, $wrapped_mode ); + $paragraph = "$line\n"; + $wrapped_mode = $defaultwrap; + } else { + if ( $line =~ /^\s/ ) { + + # A line starting by a space indicates a non-wrap + # paragraph + $wrapped_mode = 0; + } + if ( + $markdown + and ( + $line =~ /\S $/ # explicit newline + or $line =~ /"""$/ + ) + ) + { # """ textblock inside macro begin + # Markdown markup needing separation _after_ this line + $end_of_paragraph = 1; + } else { + undef $self->{bullet}; + undef $self->{indent}; + } + + # TODO: comments + $paragraph .= $line . "\n"; + } + return ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); +} + +sub parse_debianchangelog { + my ( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ) = @_; + if ( + $expect_header + and $line =~ /^(\w[-+0-9a-z.]*)\ \(([^\(\) \t]+)\) # src, version + \s+([-+0-9a-z.]+); # distribution + \s*urgency\s*\=\s*(.*\S)\s*$/ix + ) + { # + do_paragraph( $self, $paragraph, $wrapped_mode ); + $paragraph = ""; + $self->pushline("$line\n"); + $expect_header = 0; + } elsif ( $line =~ + m/^ \-\- (.*) <(.*)> ((\w+\,\s*)?\d{1,2}\s+\w+\s+\d{4}\s+\d{1,2}:\d\d:\d\d\s+[-+]\d{4}(\s+\([^\\\(\)]+\)))$/ ) + { + # Found trailer + do_paragraph( $self, $paragraph, $wrapped_mode ); + $paragraph = ""; + $self->pushline("$line\n"); + $expect_header = 1; + } else { + return parse_fallback( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); + } + return ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); +} + +sub parse_fortunes { + my ( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ) = @_; + + # Always include paragraphs in no-wrap mode, + # because the formatting of the fortunes + # is usually hand-crafted and matters. + $wrapped_mode = 0; + + # Check if there are more lines in the file. + my $last_line_of_file = 0; + my ( $nextline, $nextref ) = $self->shiftline(); + if ( defined $nextline ) { + + # There is a next line, put it back. + $self->unshiftline( $nextline, $nextref ); + } else { + + # Nope, no more lines available. + $last_line_of_file = 1; + } + + # Is the line the end of a fortune or the last line of the file? + if ( $line =~ m/^%%?\s*$/ or $last_line_of_file ) { + + # Add the last line to the paragraph + if ($last_line_of_file) { + $paragraph .= $line; + } + + # Remove the last newline for the translation. + chomp($paragraph); + do_paragraph( $self, $paragraph, $wrapped_mode ); + $paragraph = ""; + + # Add the last newline again for the output. + $self->pushline("\n"); + + # Also add the separator line, if this is not the end of the file. + if ( !$last_line_of_file ) { + $self->pushline("$line\n"); + } + } else { + $paragraph .= $line . "\n"; + } + return ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); +} + +sub parse_control { + my ( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ) = @_; + if ( $line =~ m/^([^ :]*): *(.*)$/ ) { + warn wrap_mod( "po4a::text", dgettext( "po4a", "Unrecognized section: %s" ), $paragraph ) + unless $paragraph eq ""; + my $tag = $1; + my $val = $2; + my $t; + if ( $control{''} or $control{$tag} ) { + $t = $self->translate( + $val, $self->{ref}, + $tag . ( defined $self->{controlkey} ? ", " . $self->{controlkey} : "" ), + "wrap" => 0 + ); + } else { + $t = $val; + } + if ( not defined $self->{controlkey} ) { + $self->{controlkey} = "$tag: $val"; + } + $self->pushline("$tag: $t\n"); + $paragraph = ""; + $wrapped_mode = $defaultwrap; + $self->{bullet} = ""; + $self->{indent} = " "; + } elsif ( $line eq " ." ) { + do_paragraph( $self, $paragraph, $wrapped_mode, + "Long Description" . ( defined $self->{controlkey} ? ", " . $self->{controlkey} : "" ) ); + $paragraph = ""; + $self->pushline( $line . "\n" ); + $self->{bullet} = ""; + $self->{indent} = " "; + } elsif ( $line =~ m/^ Link: +(.*)$/ ) { + do_paragraph( $self, $paragraph, $wrapped_mode, + "Long Description" . ( defined $self->{controlkey} ? ", " . $self->{controlkey} : "" ) ); + my $link = $1; + my $t1 = $self->translate( "Link: ", $self->{ref}, "Link", "wrap" => 0 ); + my $t2 = $self->translate( + $link, $self->{ref}, + "Link" . ( defined $self->{controlkey} ? ", " . $self->{controlkey} : "" ), + "wrap" => 0 + ); + $self->pushline(" $t1$t2\n"); + $paragraph = ""; + } elsif ( defined $self->{indent} + and $line =~ m/^$self->{indent}\S/ ) + { + $paragraph .= $line . "\n"; + $self->{type} = "Long Description" . ( defined $self->{controlkey} ? ", " . $self->{controlkey} : "" ); + } else { + return parse_fallback( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); + } + return ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); +} + +# Support pandoc's format of specifying bibliographic information. +# +# If the first line starts with a percent sign, the following +# is considered to be title, author, and date. +# +# If the information spans multiple lines, the following +# lines must be indented with space. +# If information is omitted, it's just a percent sign +# and a blank line. +# +# Examples with missing title resp. missing authors: +# +# % +# % Author +# +# % My title +# % +# % June 14, 2018 +sub parse_markdown_bibliographic_information { + my ( $self, $line, $ref ) = @_; + my ( $nextline, $nextref ); + + # The first match is always the title or an empty string (no title). + if ( $line =~ /^%(.*)$/ ) { + my $title = $1; + + # Remove leading and trailing whitespace + $title =~ s/^\s+|\s+$//g; + + # If there's some text, look for continuation lines + if ( length($title) ) { + ( $nextline, $nextref ) = $self->shiftline(); + while ( $nextline =~ /^\s+(.+)$/ ) { + $nextline = $1; + $nextline =~ s/^\s+|\s+$//g; + $title .= " " . $nextline; + ( $nextline, $nextref ) = $self->shiftline(); + } + + # Now the title should be complete, give it to translation. + my $t = $self->translate( $title, $ref, "Pandoc title block", "wrap" => $defaultwrap ); + $t = Locale::Po4a::Po::wrap($t); + my $first_line = 1; + foreach my $translated_line ( split /\n/, $t ) { + if ($first_line) { + $first_line = 0; + $self->pushline( "% " . $translated_line . "\n" ); + } else { + $self->pushline( " " . $translated_line . "\n" ); + } + } + } else { + + # Title has been empty, fetch the next line + # if that are the authors. + $self->pushline("%\n"); + ( $nextline, $nextref ) = $self->shiftline(); + } + + # The next line can contain the author or an empty string. + if ( $nextline =~ /^%(.*)$/ ) { + my $author_ref = $nextref; + my $authors = $1; + + # If there's some text, look for continuation lines + if ( length($authors) ) { + ( $nextline, $nextref ) = $self->shiftline(); + while ( $nextline =~ /^\s+(.+)$/ ) { + $nextline = $1; + $authors .= ";" . $nextline; + ( $nextline, $nextref ) = $self->shiftline(); + } + + # Now the authors should be complete, split them by semicolon + my $first_line = 1; + foreach my $author ( split /;/, $authors ) { + $author =~ s/^\s+|\s+$//g; + + # Skip empty authors + next unless length($author); + my $t = $self->translate( $author, $author_ref, "Pandoc title block" ); + if ($first_line) { + $first_line = 0; + $self->pushline( "% " . $t . "\n" ); + } else { + $self->pushline( " " . $t . "\n" ); + } + } + } else { + + # Authors has been empty, fetch the next line + # if that is the date. + $self->pushline("%\n"); + ( $nextline, $nextref ) = $self->shiftline(); + } + + # The next line can contain the date. + if ( $nextline =~ /^%(.*)$/ ) { + my $date = $1; + + # Remove leading and trailing whitespace + $date =~ s/^\s+|\s+$//g; + my $t = $self->translate( $date, $nextref, "Pandoc title block" ); + $self->pushline( "% " . $t . "\n" ); + + # Now we're done with the bibliographic information + return; + } + } + + # The line did not start with a percent sign, to stop + # parsing bibliographic information and return the + # line to the normal parsing. + $self->unshiftline( $nextline, $nextref ); + return; + } +} + +# Support YAML Front Matter in Markdown documents +# +# If the text starts with a YAML ---\n separator, the full text until +# the next YAML ---\n separator is considered YAML metadata. The ...\n +# "end of document" separator can be used at the end of the YAML +# block. +# +sub parse_markdown_yaml_front_matter { + my ( $self, $line, $blockref ) = @_; + my $yfm; + my @saved_ctn; + my ( $nextline, $nextref ) = $self->shiftline(); + push @saved_ctn, ( $nextline, $nextref ); + while ( defined($nextline) ) { + last if ( $nextline =~ /^(---|\.\.\.)$/ ); + $yfm .= $nextline; + ( $nextline, $nextref ) = $self->shiftline(); + if ( $nextline =~ /: [\[\{]/ ) { + die wrap_mod( + "po4a::text", + dgettext( + "po4a", + "Inline lists and dictionaries on a single line are not correctly handled the parser we use (YAML::Tiny): they are interpreted as regular strings. " + . "Please use multi-lines definitions instead. Offending line:\n %s" + ), + $nextline + ); + + } + push @saved_ctn, ( $nextline, $nextref ); + } + + my $yamlarray; # the parsed YFM content + my $yamlres; # containing the parse error, if any + try { + $yamlarray = YAML::Tiny->read_string($yfm); + } catch { + $yamlres = $@; + } + + if ( defined($yamlres) ) { + if ($yfm_lenient) { + $yamlres =~ s/ at .*$//; # Remove the error localisation in YAML::Tiny die message, if any (for our test) + warn wrap_mod( + "po4a::text", + dgettext( + "po4a", + "Proceeding even if the YAML Front Matter could not be parsed. Remove the 'yfm_lenient' option for a stricter behavior.\nIgnored error: %s" + ), + $yamlres + ); + my $len = ( scalar @saved_ctn ) - 1; + while ( $len >= 0 ) { + $self->unshiftline( $saved_ctn[ $len - 1 ], $saved_ctn[$len] ); + + # print STDERR "Unshift ".$saved_ctn[ $len - 1] ." | ". $saved_ctn[$len] ."\n"; + $len -= 2; + } + return 0; # Not a valid YAML + } else { + die wrap_mod( + "po4a::text", + dgettext( + "po4a", + "Could not get the YAML Front Matter from the file. If you did not intend to add a YAML front matter " + . "but an horizontal ruler, please use '----' instead, or pass the 'yfm_lenient' option.\nError: %s\nContent of the YFM: %s" + ), + $yamlres, $yfm + ); + } + } + + $self->handle_yaml( 1, $blockref, $yamlarray, \%yfm_keys, $yfm_skip_array, \%yfm_paths ); + $self->pushline("---\n"); + return 1; # Valid YAML +} + +sub parse_markdown { + my ( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ) = @_; + if ($expect_header) { + + # It is only possible to find and parse the bibliographic + # information or the YAML Front Matter from the first line. + # Anyway, stop expecting header information for the next run. + $expect_header = 0; + if ( $line =~ /^%(.*)$/ ) { + parse_markdown_bibliographic_information( $self, $line, $ref ); + return ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); + } elsif ( $line =~ /^---$/ ) { + if ( parse_markdown_yaml_front_matter( $self, $line, $ref ) ) { # successfully parsed + return ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); + } + + # If it wasn't a YFM paragraph after all, stop expecting a header and keep going + } + } + if ( ( $line =~ m/^(={4,}|-{4,})$/ ) + and ( defined($paragraph) ) + and ( $paragraph =~ m/^[^\n]*\n$/s ) + and ( length($paragraph) == ( length($line) + 1 ) ) ) + { + # XXX: There can be any number of underlining according + # to the documentation. This detection, which avoid + # translating the formatting, is only supported if + # the underlining has the same size as the header text. + # Found title + $wrapped_mode = 0; + my $level = $line; + $level =~ s/^(.).*$/$1/; + + # Remove the trailing newline from the title + chomp($paragraph); + my $t = $self->translate( + $paragraph, $self->{ref}, "Title $level", + "wrap" => 0, + "flags" => "markdown-text" + ); + + # Add the newline again for the output + $self->pushline( $t . "\n" ); + $paragraph = ""; + $wrapped_mode = $defaultwrap; + $self->pushline( ( $level x length($t) ) . "\n" ); + } elsif ( $line =~ m/^(#{1,6})( +)(.*?)( +\1)?$/ ) { + my $titlelevel1 = $1; + my $titlespaces = $2; + my $title = $3; + my $titlelevel2 = $4 || ""; + + # Found one line title + do_paragraph( $self, $paragraph, $wrapped_mode ); + $wrapped_mode = 0; + $paragraph = ""; + my $t = $self->translate( + $title, $self->{ref}, "Title $titlelevel1", + "wrap" => 0, + "flags" => "markdown-text" + ); + $self->pushline( $titlelevel1 . $titlespaces . $t . $titlelevel2 . "\n" ); + $wrapped_mode = $defaultwrap; + } elsif ( $line =~ /^[ ]{0,3}([*_-])\s*(?:\1\s*){2,}$/ ) { + + # Horizontal rule + do_paragraph( $self, $paragraph, $wrapped_mode ); + $self->pushline( $line . "\n" ); + $paragraph = ""; + $end_of_paragraph = 1; + } elsif ( $line =~ /^([ ]{0,3})(\[[^\]]+\]:[ \t]?.+)$/ ) { + my $indentation = $1; + my $linkreference = $2; + + # Link reference + # TODO: support multiline link reference definition + # TODO: treat link title properly + # https://spec.commonmark.org/0.30/#link-reference-definitions + do_paragraph( $self, $paragraph, $wrapped_mode ); + $wrapped_mode = 0; + $paragraph = ""; + my $t = $self->translate( + $linkreference, $self->{ref}, "Link reference", + "wrap" => 0, + "flags" => "link-reference" + ); + $self->pushline( $indentation . $t . "\n" ); + $wrapped_mode = $defaultwrap; + } elsif ( $line =~ /^([ ]{0,3})(([~`])\3{2,})(\s*)([^`]*)\s*$/ ) { + my $fence_space_before = $1; + my $fence = $2; + my $fencechar = $3; + my $fence_space_between = $4; + my $info_string = $5; + + # fenced code block + my $type = "Fenced code block" . ( $info_string ? " ($info_string)" : "" ); + do_paragraph( $self, $paragraph, $wrapped_mode ); + $wrapped_mode = 0; + $paragraph = ""; + $self->pushline("$line\n"); + do_paragraph( $self, $paragraph, $wrapped_mode ); + $paragraph = ""; + my ( $nextline, $nextref ) = $self->shiftline(); + + while ( $nextline !~ /^\s{0,3}$fence$fencechar*\s*$/ ) { + $paragraph .= "$nextline"; + ( $nextline, $nextref ) = $self->shiftline(); + } + do_paragraph( $self, $paragraph, $wrapped_mode, $type ); + $self->pushline($nextline); + $paragraph = ""; + $end_of_paragraph = 1; + } elsif ( $line =~ /^([ ]{0,3})(([:])\3{2,})(\s*)([^`]*)\s*$/ ) { + my $fence_space_before = $1; + my $fence = $2; + my $fencechar = $3; + my $fence_space_between = $4; + my @info_string = ($5); + + # print STDERR "----------------\n"; + # print STDERR "line: $line\n"; + # print STDERR "fence: '$fence'; fencechar: '$fencechar'; info: '$info_string'\n"; + + # fenced div block (fenced with ::: where code blocks are fenced with ` or ~) + # https://pandoc.org/MANUAL.html#divs-and-spans + my $info = join( "|", map { chomp $_; $_ } @info_string ); + my $type = "Fenced div block" . ( $info ? " ($info)" : "" ); + do_paragraph( $self, $paragraph, $wrapped_mode ); + $wrapped_mode = 0; + $paragraph = ""; + $self->pushline("$line\n"); + do_paragraph( $self, $paragraph, $wrapped_mode ); + $paragraph = ""; + + my $lvl = 1; + while ( $lvl > 0 ) { + my ( $nextline, $nextref ) = $self->shiftline(); + die wrap_mod( + "po4a::text", + dgettext( + "po4a", "Malformed fenced div block: Block starting at %s not closed before the end of the file." + ), + $ref + ) unless ( defined($nextline) ); + + # print STDERR "within $lvl: $nextline"; + if ( $nextline =~ /^\s*:::+\s*$/ ) { + my $info = join( "|", map { chomp $_; $_ } @info_string ); + $type = "Fenced div block" . ( $info ? " ($info)" : "" ); + if ( $paragraph ne "" ) { + do_paragraph( $self, $paragraph, $wrapped_mode, $type ); + $paragraph = ""; + } + $self->pushline($nextline); + $lvl--; + while ( scalar @info_string > $lvl ) { + pop @info_string; + } + } elsif ( $nextline =~ /^([ ]{0,3})(([:])\3{2,})(\s*)([^`]*)\s*$/ ) { + if ( $paragraph ne "" ) { + do_paragraph( $self, $paragraph, $wrapped_mode, $type ); + $paragraph = ""; + } + $self->pushline($nextline); + push @info_string, $5; + $lvl++; + } else { + $paragraph .= $nextline; + } + } + $paragraph = ""; + $end_of_paragraph = 1; + + # print STDERR "Out now ------------\n"; + } elsif ( + $line =~ /^\s*\[\[\!\S+\s*$/ # macro begin + or $line =~ /^\s*"""\s*\]\]\s*$/ + ) + { # """ textblock inside macro end + # Avoid translating Markdown lines containing only markup + do_paragraph( $self, $paragraph, $wrapped_mode ); + $paragraph = ""; + $wrapped_mode = $defaultwrap; + $self->pushline("$line\n"); + } elsif ( $line =~ /^\s*\[\[\!\S[^\]]*\]\]\s*$/ ) { # sole macro + # Preserve some Markdown markup as a single line + do_paragraph( $self, $paragraph, $wrapped_mode ); + $paragraph = "$line\n"; + $wrapped_mode = 0; + $end_of_paragraph = 1; + } elsif ( $line =~ /^"""/ ) { # """ textblock inside macro end + # Markdown markup needing separation _before_ this line + do_paragraph( $self, $paragraph, $wrapped_mode ); + $paragraph = "$line\n"; + $wrapped_mode = $defaultwrap; + } else { + return parse_fallback( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); + } + return ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); +} + +sub parse { + my $self = shift; + my ( $line, $ref ); + my $paragraph = ""; + my $wrapped_mode = $defaultwrap; + my $expect_header = 1; + my $end_of_paragraph = 0; + ( $line, $ref ) = $self->shiftline(); + my $file = $ref; + $file =~ s/:[0-9]+$// if defined($line); + + while ( defined($line) ) { + $ref =~ m/^(.*):[0-9]+$/; + if ( $1 ne $file ) { + $file = $1; + do_paragraph( $self, $paragraph, $wrapped_mode ); + $paragraph = ""; + $wrapped_mode = $defaultwrap; + $expect_header = 1; + } + + chomp($line); + $self->{ref} = "$ref"; + ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ) = + &$parse_func( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); + + # paragraphs starting by a bullet, or numbered + # or paragraphs with a line containing more than 3 consecutive spaces + # are considered as verbatim paragraphs + $wrapped_mode = 0 if ( $paragraph =~ m/^(\*|[0-9]+[.)] )/s + or $paragraph =~ m/[ \t][ \t][ \t]/s ); + + # Paragraphs starting with a table formating (GH extension) are also considered verbatim + $wrapped_mode = 0 if ( $paragraph =~ m/^\|/ ); + + $wrapped_mode = 0 if ( $tabs eq "verbatim" + and $paragraph =~ m/\t/s ); + + # Also consider keyvalue paragraphs verbatim, if requested + $wrapped_mode = 0 if ( $keyvalue == 1 + and $paragraph =~ m/^[^ :]+:.*[^\s].*$/s ); + if ($markdown) { + + # Some Markdown markup can (or might) not survive wrapping + $wrapped_mode = 0 + if ( + $paragraph =~ /^>/ms # blockquote + or $paragraph =~ /^( {8}|\t)/ms # monospaced + or $paragraph =~ /^\$(\S+[{}]\S*\s*)+/ms # Xapian macro + or $paragraph =~ /<(?![a-z]+[:@])/ms # maybe html (tags but not wiki <URI>) + or $paragraph =~ /^[^<]+>/ms # maybe html (tag with vertical space) + or $paragraph =~ /\S $/ms # explicit newline + or $paragraph =~ /\[\[\!\S[^\]]+$/ms # macro begin + ); + } + if ($end_of_paragraph) { + do_paragraph( $self, $paragraph, $wrapped_mode ); + $paragraph = ""; + $wrapped_mode = $defaultwrap; + $end_of_paragraph = 0; + } + ( $line, $ref ) = $self->shiftline(); + } + if ( length $paragraph ) { + do_paragraph( $self, $paragraph, $wrapped_mode ); + } +} + +sub do_paragraph { + my ( $self, $paragraph, $wrap ) = ( shift, shift, shift ); + my $type = shift || $self->{type} || "Plain text"; + my $flags = ""; + if ( $type eq "Plain text" and $markdown ) { + $flags = "markdown-text"; + } + + return if ( $paragraph eq "" ); + + $wrap = 0 unless $defaultwrap; + + # DEBUG + # $type .= " verbatim: '".($self->{verbatim}//"NONE")."' bullet: '$bullets' wrap: '$wrap' indent: '".($self->{indent}//"NONE")."' type: '".($self->{type}//"NONE")."'"; + # print STDERR "$type\n"; + + if ( $bullets and not defined $self->{verbatim} ) { + + # Detect bullets + # | * blah blah + # |<spaces> blah + # | ^-- aligned + # <empty line> + # + # The leading spaces are optional, and other bullets are supported: + # - blah o blah + blah + # 1. blah 1) blah (1) blah + TEST_BULLET: + if ( $paragraph =~ m/^(\s*)((?:[-*o+]|([0-9]+[.\)])|\([0-9]+\))\s+)([^\n]*\n)(.*)$/s ) { + my $para = $5; + my $bullet = $2; + my $indent1 = $1; + my $indent2 = "$1" . ( ' ' x length $bullet ); + my $text = $4; + while ( $para !~ m/^$indent2(?:[-*o+]|([0-9]+[.\)])|\([0-9]+\))\s+/ + and $para =~ s/^$indent2(\S[^\n]*\n)//s ) + { + $text .= $1; + } + + # TODO: detect if a line starts with the same bullet + if ( $text !~ m/\S[ \t][ \t][ \t]+\S/s ) { + my $bullet_regex = quotemeta( $indent1 . $bullet ); + $bullet_regex =~ s/[0-9]+/\\d\+/; + if ( $para eq '' + or $para =~ m/^(\s*)((?:[-*o+]|([0-9]+[.\)])|\([0-9]+\))\s+)([^\n]*\n)(.*)$/s + or $para =~ m/^$bullet_regex\S/s ) + { + my $trans = $self->translate( + $text, + $self->{ref}, + "Bullet: '$indent1$bullet'", + "flags" => "markdown-text", + "wrap" => $defaultwrap, + "wrapcol" => -( length $indent2 ) + ); + $trans =~ s/^/$indent1$bullet/s; + $trans =~ s/\n(.)/\n$indent2$1/sg; + $self->pushline( $trans . "\n" ); + if ( $para eq '' ) { + return; + } else { + + # Another bullet + $paragraph = $para; + goto TEST_BULLET; + } + } + } + } + } + + my $end = ""; + if ($wrap) { + $paragraph =~ s/^(.*?)(\n*)$/$1/s; + $end = $2 || ""; + } + my $t = $self->translate( + $paragraph, + $self->{ref}, + $type, + "comment" => join( "\n", @comments ), + "flags" => $flags, + "wrap" => $wrap + ); + @comments = (); + if ( defined $self->{bullet} ) { + my $bullet = $self->{bullet}; + my $indent1 = $self->{indent}; + my $indent2 = $indent1 . ( ' ' x length($bullet) ); + $t =~ s/^/$indent1$bullet/s; + $t =~ s/\n(.)/\n$indent2$1/sg; + } + $self->pushline( $t . $end ); +} + +1; + +=head1 STATUS OF THIS MODULE + +Tested successfully on simple text files and NEWS.Debian files. + +=head1 AUTHORS + + Nicolas François <nicolas.francois@centraliens.net> + +=head1 COPYRIGHT AND LICENSE + + Copyright © 2005-2008 Nicolas FRANÇOIS <nicolas.francois@centraliens.net>. + + Copyright © 2008-2009, 2018 Jonas Smedegaard <dr@jones.dk>. + Copyright © 2020 Martin Quinson <mquinson#debian.org>. + +This program is free software; you may redistribute it and/or modify it +under the terms of GPL v2.0 or later (see the COPYING file). + +=cut + +__END__ + +# LocalWords: Charset charset po UTF gettext msgid nostrip GPL |