#!/usr/bin/perl -w # Po4a::Text.pm # # extract and translate translatable strings from a text documents # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # ######################################################################## =encoding UTF-8 =head1 NAME Locale::Po4a::Text - convert text documents from/to PO files =head1 DESCRIPTION The po4a (PO for anything) project goal is to ease translations (and more interestingly, the maintenance of translations) using gettext tools on areas where they were not expected like documentation. Locale::Po4a::Text is a module to help the translation of text documents into other [human] languages. Paragraphs are split on empty lines (or lines containing only spaces or tabulations). If a paragraph contains a line starting by a space (or tabulation), this paragraph won't be rewrapped. =cut package Locale::Po4a::Text; use 5.16.0; use strict; use warnings; require Exporter; use vars qw(@ISA @EXPORT); @ISA = qw(Locale::Po4a::TransTractor); @EXPORT = qw(); use Locale::Po4a::TransTractor; use Locale::Po4a::Common; use YAML::Tiny; use Syntax::Keyword::Try; =head1 OPTIONS ACCEPTED BY THIS MODULE These are this module's particular options: =over =item B Treat paragraphs that look like a colon-separated key-value pair as verbatim (with the C flag in the PO file). A key-value pair string is a string like C, containing one or more non-colon and non-space characters followed by a colon followed by at least one non-space character before the end of the line. =cut my $keyvalue = 0; =item B Deactivate the detection of bullets. By default, when a bullet is detected, the bullet paragraph is not considered as a verbatim paragraph (with the C flag in the PO file). Instead, the corresponding paragraph is rewrapped in the translation. =cut my $bullets = 1; =item BI Specify how tabulations shall be handled. The I can be any of: =over =item B Lines with tabulations introduce breaks in the current paragraph. =item B Paragraph containing tabulations will not be re-wrapped. =back By default, tabulations are considered as spaces. =cut my $tabs = ""; =item BI A regular expression matching lines which introduce breaks. The regular expression will be anchored so that the whole line must match. =cut my $breaks; =item B Handle the header and footer of released versions, which only contain non translatable information. =cut my $debianchangelog = 0; =item B Handle the fortunes format, which separate fortunes with a line which consists in '%' or '%%', and use '%%' as the beginning of a comment. =cut my $fortunes = 0; =item B Handle some special markup in Markdown-formatted texts. =cut my $markdown = 0; =item B (markdown-only) Comma-separated list of keys to process for translation in the YAML Front Matter section. All other keys are skipped. Keys are matched with a case-sensitive match. If B and B are used together, values are included if they are matched by at least one of the options. Array values are always translated, unless the B option is provided. =cut my %yfm_keys = (); =item B (markdown only) Allow the YAML Front Matter parser to fail on malformated headers. This is particularly helpful when your file starts with a horizontal ruler instead of a YAML Front Matter, but you insist on using three dashes only for your ruler. =cut my $yfm_lenient = 0; =item B (markdown only) =item B Comma-separated list of hash paths to process for extraction in the YAML Front Matter section, all other paths are skipped. Paths are matched with a case-sensitive match. If B and B are used together, values are included if they are matched by at least one of the options. Arrays values are always returned unless the B option is provided. =cut my %yfm_paths = (); =item B (markdown-only) Do not translate array values in the YAML Front Matter section. =cut my $yfm_skip_array = 0; =item B[B<=>I] Handle Debian's control files. A comma-separated list of fields to be translated can be provided. =cut my %control = (); =item B Prevent po4a from wrapping any lines. This means that every content is handled verbatim, even simple paragraphs. =cut my $defaultwrap = 1; my $parse_func = \&parse_fallback; my @comments = (); =back =cut sub initialize { my $self = shift; my %options = @_; $self->{options}{'control'} = ""; $self->{options}{'breaks'} = 1; $self->{options}{'debianchangelog'} = 1; $self->{options}{'debug'} = 1; $self->{options}{'fortunes'} = 1; $self->{options}{'markdown'} = 1; $self->{options}{'yfm_keys'} = ''; $self->{options}{'yfm_lenient'} = 0; $self->{options}{'yfm_paths'} = ''; $self->{options}{'yfm_skip_array'} = 0; $self->{options}{'nobullets'} = 0; $self->{options}{'keyvalue'} = 1; $self->{options}{'tabs'} = 1; $self->{options}{'verbose'} = 1; $self->{options}{'neverwrap'} = 1; foreach my $opt ( keys %options ) { die wrap_mod( "po4a::text", dgettext( "po4a", "Unknown option: %s" ), $opt ) unless exists $self->{options}{$opt}; $self->{options}{$opt} = $options{$opt}; } $keyvalue = 1 if ( defined $options{'keyvalue'} ); $bullets = 0 if ( defined $options{'nobullets'} ); $tabs = $options{'tabs'} if ( defined $options{'tabs'} ); $breaks = $options{'breaks'} if ( defined $options{'breaks'} ); $defaultwrap = 0 if ( defined $options{'neverwrap'} ); $parse_func = \&parse_debianchangelog if ( defined $options{'debianchangelog'} ); $parse_func = \&parse_fortunes if ( defined $options{'fortunes'} ); if ( defined $options{'markdown'} ) { $parse_func = \&parse_markdown; $markdown = 1; map { $_ =~ s/^\s+|\s+$//g; # Trim the keys before using them $yfm_keys{$_} = 1 } ( split( ',', $self->{options}{'yfm_keys'} ) ); map { $_ =~ s/^\s+|\s+$//g; # Trim the keys before using them $yfm_paths{$_} = 1 } ( split( ',', $self->{options}{'yfm_paths'} ) ); # map { print STDERR "key $_\n"; } (keys %yfm_keys); $yfm_skip_array = $self->{options}{'yfm_skip_array'}; $yfm_lenient = $self->{options}{'yfm_lenient'}; } else { foreach my $opt (qw(yfm_keys yfm_lenient yfm_skip_array)) { die wrap_mod( "po4a::text", dgettext( "po4a", "Option %s is only valid when parsing markdown files." ), $opt ) if exists $options{$opt}; } } if ( defined $options{'control'} ) { $parse_func = \&parse_control; if ( $options{'control'} eq "1" ) { $control{''} = 1; } else { foreach my $tag ( split( ',', $options{'control'} ) ) { $control{$tag} = 1; } } } } sub parse_fallback { my ( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ) = @_; if ( ( $line =~ /^\s*$/ ) or ( defined $breaks and $line =~ m/^$breaks$/ ) ) { # Break paragraphs on lines containing only spaces do_paragraph( $self, $paragraph, $wrapped_mode ); $paragraph = ""; $wrapped_mode = $defaultwrap unless defined( $self->{verbatim} ); $self->pushline( $line . "\n" ); undef $self->{controlkey}; } elsif ( $line =~ /^-- $/ ) { # Break paragraphs on email signature hint do_paragraph( $self, $paragraph, $wrapped_mode ); $paragraph = ""; $wrapped_mode = $defaultwrap; $self->pushline( $line . "\n" ); } elsif ( $line =~ /^=+$/ or $line =~ /^_+$/ or $line =~ /^-+$/ ) { $wrapped_mode = 0; $paragraph .= $line . "\n"; do_paragraph( $self, $paragraph, $wrapped_mode ); $paragraph = ""; $wrapped_mode = $defaultwrap; } elsif ( $tabs eq "split" and $line =~ m/\t/ and $paragraph !~ m/\t/s ) { $wrapped_mode = 0; do_paragraph( $self, $paragraph, $wrapped_mode ); $paragraph = "$line\n"; $wrapped_mode = 0; } elsif ( $tabs eq "split" and $line !~ m/\t/ and $paragraph =~ m/\t/s ) { do_paragraph( $self, $paragraph, $wrapped_mode ); $paragraph = "$line\n"; $wrapped_mode = $defaultwrap; } else { if ( $line =~ /^\s/ ) { # A line starting by a space indicates a non-wrap # paragraph $wrapped_mode = 0; } if ( $markdown and ( $line =~ /\S $/ # explicit newline or $line =~ /"""$/ ) ) { # """ textblock inside macro begin # Markdown markup needing separation _after_ this line $end_of_paragraph = 1; } else { undef $self->{bullet}; undef $self->{indent}; } # TODO: comments $paragraph .= $line . "\n"; } return ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); } sub parse_debianchangelog { my ( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ) = @_; if ( $expect_header and $line =~ /^(\w[-+0-9a-z.]*)\ \(([^\(\) \t]+)\) # src, version \s+([-+0-9a-z.]+); # distribution \s*urgency\s*\=\s*(.*\S)\s*$/ix ) { # do_paragraph( $self, $paragraph, $wrapped_mode ); $paragraph = ""; $self->pushline("$line\n"); $expect_header = 0; } elsif ( $line =~ m/^ \-\- (.*) <(.*)> ((\w+\,\s*)?\d{1,2}\s+\w+\s+\d{4}\s+\d{1,2}:\d\d:\d\d\s+[-+]\d{4}(\s+\([^\\\(\)]+\)))$/ ) { # Found trailer do_paragraph( $self, $paragraph, $wrapped_mode ); $paragraph = ""; $self->pushline("$line\n"); $expect_header = 1; } else { return parse_fallback( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); } return ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); } sub parse_fortunes { my ( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ) = @_; # Always include paragraphs in no-wrap mode, # because the formatting of the fortunes # is usually hand-crafted and matters. $wrapped_mode = 0; # Check if there are more lines in the file. my $last_line_of_file = 0; my ( $nextline, $nextref ) = $self->shiftline(); if ( defined $nextline ) { # There is a next line, put it back. $self->unshiftline( $nextline, $nextref ); } else { # Nope, no more lines available. $last_line_of_file = 1; } # Is the line the end of a fortune or the last line of the file? if ( $line =~ m/^%%?\s*$/ or $last_line_of_file ) { # Add the last line to the paragraph if ($last_line_of_file) { $paragraph .= $line; } # Remove the last newline for the translation. chomp($paragraph); do_paragraph( $self, $paragraph, $wrapped_mode ); $paragraph = ""; # Add the last newline again for the output. $self->pushline("\n"); # Also add the separator line, if this is not the end of the file. if ( !$last_line_of_file ) { $self->pushline("$line\n"); } } else { $paragraph .= $line . "\n"; } return ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); } sub parse_control { my ( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ) = @_; if ( $line =~ m/^([^ :]*): *(.*)$/ ) { warn wrap_mod( "po4a::text", dgettext( "po4a", "Unrecognized section: %s" ), $paragraph ) unless $paragraph eq ""; my $tag = $1; my $val = $2; my $t; if ( $control{''} or $control{$tag} ) { $t = $self->translate( $val, $self->{ref}, $tag . ( defined $self->{controlkey} ? ", " . $self->{controlkey} : "" ), "wrap" => 0 ); } else { $t = $val; } if ( not defined $self->{controlkey} ) { $self->{controlkey} = "$tag: $val"; } $self->pushline("$tag: $t\n"); $paragraph = ""; $wrapped_mode = $defaultwrap; $self->{bullet} = ""; $self->{indent} = " "; } elsif ( $line eq " ." ) { do_paragraph( $self, $paragraph, $wrapped_mode, "Long Description" . ( defined $self->{controlkey} ? ", " . $self->{controlkey} : "" ) ); $paragraph = ""; $self->pushline( $line . "\n" ); $self->{bullet} = ""; $self->{indent} = " "; } elsif ( $line =~ m/^ Link: +(.*)$/ ) { do_paragraph( $self, $paragraph, $wrapped_mode, "Long Description" . ( defined $self->{controlkey} ? ", " . $self->{controlkey} : "" ) ); my $link = $1; my $t1 = $self->translate( "Link: ", $self->{ref}, "Link", "wrap" => 0 ); my $t2 = $self->translate( $link, $self->{ref}, "Link" . ( defined $self->{controlkey} ? ", " . $self->{controlkey} : "" ), "wrap" => 0 ); $self->pushline(" $t1$t2\n"); $paragraph = ""; } elsif ( defined $self->{indent} and $line =~ m/^$self->{indent}\S/ ) { $paragraph .= $line . "\n"; $self->{type} = "Long Description" . ( defined $self->{controlkey} ? ", " . $self->{controlkey} : "" ); } else { return parse_fallback( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); } return ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); } # Support pandoc's format of specifying bibliographic information. # # If the first line starts with a percent sign, the following # is considered to be title, author, and date. # # If the information spans multiple lines, the following # lines must be indented with space. # If information is omitted, it's just a percent sign # and a blank line. # # Examples with missing title resp. missing authors: # # % # % Author # # % My title # % # % June 14, 2018 sub parse_markdown_bibliographic_information { my ( $self, $line, $ref ) = @_; my ( $nextline, $nextref ); # The first match is always the title or an empty string (no title). if ( $line =~ /^%(.*)$/ ) { my $title = $1; # Remove leading and trailing whitespace $title =~ s/^\s+|\s+$//g; # If there's some text, look for continuation lines if ( length($title) ) { ( $nextline, $nextref ) = $self->shiftline(); while ( $nextline =~ /^\s+(.+)$/ ) { $nextline = $1; $nextline =~ s/^\s+|\s+$//g; $title .= " " . $nextline; ( $nextline, $nextref ) = $self->shiftline(); } # Now the title should be complete, give it to translation. my $t = $self->translate( $title, $ref, "Pandoc title block", "wrap" => $defaultwrap ); $t = Locale::Po4a::Po::wrap($t); my $first_line = 1; foreach my $translated_line ( split /\n/, $t ) { if ($first_line) { $first_line = 0; $self->pushline( "% " . $translated_line . "\n" ); } else { $self->pushline( " " . $translated_line . "\n" ); } } } else { # Title has been empty, fetch the next line # if that are the authors. $self->pushline("%\n"); ( $nextline, $nextref ) = $self->shiftline(); } # The next line can contain the author or an empty string. if ( $nextline =~ /^%(.*)$/ ) { my $author_ref = $nextref; my $authors = $1; # If there's some text, look for continuation lines if ( length($authors) ) { ( $nextline, $nextref ) = $self->shiftline(); while ( $nextline =~ /^\s+(.+)$/ ) { $nextline = $1; $authors .= ";" . $nextline; ( $nextline, $nextref ) = $self->shiftline(); } # Now the authors should be complete, split them by semicolon my $first_line = 1; foreach my $author ( split /;/, $authors ) { $author =~ s/^\s+|\s+$//g; # Skip empty authors next unless length($author); my $t = $self->translate( $author, $author_ref, "Pandoc title block" ); if ($first_line) { $first_line = 0; $self->pushline( "% " . $t . "\n" ); } else { $self->pushline( " " . $t . "\n" ); } } } else { # Authors has been empty, fetch the next line # if that is the date. $self->pushline("%\n"); ( $nextline, $nextref ) = $self->shiftline(); } # The next line can contain the date. if ( $nextline =~ /^%(.*)$/ ) { my $date = $1; # Remove leading and trailing whitespace $date =~ s/^\s+|\s+$//g; my $t = $self->translate( $date, $nextref, "Pandoc title block" ); $self->pushline( "% " . $t . "\n" ); # Now we're done with the bibliographic information return; } } # The line did not start with a percent sign, to stop # parsing bibliographic information and return the # line to the normal parsing. $self->unshiftline( $nextline, $nextref ); return; } } # Support YAML Front Matter in Markdown documents # # If the text starts with a YAML ---\n separator, the full text until # the next YAML ---\n separator is considered YAML metadata. The ...\n # "end of document" separator can be used at the end of the YAML # block. # sub parse_markdown_yaml_front_matter { my ( $self, $line, $blockref ) = @_; my $yfm; my @saved_ctn; my ( $nextline, $nextref ) = $self->shiftline(); push @saved_ctn, ( $nextline, $nextref ); while ( defined($nextline) ) { last if ( $nextline =~ /^(---|\.\.\.)$/ ); $yfm .= $nextline; ( $nextline, $nextref ) = $self->shiftline(); if ( $nextline =~ /: [\[\{]/ ) { die wrap_mod( "po4a::text", dgettext( "po4a", "Inline lists and dictionaries on a single line are not correctly handled the parser we use (YAML::Tiny): they are interpreted as regular strings. " . "Please use multi-lines definitions instead. Offending line:\n %s" ), $nextline ); } push @saved_ctn, ( $nextline, $nextref ); } my $yamlarray; # the parsed YFM content my $yamlres; # containing the parse error, if any try { $yamlarray = YAML::Tiny->read_string($yfm); } catch { $yamlres = $@; } if ( defined($yamlres) ) { if ($yfm_lenient) { $yamlres =~ s/ at .*$//; # Remove the error localisation in YAML::Tiny die message, if any (for our test) warn wrap_mod( "po4a::text", dgettext( "po4a", "Proceeding even if the YAML Front Matter could not be parsed. Remove the 'yfm_lenient' option for a stricter behavior.\nIgnored error: %s" ), $yamlres ); my $len = ( scalar @saved_ctn ) - 1; while ( $len >= 0 ) { $self->unshiftline( $saved_ctn[ $len - 1 ], $saved_ctn[$len] ); # print STDERR "Unshift ".$saved_ctn[ $len - 1] ." | ". $saved_ctn[$len] ."\n"; $len -= 2; } return 0; # Not a valid YAML } else { die wrap_mod( "po4a::text", dgettext( "po4a", "Could not get the YAML Front Matter from the file. If you did not intend to add a YAML front matter " . "but an horizontal ruler, please use '----' instead, or pass the 'yfm_lenient' option.\nError: %s\nContent of the YFM: %s" ), $yamlres, $yfm ); } } $self->handle_yaml( 1, $blockref, $yamlarray, \%yfm_keys, $yfm_skip_array, \%yfm_paths ); $self->pushline("---\n"); return 1; # Valid YAML } sub parse_markdown { my ( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ) = @_; if ($expect_header) { # It is only possible to find and parse the bibliographic # information or the YAML Front Matter from the first line. # Anyway, stop expecting header information for the next run. $expect_header = 0; if ( $line =~ /^%(.*)$/ ) { parse_markdown_bibliographic_information( $self, $line, $ref ); return ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); } elsif ( $line =~ /^---$/ ) { if ( parse_markdown_yaml_front_matter( $self, $line, $ref ) ) { # successfully parsed return ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); } # If it wasn't a YFM paragraph after all, stop expecting a header and keep going } } if ( ( $line =~ m/^(={4,}|-{4,})$/ ) and ( defined($paragraph) ) and ( $paragraph =~ m/^[^\n]*\n$/s ) and ( length($paragraph) == ( length($line) + 1 ) ) ) { # XXX: There can be any number of underlining according # to the documentation. This detection, which avoid # translating the formatting, is only supported if # the underlining has the same size as the header text. # Found title $wrapped_mode = 0; my $level = $line; $level =~ s/^(.).*$/$1/; # Remove the trailing newline from the title chomp($paragraph); my $t = $self->translate( $paragraph, $self->{ref}, "Title $level", "wrap" => 0, "flags" => "markdown-text" ); # Add the newline again for the output $self->pushline( $t . "\n" ); $paragraph = ""; $wrapped_mode = $defaultwrap; $self->pushline( ( $level x length($t) ) . "\n" ); } elsif ( $line =~ m/^(#{1,6})( +)(.*?)( +\1)?$/ ) { my $titlelevel1 = $1; my $titlespaces = $2; my $title = $3; my $titlelevel2 = $4 || ""; # Found one line title do_paragraph( $self, $paragraph, $wrapped_mode ); $wrapped_mode = 0; $paragraph = ""; my $t = $self->translate( $title, $self->{ref}, "Title $titlelevel1", "wrap" => 0, "flags" => "markdown-text" ); $self->pushline( $titlelevel1 . $titlespaces . $t . $titlelevel2 . "\n" ); $wrapped_mode = $defaultwrap; } elsif ( $line =~ /^[ ]{0,3}([*_-])\s*(?:\1\s*){2,}$/ ) { # Horizontal rule do_paragraph( $self, $paragraph, $wrapped_mode ); $self->pushline( $line . "\n" ); $paragraph = ""; $end_of_paragraph = 1; } elsif ( $line =~ /^([ ]{0,3})(\[[^\]]+\]:[ \t]?.+)$/ ) { my $indentation = $1; my $linkreference = $2; # Link reference # TODO: support multiline link reference definition # TODO: treat link title properly # https://spec.commonmark.org/0.30/#link-reference-definitions do_paragraph( $self, $paragraph, $wrapped_mode ); $wrapped_mode = 0; $paragraph = ""; my $t = $self->translate( $linkreference, $self->{ref}, "Link reference", "wrap" => 0, "flags" => "link-reference" ); $self->pushline( $indentation . $t . "\n" ); $wrapped_mode = $defaultwrap; } elsif ( $line =~ /^([ ]{0,3})(([~`])\3{2,})(\s*)([^`]*)\s*$/ ) { my $fence_space_before = $1; my $fence = $2; my $fencechar = $3; my $fence_space_between = $4; my $info_string = $5; # fenced code block my $type = "Fenced code block" . ( $info_string ? " ($info_string)" : "" ); do_paragraph( $self, $paragraph, $wrapped_mode ); $wrapped_mode = 0; $paragraph = ""; $self->pushline("$line\n"); do_paragraph( $self, $paragraph, $wrapped_mode ); $paragraph = ""; my ( $nextline, $nextref ) = $self->shiftline(); while ( $nextline !~ /^\s{0,3}$fence$fencechar*\s*$/ ) { $paragraph .= "$nextline"; ( $nextline, $nextref ) = $self->shiftline(); } do_paragraph( $self, $paragraph, $wrapped_mode, $type ); $self->pushline($nextline); $paragraph = ""; $end_of_paragraph = 1; } elsif ( $line =~ /^([ ]{0,3})(([:])\3{2,})(\s*)([^`]*)\s*$/ ) { my $fence_space_before = $1; my $fence = $2; my $fencechar = $3; my $fence_space_between = $4; my @info_string = ($5); # print STDERR "----------------\n"; # print STDERR "line: $line\n"; # print STDERR "fence: '$fence'; fencechar: '$fencechar'; info: '$info_string'\n"; # fenced div block (fenced with ::: where code blocks are fenced with ` or ~) # https://pandoc.org/MANUAL.html#divs-and-spans my $info = join( "|", map { chomp $_; $_ } @info_string ); my $type = "Fenced div block" . ( $info ? " ($info)" : "" ); do_paragraph( $self, $paragraph, $wrapped_mode ); $wrapped_mode = 0; $paragraph = ""; $self->pushline("$line\n"); do_paragraph( $self, $paragraph, $wrapped_mode ); $paragraph = ""; my $lvl = 1; while ( $lvl > 0 ) { my ( $nextline, $nextref ) = $self->shiftline(); die wrap_mod( "po4a::text", dgettext( "po4a", "Malformed fenced div block: Block starting at %s not closed before the end of the file." ), $ref ) unless ( defined($nextline) ); # print STDERR "within $lvl: $nextline"; if ( $nextline =~ /^\s*:::+\s*$/ ) { my $info = join( "|", map { chomp $_; $_ } @info_string ); $type = "Fenced div block" . ( $info ? " ($info)" : "" ); if ( $paragraph ne "" ) { do_paragraph( $self, $paragraph, $wrapped_mode, $type ); $paragraph = ""; } $self->pushline($nextline); $lvl--; while ( scalar @info_string > $lvl ) { pop @info_string; } } elsif ( $nextline =~ /^([ ]{0,3})(([:])\3{2,})(\s*)([^`]*)\s*$/ ) { if ( $paragraph ne "" ) { do_paragraph( $self, $paragraph, $wrapped_mode, $type ); $paragraph = ""; } $self->pushline($nextline); push @info_string, $5; $lvl++; } else { $paragraph .= $nextline; } } $paragraph = ""; $end_of_paragraph = 1; # print STDERR "Out now ------------\n"; } elsif ( $line =~ /^\s*\[\[\!\S+\s*$/ # macro begin or $line =~ /^\s*"""\s*\]\]\s*$/ ) { # """ textblock inside macro end # Avoid translating Markdown lines containing only markup do_paragraph( $self, $paragraph, $wrapped_mode ); $paragraph = ""; $wrapped_mode = $defaultwrap; $self->pushline("$line\n"); } elsif ( $line =~ /^\s*\[\[\!\S[^\]]*\]\]\s*$/ ) { # sole macro # Preserve some Markdown markup as a single line do_paragraph( $self, $paragraph, $wrapped_mode ); $paragraph = "$line\n"; $wrapped_mode = 0; $end_of_paragraph = 1; } elsif ( $line =~ /^"""/ ) { # """ textblock inside macro end # Markdown markup needing separation _before_ this line do_paragraph( $self, $paragraph, $wrapped_mode ); $paragraph = "$line\n"; $wrapped_mode = $defaultwrap; } else { return parse_fallback( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); } return ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); } sub parse { my $self = shift; my ( $line, $ref ); my $paragraph = ""; my $wrapped_mode = $defaultwrap; my $expect_header = 1; my $end_of_paragraph = 0; ( $line, $ref ) = $self->shiftline(); my $file = $ref; $file =~ s/:[0-9]+$// if defined($line); while ( defined($line) ) { $ref =~ m/^(.*):[0-9]+$/; if ( $1 ne $file ) { $file = $1; do_paragraph( $self, $paragraph, $wrapped_mode ); $paragraph = ""; $wrapped_mode = $defaultwrap; $expect_header = 1; } chomp($line); $self->{ref} = "$ref"; ( $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ) = &$parse_func( $self, $line, $ref, $paragraph, $wrapped_mode, $expect_header, $end_of_paragraph ); # paragraphs starting by a bullet, or numbered # or paragraphs with a line containing more than 3 consecutive spaces # are considered as verbatim paragraphs $wrapped_mode = 0 if ( $paragraph =~ m/^(\*|[0-9]+[.)] )/s or $paragraph =~ m/[ \t][ \t][ \t]/s ); # Paragraphs starting with a table formating (GH extension) are also considered verbatim $wrapped_mode = 0 if ( $paragraph =~ m/^\|/ ); $wrapped_mode = 0 if ( $tabs eq "verbatim" and $paragraph =~ m/\t/s ); # Also consider keyvalue paragraphs verbatim, if requested $wrapped_mode = 0 if ( $keyvalue == 1 and $paragraph =~ m/^[^ :]+:.*[^\s].*$/s ); if ($markdown) { # Some Markdown markup can (or might) not survive wrapping $wrapped_mode = 0 if ( $paragraph =~ /^>/ms # blockquote or $paragraph =~ /^( {8}|\t)/ms # monospaced or $paragraph =~ /^\$(\S+[{}]\S*\s*)+/ms # Xapian macro or $paragraph =~ /<(?![a-z]+[:@])/ms # maybe html (tags but not wiki ) or $paragraph =~ /^[^<]+>/ms # maybe html (tag with vertical space) or $paragraph =~ /\S $/ms # explicit newline or $paragraph =~ /\[\[\!\S[^\]]+$/ms # macro begin ); } if ($end_of_paragraph) { do_paragraph( $self, $paragraph, $wrapped_mode ); $paragraph = ""; $wrapped_mode = $defaultwrap; $end_of_paragraph = 0; } ( $line, $ref ) = $self->shiftline(); } if ( length $paragraph ) { do_paragraph( $self, $paragraph, $wrapped_mode ); } } sub do_paragraph { my ( $self, $paragraph, $wrap ) = ( shift, shift, shift ); my $type = shift || $self->{type} || "Plain text"; my $flags = ""; if ( $type eq "Plain text" and $markdown ) { $flags = "markdown-text"; } return if ( $paragraph eq "" ); $wrap = 0 unless $defaultwrap; # DEBUG # $type .= " verbatim: '".($self->{verbatim}//"NONE")."' bullet: '$bullets' wrap: '$wrap' indent: '".($self->{indent}//"NONE")."' type: '".($self->{type}//"NONE")."'"; # print STDERR "$type\n"; if ( $bullets and not defined $self->{verbatim} ) { # Detect bullets # | * blah blah # | blah # | ^-- aligned # # # The leading spaces are optional, and other bullets are supported: # - blah o blah + blah # 1. blah 1) blah (1) blah TEST_BULLET: if ( $paragraph =~ m/^(\s*)((?:[-*o+]|([0-9]+[.\)])|\([0-9]+\))\s+)([^\n]*\n)(.*)$/s ) { my $para = $5; my $bullet = $2; my $indent1 = $1; my $indent2 = "$1" . ( ' ' x length $bullet ); my $text = $4; while ( $para !~ m/^$indent2(?:[-*o+]|([0-9]+[.\)])|\([0-9]+\))\s+/ and $para =~ s/^$indent2(\S[^\n]*\n)//s ) { $text .= $1; } # TODO: detect if a line starts with the same bullet if ( $text !~ m/\S[ \t][ \t][ \t]+\S/s ) { my $bullet_regex = quotemeta( $indent1 . $bullet ); $bullet_regex =~ s/[0-9]+/\\d\+/; if ( $para eq '' or $para =~ m/^(\s*)((?:[-*o+]|([0-9]+[.\)])|\([0-9]+\))\s+)([^\n]*\n)(.*)$/s or $para =~ m/^$bullet_regex\S/s ) { my $trans = $self->translate( $text, $self->{ref}, "Bullet: '$indent1$bullet'", "flags" => "markdown-text", "wrap" => $defaultwrap, "wrapcol" => -( length $indent2 ) ); $trans =~ s/^/$indent1$bullet/s; $trans =~ s/\n(.)/\n$indent2$1/sg; $self->pushline( $trans . "\n" ); if ( $para eq '' ) { return; } else { # Another bullet $paragraph = $para; goto TEST_BULLET; } } } } } my $end = ""; if ($wrap) { $paragraph =~ s/^(.*?)(\n*)$/$1/s; $end = $2 || ""; } my $t = $self->translate( $paragraph, $self->{ref}, $type, "comment" => join( "\n", @comments ), "flags" => $flags, "wrap" => $wrap ); @comments = (); if ( defined $self->{bullet} ) { my $bullet = $self->{bullet}; my $indent1 = $self->{indent}; my $indent2 = $indent1 . ( ' ' x length($bullet) ); $t =~ s/^/$indent1$bullet/s; $t =~ s/\n(.)/\n$indent2$1/sg; } $self->pushline( $t . $end ); } 1; =head1 STATUS OF THIS MODULE Tested successfully on simple text files and NEWS.Debian files. =head1 AUTHORS Nicolas François =head1 COPYRIGHT AND LICENSE Copyright © 2005-2008 Nicolas FRANÇOIS . Copyright © 2008-2009, 2018 Jonas Smedegaard . Copyright © 2020 Martin Quinson . This program is free software; you may redistribute it and/or modify it under the terms of GPL v2.0 or later (see the COPYING file). =cut __END__ # LocalWords: Charset charset po UTF gettext msgid nostrip GPL