diff options
Diffstat (limited to 'lib/Lintian/Deb822.pm')
-rw-r--r-- | lib/Lintian/Deb822.pm | 692 |
1 files changed, 692 insertions, 0 deletions
diff --git a/lib/Lintian/Deb822.pm b/lib/Lintian/Deb822.pm new file mode 100644 index 0000000..c153415 --- /dev/null +++ b/lib/Lintian/Deb822.pm @@ -0,0 +1,692 @@ +# Copyright (C) 1998 Christian Schwarz +# Copyright (C) 2020 Felix Lechner +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, you can find it on the World Wide +# Web at https://www.gnu.org/copyleft/gpl.html, or write to the Free +# Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, +# MA 02110-1301, USA. + +package Lintian::Deb822; + +use v5.20; +use warnings; +use utf8; + +use Const::Fast; +use Path::Tiny; +use Syntax::Keyword::Try; +use Unicode::UTF8 qw(encode_utf8); + +use Lintian::Deb822::Constants qw(:constants); +use Lintian::Deb822::Section; + +const my $EMPTY => q{}; +const my $NUMBER_SIGN => q{#}; + +use Moo; +use namespace::clean; + +=encoding utf-8 + +=head1 NAME + +Lintian::Deb822 -- A deb822 control file + +=head1 SYNOPSIS + + use Lintian::Deb822; + +=head1 DESCRIPTION + +Represents a paragraph in a Deb822 control file. + +=head1 INSTANCE METHODS + +=over 4 + +=item sections + +Array of Deb822::Section objects in order of their original appearance. + +=item positions + +Line positions + +=cut + +has sections => (is => 'rw', default => sub { [] }); +has positions => (is => 'rw', default => sub { [] }); + +=item first_mention + +=cut + +sub first_mention { + my ($self, $name) = @_; + + my $earliest; + + # empty when field not present + $earliest ||= $_->value($name) for @{$self->sections}; + + return ($earliest // $EMPTY); +} + +=item last_mention + +=cut + +sub last_mention { + my ($self, $name) = @_; + + my $latest; + + for my $section (@{$self->sections}) { + + # empty when field not present + $latest = $section->value($name) + if $section->declares($name); + } + + return ($latest // $EMPTY); +} + +=item read_file + +=cut + +sub read_file { + my ($self, $path, $flags) = @_; + + my $contents = path($path)->slurp_utf8; + + return $self->parse_string($contents, $flags); +} + +=item parse_string + +=cut + +sub parse_string { + my ($self, $contents, $flags) = @_; + + my (@paragraphs, @positions); + + try { + @paragraphs= parse_dpkg_control_string($contents, $flags,\@positions); + + } catch { + # ignore syntax errors here + die map { encode_utf8($_) } $@ + unless $@ =~ /syntax error/; + } + + my $index = 0; + for my $paragraph (@paragraphs) { + + my $section = Lintian::Deb822::Section->new; + $section->verbatim($paragraph); + $section->positions($positions[$index]); + + push(@{$self->sections}, $section); + + } continue { + $index++; + } + + return @{$self->sections}; +} + +=back + +=head1 FUNCTIONS + +=head2 Debian control parsers + +At first glance, this module appears to contain several debian control +parsers. In practise, there is only one real parser +(L</visit_dpkg_paragraph_string>) - the rest are convenience functions around +it. + +=over 4 + +=item read_dpkg_control(FILE[, FLAGS[, LINES]]) + +This is a convenience function to ease using L</parse_dpkg_control> +with paths to files (rather than open handles). The first argument +must be the path to a FILE, which should be read as a debian control +file. If the file is empty, an empty list is returned. + +Otherwise, this behaves like: + + use autodie; + + open(my $fd, '<:encoding(UTF-8)', FILE); # or '<' + my @p = parse_dpkg_control($fd, FLAGS, LINES); + close($fd); + return @p; + +This goes without saying that may fail with any of the messages that +L</parse_dpkg_control(HANDLE[, FLAGS[, LINES]])> do. It can also emit +autodie exceptions if open or close fails. + +=cut + +sub read_dpkg_control { + my ($file, $flags, $field_starts) = @_; + + open(my $handle, '<:utf8_strict', $file) + or die encode_utf8("Cannot open $file"); + + local $/ = undef; + my $string = <$handle>; + close $handle; + + my @result; + + my $visitor = sub { + my ($paragraph, $line) = @_; + + push(@result, $paragraph); + push(@{$field_starts}, $line) if defined $field_starts; + }; + + visit_dpkg_paragraph_string($visitor, $string, $flags); + + return @result; +} + +=item read_dpkg_control_lc(FILE[, FLAGS[, LINES]]) + +=cut + +sub read_dpkg_control_lc { + my ($file, $flags, $field_starts) = @_; + + my @result = read_dpkg_control($file, $flags, $field_starts); + + lowercase_field_names(\@result); + lowercase_field_names($field_starts); + + return @result; +} + +=item parse_dpkg_control_string(STRING[, FLAGS[, LINES]]) + +Reads debian control data from STRING and returns a list of +paragraphs in it. A paragraph is represented via a hashref, which +maps (lower cased) field names to their values. + +FLAGS (if given) is a bitmask of the I<DCTRL_*> constants. Please +refer to L</CONSTANTS> for the list of constants and their meaning. +The default value for FLAGS is 0. + +If LINES is given, it should be a reference to an empty list. On +return, LINES will be populated with a hashref for each paragraph (in +the same order as the returned list). Each hashref will also have a +special key "I<START-OF-PARAGRAPH>" that gives the line number of the +first field in that paragraph. These hashrefs will map the field name +of the given paragraph to the line number where the field name +appeared. + +This is a convenience sub around L</visit_dpkg_paragraph> and can +therefore produce the same errors as it. Please see +L</visit_dpkg_paragraph> for the finer semantics of how the +control file is parsed. + +NB: parse_dpkg_control does I<not> close the handle for the caller. + +=cut + +sub parse_dpkg_control_string { + my ($string, $flags, $field_starts) = @_; + my @result; + + my $c = sub { + my ($para, $line) = @_; + + push(@result, $para); + push(@{$field_starts}, $line) + if defined $field_starts; + }; + + visit_dpkg_paragraph_string($c, $string, $flags); + + return @result; +} + +=item parse_dpkg_control_string_lc(STRING[, FLAGS[, LINES]]) + +=cut + +sub parse_dpkg_control_string_lc { + my ($string, $flags, $field_starts) = @_; + + my @result = parse_dpkg_control_string($string, $flags, $field_starts); + + lowercase_field_names(\@result); + lowercase_field_names($field_starts); + + return @result; +} + +=item lowercase_field_names + +=cut + +sub lowercase_field_names { + my ($arrayref) = @_; + + return + unless $arrayref; + + for my $paragraph (@{$arrayref}) { + + # magic marker should only appear in field starts + my @fields = grep { $_ ne 'START-OF-PARAGRAPH' } keys %{$paragraph}; + my @mixedcase = grep { $_ ne lc } @fields; + + for my $old (@mixedcase) { + $paragraph->{lc $old} = $paragraph->{$old}; + delete $paragraph->{$old}; + } + } + + return; +} + +=item visit_dpkg_paragraph_string (CODE, STRING[, FLAGS]) + +Reads debian control data from STRING and passes each paragraph to +CODE. A paragraph is represented via a hashref, which maps (lower +cased) field names to their values. + +FLAGS (if given) is a bitmask of the I<DCTRL_*> constants. Please +refer to L</CONSTANTS> for the list of constants and their meaning. +The default value for FLAGS is 0. + +If the file is empty (i.e. it contains no paragraphs), the method will +contain an I<empty> list. The deb822 contents may be inside a +I<signed> PGP message with a signature. + +visit_dpkg_paragraph will require the PGP headers to be correct (if +present) and require that the entire file is covered by the signature. +However, it will I<not> validate the signature (in fact, the contents +of the PGP SIGNATURE part can be empty). The signature should be +validated separately. + +visit_dpkg_paragraph will pass paragraphs to CODE as they are +completed. If CODE can process the paragraphs as they are seen, very +large control files can be processed without keeping all the +paragraphs in memory. + +As a consequence of how the file is parsed, CODE may be passed a +number of (valid) paragraphs before parsing is stopped due to a syntax +error. + +NB: visit_dpkg_paragraph does I<not> close the handle for the caller. + +CODE is expected to be a callable reference (e.g. a sub) and will be +invoked as the following: + +=over 4 + +=item CODE->(PARA, LINE_NUMBERS) + +The first argument, PARA, is a hashref to the most recent paragraph +parsed. The second argument, LINE_NUMBERS, is a hashref mapping each +of the field names to the line number where the field name appeared. +LINE_NUMBERS will also have a special key "I<START-OF-PARAGRAPH>" that +gives the line number of the first field in that paragraph. + +The return value of CODE is ignored. + +If the CODE invokes die (or similar) the error is propagated to the +caller. + +=back + + +I<On syntax errors>, visit_dpkg_paragraph will call die with the +following string: + + "syntax error at line %d: %s\n" + +Where %d is the line number of the issue and %s is one of: + +=over + +=item Duplicate field %s + +The field appeared twice in the paragraph. + +=item Continuation line outside a paragraph (maybe line %d should be " .") + +A continuation line appears outside a paragraph - usually caused by an +unintended empty line before it. + +=item Whitespace line not allowed (possibly missing a ".") + +An empty continuation line was found. This usually means that a +period is missing to denote an "empty line" in (e.g.) the long +description of a package. + +=item Cannot parse line "%s" + +Generic error containing the text of the line that confused the +parser. Note that all non-printables in %s will be replaced by +underscores. + +=item Comments are not allowed + +A comment line appeared and FLAGS contained DCTRL_NO_COMMENTS. + +=item PGP signature seen before start of signed message + +A "BEGIN PGP SIGNATURE" header is seen and a "BEGIN PGP MESSAGE" has +not been seen yet. + +=item Two PGP signatures (first one at line %d) + +Two "BEGIN PGP SIGNATURE" headers are seen in the same file. + +=item Unexpected %s header + +A valid PGP header appears (e.g. "BEGIN PUBLIC KEY BLOCK"). + +=item Malformed PGP header + +An invalid or malformed PGP header appears. + +=item Expected at most one signed message (previous at line %d) + +Two "BEGIN PGP MESSAGE" headers appears in the same message. + +=item End of file but expected an "END PGP SIGNATURE" header + +The file ended after a "BEGIN PGP SIGNATURE" header without being +followed by an "END PGP SIGNATURE". + +=item PGP MESSAGE header must be first content if present + +The file had content before PGP MESSAGE. + +=item Data after the PGP SIGNATURE + +The file had data after the PGP SIGNATURE block ended. + +=item End of file before "BEGIN PGP SIGNATURE" + +The file had a "BEGIN PGP MESSAGE" header, but no signature was +present. + +=back + +=cut + +sub visit_dpkg_paragraph_string { + my ($code, $string, $flags) = @_; + $flags//=0; + my $field_starts = {}; + my $section = {}; + my $open_section = 0; + my $last_tag; + my $debconf = $flags & DCTRL_DEBCONF_TEMPLATE; + my $signed = 0; + my $signature = 0; + + my @lines = split(/\n/, $string); + + my $position = 1; + + my $line; + while (defined($line = shift @lines)) { + chomp $line; + + if (substr($line, 0, 1) eq $NUMBER_SIGN) { + next + unless $flags & DCTRL_NO_COMMENTS; + die encode_utf8("No comments allowed (line $position).\n"); + } + + # empty line? + if ($line eq $EMPTY || (!$debconf && $line =~ /^\s*$/)) { + if ($open_section) { # end of current section + # pass the current section to the handler + $code->($section, $field_starts); + $section = {}; + $field_starts = {}; + $open_section = 0; + } + } + # pgp sig? Be strict here (due to #696230) + # According to http://tools.ietf.org/html/rfc4880#section-6.2 + # The header MUST start at the beginning of the line and MUST NOT have + # any other text (except whitespace) after the header. + elsif ($line =~ m/^-----BEGIN PGP SIGNATURE-----[ \r\t]*$/) + { # skip until end of signature + my $saw_end = 0; + + die encode_utf8("PGP signature before message (line $position).\n") + unless $signed; + + die encode_utf8( +"Found two PGP signatures (line $signature and line $position).\n" + )if $signature; + + $signature = $position; + while (defined($line = shift @lines)) { + if ($line =~ /^-----END PGP SIGNATURE-----[ \r\t]*$/) { + $saw_end = 1; + last; + } + }continue { + ++$position; + } + + # The "at line X" may seem a little weird, but it keeps the + # message format identical. + die encode_utf8("Cannot find END PGP SIGNATURE header.\n") + unless $saw_end; + } + # other pgp control? + elsif ($line =~ /^-----(?:BEGIN|END) PGP/) { + # At this point it could be a malformed PGP header or one + # of the following valid headers (RFC4880): + # * BEGIN PGP MESSAGE + # - Possibly a signed Debian CTRL, so okay (for now) + # * BEGIN PGP {PUBLIC,PRIVATE} KEY BLOCK + # - Valid header, but not a Debian CTRL file. + # * BEGIN PGP MESSAGE, PART X{,/Y} + # - Valid, but we don't support partial messages, so + # bail on those. + + unless ($line =~ /^-----BEGIN PGP SIGNED MESSAGE-----[ \r\t]*$/) { + # Not a (full) PGP MESSAGE; reject. + + my $key = qr/(?:BEGIN|END) PGP (?:PUBLIC|PRIVATE) KEY BLOCK/; + my $msgpart = qr{BEGIN PGP MESSAGE, PART \d+(?:/\d+)?}; + my $msg + = qr/(?:BEGIN|END) PGP (?:(?:COMPRESSED|ENCRYPTED) )?MESSAGE/; + + if ($line =~ /^-----($key|$msgpart|$msg)-----[ \r\t]*$/) { + die encode_utf8( + "Unexpected $1 header (line $position).\n"); + } + + die encode_utf8("Malformed PGP header (line $position).\n"); + + } else { + die encode_utf8( +"Multiple PGP messages (line $signed and line $position).\n" + )if $signed; + + # NB: If you remove this, keep in mind that it may + # allow two paragraphs to merge. Consider: + # + # Field-P1: some-value + # -----BEGIN PGP SIGNATURE----- + # + # Field-P2: another value + # + # At the time of writing: If $open_section is + # true, it will remain so until the empty line + # after the PGP header. + die encode_utf8( + "Expected PGP MESSAGE header (line $position).\n") + if $last_tag; + + $signed = $position; + } + + # skip until the next blank line + while (defined($line = shift @lines)) { + last + if $line =~ /^\s*$/; + }continue { + ++$position; + } + } + # did we see a signature already? We allow all whitespace/comment lines + # outside the signature. + elsif ($signature) { + # Accept empty lines after the signature. + next + if $line =~ /^\s*$/; + + # NB: If you remove this, keep in mind that it may allow + # two paragraphs to merge. Consider: + # + # Field-P1: some-value + # -----BEGIN PGP SIGNATURE----- + # [...] + # -----END PGP SIGNATURE----- + # Field-P2: another value + # + # At the time of writing: If $open_section is true, it + # will remain so until the empty line after the PGP + # header. + die encode_utf8("Data after PGP SIGNATURE (line $position).\n"); + } + # new empty field? + elsif ($line =~ /^([^: \t]+):\s*$/) { + $field_starts->{'START-OF-PARAGRAPH'} = $position + unless $open_section; + $open_section = 1; + + my $tag = $1; + $section->{$tag} = $EMPTY; + $field_starts->{$tag} = $position; + + $last_tag = $tag; + } + # new field? + elsif ($line =~ /^([^: \t]+):\s*(.*)$/) { + $field_starts->{'START-OF-PARAGRAPH'} = $position + unless $open_section; + $open_section = 1; + + # Policy: Horizontal whitespace (spaces and tabs) may occur + # immediately before or after the value and is ignored there. + my $tag = $1; + my $value = $2; + + # trim right + $value =~ s/\s+$//; + + if (exists $section->{$tag}) { + # Policy: A paragraph must not contain more than one instance + # of a particular field name. + die encode_utf8("Duplicate field $tag (line $position).\n"); + } + $value =~ s/#.*$// + if $flags & DCTRL_COMMENTS_AT_EOL; + $section->{$tag} = $value; + $field_starts->{$tag} = $position; + + $last_tag = $tag; + } + + # continued field? + elsif ($line =~ /^([ \t].*\S.*)$/) { + die encode_utf8( +"Continuation line not in paragraph (line $position). Missing a dot on the previous line?\n" + )unless $open_section; + + # Policy: Many fields' values may span several lines; in this case + # each continuation line must start with a space or a tab. Any + # trailing spaces or tabs at the end of individual lines of a + # field value are ignored. + my $value = $1; + + # trim right + $value =~ s/\s+$//; + + $value =~ s/#.*$// + if $flags & DCTRL_COMMENTS_AT_EOL; + $section->{$last_tag} .= "\n" . $value; + } + # None of the above => syntax error + else { + + die encode_utf8( + "Unexpected whitespace (line $position). Missing a dot?\n") + if $line =~ /^\s+$/; + + # Replace non-printables and non-space characters with + # "_" - just in case. + $line =~ s/[^[:graph:][:space:]]/_/g; + + die encode_utf8("Cannot parse line $position: $line\n"); + } + + }continue { + ++$position; + } + + # pass the last section (if not already done). + $code->($section, $field_starts) + if $open_section; + + # Given the API, we cannot use this check to prevent any + # paragraphs from being emitted to the code argument, so we might + # as well just do this last. + + die encode_utf8("Cannot find BEGIN PGP SIGNATURE\n.") + if $signed && !$signature; + + return; +} + +=back + +=head1 AUTHOR + +Originally written Christian Schwarz and many other people. + +Moo version by Felix Lechner <felix.lechner@lease-up.com> for Lintian. + +=head1 SEE ALSO + +lintian(1) + +=cut + +1; + +# Local Variables: +# indent-tabs-mode: nil +# cperl-indent-level: 4 +# End: +# vim: syntax=perl sw=4 sts=4 sr et |