summaryrefslogtreecommitdiffstats
path: root/lib/Lintian/Deb822.pm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--lib/Lintian/Deb822.pm692
1 files changed, 692 insertions, 0 deletions
diff --git a/lib/Lintian/Deb822.pm b/lib/Lintian/Deb822.pm
new file mode 100644
index 0000000..c153415
--- /dev/null
+++ b/lib/Lintian/Deb822.pm
@@ -0,0 +1,692 @@
+# Copyright (C) 1998 Christian Schwarz
+# Copyright (C) 2020 Felix Lechner
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, you can find it on the World Wide
+# Web at https://www.gnu.org/copyleft/gpl.html, or write to the Free
+# Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
+# MA 02110-1301, USA.
+
+package Lintian::Deb822;
+
+use v5.20;
+use warnings;
+use utf8;
+
+use Const::Fast;
+use Path::Tiny;
+use Syntax::Keyword::Try;
+use Unicode::UTF8 qw(encode_utf8);
+
+use Lintian::Deb822::Constants qw(:constants);
+use Lintian::Deb822::Section;
+
+const my $EMPTY => q{};
+const my $NUMBER_SIGN => q{#};
+
+use Moo;
+use namespace::clean;
+
+=encoding utf-8
+
+=head1 NAME
+
+Lintian::Deb822 -- A deb822 control file
+
+=head1 SYNOPSIS
+
+ use Lintian::Deb822;
+
+=head1 DESCRIPTION
+
+Represents a paragraph in a Deb822 control file.
+
+=head1 INSTANCE METHODS
+
+=over 4
+
+=item sections
+
+Array of Deb822::Section objects in order of their original appearance.
+
+=item positions
+
+Line positions
+
+=cut
+
+has sections => (is => 'rw', default => sub { [] });
+has positions => (is => 'rw', default => sub { [] });
+
+=item first_mention
+
+=cut
+
+sub first_mention {
+ my ($self, $name) = @_;
+
+ my $earliest;
+
+ # empty when field not present
+ $earliest ||= $_->value($name) for @{$self->sections};
+
+ return ($earliest // $EMPTY);
+}
+
+=item last_mention
+
+=cut
+
+sub last_mention {
+ my ($self, $name) = @_;
+
+ my $latest;
+
+ for my $section (@{$self->sections}) {
+
+ # empty when field not present
+ $latest = $section->value($name)
+ if $section->declares($name);
+ }
+
+ return ($latest // $EMPTY);
+}
+
+=item read_file
+
+=cut
+
+sub read_file {
+ my ($self, $path, $flags) = @_;
+
+ my $contents = path($path)->slurp_utf8;
+
+ return $self->parse_string($contents, $flags);
+}
+
+=item parse_string
+
+=cut
+
+sub parse_string {
+ my ($self, $contents, $flags) = @_;
+
+ my (@paragraphs, @positions);
+
+ try {
+ @paragraphs= parse_dpkg_control_string($contents, $flags,\@positions);
+
+ } catch {
+ # ignore syntax errors here
+ die map { encode_utf8($_) } $@
+ unless $@ =~ /syntax error/;
+ }
+
+ my $index = 0;
+ for my $paragraph (@paragraphs) {
+
+ my $section = Lintian::Deb822::Section->new;
+ $section->verbatim($paragraph);
+ $section->positions($positions[$index]);
+
+ push(@{$self->sections}, $section);
+
+ } continue {
+ $index++;
+ }
+
+ return @{$self->sections};
+}
+
+=back
+
+=head1 FUNCTIONS
+
+=head2 Debian control parsers
+
+At first glance, this module appears to contain several debian control
+parsers. In practise, there is only one real parser
+(L</visit_dpkg_paragraph_string>) - the rest are convenience functions around
+it.
+
+=over 4
+
+=item read_dpkg_control(FILE[, FLAGS[, LINES]])
+
+This is a convenience function to ease using L</parse_dpkg_control>
+with paths to files (rather than open handles). The first argument
+must be the path to a FILE, which should be read as a debian control
+file. If the file is empty, an empty list is returned.
+
+Otherwise, this behaves like:
+
+ use autodie;
+
+ open(my $fd, '<:encoding(UTF-8)', FILE); # or '<'
+ my @p = parse_dpkg_control($fd, FLAGS, LINES);
+ close($fd);
+ return @p;
+
+This goes without saying that may fail with any of the messages that
+L</parse_dpkg_control(HANDLE[, FLAGS[, LINES]])> do. It can also emit
+autodie exceptions if open or close fails.
+
+=cut
+
+sub read_dpkg_control {
+ my ($file, $flags, $field_starts) = @_;
+
+ open(my $handle, '<:utf8_strict', $file)
+ or die encode_utf8("Cannot open $file");
+
+ local $/ = undef;
+ my $string = <$handle>;
+ close $handle;
+
+ my @result;
+
+ my $visitor = sub {
+ my ($paragraph, $line) = @_;
+
+ push(@result, $paragraph);
+ push(@{$field_starts}, $line) if defined $field_starts;
+ };
+
+ visit_dpkg_paragraph_string($visitor, $string, $flags);
+
+ return @result;
+}
+
+=item read_dpkg_control_lc(FILE[, FLAGS[, LINES]])
+
+=cut
+
+sub read_dpkg_control_lc {
+ my ($file, $flags, $field_starts) = @_;
+
+ my @result = read_dpkg_control($file, $flags, $field_starts);
+
+ lowercase_field_names(\@result);
+ lowercase_field_names($field_starts);
+
+ return @result;
+}
+
+=item parse_dpkg_control_string(STRING[, FLAGS[, LINES]])
+
+Reads debian control data from STRING and returns a list of
+paragraphs in it. A paragraph is represented via a hashref, which
+maps (lower cased) field names to their values.
+
+FLAGS (if given) is a bitmask of the I<DCTRL_*> constants. Please
+refer to L</CONSTANTS> for the list of constants and their meaning.
+The default value for FLAGS is 0.
+
+If LINES is given, it should be a reference to an empty list. On
+return, LINES will be populated with a hashref for each paragraph (in
+the same order as the returned list). Each hashref will also have a
+special key "I<START-OF-PARAGRAPH>" that gives the line number of the
+first field in that paragraph. These hashrefs will map the field name
+of the given paragraph to the line number where the field name
+appeared.
+
+This is a convenience sub around L</visit_dpkg_paragraph> and can
+therefore produce the same errors as it. Please see
+L</visit_dpkg_paragraph> for the finer semantics of how the
+control file is parsed.
+
+NB: parse_dpkg_control does I<not> close the handle for the caller.
+
+=cut
+
+sub parse_dpkg_control_string {
+ my ($string, $flags, $field_starts) = @_;
+ my @result;
+
+ my $c = sub {
+ my ($para, $line) = @_;
+
+ push(@result, $para);
+ push(@{$field_starts}, $line)
+ if defined $field_starts;
+ };
+
+ visit_dpkg_paragraph_string($c, $string, $flags);
+
+ return @result;
+}
+
+=item parse_dpkg_control_string_lc(STRING[, FLAGS[, LINES]])
+
+=cut
+
+sub parse_dpkg_control_string_lc {
+ my ($string, $flags, $field_starts) = @_;
+
+ my @result = parse_dpkg_control_string($string, $flags, $field_starts);
+
+ lowercase_field_names(\@result);
+ lowercase_field_names($field_starts);
+
+ return @result;
+}
+
+=item lowercase_field_names
+
+=cut
+
+sub lowercase_field_names {
+ my ($arrayref) = @_;
+
+ return
+ unless $arrayref;
+
+ for my $paragraph (@{$arrayref}) {
+
+ # magic marker should only appear in field starts
+ my @fields = grep { $_ ne 'START-OF-PARAGRAPH' } keys %{$paragraph};
+ my @mixedcase = grep { $_ ne lc } @fields;
+
+ for my $old (@mixedcase) {
+ $paragraph->{lc $old} = $paragraph->{$old};
+ delete $paragraph->{$old};
+ }
+ }
+
+ return;
+}
+
+=item visit_dpkg_paragraph_string (CODE, STRING[, FLAGS])
+
+Reads debian control data from STRING and passes each paragraph to
+CODE. A paragraph is represented via a hashref, which maps (lower
+cased) field names to their values.
+
+FLAGS (if given) is a bitmask of the I<DCTRL_*> constants. Please
+refer to L</CONSTANTS> for the list of constants and their meaning.
+The default value for FLAGS is 0.
+
+If the file is empty (i.e. it contains no paragraphs), the method will
+contain an I<empty> list. The deb822 contents may be inside a
+I<signed> PGP message with a signature.
+
+visit_dpkg_paragraph will require the PGP headers to be correct (if
+present) and require that the entire file is covered by the signature.
+However, it will I<not> validate the signature (in fact, the contents
+of the PGP SIGNATURE part can be empty). The signature should be
+validated separately.
+
+visit_dpkg_paragraph will pass paragraphs to CODE as they are
+completed. If CODE can process the paragraphs as they are seen, very
+large control files can be processed without keeping all the
+paragraphs in memory.
+
+As a consequence of how the file is parsed, CODE may be passed a
+number of (valid) paragraphs before parsing is stopped due to a syntax
+error.
+
+NB: visit_dpkg_paragraph does I<not> close the handle for the caller.
+
+CODE is expected to be a callable reference (e.g. a sub) and will be
+invoked as the following:
+
+=over 4
+
+=item CODE->(PARA, LINE_NUMBERS)
+
+The first argument, PARA, is a hashref to the most recent paragraph
+parsed. The second argument, LINE_NUMBERS, is a hashref mapping each
+of the field names to the line number where the field name appeared.
+LINE_NUMBERS will also have a special key "I<START-OF-PARAGRAPH>" that
+gives the line number of the first field in that paragraph.
+
+The return value of CODE is ignored.
+
+If the CODE invokes die (or similar) the error is propagated to the
+caller.
+
+=back
+
+
+I<On syntax errors>, visit_dpkg_paragraph will call die with the
+following string:
+
+ "syntax error at line %d: %s\n"
+
+Where %d is the line number of the issue and %s is one of:
+
+=over
+
+=item Duplicate field %s
+
+The field appeared twice in the paragraph.
+
+=item Continuation line outside a paragraph (maybe line %d should be " .")
+
+A continuation line appears outside a paragraph - usually caused by an
+unintended empty line before it.
+
+=item Whitespace line not allowed (possibly missing a ".")
+
+An empty continuation line was found. This usually means that a
+period is missing to denote an "empty line" in (e.g.) the long
+description of a package.
+
+=item Cannot parse line "%s"
+
+Generic error containing the text of the line that confused the
+parser. Note that all non-printables in %s will be replaced by
+underscores.
+
+=item Comments are not allowed
+
+A comment line appeared and FLAGS contained DCTRL_NO_COMMENTS.
+
+=item PGP signature seen before start of signed message
+
+A "BEGIN PGP SIGNATURE" header is seen and a "BEGIN PGP MESSAGE" has
+not been seen yet.
+
+=item Two PGP signatures (first one at line %d)
+
+Two "BEGIN PGP SIGNATURE" headers are seen in the same file.
+
+=item Unexpected %s header
+
+A valid PGP header appears (e.g. "BEGIN PUBLIC KEY BLOCK").
+
+=item Malformed PGP header
+
+An invalid or malformed PGP header appears.
+
+=item Expected at most one signed message (previous at line %d)
+
+Two "BEGIN PGP MESSAGE" headers appears in the same message.
+
+=item End of file but expected an "END PGP SIGNATURE" header
+
+The file ended after a "BEGIN PGP SIGNATURE" header without being
+followed by an "END PGP SIGNATURE".
+
+=item PGP MESSAGE header must be first content if present
+
+The file had content before PGP MESSAGE.
+
+=item Data after the PGP SIGNATURE
+
+The file had data after the PGP SIGNATURE block ended.
+
+=item End of file before "BEGIN PGP SIGNATURE"
+
+The file had a "BEGIN PGP MESSAGE" header, but no signature was
+present.
+
+=back
+
+=cut
+
+sub visit_dpkg_paragraph_string {
+ my ($code, $string, $flags) = @_;
+ $flags//=0;
+ my $field_starts = {};
+ my $section = {};
+ my $open_section = 0;
+ my $last_tag;
+ my $debconf = $flags & DCTRL_DEBCONF_TEMPLATE;
+ my $signed = 0;
+ my $signature = 0;
+
+ my @lines = split(/\n/, $string);
+
+ my $position = 1;
+
+ my $line;
+ while (defined($line = shift @lines)) {
+ chomp $line;
+
+ if (substr($line, 0, 1) eq $NUMBER_SIGN) {
+ next
+ unless $flags & DCTRL_NO_COMMENTS;
+ die encode_utf8("No comments allowed (line $position).\n");
+ }
+
+ # empty line?
+ if ($line eq $EMPTY || (!$debconf && $line =~ /^\s*$/)) {
+ if ($open_section) { # end of current section
+ # pass the current section to the handler
+ $code->($section, $field_starts);
+ $section = {};
+ $field_starts = {};
+ $open_section = 0;
+ }
+ }
+ # pgp sig? Be strict here (due to #696230)
+ # According to http://tools.ietf.org/html/rfc4880#section-6.2
+ # The header MUST start at the beginning of the line and MUST NOT have
+ # any other text (except whitespace) after the header.
+ elsif ($line =~ m/^-----BEGIN PGP SIGNATURE-----[ \r\t]*$/)
+ { # skip until end of signature
+ my $saw_end = 0;
+
+ die encode_utf8("PGP signature before message (line $position).\n")
+ unless $signed;
+
+ die encode_utf8(
+"Found two PGP signatures (line $signature and line $position).\n"
+ )if $signature;
+
+ $signature = $position;
+ while (defined($line = shift @lines)) {
+ if ($line =~ /^-----END PGP SIGNATURE-----[ \r\t]*$/) {
+ $saw_end = 1;
+ last;
+ }
+ }continue {
+ ++$position;
+ }
+
+ # The "at line X" may seem a little weird, but it keeps the
+ # message format identical.
+ die encode_utf8("Cannot find END PGP SIGNATURE header.\n")
+ unless $saw_end;
+ }
+ # other pgp control?
+ elsif ($line =~ /^-----(?:BEGIN|END) PGP/) {
+ # At this point it could be a malformed PGP header or one
+ # of the following valid headers (RFC4880):
+ # * BEGIN PGP MESSAGE
+ # - Possibly a signed Debian CTRL, so okay (for now)
+ # * BEGIN PGP {PUBLIC,PRIVATE} KEY BLOCK
+ # - Valid header, but not a Debian CTRL file.
+ # * BEGIN PGP MESSAGE, PART X{,/Y}
+ # - Valid, but we don't support partial messages, so
+ # bail on those.
+
+ unless ($line =~ /^-----BEGIN PGP SIGNED MESSAGE-----[ \r\t]*$/) {
+ # Not a (full) PGP MESSAGE; reject.
+
+ my $key = qr/(?:BEGIN|END) PGP (?:PUBLIC|PRIVATE) KEY BLOCK/;
+ my $msgpart = qr{BEGIN PGP MESSAGE, PART \d+(?:/\d+)?};
+ my $msg
+ = qr/(?:BEGIN|END) PGP (?:(?:COMPRESSED|ENCRYPTED) )?MESSAGE/;
+
+ if ($line =~ /^-----($key|$msgpart|$msg)-----[ \r\t]*$/) {
+ die encode_utf8(
+ "Unexpected $1 header (line $position).\n");
+ }
+
+ die encode_utf8("Malformed PGP header (line $position).\n");
+
+ } else {
+ die encode_utf8(
+"Multiple PGP messages (line $signed and line $position).\n"
+ )if $signed;
+
+ # NB: If you remove this, keep in mind that it may
+ # allow two paragraphs to merge. Consider:
+ #
+ # Field-P1: some-value
+ # -----BEGIN PGP SIGNATURE-----
+ #
+ # Field-P2: another value
+ #
+ # At the time of writing: If $open_section is
+ # true, it will remain so until the empty line
+ # after the PGP header.
+ die encode_utf8(
+ "Expected PGP MESSAGE header (line $position).\n")
+ if $last_tag;
+
+ $signed = $position;
+ }
+
+ # skip until the next blank line
+ while (defined($line = shift @lines)) {
+ last
+ if $line =~ /^\s*$/;
+ }continue {
+ ++$position;
+ }
+ }
+ # did we see a signature already? We allow all whitespace/comment lines
+ # outside the signature.
+ elsif ($signature) {
+ # Accept empty lines after the signature.
+ next
+ if $line =~ /^\s*$/;
+
+ # NB: If you remove this, keep in mind that it may allow
+ # two paragraphs to merge. Consider:
+ #
+ # Field-P1: some-value
+ # -----BEGIN PGP SIGNATURE-----
+ # [...]
+ # -----END PGP SIGNATURE-----
+ # Field-P2: another value
+ #
+ # At the time of writing: If $open_section is true, it
+ # will remain so until the empty line after the PGP
+ # header.
+ die encode_utf8("Data after PGP SIGNATURE (line $position).\n");
+ }
+ # new empty field?
+ elsif ($line =~ /^([^: \t]+):\s*$/) {
+ $field_starts->{'START-OF-PARAGRAPH'} = $position
+ unless $open_section;
+ $open_section = 1;
+
+ my $tag = $1;
+ $section->{$tag} = $EMPTY;
+ $field_starts->{$tag} = $position;
+
+ $last_tag = $tag;
+ }
+ # new field?
+ elsif ($line =~ /^([^: \t]+):\s*(.*)$/) {
+ $field_starts->{'START-OF-PARAGRAPH'} = $position
+ unless $open_section;
+ $open_section = 1;
+
+ # Policy: Horizontal whitespace (spaces and tabs) may occur
+ # immediately before or after the value and is ignored there.
+ my $tag = $1;
+ my $value = $2;
+
+ # trim right
+ $value =~ s/\s+$//;
+
+ if (exists $section->{$tag}) {
+ # Policy: A paragraph must not contain more than one instance
+ # of a particular field name.
+ die encode_utf8("Duplicate field $tag (line $position).\n");
+ }
+ $value =~ s/#.*$//
+ if $flags & DCTRL_COMMENTS_AT_EOL;
+ $section->{$tag} = $value;
+ $field_starts->{$tag} = $position;
+
+ $last_tag = $tag;
+ }
+
+ # continued field?
+ elsif ($line =~ /^([ \t].*\S.*)$/) {
+ die encode_utf8(
+"Continuation line not in paragraph (line $position). Missing a dot on the previous line?\n"
+ )unless $open_section;
+
+ # Policy: Many fields' values may span several lines; in this case
+ # each continuation line must start with a space or a tab. Any
+ # trailing spaces or tabs at the end of individual lines of a
+ # field value are ignored.
+ my $value = $1;
+
+ # trim right
+ $value =~ s/\s+$//;
+
+ $value =~ s/#.*$//
+ if $flags & DCTRL_COMMENTS_AT_EOL;
+ $section->{$last_tag} .= "\n" . $value;
+ }
+ # None of the above => syntax error
+ else {
+
+ die encode_utf8(
+ "Unexpected whitespace (line $position). Missing a dot?\n")
+ if $line =~ /^\s+$/;
+
+ # Replace non-printables and non-space characters with
+ # "_" - just in case.
+ $line =~ s/[^[:graph:][:space:]]/_/g;
+
+ die encode_utf8("Cannot parse line $position: $line\n");
+ }
+
+ }continue {
+ ++$position;
+ }
+
+ # pass the last section (if not already done).
+ $code->($section, $field_starts)
+ if $open_section;
+
+ # Given the API, we cannot use this check to prevent any
+ # paragraphs from being emitted to the code argument, so we might
+ # as well just do this last.
+
+ die encode_utf8("Cannot find BEGIN PGP SIGNATURE\n.")
+ if $signed && !$signature;
+
+ return;
+}
+
+=back
+
+=head1 AUTHOR
+
+Originally written Christian Schwarz and many other people.
+
+Moo version by Felix Lechner <felix.lechner@lease-up.com> for Lintian.
+
+=head1 SEE ALSO
+
+lintian(1)
+
+=cut
+
+1;
+
+# Local Variables:
+# indent-tabs-mode: nil
+# cperl-indent-level: 4
+# End:
+# vim: syntax=perl sw=4 sts=4 sr et