1 files changed, 674 insertions, 0 deletions
diff --git a/lib/Lintian/Util.pm b/lib/Lintian/Util.pm
new file mode 100644
index 0000000..c512451
--- /dev/null
+++ b/lib/Lintian/Util.pm
@@ -0,0 +1,674 @@
+# Hey emacs! This is a -*- Perl -*- script!
+# Lintian::Util -- Perl utility functions for lintian
+
+# Copyright (C) 1998 Christian Schwarz
+# Copyright (C) 2018-2019 Chris Lamb <lamby@debian.org>
+# Copyright (C) 2020 Felix Lechner
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, you can find it on the World Wide
+# Web at https://www.gnu.org/copyleft/gpl.html, or write to the Free
+# Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
+# MA 02110-1301, USA.
+
+package Lintian::Util;
+
+use v5.20;
+use warnings;
+use utf8;
+
+use Exporter qw(import);
+
+# Force export as soon as possible, since some of the modules we load also
+# depend on us and the sequencing can cause things not to be exported
+# otherwise.
+our @EXPORT_OK;
+
+BEGIN {
+
+    @EXPORT_OK = (
+        qw(
+          get_file_checksum
+          get_file_digest
+          human_bytes
+          perm2oct
+          locate_executable
+          match_glob
+          normalize_pkg_path
+          normalize_link_target
+          is_ancestor_of
+          drain_pipe
+          drop_relative_prefix
+          read_md5sums
+          utf8_clean_log
+          utf8_clean_bytes
+          version_from_changelog
+          $PKGNAME_REGEX
+          $PKGREPACK_REGEX
+          $PKGVERSION_REGEX
+        )
+    );
+}
+
+use Carp qw(croak);
+use Const::Fast;
+use Cwd qw(abs_path);
+use Digest::MD5;
+use Digest::SHA;
+use List::SomeUtils qw(first_value);
+use Path::Tiny;
+use Regexp::Wildcards;
+use Unicode::UTF8 qw(valid_utf8 encode_utf8);
+
+use Lintian::Deb822;
+use Lintian::Changelog;
+use Lintian::Relation::Version qw(versions_equal versions_comparator);
+
+const my $EMPTY => q{};
+const my $SPACE => q{ };
+const my $NEWLINE => qq{\n};
+const my $SLASH => q{/};
+const my $DOT => q{.};
+const my $DOUBLEDOT => q{..};
+const my $BACKSLASH => q{\\};
+
+const my $DEFAULT_READ_SIZE => 4096;
+const my $KIB_UNIT_FACTOR => 1024;
+const my $COMFORT_THRESHOLD => 1536;
+
+const my $OWNER_READ => oct(400);
+const my $OWNER_WRITE => oct(200);
+const my $OWNER_EXECUTE => oct(100);
+const my $SETUID => oct(4000);
+const my $SETUID_OWNER_EXECUTE => oct(4100);
+const my $GROUP_READ => oct(40);
+const my $GROUP_WRITE => oct(20);
+const my $GROUP_EXECUTE => oct(10);
+const my $SETGID => oct(2000);
+const my $SETGID_GROUP_EXECUTE => oct(2010);
+const my $WORLD_READ => oct(4);
+const my $WORLD_WRITE => oct(2);
+const my $WORLD_EXECUTE => oct(1);
+const my $STICKY => oct(1000);
+const my $STICKY_WORLD_EXECUTE => oct(1001);
+
+# preload cache for common permission strings
+# call overhead o perm2oct was measurable on chromium-browser/32.0.1700.123-2
+# load time went from ~1.5s to ~0.1s; of 115363 paths, only 306 were uncached
+# standard file, executable file, standard dir, dir with suid, symlink
+my %OCTAL_LOOKUP = map { $_ => perm2oct($_) } qw(
+  -rw-r--r--
+  -rwxr-xr-x
+  drwxr-xr-x
+  drwxr-sr-x
+  lrwxrwxrwx
+);
+
+my $rw = Regexp::Wildcards->new(type => 'jokers');
+
+=head1 NAME
+
+Lintian::Util - Lintian utility functions
+
+=head1 SYNOPSIS
+
+ use Lintian::Util;
+
+=head1 DESCRIPTION
+
+This module contains a number of utility subs that are nice to have,
+but on their own did not warrant their own module.
+
+Most subs are imported only on request.
+
+=head1 VARIABLES
+
+=over 4
+
+=item $PKGNAME_REGEX
+
+Regular expression that matches valid package names.  The expression
+is not anchored and does not enforce any "boundary" characters.
+
+=cut
+
+our $PKGNAME_REGEX = qr/[a-z0-9][-+\.a-z0-9]+/;
+
+=item $PKGREPACK_REGEX
+
+Regular expression that matches "repacked" package names.  The expression is
+not anchored and does not enforce any "boundary" characters. It should only
+be applied to the upstream portion (see #931846).
+
+=cut
+
+our $PKGREPACK_REGEX = qr/(dfsg|debian|ds|repack)/;
+
+=item $PKGVERSION_REGEX
+
+Regular expression that matches valid package versions.  The
+expression is not anchored and does not enforce any "boundary"
+characters.
+
+=cut
+
+our $PKGVERSION_REGEX = qr{
+                 (?: \d+ : )?                # Optional epoch
+                 [0-9][0-9A-Za-z.+:~]*       # Upstream version (with no hyphens)
+                 (?: - [0-9A-Za-z.+:~]+ )*   # Optional debian revision (+ upstreams versions with hyphens)
+                          }xa;
+
+=back
+
+=head1 FUNCTIONS
+
+=over 4
+
+=item drain_pipe(FD)
+
+Reads and discards any remaining contents from FD, which is assumed to
+be a pipe.  This is mostly done to avoid having the "write"-end die
+with a SIGPIPE due to a "broken pipe" (which can happen if you just
+close the pipe).
+
+May cause an exception if there are issues reading from the pipe.
+
+Caveat: This will block until the pipe is closed from the "write"-end,
+so only use it with pipes where the "write"-end will eventually close
+their end by themselves (or something else will make them close it).
+
+=cut
+
+sub drain_pipe {
+    my ($fd) = @_;
+    my $buffer;
+
+    1 while (read($fd, $buffer, $DEFAULT_READ_SIZE) > 0);
+
+    return 1;
+}
+
+=item get_file_digest(ALGO, FILE)
+
+Creates an ALGO digest object that is seeded with the contents of
+FILE.  If you just want the hex digest, please use
+L</get_file_checksum(ALGO, FILE)> instead.
+
+ALGO can be 'md5' or shaX, where X is any number supported by
+L<Digest::SHA> (e.g. 'sha256').
+
+This sub is a convenience wrapper around Digest::{MD5,SHA}.
+
+=cut
+
+sub get_file_digest {
+    my ($alg, $file) = @_;
+
+    open(my $fd, '<', $file)
+      or die encode_utf8("Cannot open $file");
+
+    my $digest;
+    if (lc($alg) eq 'md5') {
+        $digest = Digest::MD5->new;
+    } elsif (lc($alg) =~ /sha(\d+)/) {
+        $digest = Digest::SHA->new($1);
+    }
+    $digest->addfile($fd);
+    close($fd);
+
+    return $digest;
+}
+
+=item get_file_checksum(ALGO, FILE)
+
+Returns a hexadecimal string of the message digest checksum generated
+by the algorithm ALGO on FILE.
+
+ALGO can be 'md5' or shaX, where X is any number supported by
+L<Digest::SHA> (e.g. 'sha256').
+
+This sub is a convenience wrapper around Digest::{MD5,SHA}.
+
+=cut
+
+sub get_file_checksum {
+    my @paths = @_;
+
+    my $digest = get_file_digest(@paths);
+
+    return $digest->hexdigest;
+}
+
+=item perm2oct(PERM)
+
+Translates PERM to an octal permission.  PERM should be a string describing
+the permissions as done by I<tar t> or I<ls -l>.  That is, it should be a
+string like "-rw-r--r--".
+
+If the string does not appear to be a valid permission, it will cause
+a trappable error.
+
+Examples:
+
+ # Good
+ perm2oct('-rw-r--r--') == oct(644)
+ perm2oct('-rwxr-xr-x') == oct(755)
+
+ # Bad
+ perm2oct('broken')      # too short to be recognised
+ perm2oct('-resurunet')  # contains unknown permissions
+
+=cut
+
+sub perm2oct {
+    my ($text) = @_;
+
+    my $lookup = $OCTAL_LOOKUP{$text};
+    return $lookup
+      if defined $lookup;
+
+    my $octal = 0;
+
+    # Types:
+    #  file (-), block/character device (b & c), directory (d),
+    #  hardlink (h), symlink (l), named pipe (p).
+    if (
+        $text !~ m{^   [-bcdhlp]                # file type
+                    ([-r])([-w])([-xsS])     # user
+                    ([-r])([-w])([-xsS])     # group
+                    ([-r])([-w])([-xtT])     # other
+               }xsm
+    ) {
+        croak encode_utf8("$text does not appear to be a permission string");
+    }
+
+    $octal |= $OWNER_READ if $1 eq 'r';
+    $octal |= $OWNER_WRITE if $2 eq 'w';
+    $octal |= $OWNER_EXECUTE if $3 eq 'x';
+    $octal |= $SETUID if $3 eq 'S';
+    $octal |= $SETUID_OWNER_EXECUTE if $3 eq 's';
+    $octal |= $GROUP_READ if $4 eq 'r';
+    $octal |= $GROUP_WRITE if $5 eq 'w';
+    $octal |= $GROUP_EXECUTE if $6 eq 'x';
+    $octal |= $SETGID if $6 eq 'S';
+    $octal |= $SETGID_GROUP_EXECUTE if $6 eq 's';
+    $octal |= $WORLD_READ if $7 eq 'r';
+    $octal |= $WORLD_WRITE if $8 eq 'w';
+    $octal |= $WORLD_EXECUTE if $9 eq 'x';
+    $octal |= $STICKY if $9 eq 'T';
+    $octal |= $STICKY_WORLD_EXECUTE if $9 eq 't';
+
+    $OCTAL_LOOKUP{$text} = $octal;
+
+    return $octal;
+}
+
+=item human_bytes(SIZE)
+
+=cut
+
+sub human_bytes {
+    my ($size) = @_;
+
+    my @units = qw(B kiB MiB GiB);
+
+    my $unit = shift @units;
+
+    while ($size > $COMFORT_THRESHOLD && @units) {
+
+        $size /= $KIB_UNIT_FACTOR;
+        $unit = shift @units;
+    }
+
+    my $human = sprintf('%.0f %s', $size, $unit);
+
+    return $human;
+}
+
+=item locate_executable (CMD)
+
+=cut
+
+sub locate_executable {
+    my ($command) = @_;
+
+    return $EMPTY
+      unless exists $ENV{PATH};
+
+    my @folders =  grep { length } split(/:/, $ENV{PATH});
+    my $path = first_value { -x "$_/$command" } @folders;
+
+    return ($path // $EMPTY);
+}
+
+=item drop_relative_prefix(STRING)
+
+Remove an initial ./ from STRING, if present
+
+=cut
+
+sub drop_relative_prefix {
+    my ($name) = @_;
+
+    my $copy = $name;
+    $copy =~ s{^\./}{}s;
+
+    return $copy;
+}
+
+=item version_from_changelog
+
+=cut
+
+sub version_from_changelog {
+    my ($package_path) = @_;
+
+    my $changelog_path = "$package_path/debian/changelog";
+
+    return $EMPTY
+      unless -e $changelog_path;
+
+    my $contents = path($changelog_path)->slurp_utf8;
+    my $changelog = Lintian::Changelog->new;
+
+    $changelog->parse($contents);
+    my @entries = @{$changelog->entries};
+
+    return $entries[0]->{'Version'}
+      if @entries;
+
+    return $EMPTY;
+}
+
+=item match_glob( $glob, @things_to_test )
+
+Resembles the same semantic as Text::Glob's match_glob(), but with the
+proper escaping of Regexp::Wildcards and pre-configured for Lintian's
+purpose. No more directly having to access module variables either.
+
+=cut
+
+sub match_glob {
+    my ($glob, @things_to_test) = @_;
+    my $re = $rw->convert($glob);
+
+    return grep { /^$re\z/ } @things_to_test;
+}
+
+=item normalize_pkg_path(PATH)
+
+Normalize PATH by removing superfluous path segments.  PATH is assumed
+to be relative the package root.  Note that the result will never
+start nor end with a slash, even if PATH does.
+
+As the name suggests, this is a path "normalization" rather than a
+true path resolution (for that use Cwd::realpath).  Particularly,
+it assumes none of the path segments are symlinks.
+
+normalize_pkg_path will return C<q{}> (i.e. the empty string) if PATH
+is normalized to the root dir and C<undef> if the path cannot be
+normalized without escaping the package root.
+
+=item normalize_link_target(CURDIR, LINK_TARGET)
+
+Normalize the path obtained by following a link with LINK_TARGET as
+its target from CURDIR as the current directory.  CURDIR is assumed to
+be relative to the package root.  Note that the result will never
+start nor end with a slash, even if CURDIR or DEST does.
+
+normalize_pkg_path will return C<q{}> (i.e. the empty string) if the
+target is the root dir and C<undef> if the path cannot be normalized
+without escaping the package root.
+
+B<CAVEAT>: This function is I<not always sufficient> to test if it is
+safe to open a given symlink. Use C<is_ancestor_of(PARENTDIR, PATH)> for
+that.  If you must use this function, remember to check that the
+target is not a symlink (or if it is, that it can be resolved safely).
+
+=cut
+
+sub normalize_link_target {
+    my ($path, $target) = @_;
+
+    if (substr($target, 0, 1) eq $SLASH) {
+        # Link is absolute
+        $path = $target;
+    } else {
+        # link is relative
+        $path = "$path/$target";
+    }
+
+    return normalize_pkg_path($path);
+}
+
+sub normalize_pkg_path {
+    my ($path) = @_;
+
+    return $EMPTY
+      if $path eq $SLASH;
+
+    my @dirty = split(m{/}, $path);
+    my @clean = grep { length } @dirty;
+
+    my @final;
+    for my $component (@clean) {
+
+        if ($component eq $DOT) {
+            # do nothing
+
+        } elsif ($component eq $DOUBLEDOT) {
+            # are we out of bounds?
+            my $discard = pop @final;
+            return undef
+              unless defined $discard;
+
+        } else {
+            push(@final, $component);
+        }
+    }
+
+    # empty if we end in the root
+    my $normalized = join($SLASH, @final);
+
+    return $normalized;
+}
+
+=item is_ancestor_of(PARENTDIR, PATH)
+
+Returns true if and only if PATH is PARENTDIR or a path stored
+somewhere within PARENTDIR (or its subdirs).
+
+This function will resolve the paths; any failure to resolve the path
+will cause a trappable error.
+
+=cut
+
+sub is_ancestor_of {
+    my ($ancestor, $file) = @_;
+
+    my $resolved_file = abs_path($file);
+    croak encode_utf8("resolving $file failed: $!")
+      unless defined $resolved_file;
+
+    my $resolved_ancestor = abs_path($ancestor);
+    croak encode_utf8("resolving $ancestor failed: $!")
+      unless defined $resolved_ancestor;
+
+    my $len;
+    return 1 if $resolved_ancestor eq $resolved_file;
+    # add a slash, "path/some-dir" is not "path/some-dir-2" and this
+    # allows us to blindly match against the root dir.
+    $resolved_file .= $SLASH;
+    $resolved_ancestor .= $SLASH;
+
+    # If $resolved_file is contained within $resolved_ancestor, then
+    # $resolved_ancestor will be a prefix of $resolved_file.
+    $len = length($resolved_ancestor);
+    if (substr($resolved_file, 0, $len) eq $resolved_ancestor) {
+        return 1;
+    }
+    return 0;
+}
+
+=item read_md5sums
+
+=item unescape_md5sum_filename
+
+=cut
+
+sub unescape_md5sum_filename {
+    my ($string, $problematic) = @_;
+
+    # done if there are no escapes
+    return $string
+      unless $problematic;
+
+    # split into individual characters
+    my @array = split(//, $string);
+
+# https://www.gnu.org/software/coreutils/manual/html_node/md5sum-invocation.html
+    my $path;
+    my $escaped = 0;
+    for my $char (@array) {
+
+        # start escape sequence
+        if ($char eq $BACKSLASH && !$escaped) {
+            $escaped = 1;
+            next;
+        }
+
+        # unescape newline
+        $char = $NEWLINE
+          if $char eq 'n' && $escaped;
+
+        # append character
+        $path .= $char;
+
+        # end any escape sequence
+        $escaped = 0;
+    }
+
+    # do not stop inside an escape sequence
+    die encode_utf8('Name terminated inside an escape sequence')
+      if $escaped;
+
+    return $path;
+}
+
+sub read_md5sums {
+    my ($text) = @_;
+
+    my %checksums;
+    my @errors;
+
+    my @lines = split(/\n/, $text);
+
+    while (defined(my $line = shift @lines)) {
+
+        next
+          unless length $line;
+
+        # make sure there are two spaces in between
+        $line =~ /^((?:\\)?\S{32})  (.*)$/;
+
+        my $checksum = $1;
+        my $string = $2;
+
+        unless (length $checksum && length $string) {
+
+            push(@errors, "Odd text: $line");
+            next;
+        }
+
+        my $problematic = 0;
+
+        # leading slash in checksum indicates an escaped name
+        $problematic = 1
+          if $checksum =~ s{^\\}{};
+
+        my $path = unescape_md5sum_filename($string, $problematic);
+
+        push(@errors, "Empty name for checksum $checksum")
+          unless length $path;
+
+        $checksums{$path} = $checksum;
+    }
+
+    return (\%checksums, \@errors);
+}
+
+=item utf8_clean_log
+
+=cut
+
+sub utf8_clean_log {
+    my ($bytes) = @_;
+
+    my $hex_sequence = sub {
+        my ($unclean_bytes) = @_;
+        return '{hex:' . sprintf('%vX', $unclean_bytes) . '}';
+    };
+
+    my $utf8_clean_word = sub {
+        my ($word) = @_;
+        return utf8_clean_bytes($word, $SLASH, $hex_sequence);
+    };
+
+    my $utf8_clean_line = sub {
+        my ($line) = @_;
+        return utf8_clean_bytes($line, $SPACE, $utf8_clean_word);
+    };
+
+    return utf8_clean_bytes($bytes, $NEWLINE, $utf8_clean_line) . $NEWLINE;
+}
+
+=item utf8_clean_bytes
+
+=cut
+
+sub utf8_clean_bytes {
+    my ($bytes, $separator, $utf8_clean_part) = @_;
+
+    my @utf8_clean_parts;
+
+    my $regex = quotemeta($separator);
+    my @parts = split(/$regex/, $bytes);
+
+    for my $part (@parts) {
+
+        if (valid_utf8($part)) {
+            push(@utf8_clean_parts, $part);
+
+        } else {
+            push(@utf8_clean_parts, $utf8_clean_part->($part));
+        }
+    }
+
+    return join($separator, @utf8_clean_parts);
+}
+
+=back
+
+=head1 SEE ALSO
+
+lintian(1)
+
+=cut
+
+1;
+
+# Local Variables:
+# indent-tabs-mode: nil
+# cperl-indent-level: 4
+# End:
+# vim: syntax=perl sw=4 sts=4 sr et