diff options
Diffstat (limited to 'lib/Lintian/Util.pm')
-rw-r--r-- | lib/Lintian/Util.pm | 674 |
1 files changed, 674 insertions, 0 deletions
diff --git a/lib/Lintian/Util.pm b/lib/Lintian/Util.pm new file mode 100644 index 0000000..c512451 --- /dev/null +++ b/lib/Lintian/Util.pm @@ -0,0 +1,674 @@ +# Hey emacs! This is a -*- Perl -*- script! +# Lintian::Util -- Perl utility functions for lintian + +# Copyright (C) 1998 Christian Schwarz +# Copyright (C) 2018-2019 Chris Lamb <lamby@debian.org> +# Copyright (C) 2020 Felix Lechner +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, you can find it on the World Wide +# Web at https://www.gnu.org/copyleft/gpl.html, or write to the Free +# Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, +# MA 02110-1301, USA. + +package Lintian::Util; + +use v5.20; +use warnings; +use utf8; + +use Exporter qw(import); + +# Force export as soon as possible, since some of the modules we load also +# depend on us and the sequencing can cause things not to be exported +# otherwise. +our @EXPORT_OK; + +BEGIN { + + @EXPORT_OK = ( + qw( + get_file_checksum + get_file_digest + human_bytes + perm2oct + locate_executable + match_glob + normalize_pkg_path + normalize_link_target + is_ancestor_of + drain_pipe + drop_relative_prefix + read_md5sums + utf8_clean_log + utf8_clean_bytes + version_from_changelog + $PKGNAME_REGEX + $PKGREPACK_REGEX + $PKGVERSION_REGEX + ) + ); +} + +use Carp qw(croak); +use Const::Fast; +use Cwd qw(abs_path); +use Digest::MD5; +use Digest::SHA; +use List::SomeUtils qw(first_value); +use Path::Tiny; +use Regexp::Wildcards; +use Unicode::UTF8 qw(valid_utf8 encode_utf8); + +use Lintian::Deb822; +use Lintian::Changelog; +use Lintian::Relation::Version qw(versions_equal versions_comparator); + +const my $EMPTY => q{}; +const my $SPACE => q{ }; +const my $NEWLINE => qq{\n}; +const my $SLASH => q{/}; +const my $DOT => q{.}; +const my $DOUBLEDOT => q{..}; +const my $BACKSLASH => q{\\}; + +const my $DEFAULT_READ_SIZE => 4096; +const my $KIB_UNIT_FACTOR => 1024; +const my $COMFORT_THRESHOLD => 1536; + +const my $OWNER_READ => oct(400); +const my $OWNER_WRITE => oct(200); +const my $OWNER_EXECUTE => oct(100); +const my $SETUID => oct(4000); +const my $SETUID_OWNER_EXECUTE => oct(4100); +const my $GROUP_READ => oct(40); +const my $GROUP_WRITE => oct(20); +const my $GROUP_EXECUTE => oct(10); +const my $SETGID => oct(2000); +const my $SETGID_GROUP_EXECUTE => oct(2010); +const my $WORLD_READ => oct(4); +const my $WORLD_WRITE => oct(2); +const my $WORLD_EXECUTE => oct(1); +const my $STICKY => oct(1000); +const my $STICKY_WORLD_EXECUTE => oct(1001); + +# preload cache for common permission strings +# call overhead o perm2oct was measurable on chromium-browser/32.0.1700.123-2 +# load time went from ~1.5s to ~0.1s; of 115363 paths, only 306 were uncached +# standard file, executable file, standard dir, dir with suid, symlink +my %OCTAL_LOOKUP = map { $_ => perm2oct($_) } qw( + -rw-r--r-- + -rwxr-xr-x + drwxr-xr-x + drwxr-sr-x + lrwxrwxrwx +); + +my $rw = Regexp::Wildcards->new(type => 'jokers'); + +=head1 NAME + +Lintian::Util - Lintian utility functions + +=head1 SYNOPSIS + + use Lintian::Util; + +=head1 DESCRIPTION + +This module contains a number of utility subs that are nice to have, +but on their own did not warrant their own module. + +Most subs are imported only on request. + +=head1 VARIABLES + +=over 4 + +=item $PKGNAME_REGEX + +Regular expression that matches valid package names. The expression +is not anchored and does not enforce any "boundary" characters. + +=cut + +our $PKGNAME_REGEX = qr/[a-z0-9][-+\.a-z0-9]+/; + +=item $PKGREPACK_REGEX + +Regular expression that matches "repacked" package names. The expression is +not anchored and does not enforce any "boundary" characters. It should only +be applied to the upstream portion (see #931846). + +=cut + +our $PKGREPACK_REGEX = qr/(dfsg|debian|ds|repack)/; + +=item $PKGVERSION_REGEX + +Regular expression that matches valid package versions. The +expression is not anchored and does not enforce any "boundary" +characters. + +=cut + +our $PKGVERSION_REGEX = qr{ + (?: \d+ : )? # Optional epoch + [0-9][0-9A-Za-z.+:~]* # Upstream version (with no hyphens) + (?: - [0-9A-Za-z.+:~]+ )* # Optional debian revision (+ upstreams versions with hyphens) + }xa; + +=back + +=head1 FUNCTIONS + +=over 4 + +=item drain_pipe(FD) + +Reads and discards any remaining contents from FD, which is assumed to +be a pipe. This is mostly done to avoid having the "write"-end die +with a SIGPIPE due to a "broken pipe" (which can happen if you just +close the pipe). + +May cause an exception if there are issues reading from the pipe. + +Caveat: This will block until the pipe is closed from the "write"-end, +so only use it with pipes where the "write"-end will eventually close +their end by themselves (or something else will make them close it). + +=cut + +sub drain_pipe { + my ($fd) = @_; + my $buffer; + + 1 while (read($fd, $buffer, $DEFAULT_READ_SIZE) > 0); + + return 1; +} + +=item get_file_digest(ALGO, FILE) + +Creates an ALGO digest object that is seeded with the contents of +FILE. If you just want the hex digest, please use +L</get_file_checksum(ALGO, FILE)> instead. + +ALGO can be 'md5' or shaX, where X is any number supported by +L<Digest::SHA> (e.g. 'sha256'). + +This sub is a convenience wrapper around Digest::{MD5,SHA}. + +=cut + +sub get_file_digest { + my ($alg, $file) = @_; + + open(my $fd, '<', $file) + or die encode_utf8("Cannot open $file"); + + my $digest; + if (lc($alg) eq 'md5') { + $digest = Digest::MD5->new; + } elsif (lc($alg) =~ /sha(\d+)/) { + $digest = Digest::SHA->new($1); + } + $digest->addfile($fd); + close($fd); + + return $digest; +} + +=item get_file_checksum(ALGO, FILE) + +Returns a hexadecimal string of the message digest checksum generated +by the algorithm ALGO on FILE. + +ALGO can be 'md5' or shaX, where X is any number supported by +L<Digest::SHA> (e.g. 'sha256'). + +This sub is a convenience wrapper around Digest::{MD5,SHA}. + +=cut + +sub get_file_checksum { + my @paths = @_; + + my $digest = get_file_digest(@paths); + + return $digest->hexdigest; +} + +=item perm2oct(PERM) + +Translates PERM to an octal permission. PERM should be a string describing +the permissions as done by I<tar t> or I<ls -l>. That is, it should be a +string like "-rw-r--r--". + +If the string does not appear to be a valid permission, it will cause +a trappable error. + +Examples: + + # Good + perm2oct('-rw-r--r--') == oct(644) + perm2oct('-rwxr-xr-x') == oct(755) + + # Bad + perm2oct('broken') # too short to be recognised + perm2oct('-resurunet') # contains unknown permissions + +=cut + +sub perm2oct { + my ($text) = @_; + + my $lookup = $OCTAL_LOOKUP{$text}; + return $lookup + if defined $lookup; + + my $octal = 0; + + # Types: + # file (-), block/character device (b & c), directory (d), + # hardlink (h), symlink (l), named pipe (p). + if ( + $text !~ m{^ [-bcdhlp] # file type + ([-r])([-w])([-xsS]) # user + ([-r])([-w])([-xsS]) # group + ([-r])([-w])([-xtT]) # other + }xsm + ) { + croak encode_utf8("$text does not appear to be a permission string"); + } + + $octal |= $OWNER_READ if $1 eq 'r'; + $octal |= $OWNER_WRITE if $2 eq 'w'; + $octal |= $OWNER_EXECUTE if $3 eq 'x'; + $octal |= $SETUID if $3 eq 'S'; + $octal |= $SETUID_OWNER_EXECUTE if $3 eq 's'; + $octal |= $GROUP_READ if $4 eq 'r'; + $octal |= $GROUP_WRITE if $5 eq 'w'; + $octal |= $GROUP_EXECUTE if $6 eq 'x'; + $octal |= $SETGID if $6 eq 'S'; + $octal |= $SETGID_GROUP_EXECUTE if $6 eq 's'; + $octal |= $WORLD_READ if $7 eq 'r'; + $octal |= $WORLD_WRITE if $8 eq 'w'; + $octal |= $WORLD_EXECUTE if $9 eq 'x'; + $octal |= $STICKY if $9 eq 'T'; + $octal |= $STICKY_WORLD_EXECUTE if $9 eq 't'; + + $OCTAL_LOOKUP{$text} = $octal; + + return $octal; +} + +=item human_bytes(SIZE) + +=cut + +sub human_bytes { + my ($size) = @_; + + my @units = qw(B kiB MiB GiB); + + my $unit = shift @units; + + while ($size > $COMFORT_THRESHOLD && @units) { + + $size /= $KIB_UNIT_FACTOR; + $unit = shift @units; + } + + my $human = sprintf('%.0f %s', $size, $unit); + + return $human; +} + +=item locate_executable (CMD) + +=cut + +sub locate_executable { + my ($command) = @_; + + return $EMPTY + unless exists $ENV{PATH}; + + my @folders = grep { length } split(/:/, $ENV{PATH}); + my $path = first_value { -x "$_/$command" } @folders; + + return ($path // $EMPTY); +} + +=item drop_relative_prefix(STRING) + +Remove an initial ./ from STRING, if present + +=cut + +sub drop_relative_prefix { + my ($name) = @_; + + my $copy = $name; + $copy =~ s{^\./}{}s; + + return $copy; +} + +=item version_from_changelog + +=cut + +sub version_from_changelog { + my ($package_path) = @_; + + my $changelog_path = "$package_path/debian/changelog"; + + return $EMPTY + unless -e $changelog_path; + + my $contents = path($changelog_path)->slurp_utf8; + my $changelog = Lintian::Changelog->new; + + $changelog->parse($contents); + my @entries = @{$changelog->entries}; + + return $entries[0]->{'Version'} + if @entries; + + return $EMPTY; +} + +=item match_glob( $glob, @things_to_test ) + +Resembles the same semantic as Text::Glob's match_glob(), but with the +proper escaping of Regexp::Wildcards and pre-configured for Lintian's +purpose. No more directly having to access module variables either. + +=cut + +sub match_glob { + my ($glob, @things_to_test) = @_; + my $re = $rw->convert($glob); + + return grep { /^$re\z/ } @things_to_test; +} + +=item normalize_pkg_path(PATH) + +Normalize PATH by removing superfluous path segments. PATH is assumed +to be relative the package root. Note that the result will never +start nor end with a slash, even if PATH does. + +As the name suggests, this is a path "normalization" rather than a +true path resolution (for that use Cwd::realpath). Particularly, +it assumes none of the path segments are symlinks. + +normalize_pkg_path will return C<q{}> (i.e. the empty string) if PATH +is normalized to the root dir and C<undef> if the path cannot be +normalized without escaping the package root. + +=item normalize_link_target(CURDIR, LINK_TARGET) + +Normalize the path obtained by following a link with LINK_TARGET as +its target from CURDIR as the current directory. CURDIR is assumed to +be relative to the package root. Note that the result will never +start nor end with a slash, even if CURDIR or DEST does. + +normalize_pkg_path will return C<q{}> (i.e. the empty string) if the +target is the root dir and C<undef> if the path cannot be normalized +without escaping the package root. + +B<CAVEAT>: This function is I<not always sufficient> to test if it is +safe to open a given symlink. Use C<is_ancestor_of(PARENTDIR, PATH)> for +that. If you must use this function, remember to check that the +target is not a symlink (or if it is, that it can be resolved safely). + +=cut + +sub normalize_link_target { + my ($path, $target) = @_; + + if (substr($target, 0, 1) eq $SLASH) { + # Link is absolute + $path = $target; + } else { + # link is relative + $path = "$path/$target"; + } + + return normalize_pkg_path($path); +} + +sub normalize_pkg_path { + my ($path) = @_; + + return $EMPTY + if $path eq $SLASH; + + my @dirty = split(m{/}, $path); + my @clean = grep { length } @dirty; + + my @final; + for my $component (@clean) { + + if ($component eq $DOT) { + # do nothing + + } elsif ($component eq $DOUBLEDOT) { + # are we out of bounds? + my $discard = pop @final; + return undef + unless defined $discard; + + } else { + push(@final, $component); + } + } + + # empty if we end in the root + my $normalized = join($SLASH, @final); + + return $normalized; +} + +=item is_ancestor_of(PARENTDIR, PATH) + +Returns true if and only if PATH is PARENTDIR or a path stored +somewhere within PARENTDIR (or its subdirs). + +This function will resolve the paths; any failure to resolve the path +will cause a trappable error. + +=cut + +sub is_ancestor_of { + my ($ancestor, $file) = @_; + + my $resolved_file = abs_path($file); + croak encode_utf8("resolving $file failed: $!") + unless defined $resolved_file; + + my $resolved_ancestor = abs_path($ancestor); + croak encode_utf8("resolving $ancestor failed: $!") + unless defined $resolved_ancestor; + + my $len; + return 1 if $resolved_ancestor eq $resolved_file; + # add a slash, "path/some-dir" is not "path/some-dir-2" and this + # allows us to blindly match against the root dir. + $resolved_file .= $SLASH; + $resolved_ancestor .= $SLASH; + + # If $resolved_file is contained within $resolved_ancestor, then + # $resolved_ancestor will be a prefix of $resolved_file. + $len = length($resolved_ancestor); + if (substr($resolved_file, 0, $len) eq $resolved_ancestor) { + return 1; + } + return 0; +} + +=item read_md5sums + +=item unescape_md5sum_filename + +=cut + +sub unescape_md5sum_filename { + my ($string, $problematic) = @_; + + # done if there are no escapes + return $string + unless $problematic; + + # split into individual characters + my @array = split(//, $string); + +# https://www.gnu.org/software/coreutils/manual/html_node/md5sum-invocation.html + my $path; + my $escaped = 0; + for my $char (@array) { + + # start escape sequence + if ($char eq $BACKSLASH && !$escaped) { + $escaped = 1; + next; + } + + # unescape newline + $char = $NEWLINE + if $char eq 'n' && $escaped; + + # append character + $path .= $char; + + # end any escape sequence + $escaped = 0; + } + + # do not stop inside an escape sequence + die encode_utf8('Name terminated inside an escape sequence') + if $escaped; + + return $path; +} + +sub read_md5sums { + my ($text) = @_; + + my %checksums; + my @errors; + + my @lines = split(/\n/, $text); + + while (defined(my $line = shift @lines)) { + + next + unless length $line; + + # make sure there are two spaces in between + $line =~ /^((?:\\)?\S{32}) (.*)$/; + + my $checksum = $1; + my $string = $2; + + unless (length $checksum && length $string) { + + push(@errors, "Odd text: $line"); + next; + } + + my $problematic = 0; + + # leading slash in checksum indicates an escaped name + $problematic = 1 + if $checksum =~ s{^\\}{}; + + my $path = unescape_md5sum_filename($string, $problematic); + + push(@errors, "Empty name for checksum $checksum") + unless length $path; + + $checksums{$path} = $checksum; + } + + return (\%checksums, \@errors); +} + +=item utf8_clean_log + +=cut + +sub utf8_clean_log { + my ($bytes) = @_; + + my $hex_sequence = sub { + my ($unclean_bytes) = @_; + return '{hex:' . sprintf('%vX', $unclean_bytes) . '}'; + }; + + my $utf8_clean_word = sub { + my ($word) = @_; + return utf8_clean_bytes($word, $SLASH, $hex_sequence); + }; + + my $utf8_clean_line = sub { + my ($line) = @_; + return utf8_clean_bytes($line, $SPACE, $utf8_clean_word); + }; + + return utf8_clean_bytes($bytes, $NEWLINE, $utf8_clean_line) . $NEWLINE; +} + +=item utf8_clean_bytes + +=cut + +sub utf8_clean_bytes { + my ($bytes, $separator, $utf8_clean_part) = @_; + + my @utf8_clean_parts; + + my $regex = quotemeta($separator); + my @parts = split(/$regex/, $bytes); + + for my $part (@parts) { + + if (valid_utf8($part)) { + push(@utf8_clean_parts, $part); + + } else { + push(@utf8_clean_parts, $utf8_clean_part->($part)); + } + } + + return join($separator, @utf8_clean_parts); +} + +=back + +=head1 SEE ALSO + +lintian(1) + +=cut + +1; + +# Local Variables: +# indent-tabs-mode: nil +# cperl-indent-level: 4 +# End: +# vim: syntax=perl sw=4 sts=4 sr et |