# -*- perl -*- Lintian::Index
#
# Copyright (C) 2020 Felix Lechner
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# this program.  If not, see <http://www.gnu.org/licenses/>.

package Lintian::Index;

use v5.20;
use warnings;
use utf8;

use Carp;
use Const::Fast;
use Cwd;
use IPC::Run3;
use List::SomeUtils qw(any);
use Path::Tiny;
use Unicode::UTF8 qw(encode_utf8 decode_utf8);

use Lintian::Index::Item;
use Lintian::IO::Select qw(unpack_and_index_piped_tar);
use Lintian::IPC::Run3 qw(safe_qx);

use Lintian::Util qw(perm2oct);

const my $EMPTY => q{};
const my $SPACE => q{ };
const my $SLASH => q{/};
const my $BACKSLASH => q{\\};
const my $ZERO => q{0};
const my $HYPHEN => q{-};
const my $PERCENT => q{%};
const my $NEWLINE => qq{\n};

const my $WAIT_STATUS_SHIFT => 8;
const my $NO_LIMIT => -1;
const my $LINES_PER_FILE => 3;
const my $WIDELY_READABLE_FOLDER => oct(755);
const my $WORLD_WRITABLE_FOLDER => oct(777);

use Moo;
use namespace::clean;

with
  'Lintian::Index::Ar',
  'Lintian::Index::Elf',
  'Lintian::Index::FileTypes',
  'Lintian::Index::Java',
  'Lintian::Index::Md5sums',
  'Lintian::Index::Strings';

my %FILE_CODE2LPATH_TYPE = (
    $HYPHEN => Lintian::Index::Item::TYPE_FILE
      | Lintian::Index::Item::OPEN_IS_OK,
    'h' => Lintian::Index::Item::TYPE_HARDLINK
      | Lintian::Index::Item::OPEN_IS_OK,
    'd' => Lintian::Index::Item::TYPE_DIR| Lintian::Index::Item::FS_PATH_IS_OK,
    'l' => Lintian::Index::Item::TYPE_SYMLINK,
    'b' => Lintian::Index::Item::TYPE_BLOCK_DEV,
    'c' => Lintian::Index::Item::TYPE_CHAR_DEV,
    'p' => Lintian::Index::Item::TYPE_PIPE,
);

=head1 NAME

Lintian::Index - access to collected data about the upstream (orig) sources

=head1 SYNOPSIS

    use Lintian::Index;

=head1 DESCRIPTION

Lintian::Processable::Source::Orig::Index provides an interface to collected data about the upstream (orig) sources.

=head1 INSTANCE METHODS

=over 4

=item identifier

=item catalog

Returns a reference to a hash with elements catalogued by path names.

=item C<basedir>

Returns the base directory for file references.

=item C<anchored>

=item unpack_messages

=cut

has identifier => (is => 'rw', default => 'unnamed');

has catalog => (
    is => 'rw',
    default => sub {
        my ($self) = @_;

        # create an empty root
        my $root = Lintian::Index::Item->new;

        # associate with this index
        $root->index($self);

        my %catalog;
        $catalog{$EMPTY} = $root;

        return \%catalog;
    }
);

has basedir => (
    is => 'rw',
    trigger => sub {
        my ($self, $folder) = @_;

        return
          unless length $folder;

        # create directory
        path($folder)->mkpath({ chmod => $WORLD_WRITABLE_FOLDER })
          unless -e $folder;
    },
    default => $EMPTY
);

has anchored => (is => 'rw', default => 0);
has unpack_messages => (is => 'rw', default => sub { [] });

has sorted_list => (
    is => 'ro',
    lazy => 1,
    default => sub {
        my ($self) = @_;

        my @sorted = sort { $a->name cmp $b->name } values %{$self->catalog};

        # remove automatic root dir; list is sorted
        shift @sorted;

        const my @IMMUTABLE => @sorted;

        return \@IMMUTABLE;
    }
);

=item lookup (FILE)

Like L</index> except orig_index is based on the "orig tarballs" of
the source packages.

For native packages L</index> and L</orig_index> are generally
identical.

NB: If sorted_index includes a debian packaging, it is was
contained in upstream part of the source package (or the package is
native).

=cut

sub lookup {
    my ($self, $name) = @_;

    # get root dir by default
    $name //= $EMPTY;

    croak encode_utf8($self->identifier . ': Name is not a string')
      unless ref $name eq $EMPTY;

    my $found = $self->catalog->{$name};

    return $found
      if defined $found;

    return undef;
}

=item resolve_path

=cut

sub resolve_path {
    my ($self, $name) = @_;

    return $self->lookup->resolve_path($name);
}

=item create_from_basedir

=cut

sub create_from_basedir {
    my ($self) = @_;

    my $savedir = getcwd;
    chdir($self->basedir)
      or die encode_utf8(
        $self->identifier . ': Cannot change to directory ' . $self->basedir);

    # get times in UTC
    my $TIME_STAMP
      = $PERCENT . q{M} . $SPACE . $PERCENT . q{s} . $SPACE . $PERCENT . q{A+};
    my $FILE_NAME = $PERCENT . q{p};
    my $LINK_DESTINATION = $PERCENT . q{l};
    my $NULL_BREAK = $BACKSLASH . $ZERO;

    my @REQUESTED_FIELDS
      = map { $_ . $NULL_BREAK } ($TIME_STAMP, $FILE_NAME, $LINK_DESTINATION);

    my @index_command
      = ('env', 'TZ=UTC', 'find', '-printf', join($EMPTY, @REQUESTED_FIELDS));
    my $index_output;
    my $index_errors;

    run3(\@index_command, \undef, \$index_output, \$index_errors);

    chdir($savedir)
      or die encode_utf8(
        $self->identifier . ": Cannot change to directory $savedir");

    # allow processing of file names with non UTF-8 bytes
    $index_errors = decode_utf8($index_errors)
      if length $index_errors;

    my $permissionspattern = qr/\S{10}/;
    my $sizepattern = qr/\d+/;
    my $datepattern = qr/\d{4}-\d{2}-\d{2}/;
    my $timepattern = qr/\d{2}:\d{2}:\d{2}\.\d+/;
    my $pathpattern = qr/[^\0]*/;

    my %all;

    $index_output =~ s/\0$//;

    my @lines = split(/\0/, $index_output, $NO_LIMIT);
    die encode_utf8($self->identifier
          . ": Did not get a multiple of $LINES_PER_FILE lines from find.")
      unless @lines % $LINES_PER_FILE == 0;

    while (defined(my $first = shift @lines)) {

        my $entry = Lintian::Index::Item->new;
        $entry->index($self);

        $first
          =~ /^($permissionspattern)\ ($sizepattern)\ ($datepattern)\+($timepattern)$/s;

        $entry->perm($1);
        $entry->size($2);
        $entry->date($3);
        $entry->time($4);

        my $name = shift @lines;

        my $linktarget = shift @lines;

        # for non-links, string is empty
        $entry->link($linktarget)
          if length $linktarget;

        # find prints single dot for base; removed in next step
        $name =~ s{^\.$}{\./}s;

        # strip relative prefix
        $name =~ s{^\./+}{}s;

        # make sure directories end with a slash, except root
        $name .= $SLASH
          if length $name
          && $entry->perm =~ /^d/
          && $name !~ m{ /$ }msx;
        $entry->name($name);

        $all{$entry->name} = $entry;
    }

    $self->catalog(\%all);

    my $load_errors = $self->load;

    return $index_errors . $load_errors;
}

=item create_from_piped_tar

=cut

sub create_from_piped_tar {
    my ($self, $command) = @_;

    my $extract_dir = $self->basedir;

    my ($named, $numeric, $extract_errors, $index_errors)
      = unpack_and_index_piped_tar($command, $extract_dir);

    # fix permissions
    safe_qx('chmod', '-R', 'u+rwX,go-w', $extract_dir);

    # allow processing of file names with non UTF-8 bytes
    my @named_owner = split(/\n/, $named);
    my @numeric_owner = split(/\n/, $numeric);

    my %catalog;

    for my $line (@named_owner) {

        my $entry = Lintian::Index::Item->new;
        $entry->init_from_tar_output($line);
        $entry->index($self);

        $catalog{$entry->name} = $entry;
    }

    # get numerical owners from second list
    for my $line (@numeric_owner) {

        # entry not used outside this loop
        my $entry = Lintian::Index::Item->new;
        $entry->init_from_tar_output($line);

        die encode_utf8($self->identifier
              . ': Numerical index lists extra files for file name '
              . $entry->name)
          unless exists $catalog{$entry->name};

        # keep numerical uid and gid
        $catalog{$entry->name}->uid($entry->owner);
        $catalog{$entry->name}->gid($entry->group);
    }

    # tar produces spurious root entry when stripping slashes from member names
    delete $catalog{$SLASH}
      unless $self->anchored;

    $self->catalog(\%catalog);

    my $load_errors = $self->load;

    return $extract_errors . $index_errors . $load_errors;
}

=item load

=cut

sub load {
    my ($self) = @_;

    my $errors = $EMPTY;

    my %all = %{$self->catalog};

    # set internal permissions flags
    for my $entry (values %all) {

        my $raw_type = substr($entry->perm, 0, 1);

        my $operm = perm2oct($entry->perm);
        $entry->path_info(
            $operm | (
                $FILE_CODE2LPATH_TYPE{$raw_type}
                  // Lintian::Index::Item::TYPE_OTHER
            )
        );
    }

    # find all entries that are not regular files
    my @nosize
      = grep { !$_->path_info & Lintian::Index::Item::TYPE_FILE } values %all;

    # reset size for anything but regular files
    $_->size(0) for @nosize;

    if ($self->anchored) {

        my %relative;
        for my $name (keys %all) {
            my $entry = $all{$name};

            # remove leading slash from absolute names
            my $name = $entry->name;
            $name =~ s{^/+}{}s;
            $entry->name($name);

            # remove leading slash from absolute hardlink targets
            if ($entry->is_hardlink) {
                my $target = $entry->link;
                $target =~ s{^/+}{}s;
                $entry->link($target);
            }

            $relative{$name} = $entry;
        }

        %all = %relative;
    }

    # disallow absolute names
    die encode_utf8($self->identifier . ': Index contains absolute path names')
      if any { $_->name =~ m{^/}s } values %all;

    # disallow absolute hardlink targets
    die encode_utf8(
        $self->identifier . ': Index contains absolute hardlink targets')
      if any { $_->link =~ m{^/}s } grep { $_->is_hardlink } values %all;

    # add entries for missing directories
    for my $entry (values %all) {

        my $current = $entry;
        my $parentname;

        # travel up the directory tree
        do {
            $parentname = $current->dirname;

            # insert new entry for missing intermediate directories
            unless (exists $all{$parentname}) {

                my $added = Lintian::Index::Item->new;
                $added->index($self);

                $added->name($parentname);
                $added->path_info(
                    $FILE_CODE2LPATH_TYPE{'d'} | $WIDELY_READABLE_FOLDER);

                # random but fixed date; hint, it's a good read. :)
                $added->date('1998-01-25');
                $added->time('22:55:34');
                $added->faux(1);

                $all{$parentname} = $added;
            }

            $current = $all{$parentname};

        } while ($parentname ne $EMPTY);
    }

    # insert root for empty tarfies like suckless-tools_45.orig.tar.xz
    unless (exists $all{$EMPTY}) {

        my $root = Lintian::Index::Item->new;
        $root->index($self);

        $root->name($EMPTY);
        $root->path_info($FILE_CODE2LPATH_TYPE{'d'} | $WIDELY_READABLE_FOLDER);

        # random but fixed date; hint, it's a good read. :)
        $root->date('1998-01-25');
        $root->time('22:55:34');
        $root->faux(1);

        $all{$EMPTY} = $root;
    }

    my @directories
      = grep { $_->path_info & Lintian::Index::Item::TYPE_DIR } values %all;

    # make space for children
    my %children;
    $children{$_->name} = [] for @directories;

    # record children
    for my $entry (values %all) {

        my $parentname = $entry->dirname;

        # Ensure the "root" is not its own child.  It is not really helpful
        # from an analysis PoV and it creates ref cycles  (and by extension
        # leaks like #695866).
        push(@{ $children{$parentname} }, $entry)
          unless $parentname eq $entry->name;
    }

    foreach my $entry (@directories) {
        my %childnames
          = map {$_->basename => $_->name }@{ $children{$entry->name} };
        $entry->childnames(\%childnames);
    }

    # ensure root is not its own child; may create leaks like #695866
    die encode_utf8($self->identifier . ': Root directory is its own parent')
      if defined $all{$EMPTY} && defined $all{$EMPTY}->parent_dir;

    # find all hard links
    my @hardlinks
      = grep { $_->path_info & Lintian::Index::Item::TYPE_HARDLINK }
      values %all;

    # catalog where they point
    my %backlinks;
    push(@{$backlinks{$_->link}}, $_) for @hardlinks;

    # add the master files for proper sort results
    push(@{$backlinks{$_}}, $all{$_}) for keys %backlinks;

    # point hard links to shortest path
    for my $mastername (keys %backlinks) {

        my @group = @{$backlinks{$mastername}};

        # sort for path length
        my @links = sort { $a->name cmp $b->name } @group;

        # pick the shortest path
        my $preferred = shift @links;

        # get the previous master entry
        my $master = $all{$mastername};

        # skip if done
        next
          if $preferred->name eq $master->name;

        # unset link for preferred
        $preferred->link($EMPTY);

        # copy size from original
        $preferred->size($master->size);

        $preferred->path_info(
            ($preferred->path_info& ~Lintian::Index::Item::TYPE_HARDLINK)
            | Lintian::Index::Item::TYPE_FILE);

        foreach my $pointer (@links) {

            # turn into a hard link
            $pointer->path_info(
                ($pointer->path_info & ~Lintian::Index::Item::TYPE_FILE)
                | Lintian::Index::Item::TYPE_HARDLINK);

            # set link to preferred path
            $pointer->link($preferred->name);

            # no size for hardlinks
            $pointer->size(0);
        }
    }

    # make sure recorded names match hash keys
    $all{$_}->name($_) for keys %all;

    $self->catalog(\%all);

    $errors .= $self->add_md5sums;
    $errors .= $self->add_file_types;

    $errors .= $self->add_ar;
    $errors .= $self->add_elf;
    $errors .= $self->add_java;
    $errors .= $self->add_strings;

    return $errors;
}

=item merge_in

=cut

sub merge_in {
    my ($self, $other) = @_;

    die encode_utf8($self->identifier
          . ': Need same base directory ('
          . $self->basedir . ' vs '
          . $other->basedir . ')')
      unless $self->basedir eq $other->basedir;

    die encode_utf8($self->identifier . ': Need same anchoring status')
      unless $self->anchored == $other->anchored;

    # associate all new items with this index
    $_->index($self) for values %{$other->catalog};

    for my $item (values %{$other->catalog}) {

        # do not transfer root
        next
          if $item->name eq $EMPTY;

        # duplicates on disk are dropped with basedir segments
        $self->catalog->{$item->name} = $item;

        # when adding folder, delete potential file entry
        my $noslash = $item->name;
        if ($noslash =~ s{/$}{}) {
            delete $self->catalog->{$noslash};
        }
    }

    # add children that came from other root to current
    my @other_childnames = keys %{$other->catalog->{$EMPTY}->childnames};
    for my $name (@other_childnames) {

        $self->catalog->{$EMPTY}->childnames->{$name}
          = $self->catalog->{$name};
    }

    # remove items from other index
    $other->catalog({});

    # unset other base directory
    $other->basedir($EMPTY);

    return;
}

=item capture_common_prefix

=cut

sub capture_common_prefix {
    my ($self) = @_;

    my $new_basedir = path($self->basedir)->parent;

    # do nothing in root
    return
      if $new_basedir eq $SLASH;

    my $segment = path($self->basedir)->basename;
    die encode_utf8($self->identifier . ': Common path segment has no length')
      unless length $segment;

    my $prefix;
    if ($self->anchored) {
        $prefix = $SLASH . $segment;
    } else {
        $prefix = $segment . $SLASH;
    }

    my $new_root = Lintian::Index::Item->new;

    # associate new item with this index
    $new_root->index($self);

    $new_root->name($EMPTY);
    $new_root->childnames({ $segment => $prefix });

    # random but fixed date; hint, it's a good read. :)
    $new_root->date('1998-01-25');
    $new_root->time('22:55:34');
    $new_root->path_info($FILE_CODE2LPATH_TYPE{'d'} | $WIDELY_READABLE_FOLDER);
    $new_root->faux(1);

    my %new_catalog;
    for my $item (values %{$self->catalog}) {

        # drop common prefix from name
        my $new_name = $prefix . $item->name;
        $item->name($new_name);

        if (length $item->link) {

            # add common prefix from link target
            my $new_link = $prefix . $item->link;
            $item->link($new_link);
        }

        # adjust references to children
        for my $basename (keys %{$item->childnames}) {
            $item->childnames->{$basename}
              = $prefix . $item->childnames->{$basename};
        }

        $new_catalog{$new_name} = $item;
    }

    $new_catalog{$EMPTY} = $new_root;
    $new_catalog{$prefix}->parent_dir($new_root);

    $self->catalog(\%new_catalog);

    # remove segment from base directory
    $self->basedir($new_basedir);

    return;
}

=item drop_common_prefix

=cut

sub drop_common_prefix {
    my ($self) = @_;

    my $errors = $EMPTY;

    my @childnames = keys %{$self->catalog->{$EMPTY}->childnames};

    die encode_utf8($self->identifier . ': Not exactly one top-level child')
      unless @childnames == 1;

    my $segment = $childnames[0];
    die encode_utf8($self->identifier . ': Common path segment has no length')
      unless length $segment;

    my $new_root = $self->lookup($segment . $SLASH);
    die encode_utf8($self->identifier . ': New root is not a directory')
      unless $new_root->is_dir;

    my $prefix;
    if ($self->anchored) {
        $prefix = $SLASH . $segment;
    } else {
        $prefix = $segment . $SLASH;
    }

    my $regex = quotemeta($prefix);

    delete $self->catalog->{$EMPTY};

    my %new_catalog;
    for my $item (values %{$self->catalog}) {

        # drop common prefix from name
        my $new_name = $item->name;
        $new_name =~ s{^$regex}{};
        $item->name($new_name);

        if (length $item->link) {

            # drop common prefix from link target
            my $new_link = $item->link;
            $new_link =~ s{^$regex}{};
            $item->link($new_link);
        }

        # adjust references to children
        for my $basename (keys %{$item->childnames}) {
            $item->childnames->{$basename} =~ s{^$regex}{};
        }

        # unsure this works, but orig not anchored
        $new_name = $EMPTY
          if $new_name eq $SLASH && $self->anchored;

        $new_catalog{$new_name} = $item;
    }

    $self->catalog(\%new_catalog);

    # add dropped segment to base directory
    $self->basedir($self->basedir . $SLASH . $segment);

    my $other_errors = $self->drop_basedir_segment;

    return $errors . $other_errors;
}

=item drop_basedir_segment

=cut

sub drop_basedir_segment {
    my ($self) = @_;

    my $errors = $EMPTY;

    my $obsolete = path($self->basedir)->basename;
    die encode_utf8($self->identifier . ': Base directory has no name')
      unless length $obsolete;

    my $parent_dir = path($self->basedir)->parent->stringify;
    die encode_utf8($self->identifier . ': Base directory has no parent')
      if $parent_dir eq $SLASH;

    my $grandparent_dir = path($parent_dir)->parent->stringify;
    die encode_utf8(
        $self->identifier . ': Will not do anything in file system root')
      if $grandparent_dir eq $SLASH;

    # destroyed when object is lost
    my $tempdir_tiny
      = path($grandparent_dir)->tempdir(TEMPLATE => 'customXXXXXXXX');

    my $tempdir = $tempdir_tiny->stringify;

    # avoids conflict in case of repeating path segments
    for my $child (path($self->basedir)->children) {
        my $old_name = $child->stringify;

        # Perl unicode bug
        utf8::downgrade $old_name;
        utf8::downgrade $tempdir;

        my @command = ('mv', $old_name, $tempdir);
        my $stderr;
        run3(\@command, \undef, \undef, \$stderr);
        my $status = ($? >> $WAIT_STATUS_SHIFT);

        # already in UTF-8
        die $stderr
          if $status;
    }

    rmdir $self->basedir;
    $self->basedir($parent_dir);

    for my $child ($tempdir_tiny->children) {
        my $old_name = $child->stringify;

        my $target_dir = $parent_dir . $SLASH . $child->basename;

        # Perl unicode bug
        utf8::downgrade $target_dir;

        if (-e $target_dir) {

            # catalog items were dropped when index was merged
            my @command = (qw{rm -rf}, $target_dir);
            my $stderr;
            run3(\@command, \undef, \undef, \$stderr);
            my $status = ($? >> $WAIT_STATUS_SHIFT);

            # already in UTF-8
            die $stderr
              if $status;

            my $display_dir
              = path($parent_dir)->basename . $SLASH . $child->basename;
            $errors .= "removed existing $display_dir" . $NEWLINE;
        }

        # Perl unicode bug
        utf8::downgrade $old_name;
        utf8::downgrade $parent_dir;

        my @command = ('mv', $old_name, $parent_dir);
        my $stderr;
        run3(\@command, \undef, \undef, \$stderr);
        my $status = ($? >> $WAIT_STATUS_SHIFT);

        # already in UTF-8
        die $stderr
          if $status;
    }

    return $errors;
}

=back

=head1 AUTHOR

Originally written by Felix Lechner <felix.lechner@lease-up.com> for
Lintian.

=head1 SEE ALSO

lintian(1)

=cut

1;

# Local Variables:
# indent-tabs-mode: nil
# cperl-indent-level: 4
# End:
# vim: syntax=perl sw=4 sts=4 sr et