diff options
Diffstat (limited to 'commands/reporting-html-reports.pm')
-rw-r--r-- | commands/reporting-html-reports.pm | 1262 |
1 files changed, 1262 insertions, 0 deletions
diff --git a/commands/reporting-html-reports.pm b/commands/reporting-html-reports.pm new file mode 100644 index 0000000..2505538 --- /dev/null +++ b/commands/reporting-html-reports.pm @@ -0,0 +1,1262 @@ +#!/usr/bin/perl -w +# +# Lintian HTML reporting tool -- Create Lintian web reports +# +# Copyright (C) 1998 Christian Schwarz and Richard Braakman +# Copyright (C) 2007 Russ Allbery +# Copyright (C) 2017-2019 Chris Lamb <lamby@debian.org> +# +# This program is free software. It is distributed under the terms of +# the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, you can find it on the World Wide +# Web at https://www.gnu.org/copyleft/gpl.html, or write to the Free +# Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, +# MA 02110-1301, USA. + +package reporting_html_reports; + +use v5.20; +use warnings; +use utf8; +use autodie; + +use Getopt::Long; +use POSIX qw(strftime); +use File::Copy qw(copy); +use Fcntl qw(SEEK_SET); +use List::Util qw(first); +use List::MoreUtils qw(uniq); +use Path::Tiny; +use Text::Template (); +use URI::Escape; +use YAML::XS (); + +use Lintian::Data; +use Lintian::Deb822::Parser qw(read_dpkg_control_lc); +use Lintian::IO::Async qw(safe_qx); +use Lintian::Profile; +use Lintian::Relation::Version qw(versions_comparator); +use Lintian::Reporting::ResourceManager; +use Lintian::Reporting::Util qw(load_state_cache find_backlog); +use Lintian::Util qw(copy_dir run_cmd locate_executable); + +my $CONFIG; +my %OPT; +my %OPT_HASH = ('reporting-config=s'=> \$OPT{'reporting-config'},); + +# ------------------------------ +# Global variables and configuration + +# Some globals initialised in init_global() +my ( + $RESOURCE_MANAGER, $LINTIAN_VERSION, $timestamp, + $TEMPLATE_CONFIG_VARS,$HARNESS_STATE_DIR, $HISTORY_DIR, + $HISTORY, $GRAPHS, $LINTIAN_BASE, + $HTML_TMP_DIR, $SCOUR_ENABLED, +); +# FIXME: Should become obsolete if gnuplot is replaced by R like piuparts.d.o / +# reproducible.d.n is using +my $GRAPHS_RANGE_DAYS = 366; + +# ------------------------------ +# Initialize templates + +# This only has to be done once, so do it at the start and then reuse the same +# templates throughout. +our %templates; + +# %statistics accumulates global statistics. For tags: errors, warnings, +# experimental, overridden, and info are the keys holding the count of tags of +# that sort. For packages: binary, udeb, and source are the number of +# packages of each type with Lintian errors or warnings. For maintainers: +# maintainers is the number of maintainers with Lintian errors or warnings. +# +# %tag_statistics holds a hash of tag-specific statistics. Each tag name is a +# key, and its value is a hash with the following keys: count and overrides +# (number of times the tag has been detected and overridden, respectively), and +# packages (number of packages with at least one such tag). +my (%statistics, %tag_statistics); + +# %by_maint holds a hash of maintainer names to packages and tags. Each +# maintainer is a key. The value is a hash of package names to hashes. Each +# package hash is in turn a hash of versions to an anonymous array of hashes, +# with each hash having keys code, package, type, tag, severity, +# extra, and xref. xref gets the partial URL of the maintainer page for that +# source package. +# +# In other words, the lintian output line: +# +# W: gnubg source: substvar-source-version-is-deprecated gnubg-data +# +# for gnubg 0.15~20061120-1 maintained by Russ Allbery <rra@debian.org> is +# turned into the following structure: +# +# { 'gnubg' => { +# '0.15~20061120-1' => [ +# { code => 'W', # Either 'O' or same as $tag_info->code +# pkg_info => { +# package => 'gnubg', +# version => '0.15~20061120-1', +# component => 'main', +# type => 'source', +# anchor => 'gnubg_0.15~20061120-1', +# xref => 'rra@debian.org.html#gnubg_0.15~20061120-1' +# }, +# tag_info => $tag_info, # an instance of Lintian::Tag::Info +# archs => { +# # Architectures we have seen this tag for +# 'amd64' => 1, +# 'i386' => 1, +# }, +# extra => 'gnubg-data' +# } ] } } +# +# and then stored under the key 'Russ Allbery <rra@debian.org>' +# +# %by_uploader holds the same thing except for packages for which the person +# is only an uploader. +# +# %by_tag is a hash of tag names to an anonymous array of tag information +# hashes just like the inside-most data structure above. +my (%by_maint, %by_uploader, %by_tag, %maintainer_table, %delta); +my @attrs = qw(maintainers source-packages binary-packages udeb-packages + errors warnings info experimental pedantic overridden groups-known + groups-backlog classifications groups-with-errors); + +my @RESTRICTED_CONFIG_DIRS= split(/:/, $ENV{'LINTIAN_RESTRICTED_CONFIG_DIRS'}); +my @CONFIG_DIRS = split(/:/, $ENV{'LINTIAN_CONFIG_DIRS'}); + +sub load_profile { + my ($profile_name, $options) = @_; + my %opt = ( + 'restricted-search-dirs' => \@RESTRICTED_CONFIG_DIRS, + %{$options // {}}, + ); + require Lintian::Profile; + + my $profile = Lintian::Profile->new; + $profile->load($profile_name, \@CONFIG_DIRS, \%opt); + + return $profile; +} + +sub required_cfg_value { + my (@keys) = @_; + my $v = $CONFIG; + for my $key (@keys) { + if (not exists($v->{$key})) { + my $k = join('.', @keys); + die("Missing required config parameter: ${k}\n"); + } + $v = $v->{$key}; + } + return $v; +} + +sub required_cfg_non_empty_list_value { + my (@keys) = @_; + my $v = required_cfg_value(@keys); + if (not defined($v) or ref($v) ne 'ARRAY' or scalar(@{$v}) < 1) { + my $k = join('.', @keys); + die("Invalid configuration: ${k} must be a non-empty list\n"); + } + return $v; +} + +# ------------------------------ +# Main routine + +sub main { + my $profile = init_globals(); + + setup_output_dir( + 'output_dir' => $HTML_TMP_DIR, + 'lintian_manual' => "${LINTIAN_BASE}/doc/lintian.html", + 'lintian_api_docs' => "${LINTIAN_BASE}/doc/api.html", + 'lintian_log_file' => $ARGV[0], + 'resource_dirs' => + [map { "${LINTIAN_BASE}/reporting/$_"} qw(images resources)], + ); + + load_templates("$LINTIAN_BASE/reporting/templates"); + + # Create lintian.css from a template, install the output file as a resource + # and discard the original output file. We do this after installing all + # resources, so the .css file can refer to resources. + output_template( + 'lintian.css', + $templates{'lintian.css'}, + { 'path_prefix' => '../' }); + $RESOURCE_MANAGER->install_resource("$HTML_TMP_DIR/lintian.css"); + + my $state_cache = load_state_cache($HARNESS_STATE_DIR); + + print "Parsing lintian log...\n"; + parse_lintian_log($profile, $state_cache); + + process_data($profile, $state_cache); + exit(0); +} + +# ------------------------------ +# Utility functions + +sub init_globals { + Getopt::Long::config('bundling', 'no_getopt_compat', 'no_auto_abbrev'); + Getopt::Long::GetOptions(%OPT_HASH) or die("error parsing options\n"); + + if (not $OPT{'reporting-config'} or not -f $OPT{'reporting-config'}) { + die("The --reporting-config parameter must point to an existing file\n" + ); + } + $LINTIAN_BASE = $ENV{'LINTIAN_BASE'}; + + $CONFIG = YAML::XS::LoadFile($OPT{'reporting-config'}); + $HARNESS_STATE_DIR = required_cfg_value('storage', 'state-cache'); + $HTML_TMP_DIR = required_cfg_value('storage', 'reports-work-dir'); + my $history_key = 'storage.historical-data-dir'; + if (exists($CONFIG->{'storage'}{'historical-data-dir'})) { + $HISTORY = 1; + $HISTORY_DIR = required_cfg_value('storage', 'historical-data-dir'); + print "Enabling history tracking as ${history_key} is set\n"; + if (length locate_executable('gnuplot')) { + $GRAPHS = 1; + print "Enabling graphs (gnuplot is in PATH)\n"; + } else { + $GRAPHS = 0; + print "No graphs as \"gnuplot\" is not in PATH\n"; + } + if ($GRAPHS) { + if (locate_executable('scour')) { + $SCOUR_ENABLED = 1; + print "Minimizing generated SVG files (scour is in PATH)\n"; + } else { + $SCOUR_ENABLED = 0; + print 'No minimization of generated SVG files' + . " as \"scour\" is not in PATH\n"; + } + } + } else { + $HISTORY = 0; + $GRAPHS = 0; + print "History tracking is disabled (${history_key} is unset)\n"; + print "Without history tracking, there will be no graphs\n"; + } + + if (exists($CONFIG->{'template-variables'})) { + $TEMPLATE_CONFIG_VARS = $CONFIG->{'template-variables'}; + } else { + $TEMPLATE_CONFIG_VARS = {}; + } + # Provide a default URL for the source code. It might not be correct for + # the given installation, but it is better than nothing. + $TEMPLATE_CONFIG_VARS->{'LINTIAN_SOURCE'} + //= 'https://salsa.debian.org/lintian/lintian.git'; + + my $profile = load_profile(); + + Lintian::Data->set_vendor($profile); + + $LINTIAN_VERSION = $ENV{LINTIAN_VERSION}; + $timestamp = safe_qx(qw(date -u --rfc-822)); + chomp($LINTIAN_VERSION, $timestamp); + + $RESOURCE_MANAGER + = Lintian::Reporting::ResourceManager->new('html_dir' => $HTML_TMP_DIR,); + return $profile; +} + +sub load_templates { + my ($template_dir) = @_; + for my $template ( + qw/head foot clean index maintainer maintainers packages tag + tags tags-severity tag-not-seen tags-all/ + ) { + open(my $fd, '<:encoding(UTF-8)', "${template_dir}/$template.tmpl"); + my %options = (TYPE => 'FILEHANDLE', SOURCE => $fd); + $templates{$template} = Text::Template->new(%options) + or die "cannot load template $template: $Text::Template::ERROR\n"; + close($fd); + } + + open(my $fd, '<:encoding(UTF-8)', "${template_dir}/lintian.css.tmpl"); + $templates{'lintian.css'} = Text::Template->new( + TYPE => 'FILEHANDLE', + SOURCE => $fd, + DELIMITERS => ['{{{', '}}}'], + ) + or die("cannot load template for lintian.css: $Text::Template::ERROR\n"); + close($fd); + return; +} + +sub process_data { + my ($profile, $state_cache) = @_; + my @maintainers = sort(uniq(keys(%by_maint), keys(%by_uploader))); + my $statistics_file = "$HARNESS_STATE_DIR/statistics"; + my ($old_statistics, $archives, @archive_info); + + { + # Scoped to allow memory to be re-purposed. The %qa and %sources + # structures are only used for a very few isolated items. + my (%qa, %sources); + print "Collecting statistics...\n"; + $old_statistics + = collect_statistics($profile, $state_cache, $statistics_file, + \@maintainers,\%sources, \%qa); + + generate_lookup_tables(\%sources); + + write_qa_list(\%qa); + + generate_package_index_packages(\%sources); + + if ($HISTORY) { + update_history_and_make_graphs(\@attrs, \%statistics, + \%tag_statistics); + } + } + + # Build a hash of all maintainers, not just those with Lintian tags. We + # use this later to generate stub pages for maintainers whose packages are + # all Lintian-clean. + my %clean; + for my $group_id (sort(keys(%{$state_cache->{'groups'}}))) { + my $maintainer + = $state_cache->{'groups'}{$group_id}{'mirror-metadata'} + {'maintainer'}; + my $id; + next if not $maintainer; + $id = maintainer_url($maintainer); + $clean{$id} = $maintainer; + } + + # Now, walk through the tags by source package (sorted by maintainer). + # Output a summary page of errors and warnings for each maintainer, output + # a full page that includes info, experimental, and overridden tags, and + # assemble the maintainer index and the QA package list as we go. + + for my $maintainer (@maintainers) { + my $id = maintainer_url($maintainer); + delete $clean{$id}; + + # Determine if the maintainer's page is clean. Check all packages for + # which they're either maintainer or uploader and set $error_clean if + # they have no errors or warnings. + # + # Also take this opportunity to sort the tags so that all similar tags + # will be grouped, which produces better HTML output. + my $error_clean = 1; + for my $source ( + keys %{ $by_maint{$maintainer} }, + keys %{ $by_uploader{$maintainer} } + ) { + my $versions = $by_maint{$maintainer}{$source} + || $by_uploader{$maintainer}{$source}; + for my $version (keys %$versions) { + $versions->{$version} + = [sort by_tag @{ $versions->{$version} }]; + next if not $error_clean; + my $tags = $versions->{$version}; + for my $tag (@$tags) { + if ($tag->{code} eq 'E' or $tag->{code} eq 'W') { + $error_clean = 0; + last; + } + } + } + } + + # Determine the parts of the maintainer and the file name for the + # maintainer page. + my ($name, $email) = extract_name_and_email($maintainer); + + my $regular = "maintainer/$id"; + my $full = "full/$id"; + + # Create the regular maintainer page (only errors and warnings) and the + # full maintainer page (all tags, including overrides and info tags). + print "Generating page for $id\n"; + my $q_name = html_quote($name); + my %data = ( + email => html_quote(uri_escape($email)), + errors => 1, + id => $id, + maintainer => html_quote($maintainer), + name => $q_name, + packages => $by_maint{$maintainer}, + uploads => $by_uploader{$maintainer}, + ); + my $template; + if ($error_clean) { + $template = $templates{clean}; + } else { + $template = $templates{maintainer}; + } + output_template($regular, $template, \%data); + $template = $templates{maintainer}; + $data{errors} = 0; + output_template($full, $template, \%data); + + my %index_data = (url => $id, name => $q_name); + # Add this maintainer to the hash of maintainer to URL mappings. + $maintainer_table{$maintainer} = \%index_data; + } + undef(@maintainers); + + # Write out the maintainer index. + my %data = (maintainers => \%maintainer_table,); + output_template('maintainers.html', $templates{maintainers}, \%data); + + # Now, generate stub pages for every maintainer who has only clean + # packages. + for my $id (keys %clean) { + my $maintainer = $clean{$id}; + my ($name, $email) = extract_name_and_email($maintainer); + my %maint_data = ( + id => $id, + email => html_quote(uri_escape($email)), + maintainer => html_quote($maintainer), + name => html_quote($name), + clean => 1, + ); + print "Generating clean page for $id\n"; + output_template("maintainer/$id", $templates{clean}, \%maint_data); + output_template("full/$id", $templates{clean}, \%maint_data); + } + + # Create the pages for each tag. Each page shows the extended description + # for the tag and all the packages for which that tag was issued. + for my $tag (sort $profile->known_tags) { + my $info = $profile->get_taginfo($tag); + my $description = $info->description('html', ' '); + my ($count, $overrides) = (0, 0); + my $tmpl = 'tag-not-seen'; + my $shown_count = 0; + my $tag_list = $by_tag{$tag}; + my $tag_limit_total = 1024; + my $tag_limit_per_package = 3; + + if (exists $by_tag{$tag}) { + $tmpl = 'tag'; + $count = $tag_statistics{$tag}{'count'}; + $overrides = $tag_statistics{$tag}{'overrides'}; + $shown_count = $count + $overrides; + } + if ($shown_count > $tag_limit_total) { + my (@replacement_list, %seen); + for my $orig_info ( + sort { $a->{pkg_info}{package} cmp $b->{pkg_info}{package} } + @{$tag_list}) { + my $pkg_info = $orig_info->{pkg_info}; + my $key + = "$pkg_info->{package} $pkg_info->{type} $pkg_info->{version}"; + next if ++$seen{$key} > $tag_limit_per_package; + push(@replacement_list, $orig_info); + last if @replacement_list >= $tag_limit_total; + } + $tag_list = \@replacement_list; + $shown_count = scalar(@replacement_list); + } + + my %maint_data = ( + description => $description, + tag => $tag, + code => $info->code, + tags => $tag_list, + shown_count => $shown_count, + tag_limit_per_package => $tag_limit_per_package, + graphs => $GRAPHS, + graphs_days => $GRAPHS_RANGE_DAYS, + statistics => { + count => $count, + overrides => $overrides, + total => $count + $overrides, + }, + ); + output_template("tags/$tag.html", $templates{$tmpl}, \%maint_data); + } + + # Create the general tag indices. + %data = ( + tags => \%by_tag, + stats => \%tag_statistics, + profile => \$profile, + ); + output_template('tags.html', $templates{tags}, \%data); + output_template('tags-severity.html', $templates{'tags-severity'}, \%data); + output_template('tags-all.html', $templates{'tags-all'}, \%data); + + # Update the statistics file. + open(my $stats_fd, '>', $statistics_file); + print {$stats_fd} "last-updated: $timestamp\n"; + for my $attr (@attrs) { + print {$stats_fd} "$attr: $statistics{$attr}\n"; + } + print {$stats_fd} "lintian-version: $LINTIAN_VERSION\n"; + close($stats_fd); + + $archives = required_cfg_value('archives'); + for my $archive (sort(keys(%{$archives}))) { + my $architectures + = required_cfg_non_empty_list_value('archives', $archive, + 'architectures'); + my $components + = required_cfg_non_empty_list_value('archives', $archive, + 'components'); + my $distributions + = required_cfg_non_empty_list_value('archives', $archive, + 'distributions'); + my $path = required_cfg_value('archives', $archive, 'base-dir'); + my $trace_basename + = required_cfg_value('archives', $archive, 'tracefile'); + + # The path to the mirror timestamp. + my $trace_file= "${path}/project/trace/${trace_basename}"; + my $mirror_timestamp = path($trace_file)->slurp; + $mirror_timestamp =~ s/\n.*//s; + $mirror_timestamp + = safe_qx('date', '-u', '--rfc-822', '-d', $mirror_timestamp); + my %info = ( + 'name' => $archive, + 'architectures' => $architectures, + 'components' => $components, + 'distributions' => $distributions, + 'timestamp' => $mirror_timestamp, + ); + push(@archive_info, \%info); + } + + # Finally, we can start creating the index page. + %data = ( + delta => \%delta, + archives => \@archive_info, + previous => $old_statistics->{'last-updated'}, + graphs => $GRAPHS, + graphs_days => $GRAPHS_RANGE_DAYS, + ); + output_template('index.html', $templates{index}, \%data); + return; +} + +sub setup_output_dir { + my (%args) = @_; + my $output_dir = $args{'output_dir'}; + my $lintian_manual = $args{'lintian_manual'}; + my $lintian_api = $args{'lintian_api_docs'}; + my $resource_dirs = $args{'resource_dirs'} // []; + my $lintian_log_file = $args{'lintian_log_file'}; + + # Create output directories. + mkdir($output_dir, 0777); + mkdir("$output_dir/full", 0777); + mkdir("$output_dir/maintainer", 0777); + mkdir("$output_dir/tags", 0777); + symlink('.', "$output_dir/reports"); + copy_dir($lintian_manual, "$output_dir/manual"); + copy_dir($lintian_api, "$output_dir/library-api"); + + if ($lintian_log_file) { + my %opts = ( + 'in' => $lintian_log_file, + 'out' => "$output_dir/lintian.log.gz", + ); + run_cmd(\%opts, 'gzip', '-9nc'); + $RESOURCE_MANAGER->install_resource("$output_dir/lintian.log.gz"); + symlink($RESOURCE_MANAGER->resource_URL('lintian.log.gz'), + "$output_dir/lintian.log.gz"); + } + + for my $dir (@{$resource_dirs}) { + next if not -d $dir; + opendir(my $dirfd, $dir); + for my $resname (readdir($dirfd)) { + next if $resname eq '.' or $resname eq '..'; + $RESOURCE_MANAGER->install_resource("$dir/$resname", + { install_method => 'copy' }); + } + closedir($dirfd); + } + return; +} + +sub collect_statistics { + my ($profile, $state_cache, $statistics_file, $maintainers_ref, + $sources_ref, $qa_list_ref) + = @_; + my $old_statistics; + + # For each of this maintainer's packages, add statistical information + # about the number of each type of tag to the QA data and build the + # packages hash used for the package index. We only do this for the + # maintainer packages, not the uploader packages, to avoid + # double-counting. + for my $maintainer (@{$maintainers_ref}) { + for my $source (keys %{ $by_maint{$maintainer} }) { + my %count; + for my $version ( + sort versions_comparator + keys %{ $by_maint{$maintainer}{$source} }){ + my $tags = $by_maint{$maintainer}{$source}{$version}; + for my $tag (@{$tags}) { + $count{$tag->{code}}++; + } + if (@$tags) { + $sources_ref->{$source}{$version} + = $tags->[0]{pkg_info}{xref}; + } + } + $qa_list_ref->{$source} = \%count; + } + } + + for my $tag ($profile->known_tags) { + my ($count, $overrides) = (0, 0); + my %seen_tags; + next if (not exists($by_tag{$tag})); + foreach (@{$by_tag{$tag}}) { + if ($_->{code} ne 'O') { + $count++; + $seen_tags{$_->{pkg_info}{xref}}++; + } else { + $overrides++; + } + } + $tag_statistics{$tag}{'count'} = $count; + $tag_statistics{$tag}{'overrides'} = $overrides; + $tag_statistics{$tag}{'packages'} = scalar(keys(%seen_tags)); + } + + # Read in the old statistics file so that we can calculate deltas for + # all of our statistics. + + if (-f $statistics_file) { + ($old_statistics) = read_dpkg_control_lc($statistics_file); + } + $statistics{'groups-known'} = scalar(keys(%{$state_cache->{'groups'}})); + $statistics{'groups-backlog'} + = scalar(find_backlog($LINTIAN_VERSION,$state_cache)); + my $pkgs_w_errors = 0; + for my $group_data (values(%{$state_cache->{'groups'}})) { + $pkgs_w_errors++ + if exists($group_data->{'processing-errors'}) + and $group_data->{'processing-errors'}; + } + $statistics{'groups-with-errors'} = $pkgs_w_errors; + + for my $attr (@attrs) { + my $old = $old_statistics->{$attr} || 0; + $statistics{$attr} ||= 0; + $delta{$attr} + = sprintf('%d (%+d)', $statistics{$attr},$statistics{$attr} - $old); + } + + return $old_statistics; +} + +sub extract_name_and_email { + my ($maintainer) = @_; + my ($name, $email) = ($maintainer =~ /^(.*) <([^>]+)>/); + $name = 'Unknown Maintainer' unless $name; + $email = 'unknown' unless $email; + return ($name, $email); +} + +# Generate the package lists. These are huge, so we break them into four +# separate pages. +# +# FIXME: Does anyone actually use these pages? They're basically unreadable. +sub generate_package_index_packages { + my ($sources_ref) = @_; + + my %list = ( + '0-9, A-F' => [], + 'G-L' => [], + 'M-R' => [], + 'S-Z' => [], + ); + for my $package (sort(keys(%{$sources_ref}))) { + my $first = uc(substr($package, 0, 1)); + if ($first le 'F') { push(@{ $list{'0-9, A-F'} }, $package) } + elsif ($first le 'L') { push(@{ $list{'G-L'} }, $package) } + elsif ($first le 'R') { push(@{ $list{'M-R'} }, $package) } + else { push(@{ $list{'S-Z'} }, $package) } + } + my %data = (sources => $sources_ref); + my $i = 1; + for my $section (sort(keys(%list))) { + $data{section} = $section; + $data{list} = $list{$section}; + output_template("packages_$i.html", $templates{packages}, \%data); + $i++; + } + return; +} + +sub run_scour { + my ($input_file, $output_file) = @_; + run_cmd('scour', '-i',$input_file, '-o',$output_file, '-q', + '--enable-id-stripping', '--enable-comment-stripping', + '--shorten-ids', '--indent=none'); + return 1; +} + +sub update_history_and_make_graphs { + my ($attrs_ref, $statistics_ref, $tag_statistics_ref) = @_; + # Update history. + my %versions; + my $graph_dir = "$HTML_TMP_DIR/graphs"; + my $commonf = "$graph_dir/common.gpi"; + my $unix_time = time(); + mkdir("$HISTORY_DIR") + if (not -d "$HISTORY_DIR"); + mkdir("$HISTORY_DIR/tags") + if (not -d "$HISTORY_DIR/tags"); + + my $history_file = "$HISTORY_DIR/statistics.dat"; + my $stats = ''; + for my $attr (@{$attrs_ref}) { + $stats .= ' ' . $statistics_ref->{$attr}; + } + open(my $hist_fd, '+>>', $history_file); + print {$hist_fd} "$unix_time $LINTIAN_VERSION$stats\n"; + + if ($GRAPHS) { + seek($hist_fd, 0, SEEK_SET); + while (<$hist_fd>) { + my @fields = split(); + $versions{$fields[1]} = $fields[0] + if not exists $versions{$fields[1]}; + } + } + close($hist_fd); + + if ($GRAPHS) { + mkdir("$graph_dir", 0777); + mkdir("$graph_dir/tags", 0777); + + my $date_min + = strftime('%s', + localtime($unix_time - 3600 * 24 * $GRAPHS_RANGE_DAYS)); + my $date_max = strftime('%s', localtime($unix_time)); + + # Generate loadable Gnuplot file with common variables and labels/arrows + # for Lintian versions. + open(my $common, '>', $commonf); + print {$common} "history_dir='$HISTORY_DIR'\n"; + print {$common} "graph_dir='$graph_dir'\n"; + print {$common} "date_min='$date_min'\n"; + print {$common} "date_max='$date_max'\n"; + my $last_version = 0; + for my $v (sort { $versions{$a} <=> $versions{$b} } keys %versions) { + next unless $versions{$v} > $date_min; + + print {$common} "set arrow from '$versions{$v}',graph 0 to ", + "'$versions{$v}',graph 1 nohead lw 0.4\n"; + + # Skip label if previous release is too close; graphs can't display + # more than ~32 labels. + my $min_spacing = 3600 * 24 * $GRAPHS_RANGE_DAYS / 32; + if ($versions{$v} - $last_version > $min_spacing) { + (my $label = $v) =~ s/\-[\w\d]+$//; + print {$common} "set label '$label' at '$versions{$v}',graph ", + "1.04 rotate by 90 font ',8'\n"; + + $last_version = $versions{$v}; + } + } + close($common); + + print "Plotting global statistics...\n"; + run_cmd({ 'chdir' => $graph_dir}, + 'gnuplot',"$LINTIAN_BASE/reporting/graphs/statistics.gpi"); + + if ($SCOUR_ENABLED) { + # Do a little "rename" dance to ensure that we keep the + # "statistics.svg"-basename without having to use a + # subdirectory. + rename( + "${graph_dir}/statistics.svg", + "${graph_dir}/_statistics-orig.svg" + ); + run_scour( + "${graph_dir}/_statistics-orig.svg", + "${graph_dir}/statistics.svg" + ); + } + $RESOURCE_MANAGER->install_resource("${graph_dir}/statistics.svg"); + } + + my $gnuplot_fd; + if ($GRAPHS) { + open($gnuplot_fd, '>', "$graph_dir/call.gpi"); + } + + for my $tag (sort(keys(%{$tag_statistics_ref}))) { + $history_file = "$HISTORY_DIR/tags/$tag.dat"; + $stats = $tag_statistics_ref->{$tag}; + open(my $tag_fd, '>>', $history_file); + print {$tag_fd} "$unix_time $stats->{'count'} $stats->{'overrides'} " + ."$stats->{'packages'}\n"; + close($tag_fd); + if ($GRAPHS) { + print {$gnuplot_fd} qq{print 'Plotting $tag statistics...'\n}; + print {$gnuplot_fd} + qq{call '$LINTIAN_BASE/reporting/graphs/tags.gpi' '$tag'\n}; + print {$gnuplot_fd} qq{reset\n}; + } + } + + if ($GRAPHS) { + my $svg_dir = "${graph_dir}/tags"; + close($gnuplot_fd); + run_cmd({'chdir' => $graph_dir}, 'gnuplot', 'call.gpi'); + unlink($commonf); + if ($SCOUR_ENABLED) { + # Obvious optimization potential; run scour in parallel + my $optimized_dir = "${graph_dir}/tags-optimized"; + mkdir($optimized_dir); + print "Minimizing tag graphs; this may take a while ...\n"; + for my $tag (sort(keys(%{$tag_statistics_ref}))) { + run_scour("${svg_dir}/${tag}.svg", + "${optimized_dir}/${tag}.svg"); + } + $svg_dir = $optimized_dir; + } + for my $tag (sort(keys(%{$tag_statistics_ref}))) { + my $graph_file = "${svg_dir}/${tag}.svg"; + $RESOURCE_MANAGER->install_resource($graph_file); + } + path($graph_dir)->remove_tree + if -d $graph_dir; + } + return; +} + +# Write out the QA package list. This is a space-delimited file that contains +# the package name and then the error count, warning count, info count, +# pedantic count, experimental count, and overridden tag count. +sub write_qa_list { + my ($qa_data) = @_; + + open(my $qa_fd, '>', "$HTML_TMP_DIR/qa-list.txt"); + for my $source (sort(keys(%{$qa_data}))) { + print {$qa_fd} $source; + for my $code (qw/E W I P X O/) { + my $count = $qa_data->{$source}{$code} || 0; + print {$qa_fd} " $count"; + } + print {$qa_fd} "\n"; + } + close($qa_fd); + return; +} + +# Generate a "redirect" lookup table for the webserver to power the +# "<site>/source/<source>[/<version>]" redirects. +sub generate_lookup_tables { + my ($sources_ref) = @_; + mkdir("$HTML_TMP_DIR/lookup-tables"); + open(my $table, '>', "$HTML_TMP_DIR/lookup-tables/source-packages"); + + foreach my $source (sort(keys(%{$sources_ref}))) { + my $first = 1; + for my $version ( + sort versions_comparator keys %{ $sources_ref->{$source} }) { + my $xref = $sources_ref->{$source}{$version}; + print {$table} "$source full/$xref\n" if $first; + print {$table} "$source/$version full/$xref\n"; + $first = 0; + } + } + + close($table); + return; +} + +# Determine the file name for the maintainer page given a maintainer. It +# should be <email>.html where <email> is their email address with all +# characters other than a-z A-Z 0-9 - _ . @ = + replaced with _. Don't change +# this without coordinating with QA. +sub maintainer_url { + my ($maintainer) = @_; + if ($maintainer =~ m/<([^>]+)>/) { + my $id = $1; + $id =~ tr/a-zA-Z0-9_.@=+-/_/c; + return "$id.html"; + } else { + return 'unsorted.html'; + } +} + +sub parse_lintian_log { + my ($profile, $state_cache) = @_; + # We take a lintian log file on either standard input or as the + # first argument. This log file contains all the tags lintian + # found, plus N: tags with informational messages. Ignore all the + # N: tags and load everything else into the hashes we use for all + # web page generation. + # + # We keep track of a hash from maintainer page URLs to maintainer + # values so that we don't have two maintainers who map to the same + # page and overwrite each other's pages. If we find two + # maintainers who map to the same URL, just assume that the second + # maintainer is the same as the first (but warn about it). + # + # The "last_*" are optimizations to avoid computing the same + # things over and over again when a package have multiple tags. + my (%seen, $last_info, $last_maintainer, %unknown_member_id, $info, + $last_pi, %map_maint, %arch_map); + my %expanded_code = ( + E => 'errors', + W => 'warnings', + I => 'info', + X => 'experimental', + O => 'overridden', + P => 'pedantic', + C => 'classifications', + ); + while (<>) { + my @parts; + chomp; + @parts = split_tag($_); + next unless @parts; + my ($code, $package, $type, $version, $arch, $tag, $extra) = @parts; + $type = 'binary' unless (defined $type); + next + unless ($type eq 'source' || $type eq 'binary' || $type eq 'udeb'); + # Ignore unknown tags - happens if we removed a tag that is + # still present in the log file. + my $tag_info = $profile->get_taginfo($tag); + next + if not $tag_info + or $tag_info->effective_severity eq 'classification'; + + # Update statistics. + my $key = $expanded_code{$code}; + $statistics{$key}++; + unless ($seen{"$package $type"}) { + $statistics{"$type-packages"}++; + $seen{"$package $type"} = 1; + } + + # Determine the source package for this package and warn if + # there appears to be no source package in the archive. + # Determine the maintainer, version, and archive component. Work + # around a missing source package by pulling information from + # a binary package or udeb of the same name if there is any. + my ($source, $component, $source_version, $maintainer, $uploaders); + my $member_id + = "${type}:${package}/${version}" + . ($type ne 'source' ? "/$arch" : q{}); + my $state_data = $state_cache->{'members-to-groups'}{$member_id}; + next if exists($unknown_member_id{$member_id}); + if ($type eq 'source') { + $source = $package; + $source_version = $version; + if (not defined($state_data)) { + warn "Source package ${member_id} not found in state-cache!\n"; + $unknown_member_id{$member_id} = 1; + } + } elsif (defined($state_data)) { + my $src_member + = first { s/^source:// } keys(%{$state_data->{'members'}}); + if ($src_member) { + ($source, $source_version) = split(m{/}, $src_member, 2); + } + } elsif (not defined($state_data)) { + warn "Package ${member_id} not found in state-cache!\n"; + $unknown_member_id{$member_id} = 1; + } + $state_data //= {}; + $component = $state_data->{'mirror-metadata'}{'component'} ||= 'main'; + $maintainer = $state_data->{'mirror-metadata'}{'maintainer'} + ||= '(unknown)'; + $uploaders = $state_data->{'mirror-metadata'}{'uploaders'}; + $source ||= ''; + $version = 'unknown' + unless (defined($version) and length($version) > 0); + $source_version = $version + unless (defined($source_version) and length($source_version) > 0); + + # Sanitize, just out of paranoia. + $package =~ tr/a-zA-Z0-9.+-/_/c; + $source =~ tr/a-zA-Z0-9.+-/_/c; + $version =~ tr/a-zA-Z0-9.+:~-/_/c; + $source_version =~ tr/a-zA-Z0-9.+:~-/_/c; + + # Conditionally call html_quote if needed. On average, 11-13% of + # all tags (emitted on lintian.d.o) have no "extra". That would be + # tags like "no-upstream-changelog" (now removed) + if (defined($extra)) { + $extra = html_quote($extra); + } else { + $extra = ''; + } + + # Store binary architectures + my $arch_key = join(':', $package, $type, $version, $tag, $extra); + $arch_map{$arch_key}{$arch} = 1 + unless $arch eq 'all' or $arch eq 'source'; + + # Don't duplicate entries if they only differ on architecture + next if scalar(keys %{$arch_map{$arch_key}}) > 1; + + # Add the tag information to our hashes. Share the data + # between the hashes to save space (which means we can't later + # do destructive tricks with it). + if ( $last_info + && $last_pi->{type} eq $type + && $last_pi->{package} eq $package + && $last_pi->{version} eq $version) { + + # There are something like 622k tags emitted on lintian.d.o, + # but only "some" 90k unique package+version(+arch) pairs. + # Therefore, we can conclude that the average package will + # have ~6 tags and optimise for that case. Indeed, this path + # seems to be taken about 90% of the time (561k/624k). + # - In fact, we see less than "90k" package+version(+arch) + # pairs here, since entries without tags never this far down + # in this loop (i.e. they are filtered out by split_tag + # above). + + # Copy the last info and then change the bits that can change + $info = {%{$last_info}}; + # Code depends on whether the given tag was overridden or not + $info->{code} = $code; + $info->{extra} = $extra; + if ($info->{tag_info}->name ne $tag) { + $info->{tag_info} = $tag_info; + } + # saves a map_maintainer call + $maintainer = $last_maintainer; + } else { + + my $anchor = "${source}_${source_version}"; + # Apparently "+" are not allowed in ids and I am guessing + # ":" is not either + if (index($anchor, '+') > -1 or index($anchor, ':') > -1) { + $anchor =~ s/[+]/_x2b/g; + $anchor =~ s/[:]/_x3a/g; + } + if (substr($maintainer, 0, 1) eq q{"}) { + # Strip out ""-quotes, which is required in d/control for some + # maintainers. + $maintainer =~ s/^"(.*)" <(.*)>$/$1 <$2>/; + } + + # Check if we've seen the URL for this maintainer before + # and, if so, map them to the same person as the previous + # one. + + $last_maintainer = $maintainer + = map_maintainer(\%map_maint, $maintainer); + + # Update maintainer statistics. + $statistics{maintainers}++ unless defined $by_maint{$maintainer}; + + $last_info = $info = { + # Tag instance specific data + + # split_tags ensures that $code is a single upper case letter + code => $code, + tag_info => $tag_info, + # extra is unsafe in general, but we already quote it above. + extra => $extra, + archs => $arch_map{$arch_key}, + + # Shareable data + pkg_info => { + package => $package, + version => $version, + # There is a check for type being in a fixed whitelist of + # HTML-safe keywords in the start of the loop., + type => $type, + component => html_quote($component), + # should be safe + anchor => $anchor, + xref => maintainer_url($maintainer). "#${anchor}", + 'state_data' => $state_data, + maintainer => html_quote($maintainer), + }, + }; + $last_pi = $info->{pkg_info}; + if (!$by_maint{$maintainer}{$source}{$source_version}) { + my $list_ref = []; + $by_maint{$maintainer}{$source}{$source_version} = $list_ref; + # If the package had uploaders listed, also add the + # information to %by_uploaders (still sharing the data + # between hashes). + if ($uploaders) { + for my $uploader (@{$uploaders}) { + if (substr($uploader, 0, 1) eq q{"}) { + # Strip out ""-quotes, which is required in + # d/control for some uploaders. + $uploader =~ s/^"(.*)" <(.*)>$/$1 <$2>/; + } + $uploader = map_maintainer(\%map_maint, $uploader); + next if $uploader eq $maintainer; + $by_uploader{$uploader}{$source}{$source_version} + = $list_ref; + } + } + } + } + + push(@{ $by_maint{$maintainer}{$source}{$source_version} }, $info); + $by_tag{$tag} ||= []; + push(@{ $by_tag{$tag} }, $info); + + } + return; +} + +# Deduplicate maintainers. Maintains a cache of the maintainers we've seen +# with a given e-mail address and returns the maintainer string that we +# should use (which is whatever maintainer we saw first with that e-mail). +sub map_maintainer { + my ($urlmap, $maintainer) = @_; + my $url = maintainer_url($maintainer); + if (defined(my $res = $urlmap->{$url})) { + $maintainer = $res; + } else { + $urlmap->{$url} = $maintainer; + } + return $maintainer; +} + +# Quote special characters for HTML output. +sub html_quote { + my ($text) = @_; + $text ||= ''; + # Use index to do a quick check before we bother requesting a + # subst. On average, this is cheaper than blindly s///'ing, since + # we rarely subst (all) of the characters below. + if (index($text, '&') > -1) { + $text =~ s/&/\&/g; + } + if (index($text, '<') > -1) { + $text =~ s/</\</g; + } + if (index($text, '>') > -1) { + $text =~ s/>/\>/g; + } + if (index($text, '/') > -1) { + $text =~ s/\//\//g; + } + return $text; +} + +# Given a file name, a template, and a data hash, fill out the template with +# that data hash and output the results to the file. +sub output_template { + my ($file, $template, $data) = @_; + my $path_prefix = $data->{path_prefix}; + if (not defined($path_prefix)) { + $path_prefix = ''; + if (index($file, '/') > -1) { + $path_prefix = '../' x ($file =~ tr|/||); + } + } + $data->{version} ||= $LINTIAN_VERSION; + $data->{timestamp} ||= $timestamp; + $data->{by_version} ||= \&versions_comparator; + $data->{path_prefix} ||= $path_prefix; + $data->{html_quote} ||= \&html_quote; + $data->{resource_path} ||= sub { + return $path_prefix . $RESOURCE_MANAGER->resource_URL($_[0]); + }; + $data->{resource_integrity} ||= sub { + return $RESOURCE_MANAGER->resource_integrity_value($_[0]); + }; + $data->{head} ||= sub { + $templates{head}->fill_in( + HASH => { + page_title => $_[0], + config_vars => $TEMPLATE_CONFIG_VARS, + %{$data}, + }) or die "Filling out head of $file: $Text::Template::ERROR\n"; + }; + $data->{foot} ||= sub { + $templates{foot}->fill_in( + HASH => { + config_vars => $TEMPLATE_CONFIG_VARS, + %{$data}, + }) or die "Filling out footer of $file: $Text::Template::ERROR\n"; + }; + $data->{config_vars} ||= $TEMPLATE_CONFIG_VARS; + open(my $fd, '>:encoding(UTF-8)', "$HTML_TMP_DIR/$file"); + $template->fill_in(OUTPUT => $fd, HASH => $data) + or die "filling out $file failed: $Text::Template::ERROR\n"; + close($fd); + return; +} + +# Sort function for sorting lists of tags. Sort by package, version, component, +# type, tag, and then any extra data. This will produce the best HTML output. +# +# Note that source tags must come before all other tags, hence the "unfair" +# priority for those. This is because the first tags listed are assumed to +# be source package tags. +sub by_tag { + my $a_pi = $a->{pkg_info}; + my $b_pi = $b->{pkg_info}; + if ($a_pi->{type} ne $b_pi->{type}) { + return -1 if $a_pi->{type} eq 'source'; + return 1 if $b_pi->{type} eq 'source'; + } + return + $a_pi->{package} cmp $b_pi->{package} + || $a_pi->{version} cmp $b_pi->{version} + || $a_pi->{component} cmp $b_pi->{component} + || $a_pi->{type} cmp $b_pi->{type} + || $a->{tag_info}->name cmp $b->{tag_info}->name + || $a->{extra} cmp $b->{extra}; +} + +=item split_tag + +=cut + +{ + # Matches something like: (1:2.0-3) [arch1 arch2] + # - captures the version and the architectures + my $verarchre = qr,(?: \s* \(( [^)]++ )\) \s* \[ ( [^]]++ ) \]),xo; + # ^^^^^^^^ ^^^^^^^^^^^^ + # ( version ) [architecture ] + + # matches the full deal: + # 1 222 3333 4444444 5555 666 777 + # - T: pkg type (version) [arch]: tag [...] + # ^^^^^^^^^^^^^^^^^^^^^ + # Where the marked part(s) are optional values. The numbers above + # the example are the capture groups. + my $TAG_REGEX + = qr/([EWIXOPC]): (\S+)(?: (\S+)(?:$verarchre)?)?: (\S+)(?:\s+(.*))?/; + + sub split_tag { + my ($tag_input) = @_; + my $pkg_type; + return unless $tag_input =~ /^${TAG_REGEX}$/; + # default value... + $pkg_type = $3//'binary'; + return ($1, $2, $pkg_type, $4, $5, $6, $7); + } +} + +1; + +# Local Variables: +# indent-tabs-mode: nil +# cperl-indent-level: 4 +# End: +# vim: syntax=perl sw=4 sts=4 sr et |