package Devscripts::Uscan::http;
use strict;
use Cwd qw/abs_path/;
use Devscripts::Uscan::Output;
use Devscripts::Uscan::Utils;
use Devscripts::Uscan::_xtp;
use Moo::Role;
*http_newfile_base = \&Devscripts::Uscan::_xtp::_xtp_newfile_base;
##################################
# search $newversion (http mode)
##################################
sub http_search {
my ($self) = @_;
# $content: web page to be scraped to find the URLs to be downloaded
if (defined($1) and $self->downloader->ssl) {
uscan_die
"you must have the liblwp-protocol-https-perl package installed\nto use https URLs";
}
uscan_verbose "Requesting URL:\n $self->{parse_result}->{base}";
my $request = HTTP::Request->new('GET', $self->parse_result->{base});
my $response = $self->downloader->user_agent->request($request);
if (!$response->is_success) {
uscan_warn
"In watchfile $self->{watchfile}, reading webpage\n $self->{parse_result}->{base} failed: "
. $response->status_line;
return undef;
}
my @redirections = @{ $self->downloader->user_agent->get_redirections };
uscan_verbose "redirections: @redirections" if @redirections;
foreach my $_redir (@redirections) {
my $base_dir = $_redir;
$base_dir =~ s%^\w+://[^/]+/%/%;
if ($_redir =~ m%^(\w+://[^/]+)%) {
my $base_site = $1;
push @{ $self->patterns },
"(?:(?:$base_site)?"
. quotemeta($base_dir)
. ")?$self->{parse_result}->{filepattern}";
push @{ $self->sites }, $base_site;
push @{ $self->basedirs }, $base_dir;
# remove the filename, if any
my $base_dir_orig = $base_dir;
$base_dir =~ s%/[^/]*$%/%;
if ($base_dir ne $base_dir_orig) {
push @{ $self->patterns },
"(?:(?:$base_site)?"
. quotemeta($base_dir)
. ")?$self->{parse_result}->{filepattern}";
push @{ $self->sites }, $base_site;
push @{ $self->basedirs }, $base_dir;
}
}
}
my $content = $response->decoded_content;
uscan_debug
"received content:\n$content\n[End of received content] by HTTP";
my @hrefs;
if (!$self->searchmode or $self->searchmode eq 'html') {
@hrefs = $self->html_search($content);
} elsif ($self->searchmode eq 'plain') {
@hrefs = $self->plain_search($content);
} else {
uscan_warn 'Unknown searchmode "' . $self->searchmode . '", skipping';
return undef;
}
if (@hrefs) {
@hrefs = Devscripts::Versort::versort(@hrefs);
my $msg
= "Found the following matching hrefs on the web page (newest first):\n";
foreach my $href (@hrefs) {
$msg .= " $$href[2] ($$href[1]) index=$$href[0] $$href[3]\n";
}
uscan_verbose $msg;
}
my ($newversion, $newfile);
if (defined $self->shared->{download_version}) {
# extract ones which has $match in the above loop defined
my @vhrefs = grep { $$_[3] } @hrefs;
if (@vhrefs) {
(undef, $newversion, $newfile, undef) = @{ $vhrefs[0] };
} else {
uscan_warn
"In $self->{watchfile} no matching hrefs for version $self->{shared}->{download_version}"
. " in watch line\n $self->{line}";
return undef;
}
} else {
if (@hrefs) {
(undef, $newversion, $newfile, undef) = @{ $hrefs[0] };
} else {
uscan_warn
"In $self->{watchfile} no matching files for watch line\n $self->{line}";
return undef;
}
}
return ($newversion, $newfile);
}
#######################################################################
# determine $upstream_url (http mode)
#######################################################################
# http is complicated due to absolute/relative URL issue
sub http_upstream_url {
my ($self) = @_;
my $upstream_url;
my $newfile = $self->search_result->{newfile};
if ($newfile =~ m%^\w+://%) {
$upstream_url = $newfile;
} elsif ($newfile =~ m%^//%) {
$upstream_url = $self->parse_result->{site};
$upstream_url =~ s/^(https?:).*/$1/;
$upstream_url .= $newfile;
} elsif ($newfile =~ m%^/%) {
# absolute filename
# Were there any redirections? If so try using those first
if ($#{ $self->patterns } > 0) {
# replace $site here with the one we were redirected to
foreach my $index (0 .. $#{ $self->patterns }) {
if ("$self->{sites}->[$index]$newfile"
=~ m&^$self->{patterns}->[$index]$&) {
$upstream_url = "$self->{sites}->[$index]$newfile";
last;
}
}
if (!defined($upstream_url)) {
uscan_verbose
"Unable to determine upstream url from redirections,\n"
. "defaulting to using site specified in watch file";
$upstream_url = "$self->{sites}->[0]$newfile";
}
} else {
$upstream_url = "$self->{sites}->[0]$newfile";
}
} else {
# relative filename, we hope
# Were there any redirections? If so try using those first
if ($#{ $self->patterns } > 0) {
# replace $site here with the one we were redirected to
foreach my $index (0 .. $#{ $self->patterns }) {
# skip unless the basedir looks like a directory
next unless $self->{basedirs}->[$index] =~ m%/$%;
my $nf = "$self->{basedirs}->[$index]$newfile";
if ("$self->{sites}->[$index]$nf"
=~ m&^$self->{patterns}->[$index]$&) {
$upstream_url = "$self->{sites}->[$index]$nf";
last;
}
}
if (!defined($upstream_url)) {
uscan_verbose
"Unable to determine upstream url from redirections,\n"
. "defaulting to using site specified in watch file";
$upstream_url = "$self->{parse_result}->{urlbase}$newfile";
}
} else {
$upstream_url = "$self->{parse_result}->{urlbase}$newfile";
}
}
# mangle if necessary
$upstream_url =~ s/&/&/g;
uscan_verbose "Matching target for downloadurlmangle: $upstream_url";
if (@{ $self->downloadurlmangle }) {
if (
mangle(
$self->watchfile, \$self->line,
'downloadurlmangle:', \@{ $self->downloadurlmangle },
\$upstream_url
)
) {
$self->status(1);
return undef;
}
}
return $upstream_url;
}
sub http_newdir {
my (
$https, $downloader, $site,
$dir, $pattern, $dirversionmangle,
$watchfile, $lineptr, $download_version
) = @_;
my ($request, $response, $newdir);
my ($download_version_short1, $download_version_short2,
$download_version_short3)
= partial_version($download_version);
my $base = $site . $dir;
if (defined($https) and !$downloader->ssl) {
uscan_die
"$progname: you must have the liblwp-protocol-https-perl package installed\n"
. "to use https URLs";
}
$request = HTTP::Request->new('GET', $base);
$response = $downloader->user_agent->request($request);
if (!$response->is_success) {
uscan_warn
"In watch file $watchfile, reading webpage\n $base failed: "
. $response->status_line;
return '';
}
my $content = $response->content;
uscan_debug
"received content:\n$content\n[End of received content] by HTTP";
clean_content(\$content);
my $dirpattern = "(?:(?:$site)?" . quotemeta($dir) . ")?$pattern";
uscan_verbose "Matching pattern:\n $dirpattern";
my @hrefs;
my $match = '';
while ($content =~ m/<\s*a\s+[^>]*href\s*=\s*([\"\'])(.*?)\1/gi) {
my $href = fix_href($2);
uscan_verbose "Matching target for dirversionmangle: $href";
if ($href =~ m&^$dirpattern/?$&) {
my $mangled_version
= join(".", map { $_ // '' } $href =~ m&^$dirpattern/?$&);
if (
mangle(
$watchfile, $lineptr,
'dirversionmangle:', \@{$dirversionmangle},
\$mangled_version
)
) {
return 1;
}
$match = '';
if (defined $download_version
and $mangled_version eq $download_version) {
$match = "matched with the download version";
}
if (defined $download_version_short3
and $mangled_version eq $download_version_short3) {
$match = "matched with the download version (partial 3)";
}
if (defined $download_version_short2
and $mangled_version eq $download_version_short2) {
$match = "matched with the download version (partial 2)";
}
if (defined $download_version_short1
and $mangled_version eq $download_version_short1) {
$match = "matched with the download version (partial 1)";
}
push @hrefs, [$mangled_version, $href, $match];
}
}
# extract ones which has $match in the above loop defined
my @vhrefs = grep { $$_[2] } @hrefs;
if (@vhrefs) {
@vhrefs = Devscripts::Versort::upstream_versort(@vhrefs);
$newdir = $vhrefs[0][1];
}
if (@hrefs) {
@hrefs = Devscripts::Versort::upstream_versort(@hrefs);
my $msg = "Found the following matching directories (newest first):\n";
foreach my $href (@hrefs) {
$msg .= " $$href[1] ($$href[0]) $$href[2]\n";
}
uscan_verbose $msg;
$newdir //= $hrefs[0][1];
} else {
uscan_warn
"In $watchfile,\n no matching hrefs for pattern\n $site$dir$pattern";
return '';
}
# just give the final directory component
$newdir =~ s%/$%%;
$newdir =~ s%^.*/%%;
return ($newdir);
}
# Nothing to clean here
sub http_clean { 0 }
sub clean_content {
my ($content) = @_;
# We need this horrid stuff to handle href=foo type
# links. OK, bad HTML, but we have to handle it nonetheless.
# It's bug #89749.
$$content =~ s/href\s*=\s*(?=[^\"\'])([^\s>]+)/href="$1"/ig;
# Strip comments
$$content =~ s///sg;
return $content;
}
sub html_search {
my ($self, $content) = @_;
# pagenmangle: should not abuse this slow operation
if (
mangle(
$self->watchfile, \$self->line,
'pagemangle:\n', [@{ $self->pagemangle }],
\$content
)
) {
return undef;
}
if ( !$self->shared->{bare}
and $content =~ m%^<[?]xml%i
and $content =~ m%xmlns="http://s3.amazonaws.com/doc/2006-03-01/"%
and $content !~ m%([^<]*)%$1%g;
uscan_debug
"processed content:\n$content\n[End of processed content] by Amazon AWS special case code";
}
clean_content(\$content);
# Is there a base URL given?
if ($content =~ /<\s*base\s+[^>]*href\s*=\s*([\"\'])(.*?)\1/i) {
# Ensure it ends with /
$self->parse_result->{urlbase} = "$2/";
$self->parse_result->{urlbase} =~ s%//$%/%;
} else {
# May have to strip a base filename
($self->parse_result->{urlbase} = $self->parse_result->{base})
=~ s%/[^/]*$%/%;
}
uscan_debug
"processed content:\n$content\n[End of processed content] by fix bad HTML code";
# search hrefs in web page to obtain a list of uversionmangled version and matching download URL
{
local $, = ',';
uscan_verbose "Matching pattern:\n @{$self->{patterns}}";
}
my @hrefs;
while ($content =~ m/<\s*a\s+[^>]*(?<=\s)href\s*=\s*([\"\'])(.*?)\1/sgi) {
my $href = $2;
$href = fix_href($href);
if (defined $self->hrefdecode) {
if ($self->hrefdecode eq 'percent-encoding') {
uscan_debug "... Decoding from href: $href";
$href =~ s/%([A-Fa-f\d]{2})/chr hex $1/eg;
} else {
uscan_warn "Illegal value for hrefdecode: "
. "$self->{hrefdecode}";
return undef;
}
}
uscan_debug "Checking href $href";
foreach my $_pattern (@{ $self->patterns }) {
if ($href =~ /^$_pattern$/) {
push @hrefs, $self->parse_href($href, $_pattern, $1);
}
}
}
return @hrefs;
}
sub plain_search {
my ($self, $content) = @_;
my @hrefs;
foreach my $_pattern (@{ $self->patterns }) {
while ($content =~ s/.*?($_pattern)//) {
push @hrefs, $self->parse_href($1, $_pattern, $2);
}
}
return @hrefs;
}
sub parse_href {
my ($self, $href, $_pattern, $match) = @_;
my $mangled_version;
if ($self->watch_version == 2) {
# watch_version 2 only recognised one group; the code
# below will break version 2 watch files with a construction
# such as file-([\d\.]+(-\d+)?) (bug #327258)
$mangled_version = $match;
} else {
# need the map { ... } here to handle cases of (...)?
# which may match but then return undef values
if ($self->versionless) {
# exception, otherwise $mangled_version = 1
$mangled_version = '';
} else {
$mangled_version
= join(".", map { $_ if defined($_) } $href =~ m&^$_pattern$&);
}
if (
mangle(
$self->watchfile, \$self->line,
'uversionmangle:', \@{ $self->uversionmangle },
\$mangled_version
)
) {
return ();
}
}
$match = '';
if (defined $self->shared->{download_version}) {
if ($mangled_version eq $self->shared->{download_version}) {
$match = "matched with the download version";
}
}
my $priority = $mangled_version . '-' . get_priority($href);
return [$priority, $mangled_version, $href, $match];
}
1;