package Devscripts::Uscan::http;
use strict;
use Cwd qw/abs_path/;
use Devscripts::Uscan::Output;
use Devscripts::Uscan::Utils;
use Devscripts::Uscan::_xtp;
use Moo::Role;
*http_newfile_base = \&Devscripts::Uscan::_xtp::_xtp_newfile_base;
##################################
# search $newversion (http mode)
##################################
#returns (\@patterns, \@base_sites, \@base_dirs)
sub handle_redirection {
my ($self, $pattern, @additional_bases) = @_;
my @redirections = @{ $self->downloader->user_agent->get_redirections };
my (@patterns, @base_sites, @base_dirs);
uscan_verbose "redirections: @redirections" if @redirections;
foreach my $_redir (@redirections, @additional_bases) {
my $base_dir = $_redir;
$base_dir =~ s%^\w+://[^/]+/%/%;
$base_dir =~ s%/[^/]*(?:[#?].*)?$%/%;
if ($_redir =~ m%^(\w+://[^/]+)%) {
my $base_site = $1;
push @patterns,
quotemeta($base_site) . quotemeta($base_dir) . "$pattern";
push @base_sites, $base_site;
push @base_dirs, $base_dir;
# remove the filename, if any
my $base_dir_orig = $base_dir;
$base_dir =~ s%/[^/]*$%/%;
if ($base_dir ne $base_dir_orig) {
push @patterns,
quotemeta($base_site) . quotemeta($base_dir) . "$pattern";
push @base_sites, $base_site;
push @base_dirs, $base_dir;
}
}
}
return (\@patterns, \@base_sites, \@base_dirs);
}
sub http_search {
my ($self) = @_;
# $content: web page to be scraped to find the URLs to be downloaded
if ($self->{parse_result}->{base} =~ /^https/ and !$self->downloader->ssl)
{
uscan_die
"you must have the liblwp-protocol-https-perl package installed\nto use https URLs";
}
uscan_verbose "Requesting URL:\n $self->{parse_result}->{base}";
my $request = HTTP::Request->new('GET', $self->parse_result->{base});
foreach my $k (keys %{ $self->downloader->headers }) {
if ($k =~ /^(.*?)@(.*)$/) {
my $baseUrl = $1;
my $hdr = $2;
if ($self->parse_result->{base} =~ m#^\Q$baseUrl\E(?:/.*)?$#) {
$request->header($hdr => $self->headers->{$k});
uscan_verbose "Set per-host custom header $hdr for "
. $self->parse_result->{base};
} else {
uscan_debug
"$self->parse_result->{base} does not start with $1";
}
} else {
uscan_warn "Malformed http-header: $k";
}
}
$request->header('Accept-Encoding' => 'gzip');
$request->header('Accept' => '*/*');
my $response = $self->downloader->user_agent->request($request);
if (!$response->is_success) {
uscan_warn
"In watchfile $self->{watchfile}, reading webpage\n $self->{parse_result}->{base} failed: "
. $response->status_line;
return undef;
}
my ($patterns, $base_sites, $base_dirs)
= handle_redirection($self, $self->{parse_result}->{filepattern});
push @{ $self->patterns }, @$patterns;
push @{ $self->sites }, @$base_sites;
push @{ $self->basedirs }, @$base_dirs;
my $content = $response->decoded_content;
uscan_extra_debug
"received content:\n$content\n[End of received content] by HTTP";
my @hrefs;
if (!$self->searchmode or $self->searchmode eq 'html') {
@hrefs = $self->html_search($content, $self->patterns);
} elsif ($self->searchmode eq 'plain') {
@hrefs = $self->plain_search($content);
} else {
uscan_warn 'Unknown searchmode "' . $self->searchmode . '", skipping';
return undef;
}
if (@hrefs) {
@hrefs = Devscripts::Versort::versort(@hrefs);
my $msg
= "Found the following matching hrefs on the web page (newest first):\n";
foreach my $href (@hrefs) {
$msg .= " $$href[2] ($$href[1]) index=$$href[0] $$href[3]\n";
}
uscan_verbose $msg;
}
my ($newversion, $newfile);
if (defined $self->shared->{download_version}
and not $self->versionmode eq 'ignore') {
# extract ones which has $match in the above loop defined
my @vhrefs = grep { $$_[3] } @hrefs;
if (@vhrefs) {
(undef, $newversion, $newfile, undef) = @{ $vhrefs[0] };
} else {
uscan_warn
"In $self->{watchfile} no matching hrefs for version $self->{shared}->{download_version}"
. " in watch line\n $self->{line}";
return undef;
}
} else {
if (@hrefs) {
(undef, $newversion, $newfile, undef) = @{ $hrefs[0] };
} else {
uscan_warn
"In $self->{watchfile} no matching files for watch line\n $self->{line}";
return undef;
}
}
return ($newversion, $newfile);
}
#######################################################################
# determine $upstream_url (http mode)
#######################################################################
# http is complicated due to absolute/relative URL issue
sub http_upstream_url {
my ($self) = @_;
my $upstream_url;
my $newfile = $self->search_result->{newfile};
if ($newfile =~ m%^\w+://%) {
$upstream_url = $newfile;
} elsif ($newfile =~ m%^//%) {
$upstream_url = $self->parse_result->{site};
$upstream_url =~ s/^(https?:).*/$1/;
$upstream_url .= $newfile;
} elsif ($newfile =~ m%^/%) {
# absolute filename
# Were there any redirections? If so try using those first
if ($#{ $self->patterns } > 0) {
# replace $site here with the one we were redirected to
foreach my $index (0 .. $#{ $self->patterns }) {
if ("$self->{sites}->[$index]$newfile"
=~ m&^$self->{patterns}->[$index]$&) {
$upstream_url = "$self->{sites}->[$index]$newfile";
last;
}
}
if (!defined($upstream_url)) {
uscan_verbose
"Unable to determine upstream url from redirections,\n"
. "defaulting to using site specified in watch file";
$upstream_url = "$self->{sites}->[0]$newfile";
}
} else {
$upstream_url = "$self->{sites}->[0]$newfile";
}
} else {
# relative filename, we hope
# Were there any redirections? If so try using those first
if ($#{ $self->patterns } > 0) {
# replace $site here with the one we were redirected to
foreach my $index (0 .. $#{ $self->patterns }) {
# skip unless the basedir looks like a directory
next unless $self->{basedirs}->[$index] =~ m%/$%;
my $nf = "$self->{basedirs}->[$index]$newfile";
if ("$self->{sites}->[$index]$nf"
=~ m&^$self->{patterns}->[$index]$&) {
$upstream_url = "$self->{sites}->[$index]$nf";
last;
}
}
if (!defined($upstream_url)) {
uscan_verbose
"Unable to determine upstream url from redirections,\n"
. "defaulting to using site specified in watch file";
$upstream_url = "$self->{parse_result}->{urlbase}$newfile";
}
} else {
$upstream_url = "$self->{parse_result}->{urlbase}$newfile";
}
}
# mangle if necessary
$upstream_url =~ s/&/&/g;
uscan_verbose "Matching target for downloadurlmangle: $upstream_url";
if (@{ $self->downloadurlmangle }) {
if (
mangle(
$self->watchfile, \$self->line,
'downloadurlmangle:', \@{ $self->downloadurlmangle },
\$upstream_url
)
) {
$self->status(1);
return undef;
}
}
return $upstream_url;
}
sub http_newdir {
my ($https, $line, $site, $dir, $pattern, $dirversionmangle,
$watchfile, $lineptr, $download_version)
= @_;
my $downloader = $line->downloader;
my ($request, $response, $newdir);
my ($download_version_short1, $download_version_short2,
$download_version_short3)
= partial_version($download_version);
my $base = $site . $dir;
$pattern .= "/?";
if (defined($https) and !$downloader->ssl) {
uscan_die
"$progname: you must have the liblwp-protocol-https-perl package installed\n"
. "to use https URLs";
}
# At least for now, set base in the line object - other methods need it
local $line->parse_result->{base} = $base;
$request = HTTP::Request->new('GET', $base);
$response = $downloader->user_agent->request($request);
if (!$response->is_success) {
uscan_warn
"In watch file $watchfile, reading webpage\n $base failed: "
. $response->status_line;
return '';
}
my $content = $response->content;
if ( $response->header('Content-Encoding')
and $response->header('Content-Encoding') =~ /^gzip$/i) {
require IO::Uncompress::Gunzip;
require IO::String;
uscan_debug "content seems gzip encoded, let's decode it";
my $out;
if (IO::Uncompress::Gunzip::gunzip(IO::String->new($content), \$out)) {
$content = $out;
} else {
uscan_warn 'Unable to decode remote content: '
. $IO::Uncompress::GunzipError;
return '';
}
}
uscan_extra_debug
"received content:\n$content\n[End of received content] by HTTP";
clean_content(\$content);
my ($dirpatterns, $base_sites, $base_dirs)
= handle_redirection($line, $pattern, $base);
$downloader->user_agent->clear_redirections; # we won't be needing that
my @hrefs;
for my $parsed (
html_search($line, $content, $dirpatterns, 'dirversionmangle')) {
my ($priority, $mangled_version, $href, $match) = @$parsed;
$match = '';
if (defined $download_version
and $mangled_version eq $download_version) {
$match = "matched with the download version";
}
if (defined $download_version_short3
and $mangled_version eq $download_version_short3) {
$match = "matched with the download version (partial 3)";
}
if (defined $download_version_short2
and $mangled_version eq $download_version_short2) {
$match = "matched with the download version (partial 2)";
}
if (defined $download_version_short1
and $mangled_version eq $download_version_short1) {
$match = "matched with the download version (partial 1)";
}
push @hrefs, [$mangled_version, $href, $match];
}
# extract ones which has $match in the above loop defined
my @vhrefs = grep { $$_[2] } @hrefs;
if (@vhrefs) {
@vhrefs = Devscripts::Versort::upstream_versort(@vhrefs);
$newdir = $vhrefs[0][1];
}
if (@hrefs) {
@hrefs = Devscripts::Versort::upstream_versort(@hrefs);
my $msg = "Found the following matching directories (newest first):\n";
foreach my $href (@hrefs) {
$msg .= " $$href[1] ($$href[0]) $$href[2]\n";
}
uscan_verbose $msg;
$newdir //= $hrefs[0][1];
} else {
uscan_warn
"In $watchfile,\n no matching hrefs for pattern\n $site$dir$pattern";
return '';
}
# just give the final directory component
$newdir =~ s%/$%%;
$newdir =~ s%^.*/%%;
return ($newdir);
}
# Nothing to clean here
sub http_clean { 0 }
sub clean_content {
my ($content) = @_;
# We need this horrid stuff to handle href=foo type
# links. OK, bad HTML, but we have to handle it nonetheless.
# It's bug #89749.
$$content =~ s/href\s*=\s*(?=[^\"\'])([^\s>]+)/href="$1"/ig;
# Strip comments
$$content =~ s///sg;
return $content;
}
sub url_canonicalize_dots {
my ($base, $url) = @_;
if ($url !~ m{^[^:#?/]+://}) {
if ($url =~ m{^//}) {
$base =~ m{^[^:#?/]+:}
and $url = $& . $url;
} elsif ($url =~ m{^/}) {
$base =~ m{^[^:#?/]+://[^/#?]*}
and $url = $& . $url;
} else {
uscan_debug "Resolving urls with query part unimplemented"
if ($url =~ m/^[#?]/);
$base =~ m{^[^:#?/]+://[^/#?]*(?:/(?:[^#?/]*/)*)?} and do {
my $base_to_path = $&;
$base_to_path .= '/' unless $base_to_path =~ m|/$|;
$url = $base_to_path . $url;
};
}
}
$url =~ s{^([^:#?/]+://[^/#?]*)(/[^#?]*)}{
my ($h, $p) = ($1, $2);
$p =~ s{/\.(?:/|$|(?=[#?]))}{/}g;
1 while $p =~ s{/(?!\.\./)[^/]*/\.\.(?:/|(?=[#?])|$)}{/}g;
$h.$p;}e;
$url;
}
sub html_search {
my ($self, $content, $patterns, $mangle) = @_;
# pagenmangle: should not abuse this slow operation
if (
mangle(
$self->watchfile, \$self->line,
'pagemangle:\n', [@{ $self->pagemangle }],
\$content
)
) {
return undef;
}
if ( !$self->shared->{bare}
and $content =~ m%^<[?]xml%i
and $content =~ m%xmlns="http://s3.amazonaws.com/doc/2006-03-01/"%
and $content !~ m%([^<]*)%$1%g;
uscan_extra_debug
"processed content:\n$content\n[End of processed content] by Amazon AWS special case code";
}
clean_content(\$content);
# Is there a base URL given?
if ($content =~ /<\s*base\s+[^>]*href\s*=\s*([\"\'])(.*?)\1/i) {
$self->parse_result->{urlbase}
= url_canonicalize_dots($self->parse_result->{base}, $2);
} else {
$self->parse_result->{urlbase} = $self->parse_result->{base};
}
uscan_extra_debug
"processed content:\n$content\n[End of processed content] by fix bad HTML code";
# search hrefs in web page to obtain a list of uversionmangled version and matching download URL
{
local $, = ',';
uscan_verbose "Matching pattern:\n @{$self->{patterns}}";
}
my @hrefs;
while ($content =~ m/<\s*a\s+[^>]*(?<=\s)href\s*=\s*([\"\'])(.*?)\1/sgi) {
my $href = $2;
$href = fix_href($href);
my $href_canonical
= url_canonicalize_dots($self->parse_result->{urlbase}, $href);
if (defined $self->hrefdecode) {
if ($self->hrefdecode eq 'percent-encoding') {
uscan_debug "... Decoding from href: $href";
$href =~ s/%([A-Fa-f\d]{2})/chr hex $1/eg;
$href_canonical =~ s/%([A-Fa-f\d]{2})/chr hex $1/eg;
} else {
uscan_warn "Illegal value for hrefdecode: "
. "$self->{hrefdecode}";
return undef;
}
}
uscan_extra_debug "Checking href $href";
foreach my $_pattern (@$patterns) {
if (my @match = $href =~ /^$_pattern$/) {
push @hrefs,
parse_href($self, $href_canonical, $_pattern, \@match,
$mangle);
}
uscan_extra_debug "Checking href $href_canonical";
if (my @match = $href_canonical =~ /^$_pattern$/) {
push @hrefs,
parse_href($self, $href_canonical, $_pattern, \@match,
$mangle);
}
}
}
return @hrefs;
}
sub plain_search {
my ($self, $content) = @_;
my @hrefs;
foreach my $_pattern (@{ $self->patterns }) {
while ($content =~ s/.*?($_pattern)//) {
push @hrefs, $self->parse_href($1, $_pattern, $2);
}
}
$self->parse_result->{urlbase} = $self->parse_result->{base};
return @hrefs;
}
sub parse_href {
my ($self, $href, $_pattern, $match, $mangle) = @_;
$mangle //= 'uversionmangle';
my $mangled_version;
if ($self->watch_version == 2) {
# watch_version 2 only recognised one group; the code
# below will break version 2 watch files with a construction
# such as file-([\d\.]+(-\d+)?) (bug #327258)
$mangled_version
= ref $match eq 'ARRAY'
? $match->[0]
: $match;
} else {
# need the map { ... } here to handle cases of (...)?
# which may match but then return undef values
if ($self->versionless) {
# exception, otherwise $mangled_version = 1
$mangled_version = '';
} else {
$mangled_version = join(".",
map { $_ if defined($_) }
ref $match eq 'ARRAY' ? @$match : $href =~ m&^$_pattern$&);
}
if (
mangle(
$self->watchfile, \$self->line,
"$mangle:", \@{ $self->$mangle },
\$mangled_version
)
) {
return ();
}
}
$match = '';
if (defined $self->shared->{download_version}) {
if ($mangled_version eq $self->shared->{download_version}) {
$match = "matched with the download version";
}
}
my $priority = $mangled_version . '-' . get_priority($href);
return [$priority, $mangled_version, $href, $match];
}
1;