510 lines
17 KiB
Perl
510 lines
17 KiB
Perl
package Devscripts::Uscan::http;
|
|
|
|
use strict;
|
|
use Cwd qw/abs_path/;
|
|
use Devscripts::Uscan::Output;
|
|
use Devscripts::Uscan::Utils;
|
|
use Devscripts::Uscan::_xtp;
|
|
use Moo::Role;
|
|
|
|
*http_newfile_base = \&Devscripts::Uscan::_xtp::_xtp_newfile_base;
|
|
|
|
##################################
|
|
# search $newversion (http mode)
|
|
##################################
|
|
|
|
#returns (\@patterns, \@base_sites, \@base_dirs)
|
|
sub handle_redirection {
|
|
my ($self, $pattern, @additional_bases) = @_;
|
|
my @redirections = @{ $self->downloader->user_agent->get_redirections };
|
|
my (@patterns, @base_sites, @base_dirs);
|
|
|
|
uscan_verbose "redirections: @redirections" if @redirections;
|
|
|
|
foreach my $_redir (@redirections, @additional_bases) {
|
|
my $base_dir = $_redir;
|
|
|
|
$base_dir =~ s%^\w+://[^/]+/%/%;
|
|
$base_dir =~ s%/[^/]*(?:[#?].*)?$%/%;
|
|
if ($_redir =~ m%^(\w+://[^/]+)%) {
|
|
my $base_site = $1;
|
|
|
|
push @patterns,
|
|
quotemeta($base_site) . quotemeta($base_dir) . "$pattern";
|
|
push @base_sites, $base_site;
|
|
push @base_dirs, $base_dir;
|
|
|
|
# remove the filename, if any
|
|
my $base_dir_orig = $base_dir;
|
|
$base_dir =~ s%/[^/]*$%/%;
|
|
if ($base_dir ne $base_dir_orig) {
|
|
push @patterns,
|
|
quotemeta($base_site) . quotemeta($base_dir) . "$pattern";
|
|
push @base_sites, $base_site;
|
|
push @base_dirs, $base_dir;
|
|
}
|
|
}
|
|
}
|
|
return (\@patterns, \@base_sites, \@base_dirs);
|
|
}
|
|
|
|
sub http_search {
|
|
my ($self) = @_;
|
|
|
|
# $content: web page to be scraped to find the URLs to be downloaded
|
|
if ($self->{parse_result}->{base} =~ /^https/ and !$self->downloader->ssl)
|
|
{
|
|
uscan_die
|
|
"you must have the liblwp-protocol-https-perl package installed\nto use https URLs";
|
|
}
|
|
uscan_verbose "Requesting URL:\n $self->{parse_result}->{base}";
|
|
my $request = HTTP::Request->new('GET', $self->parse_result->{base});
|
|
foreach my $k (keys %{ $self->downloader->headers }) {
|
|
if ($k =~ /^(.*?)@(.*)$/) {
|
|
my $baseUrl = $1;
|
|
my $hdr = $2;
|
|
if ($self->parse_result->{base} =~ m#^\Q$baseUrl\E(?:/.*)?$#) {
|
|
$request->header($hdr => $self->headers->{$k});
|
|
uscan_verbose "Set per-host custom header $hdr for "
|
|
. $self->parse_result->{base};
|
|
} else {
|
|
uscan_debug
|
|
"$self->parse_result->{base} does not start with $1";
|
|
}
|
|
} else {
|
|
uscan_warn "Malformed http-header: $k";
|
|
}
|
|
}
|
|
$request->header('Accept-Encoding' => 'gzip');
|
|
$request->header('Accept' => '*/*');
|
|
my $response = $self->downloader->user_agent->request($request);
|
|
if (!$response->is_success) {
|
|
uscan_warn
|
|
"In watchfile $self->{watchfile}, reading webpage\n $self->{parse_result}->{base} failed: "
|
|
. $response->status_line;
|
|
return undef;
|
|
}
|
|
|
|
my ($patterns, $base_sites, $base_dirs)
|
|
= handle_redirection($self, $self->{parse_result}->{filepattern});
|
|
push @{ $self->patterns }, @$patterns;
|
|
push @{ $self->sites }, @$base_sites;
|
|
push @{ $self->basedirs }, @$base_dirs;
|
|
|
|
my $content = $response->decoded_content;
|
|
uscan_extra_debug
|
|
"received content:\n$content\n[End of received content] by HTTP";
|
|
|
|
my @hrefs;
|
|
if (!$self->searchmode or $self->searchmode eq 'html') {
|
|
@hrefs = $self->html_search($content, $self->patterns);
|
|
} elsif ($self->searchmode eq 'plain') {
|
|
@hrefs = $self->plain_search($content);
|
|
} else {
|
|
uscan_warn 'Unknown searchmode "' . $self->searchmode . '", skipping';
|
|
return undef;
|
|
}
|
|
|
|
if (@hrefs) {
|
|
@hrefs = Devscripts::Versort::versort(@hrefs);
|
|
my $msg
|
|
= "Found the following matching hrefs on the web page (newest first):\n";
|
|
foreach my $href (@hrefs) {
|
|
$msg .= " $$href[2] ($$href[1]) index=$$href[0] $$href[3]\n";
|
|
}
|
|
uscan_verbose $msg;
|
|
}
|
|
my ($newversion, $newfile);
|
|
if (defined $self->shared->{download_version}
|
|
and not $self->versionmode eq 'ignore') {
|
|
|
|
# extract ones which has $match in the above loop defined
|
|
my @vhrefs = grep { $$_[3] } @hrefs;
|
|
if (@vhrefs) {
|
|
(undef, $newversion, $newfile, undef) = @{ $vhrefs[0] };
|
|
} else {
|
|
uscan_warn
|
|
"In $self->{watchfile} no matching hrefs for version $self->{shared}->{download_version}"
|
|
. " in watch line\n $self->{line}";
|
|
return undef;
|
|
}
|
|
} else {
|
|
if (@hrefs) {
|
|
(undef, $newversion, $newfile, undef) = @{ $hrefs[0] };
|
|
} else {
|
|
uscan_warn
|
|
"In $self->{watchfile} no matching files for watch line\n $self->{line}";
|
|
return undef;
|
|
}
|
|
}
|
|
return ($newversion, $newfile);
|
|
}
|
|
|
|
#######################################################################
|
|
# determine $upstream_url (http mode)
|
|
#######################################################################
|
|
# http is complicated due to absolute/relative URL issue
|
|
sub http_upstream_url {
|
|
my ($self) = @_;
|
|
my $upstream_url;
|
|
my $newfile = $self->search_result->{newfile};
|
|
if ($newfile =~ m%^\w+://%) {
|
|
$upstream_url = $newfile;
|
|
} elsif ($newfile =~ m%^//%) {
|
|
$upstream_url = $self->parse_result->{site};
|
|
$upstream_url =~ s/^(https?:).*/$1/;
|
|
$upstream_url .= $newfile;
|
|
} elsif ($newfile =~ m%^/%) {
|
|
|
|
# absolute filename
|
|
# Were there any redirections? If so try using those first
|
|
if ($#{ $self->patterns } > 0) {
|
|
|
|
# replace $site here with the one we were redirected to
|
|
foreach my $index (0 .. $#{ $self->patterns }) {
|
|
if ("$self->{sites}->[$index]$newfile"
|
|
=~ m&^$self->{patterns}->[$index]$&) {
|
|
$upstream_url = "$self->{sites}->[$index]$newfile";
|
|
last;
|
|
}
|
|
}
|
|
if (!defined($upstream_url)) {
|
|
uscan_verbose
|
|
"Unable to determine upstream url from redirections,\n"
|
|
. "defaulting to using site specified in watch file";
|
|
$upstream_url = "$self->{sites}->[0]$newfile";
|
|
}
|
|
} else {
|
|
$upstream_url = "$self->{sites}->[0]$newfile";
|
|
}
|
|
} else {
|
|
# relative filename, we hope
|
|
# Were there any redirections? If so try using those first
|
|
if ($#{ $self->patterns } > 0) {
|
|
|
|
# replace $site here with the one we were redirected to
|
|
foreach my $index (0 .. $#{ $self->patterns }) {
|
|
|
|
# skip unless the basedir looks like a directory
|
|
next unless $self->{basedirs}->[$index] =~ m%/$%;
|
|
my $nf = "$self->{basedirs}->[$index]$newfile";
|
|
if ("$self->{sites}->[$index]$nf"
|
|
=~ m&^$self->{patterns}->[$index]$&) {
|
|
$upstream_url = "$self->{sites}->[$index]$nf";
|
|
last;
|
|
}
|
|
}
|
|
if (!defined($upstream_url)) {
|
|
uscan_verbose
|
|
"Unable to determine upstream url from redirections,\n"
|
|
. "defaulting to using site specified in watch file";
|
|
$upstream_url = "$self->{parse_result}->{urlbase}$newfile";
|
|
}
|
|
} else {
|
|
$upstream_url = "$self->{parse_result}->{urlbase}$newfile";
|
|
}
|
|
}
|
|
|
|
# mangle if necessary
|
|
$upstream_url =~ s/&/&/g;
|
|
uscan_verbose "Matching target for downloadurlmangle: $upstream_url";
|
|
if (@{ $self->downloadurlmangle }) {
|
|
if (
|
|
mangle(
|
|
$self->watchfile, \$self->line,
|
|
'downloadurlmangle:', \@{ $self->downloadurlmangle },
|
|
\$upstream_url
|
|
)
|
|
) {
|
|
$self->status(1);
|
|
return undef;
|
|
}
|
|
}
|
|
return $upstream_url;
|
|
}
|
|
|
|
sub http_newdir {
|
|
my ($https, $line, $site, $dir, $pattern, $dirversionmangle,
|
|
$watchfile, $lineptr, $download_version)
|
|
= @_;
|
|
|
|
my $downloader = $line->downloader;
|
|
my ($request, $response, $newdir);
|
|
my ($download_version_short1, $download_version_short2,
|
|
$download_version_short3)
|
|
= partial_version($download_version);
|
|
my $base = $site . $dir;
|
|
|
|
$pattern .= "/?";
|
|
|
|
if (defined($https) and !$downloader->ssl) {
|
|
uscan_die
|
|
"$progname: you must have the liblwp-protocol-https-perl package installed\n"
|
|
. "to use https URLs";
|
|
}
|
|
# At least for now, set base in the line object - other methods need it
|
|
local $line->parse_result->{base} = $base;
|
|
$request = HTTP::Request->new('GET', $base);
|
|
$response = $downloader->user_agent->request($request);
|
|
if (!$response->is_success) {
|
|
uscan_warn
|
|
"In watch file $watchfile, reading webpage\n $base failed: "
|
|
. $response->status_line;
|
|
return '';
|
|
}
|
|
|
|
my $content = $response->content;
|
|
if ( $response->header('Content-Encoding')
|
|
and $response->header('Content-Encoding') =~ /^gzip$/i) {
|
|
require IO::Uncompress::Gunzip;
|
|
require IO::String;
|
|
uscan_debug "content seems gzip encoded, let's decode it";
|
|
my $out;
|
|
if (IO::Uncompress::Gunzip::gunzip(IO::String->new($content), \$out)) {
|
|
$content = $out;
|
|
} else {
|
|
uscan_warn 'Unable to decode remote content: '
|
|
. $IO::Uncompress::GunzipError;
|
|
return '';
|
|
}
|
|
}
|
|
uscan_extra_debug
|
|
"received content:\n$content\n[End of received content] by HTTP";
|
|
|
|
clean_content(\$content);
|
|
|
|
my ($dirpatterns, $base_sites, $base_dirs)
|
|
= handle_redirection($line, $pattern, $base);
|
|
$downloader->user_agent->clear_redirections; # we won't be needing that
|
|
|
|
my @hrefs;
|
|
for my $parsed (
|
|
html_search($line, $content, $dirpatterns, 'dirversionmangle')) {
|
|
my ($priority, $mangled_version, $href, $match) = @$parsed;
|
|
$match = '';
|
|
if (defined $download_version
|
|
and $mangled_version eq $download_version) {
|
|
$match = "matched with the download version";
|
|
}
|
|
if (defined $download_version_short3
|
|
and $mangled_version eq $download_version_short3) {
|
|
$match = "matched with the download version (partial 3)";
|
|
}
|
|
if (defined $download_version_short2
|
|
and $mangled_version eq $download_version_short2) {
|
|
$match = "matched with the download version (partial 2)";
|
|
}
|
|
if (defined $download_version_short1
|
|
and $mangled_version eq $download_version_short1) {
|
|
$match = "matched with the download version (partial 1)";
|
|
}
|
|
push @hrefs, [$mangled_version, $href, $match];
|
|
}
|
|
|
|
# extract ones which has $match in the above loop defined
|
|
my @vhrefs = grep { $$_[2] } @hrefs;
|
|
if (@vhrefs) {
|
|
@vhrefs = Devscripts::Versort::upstream_versort(@vhrefs);
|
|
$newdir = $vhrefs[0][1];
|
|
}
|
|
if (@hrefs) {
|
|
@hrefs = Devscripts::Versort::upstream_versort(@hrefs);
|
|
my $msg = "Found the following matching directories (newest first):\n";
|
|
foreach my $href (@hrefs) {
|
|
$msg .= " $$href[1] ($$href[0]) $$href[2]\n";
|
|
}
|
|
uscan_verbose $msg;
|
|
$newdir //= $hrefs[0][1];
|
|
} else {
|
|
uscan_warn
|
|
"In $watchfile,\n no matching hrefs for pattern\n $site$dir$pattern";
|
|
return '';
|
|
}
|
|
|
|
# just give the final directory component
|
|
$newdir =~ s%/$%%;
|
|
$newdir =~ s%^.*/%%;
|
|
return ($newdir);
|
|
}
|
|
|
|
# Nothing to clean here
|
|
sub http_clean { 0 }
|
|
|
|
sub clean_content {
|
|
my ($content) = @_;
|
|
|
|
# We need this horrid stuff to handle href=foo type
|
|
# links. OK, bad HTML, but we have to handle it nonetheless.
|
|
# It's bug #89749.
|
|
$$content =~ s/href\s*=\s*(?=[^\"\'])([^\s>]+)/href="$1"/ig;
|
|
|
|
# Strip comments
|
|
$$content =~ s/<!-- .*?-->//sg;
|
|
return $content;
|
|
}
|
|
|
|
sub url_canonicalize_dots {
|
|
my ($base, $url) = @_;
|
|
|
|
if ($url !~ m{^[^:#?/]+://}) {
|
|
if ($url =~ m{^//}) {
|
|
$base =~ m{^[^:#?/]+:}
|
|
and $url = $& . $url;
|
|
} elsif ($url =~ m{^/}) {
|
|
$base =~ m{^[^:#?/]+://[^/#?]*}
|
|
and $url = $& . $url;
|
|
} else {
|
|
uscan_debug "Resolving urls with query part unimplemented"
|
|
if ($url =~ m/^[#?]/);
|
|
$base =~ m{^[^:#?/]+://[^/#?]*(?:/(?:[^#?/]*/)*)?} and do {
|
|
my $base_to_path = $&;
|
|
$base_to_path .= '/' unless $base_to_path =~ m|/$|;
|
|
$url = $base_to_path . $url;
|
|
};
|
|
}
|
|
}
|
|
$url =~ s{^([^:#?/]+://[^/#?]*)(/[^#?]*)}{
|
|
my ($h, $p) = ($1, $2);
|
|
$p =~ s{/\.(?:/|$|(?=[#?]))}{/}g;
|
|
1 while $p =~ s{/(?!\.\./)[^/]*/\.\.(?:/|(?=[#?])|$)}{/}g;
|
|
$h.$p;}e;
|
|
$url;
|
|
}
|
|
|
|
sub html_search {
|
|
my ($self, $content, $patterns, $mangle) = @_;
|
|
|
|
# pagenmangle: should not abuse this slow operation
|
|
if (
|
|
mangle(
|
|
$self->watchfile, \$self->line,
|
|
'pagemangle:\n', [@{ $self->pagemangle }],
|
|
\$content
|
|
)
|
|
) {
|
|
return undef;
|
|
}
|
|
if ( !$self->shared->{bare}
|
|
and $content =~ m%^<[?]xml%i
|
|
and $content =~ m%xmlns="http://s3.amazonaws.com/doc/2006-03-01/"%
|
|
and $content !~ m%<Key><a\s+href%) {
|
|
# this is an S3 bucket listing. Insert an 'a href' tag
|
|
# into the content for each 'Key', so that it looks like html (LP: #798293)
|
|
uscan_warn
|
|
"*** Amazon AWS special case code is deprecated***\nUse opts=pagemangle rule, instead";
|
|
$content =~ s%<Key>([^<]*)</Key>%<Key><a href="$1">$1</a></Key>%g;
|
|
uscan_extra_debug
|
|
"processed content:\n$content\n[End of processed content] by Amazon AWS special case code";
|
|
}
|
|
clean_content(\$content);
|
|
|
|
# Is there a base URL given?
|
|
if ($content =~ /<\s*base\s+[^>]*href\s*=\s*([\"\'])(.*?)\1/i) {
|
|
$self->parse_result->{urlbase}
|
|
= url_canonicalize_dots($self->parse_result->{base}, $2);
|
|
} else {
|
|
$self->parse_result->{urlbase} = $self->parse_result->{base};
|
|
}
|
|
uscan_extra_debug
|
|
"processed content:\n$content\n[End of processed content] by fix bad HTML code";
|
|
|
|
# search hrefs in web page to obtain a list of uversionmangled version and matching download URL
|
|
{
|
|
local $, = ',';
|
|
uscan_verbose "Matching pattern:\n @{$self->{patterns}}";
|
|
}
|
|
my @hrefs;
|
|
while ($content =~ m/<\s*a\s+[^>]*(?<=\s)href\s*=\s*([\"\'])(.*?)\1/sgi) {
|
|
my $href = $2;
|
|
$href = fix_href($href);
|
|
my $href_canonical
|
|
= url_canonicalize_dots($self->parse_result->{urlbase}, $href);
|
|
if (defined $self->hrefdecode) {
|
|
if ($self->hrefdecode eq 'percent-encoding') {
|
|
uscan_debug "... Decoding from href: $href";
|
|
$href =~ s/%([A-Fa-f\d]{2})/chr hex $1/eg;
|
|
$href_canonical =~ s/%([A-Fa-f\d]{2})/chr hex $1/eg;
|
|
} else {
|
|
uscan_warn "Illegal value for hrefdecode: "
|
|
. "$self->{hrefdecode}";
|
|
return undef;
|
|
}
|
|
}
|
|
uscan_extra_debug "Checking href $href";
|
|
foreach my $_pattern (@$patterns) {
|
|
if (my @match = $href =~ /^$_pattern$/) {
|
|
push @hrefs,
|
|
parse_href($self, $href_canonical, $_pattern, \@match,
|
|
$mangle);
|
|
}
|
|
uscan_extra_debug "Checking href $href_canonical";
|
|
if (my @match = $href_canonical =~ /^$_pattern$/) {
|
|
push @hrefs,
|
|
parse_href($self, $href_canonical, $_pattern, \@match,
|
|
$mangle);
|
|
}
|
|
}
|
|
}
|
|
return @hrefs;
|
|
}
|
|
|
|
sub plain_search {
|
|
my ($self, $content) = @_;
|
|
my @hrefs;
|
|
foreach my $_pattern (@{ $self->patterns }) {
|
|
while ($content =~ s/.*?($_pattern)//) {
|
|
push @hrefs, $self->parse_href($1, $_pattern, $2);
|
|
}
|
|
}
|
|
$self->parse_result->{urlbase} = $self->parse_result->{base};
|
|
return @hrefs;
|
|
}
|
|
|
|
sub parse_href {
|
|
my ($self, $href, $_pattern, $match, $mangle) = @_;
|
|
$mangle //= 'uversionmangle';
|
|
|
|
my $mangled_version;
|
|
if ($self->watch_version == 2) {
|
|
|
|
# watch_version 2 only recognised one group; the code
|
|
# below will break version 2 watch files with a construction
|
|
# such as file-([\d\.]+(-\d+)?) (bug #327258)
|
|
$mangled_version
|
|
= ref $match eq 'ARRAY'
|
|
? $match->[0]
|
|
: $match;
|
|
} else {
|
|
# need the map { ... } here to handle cases of (...)?
|
|
# which may match but then return undef values
|
|
if ($self->versionless) {
|
|
|
|
# exception, otherwise $mangled_version = 1
|
|
$mangled_version = '';
|
|
} else {
|
|
$mangled_version = join(".",
|
|
map { $_ if defined($_) }
|
|
ref $match eq 'ARRAY' ? @$match : $href =~ m&^$_pattern$&);
|
|
}
|
|
|
|
if (
|
|
mangle(
|
|
$self->watchfile, \$self->line,
|
|
"$mangle:", \@{ $self->$mangle },
|
|
\$mangled_version
|
|
)
|
|
) {
|
|
return ();
|
|
}
|
|
}
|
|
$match = '';
|
|
if (defined $self->shared->{download_version}) {
|
|
if ($mangled_version eq $self->shared->{download_version}) {
|
|
$match = "matched with the download version";
|
|
}
|
|
}
|
|
my $priority = $mangled_version . '-' . get_priority($href);
|
|
return [$priority, $mangled_version, $href, $match];
|
|
}
|
|
|
|
1;
|