diff options
Diffstat (limited to 'lib/Devscripts/Uscan/http.pm')
-rw-r--r-- | lib/Devscripts/Uscan/http.pm | 434 |
1 files changed, 434 insertions, 0 deletions
diff --git a/lib/Devscripts/Uscan/http.pm b/lib/Devscripts/Uscan/http.pm new file mode 100644 index 0000000..95fc08a --- /dev/null +++ b/lib/Devscripts/Uscan/http.pm @@ -0,0 +1,434 @@ +package Devscripts::Uscan::http; + +use strict; +use Cwd qw/abs_path/; +use Devscripts::Uscan::Output; +use Devscripts::Uscan::Utils; +use Devscripts::Uscan::_xtp; +use Moo::Role; + +*http_newfile_base = \&Devscripts::Uscan::_xtp::_xtp_newfile_base; + +################################## +# search $newversion (http mode) +################################## +sub http_search { + my ($self) = @_; + + # $content: web page to be scraped to find the URLs to be downloaded + if (defined($1) and $self->downloader->ssl) { + uscan_die +"you must have the liblwp-protocol-https-perl package installed\nto use https URLs"; + } + uscan_verbose "Requesting URL:\n $self->{parse_result}->{base}"; + my $request = HTTP::Request->new('GET', $self->parse_result->{base}); + my $response = $self->downloader->user_agent->request($request); + if (!$response->is_success) { + uscan_warn +"In watchfile $self->{watchfile}, reading webpage\n $self->{parse_result}->{base} failed: " + . $response->status_line; + return undef; + } + + my @redirections = @{ $self->downloader->user_agent->get_redirections }; + + uscan_verbose "redirections: @redirections" if @redirections; + + foreach my $_redir (@redirections) { + my $base_dir = $_redir; + + $base_dir =~ s%^\w+://[^/]+/%/%; + if ($_redir =~ m%^(\w+://[^/]+)%) { + my $base_site = $1; + + push @{ $self->patterns }, + "(?:(?:$base_site)?" + . quotemeta($base_dir) + . ")?$self->{parse_result}->{filepattern}"; + push @{ $self->sites }, $base_site; + push @{ $self->basedirs }, $base_dir; + + # remove the filename, if any + my $base_dir_orig = $base_dir; + $base_dir =~ s%/[^/]*$%/%; + if ($base_dir ne $base_dir_orig) { + push @{ $self->patterns }, + "(?:(?:$base_site)?" + . quotemeta($base_dir) + . ")?$self->{parse_result}->{filepattern}"; + push @{ $self->sites }, $base_site; + push @{ $self->basedirs }, $base_dir; + } + } + } + + my $content = $response->decoded_content; + uscan_debug + "received content:\n$content\n[End of received content] by HTTP"; + + my @hrefs; + if (!$self->searchmode or $self->searchmode eq 'html') { + @hrefs = $self->html_search($content); + } elsif ($self->searchmode eq 'plain') { + @hrefs = $self->plain_search($content); + } else { + uscan_warn 'Unknown searchmode "' . $self->searchmode . '", skipping'; + return undef; + } + + if (@hrefs) { + @hrefs = Devscripts::Versort::versort(@hrefs); + my $msg + = "Found the following matching hrefs on the web page (newest first):\n"; + foreach my $href (@hrefs) { + $msg .= " $$href[2] ($$href[1]) index=$$href[0] $$href[3]\n"; + } + uscan_verbose $msg; + } + my ($newversion, $newfile); + if (defined $self->shared->{download_version}) { + + # extract ones which has $match in the above loop defined + my @vhrefs = grep { $$_[3] } @hrefs; + if (@vhrefs) { + (undef, $newversion, $newfile, undef) = @{ $vhrefs[0] }; + } else { + uscan_warn +"In $self->{watchfile} no matching hrefs for version $self->{shared}->{download_version}" + . " in watch line\n $self->{line}"; + return undef; + } + } else { + if (@hrefs) { + (undef, $newversion, $newfile, undef) = @{ $hrefs[0] }; + } else { + uscan_warn +"In $self->{watchfile} no matching files for watch line\n $self->{line}"; + return undef; + } + } + return ($newversion, $newfile); +} + +####################################################################### +# determine $upstream_url (http mode) +####################################################################### +# http is complicated due to absolute/relative URL issue +sub http_upstream_url { + my ($self) = @_; + my $upstream_url; + my $newfile = $self->search_result->{newfile}; + if ($newfile =~ m%^\w+://%) { + $upstream_url = $newfile; + } elsif ($newfile =~ m%^//%) { + $upstream_url = $self->parse_result->{site}; + $upstream_url =~ s/^(https?:).*/$1/; + $upstream_url .= $newfile; + } elsif ($newfile =~ m%^/%) { + + # absolute filename + # Were there any redirections? If so try using those first + if ($#{ $self->patterns } > 0) { + + # replace $site here with the one we were redirected to + foreach my $index (0 .. $#{ $self->patterns }) { + if ("$self->{sites}->[$index]$newfile" + =~ m&^$self->{patterns}->[$index]$&) { + $upstream_url = "$self->{sites}->[$index]$newfile"; + last; + } + } + if (!defined($upstream_url)) { + uscan_verbose + "Unable to determine upstream url from redirections,\n" + . "defaulting to using site specified in watch file"; + $upstream_url = "$self->{sites}->[0]$newfile"; + } + } else { + $upstream_url = "$self->{sites}->[0]$newfile"; + } + } else { + # relative filename, we hope + # Were there any redirections? If so try using those first + if ($#{ $self->patterns } > 0) { + + # replace $site here with the one we were redirected to + foreach my $index (0 .. $#{ $self->patterns }) { + + # skip unless the basedir looks like a directory + next unless $self->{basedirs}->[$index] =~ m%/$%; + my $nf = "$self->{basedirs}->[$index]$newfile"; + if ("$self->{sites}->[$index]$nf" + =~ m&^$self->{patterns}->[$index]$&) { + $upstream_url = "$self->{sites}->[$index]$nf"; + last; + } + } + if (!defined($upstream_url)) { + uscan_verbose + "Unable to determine upstream url from redirections,\n" + . "defaulting to using site specified in watch file"; + $upstream_url = "$self->{parse_result}->{urlbase}$newfile"; + } + } else { + $upstream_url = "$self->{parse_result}->{urlbase}$newfile"; + } + } + + # mangle if necessary + $upstream_url =~ s/&/&/g; + uscan_verbose "Matching target for downloadurlmangle: $upstream_url"; + if (@{ $self->downloadurlmangle }) { + if ( + mangle( + $self->watchfile, \$self->line, + 'downloadurlmangle:', \@{ $self->downloadurlmangle }, + \$upstream_url + ) + ) { + $self->status(1); + return undef; + } + } + return $upstream_url; +} + +sub http_newdir { + my ( + $https, $downloader, $site, + $dir, $pattern, $dirversionmangle, + $watchfile, $lineptr, $download_version + ) = @_; + + my ($request, $response, $newdir); + my ($download_version_short1, $download_version_short2, + $download_version_short3) + = partial_version($download_version); + my $base = $site . $dir; + + if (defined($https) and !$downloader->ssl) { + uscan_die +"$progname: you must have the liblwp-protocol-https-perl package installed\n" + . "to use https URLs"; + } + $request = HTTP::Request->new('GET', $base); + $response = $downloader->user_agent->request($request); + if (!$response->is_success) { + uscan_warn + "In watch file $watchfile, reading webpage\n $base failed: " + . $response->status_line; + return ''; + } + + my $content = $response->content; + uscan_debug + "received content:\n$content\n[End of received content] by HTTP"; + + clean_content(\$content); + + my $dirpattern = "(?:(?:$site)?" . quotemeta($dir) . ")?$pattern"; + + uscan_verbose "Matching pattern:\n $dirpattern"; + my @hrefs; + my $match = ''; + while ($content =~ m/<\s*a\s+[^>]*href\s*=\s*([\"\'])(.*?)\1/gi) { + my $href = fix_href($2); + uscan_verbose "Matching target for dirversionmangle: $href"; + if ($href =~ m&^$dirpattern/?$&) { + my $mangled_version + = join(".", map { $_ // '' } $href =~ m&^$dirpattern/?$&); + if ( + mangle( + $watchfile, $lineptr, + 'dirversionmangle:', \@{$dirversionmangle}, + \$mangled_version + ) + ) { + return 1; + } + $match = ''; + if (defined $download_version + and $mangled_version eq $download_version) { + $match = "matched with the download version"; + } + if (defined $download_version_short3 + and $mangled_version eq $download_version_short3) { + $match = "matched with the download version (partial 3)"; + } + if (defined $download_version_short2 + and $mangled_version eq $download_version_short2) { + $match = "matched with the download version (partial 2)"; + } + if (defined $download_version_short1 + and $mangled_version eq $download_version_short1) { + $match = "matched with the download version (partial 1)"; + } + push @hrefs, [$mangled_version, $href, $match]; + } + } + + # extract ones which has $match in the above loop defined + my @vhrefs = grep { $$_[2] } @hrefs; + if (@vhrefs) { + @vhrefs = Devscripts::Versort::upstream_versort(@vhrefs); + $newdir = $vhrefs[0][1]; + } + if (@hrefs) { + @hrefs = Devscripts::Versort::upstream_versort(@hrefs); + my $msg = "Found the following matching directories (newest first):\n"; + foreach my $href (@hrefs) { + $msg .= " $$href[1] ($$href[0]) $$href[2]\n"; + } + uscan_verbose $msg; + $newdir //= $hrefs[0][1]; + } else { + uscan_warn +"In $watchfile,\n no matching hrefs for pattern\n $site$dir$pattern"; + return ''; + } + + # just give the final directory component + $newdir =~ s%/$%%; + $newdir =~ s%^.*/%%; + return ($newdir); +} + +# Nothing to clean here +sub http_clean { 0 } + +sub clean_content { + my ($content) = @_; + + # We need this horrid stuff to handle href=foo type + # links. OK, bad HTML, but we have to handle it nonetheless. + # It's bug #89749. + $$content =~ s/href\s*=\s*(?=[^\"\'])([^\s>]+)/href="$1"/ig; + + # Strip comments + $$content =~ s/<!-- .*?-->//sg; + return $content; +} + +sub html_search { + my ($self, $content) = @_; + + # pagenmangle: should not abuse this slow operation + if ( + mangle( + $self->watchfile, \$self->line, + 'pagemangle:\n', [@{ $self->pagemangle }], + \$content + ) + ) { + return undef; + } + if ( !$self->shared->{bare} + and $content =~ m%^<[?]xml%i + and $content =~ m%xmlns="http://s3.amazonaws.com/doc/2006-03-01/"% + and $content !~ m%<Key><a\s+href%) { + # this is an S3 bucket listing. Insert an 'a href' tag + # into the content for each 'Key', so that it looks like html (LP: #798293) + uscan_warn +"*** Amazon AWS special case code is deprecated***\nUse opts=pagemangle rule, instead"; + $content =~ s%<Key>([^<]*)</Key>%<Key><a href="$1">$1</a></Key>%g; + uscan_debug +"processed content:\n$content\n[End of processed content] by Amazon AWS special case code"; + } + clean_content(\$content); + + # Is there a base URL given? + if ($content =~ /<\s*base\s+[^>]*href\s*=\s*([\"\'])(.*?)\1/i) { + + # Ensure it ends with / + $self->parse_result->{urlbase} = "$2/"; + $self->parse_result->{urlbase} =~ s%//$%/%; + } else { + # May have to strip a base filename + ($self->parse_result->{urlbase} = $self->parse_result->{base}) + =~ s%/[^/]*$%/%; + } + uscan_debug +"processed content:\n$content\n[End of processed content] by fix bad HTML code"; + +# search hrefs in web page to obtain a list of uversionmangled version and matching download URL + { + local $, = ','; + uscan_verbose "Matching pattern:\n @{$self->{patterns}}"; + } + my @hrefs; + while ($content =~ m/<\s*a\s+[^>]*(?<=\s)href\s*=\s*([\"\'])(.*?)\1/sgi) { + my $href = $2; + $href = fix_href($href); + if (defined $self->hrefdecode) { + if ($self->hrefdecode eq 'percent-encoding') { + uscan_debug "... Decoding from href: $href"; + $href =~ s/%([A-Fa-f\d]{2})/chr hex $1/eg; + } else { + uscan_warn "Illegal value for hrefdecode: " + . "$self->{hrefdecode}"; + return undef; + } + } + uscan_debug "Checking href $href"; + foreach my $_pattern (@{ $self->patterns }) { + if ($href =~ /^$_pattern$/) { + push @hrefs, $self->parse_href($href, $_pattern, $1); + } + } + } + return @hrefs; +} + +sub plain_search { + my ($self, $content) = @_; + my @hrefs; + foreach my $_pattern (@{ $self->patterns }) { + while ($content =~ s/.*?($_pattern)//) { + push @hrefs, $self->parse_href($1, $_pattern, $2); + } + } + return @hrefs; +} + +sub parse_href { + my ($self, $href, $_pattern, $match) = @_; + my $mangled_version; + if ($self->watch_version == 2) { + + # watch_version 2 only recognised one group; the code + # below will break version 2 watch files with a construction + # such as file-([\d\.]+(-\d+)?) (bug #327258) + $mangled_version = $match; + } else { + # need the map { ... } here to handle cases of (...)? + # which may match but then return undef values + if ($self->versionless) { + + # exception, otherwise $mangled_version = 1 + $mangled_version = ''; + } else { + $mangled_version + = join(".", map { $_ if defined($_) } $href =~ m&^$_pattern$&); + } + + if ( + mangle( + $self->watchfile, \$self->line, + 'uversionmangle:', \@{ $self->uversionmangle }, + \$mangled_version + ) + ) { + return (); + } + } + $match = ''; + if (defined $self->shared->{download_version}) { + if ($mangled_version eq $self->shared->{download_version}) { + $match = "matched with the download version"; + } + } + my $priority = $mangled_version . '-' . get_priority($href); + return [$priority, $mangled_version, $href, $match]; +} + +1; |