diff options
Diffstat (limited to 'lib/Devscripts/Uscan/http.pm')
-rw-r--r-- | lib/Devscripts/Uscan/http.pm | 509 |
1 files changed, 509 insertions, 0 deletions
diff --git a/lib/Devscripts/Uscan/http.pm b/lib/Devscripts/Uscan/http.pm new file mode 100644 index 0000000..58aaa4c --- /dev/null +++ b/lib/Devscripts/Uscan/http.pm @@ -0,0 +1,509 @@ +package Devscripts::Uscan::http; + +use strict; +use Cwd qw/abs_path/; +use Devscripts::Uscan::Output; +use Devscripts::Uscan::Utils; +use Devscripts::Uscan::_xtp; +use Moo::Role; + +*http_newfile_base = \&Devscripts::Uscan::_xtp::_xtp_newfile_base; + +################################## +# search $newversion (http mode) +################################## + +#returns (\@patterns, \@base_sites, \@base_dirs) +sub handle_redirection { + my ($self, $pattern, @additional_bases) = @_; + my @redirections = @{ $self->downloader->user_agent->get_redirections }; + my (@patterns, @base_sites, @base_dirs); + + uscan_verbose "redirections: @redirections" if @redirections; + + foreach my $_redir (@redirections, @additional_bases) { + my $base_dir = $_redir; + + $base_dir =~ s%^\w+://[^/]+/%/%; + $base_dir =~ s%/[^/]*(?:[#?].*)?$%/%; + if ($_redir =~ m%^(\w+://[^/]+)%) { + my $base_site = $1; + + push @patterns, + quotemeta($base_site) . quotemeta($base_dir) . "$pattern"; + push @base_sites, $base_site; + push @base_dirs, $base_dir; + + # remove the filename, if any + my $base_dir_orig = $base_dir; + $base_dir =~ s%/[^/]*$%/%; + if ($base_dir ne $base_dir_orig) { + push @patterns, + quotemeta($base_site) . quotemeta($base_dir) . "$pattern"; + push @base_sites, $base_site; + push @base_dirs, $base_dir; + } + } + } + return (\@patterns, \@base_sites, \@base_dirs); +} + +sub http_search { + my ($self) = @_; + + # $content: web page to be scraped to find the URLs to be downloaded + if ($self->{parse_result}->{base} =~ /^https/ and !$self->downloader->ssl) + { + uscan_die +"you must have the liblwp-protocol-https-perl package installed\nto use https URLs"; + } + uscan_verbose "Requesting URL:\n $self->{parse_result}->{base}"; + my $request = HTTP::Request->new('GET', $self->parse_result->{base}); + foreach my $k (keys %{ $self->downloader->headers }) { + if ($k =~ /^(.*?)@(.*)$/) { + my $baseUrl = $1; + my $hdr = $2; + if ($self->parse_result->{base} =~ m#^\Q$baseUrl\E(?:/.*)?$#) { + $request->header($hdr => $self->headers->{$k}); + uscan_verbose "Set per-host custom header $hdr for " + . $self->parse_result->{base}; + } else { + uscan_debug + "$self->parse_result->{base} does not start with $1"; + } + } else { + uscan_warn "Malformed http-header: $k"; + } + } + $request->header('Accept-Encoding' => 'gzip'); + $request->header('Accept' => '*/*'); + my $response = $self->downloader->user_agent->request($request); + if (!$response->is_success) { + uscan_warn +"In watchfile $self->{watchfile}, reading webpage\n $self->{parse_result}->{base} failed: " + . $response->status_line; + return undef; + } + + my ($patterns, $base_sites, $base_dirs) + = handle_redirection($self, $self->{parse_result}->{filepattern}); + push @{ $self->patterns }, @$patterns; + push @{ $self->sites }, @$base_sites; + push @{ $self->basedirs }, @$base_dirs; + + my $content = $response->decoded_content; + uscan_extra_debug + "received content:\n$content\n[End of received content] by HTTP"; + + my @hrefs; + if (!$self->searchmode or $self->searchmode eq 'html') { + @hrefs = $self->html_search($content, $self->patterns); + } elsif ($self->searchmode eq 'plain') { + @hrefs = $self->plain_search($content); + } else { + uscan_warn 'Unknown searchmode "' . $self->searchmode . '", skipping'; + return undef; + } + + if (@hrefs) { + @hrefs = Devscripts::Versort::versort(@hrefs); + my $msg + = "Found the following matching hrefs on the web page (newest first):\n"; + foreach my $href (@hrefs) { + $msg .= " $$href[2] ($$href[1]) index=$$href[0] $$href[3]\n"; + } + uscan_verbose $msg; + } + my ($newversion, $newfile); + if (defined $self->shared->{download_version} + and not $self->versionmode eq 'ignore') { + + # extract ones which has $match in the above loop defined + my @vhrefs = grep { $$_[3] } @hrefs; + if (@vhrefs) { + (undef, $newversion, $newfile, undef) = @{ $vhrefs[0] }; + } else { + uscan_warn +"In $self->{watchfile} no matching hrefs for version $self->{shared}->{download_version}" + . " in watch line\n $self->{line}"; + return undef; + } + } else { + if (@hrefs) { + (undef, $newversion, $newfile, undef) = @{ $hrefs[0] }; + } else { + uscan_warn +"In $self->{watchfile} no matching files for watch line\n $self->{line}"; + return undef; + } + } + return ($newversion, $newfile); +} + +####################################################################### +# determine $upstream_url (http mode) +####################################################################### +# http is complicated due to absolute/relative URL issue +sub http_upstream_url { + my ($self) = @_; + my $upstream_url; + my $newfile = $self->search_result->{newfile}; + if ($newfile =~ m%^\w+://%) { + $upstream_url = $newfile; + } elsif ($newfile =~ m%^//%) { + $upstream_url = $self->parse_result->{site}; + $upstream_url =~ s/^(https?:).*/$1/; + $upstream_url .= $newfile; + } elsif ($newfile =~ m%^/%) { + + # absolute filename + # Were there any redirections? If so try using those first + if ($#{ $self->patterns } > 0) { + + # replace $site here with the one we were redirected to + foreach my $index (0 .. $#{ $self->patterns }) { + if ("$self->{sites}->[$index]$newfile" + =~ m&^$self->{patterns}->[$index]$&) { + $upstream_url = "$self->{sites}->[$index]$newfile"; + last; + } + } + if (!defined($upstream_url)) { + uscan_verbose + "Unable to determine upstream url from redirections,\n" + . "defaulting to using site specified in watch file"; + $upstream_url = "$self->{sites}->[0]$newfile"; + } + } else { + $upstream_url = "$self->{sites}->[0]$newfile"; + } + } else { + # relative filename, we hope + # Were there any redirections? If so try using those first + if ($#{ $self->patterns } > 0) { + + # replace $site here with the one we were redirected to + foreach my $index (0 .. $#{ $self->patterns }) { + + # skip unless the basedir looks like a directory + next unless $self->{basedirs}->[$index] =~ m%/$%; + my $nf = "$self->{basedirs}->[$index]$newfile"; + if ("$self->{sites}->[$index]$nf" + =~ m&^$self->{patterns}->[$index]$&) { + $upstream_url = "$self->{sites}->[$index]$nf"; + last; + } + } + if (!defined($upstream_url)) { + uscan_verbose + "Unable to determine upstream url from redirections,\n" + . "defaulting to using site specified in watch file"; + $upstream_url = "$self->{parse_result}->{urlbase}$newfile"; + } + } else { + $upstream_url = "$self->{parse_result}->{urlbase}$newfile"; + } + } + + # mangle if necessary + $upstream_url =~ s/&/&/g; + uscan_verbose "Matching target for downloadurlmangle: $upstream_url"; + if (@{ $self->downloadurlmangle }) { + if ( + mangle( + $self->watchfile, \$self->line, + 'downloadurlmangle:', \@{ $self->downloadurlmangle }, + \$upstream_url + ) + ) { + $self->status(1); + return undef; + } + } + return $upstream_url; +} + +sub http_newdir { + my ($https, $line, $site, $dir, $pattern, $dirversionmangle, + $watchfile, $lineptr, $download_version) + = @_; + + my $downloader = $line->downloader; + my ($request, $response, $newdir); + my ($download_version_short1, $download_version_short2, + $download_version_short3) + = partial_version($download_version); + my $base = $site . $dir; + + $pattern .= "/?"; + + if (defined($https) and !$downloader->ssl) { + uscan_die +"$progname: you must have the liblwp-protocol-https-perl package installed\n" + . "to use https URLs"; + } + # At least for now, set base in the line object - other methods need it + local $line->parse_result->{base} = $base; + $request = HTTP::Request->new('GET', $base); + $response = $downloader->user_agent->request($request); + if (!$response->is_success) { + uscan_warn + "In watch file $watchfile, reading webpage\n $base failed: " + . $response->status_line; + return ''; + } + + my $content = $response->content; + if ( $response->header('Content-Encoding') + and $response->header('Content-Encoding') =~ /^gzip$/i) { + require IO::Uncompress::Gunzip; + require IO::String; + uscan_debug "content seems gzip encoded, let's decode it"; + my $out; + if (IO::Uncompress::Gunzip::gunzip(IO::String->new($content), \$out)) { + $content = $out; + } else { + uscan_warn 'Unable to decode remote content: ' + . $IO::Uncompress::GunzipError; + return ''; + } + } + uscan_extra_debug + "received content:\n$content\n[End of received content] by HTTP"; + + clean_content(\$content); + + my ($dirpatterns, $base_sites, $base_dirs) + = handle_redirection($line, $pattern, $base); + $downloader->user_agent->clear_redirections; # we won't be needing that + + my @hrefs; + for my $parsed ( + html_search($line, $content, $dirpatterns, 'dirversionmangle')) { + my ($priority, $mangled_version, $href, $match) = @$parsed; + $match = ''; + if (defined $download_version + and $mangled_version eq $download_version) { + $match = "matched with the download version"; + } + if (defined $download_version_short3 + and $mangled_version eq $download_version_short3) { + $match = "matched with the download version (partial 3)"; + } + if (defined $download_version_short2 + and $mangled_version eq $download_version_short2) { + $match = "matched with the download version (partial 2)"; + } + if (defined $download_version_short1 + and $mangled_version eq $download_version_short1) { + $match = "matched with the download version (partial 1)"; + } + push @hrefs, [$mangled_version, $href, $match]; + } + + # extract ones which has $match in the above loop defined + my @vhrefs = grep { $$_[2] } @hrefs; + if (@vhrefs) { + @vhrefs = Devscripts::Versort::upstream_versort(@vhrefs); + $newdir = $vhrefs[0][1]; + } + if (@hrefs) { + @hrefs = Devscripts::Versort::upstream_versort(@hrefs); + my $msg = "Found the following matching directories (newest first):\n"; + foreach my $href (@hrefs) { + $msg .= " $$href[1] ($$href[0]) $$href[2]\n"; + } + uscan_verbose $msg; + $newdir //= $hrefs[0][1]; + } else { + uscan_warn +"In $watchfile,\n no matching hrefs for pattern\n $site$dir$pattern"; + return ''; + } + + # just give the final directory component + $newdir =~ s%/$%%; + $newdir =~ s%^.*/%%; + return ($newdir); +} + +# Nothing to clean here +sub http_clean { 0 } + +sub clean_content { + my ($content) = @_; + + # We need this horrid stuff to handle href=foo type + # links. OK, bad HTML, but we have to handle it nonetheless. + # It's bug #89749. + $$content =~ s/href\s*=\s*(?=[^\"\'])([^\s>]+)/href="$1"/ig; + + # Strip comments + $$content =~ s/<!-- .*?-->//sg; + return $content; +} + +sub url_canonicalize_dots { + my ($base, $url) = @_; + + if ($url !~ m{^[^:#?/]+://}) { + if ($url =~ m{^//}) { + $base =~ m{^[^:#?/]+:} + and $url = $& . $url; + } elsif ($url =~ m{^/}) { + $base =~ m{^[^:#?/]+://[^/#?]*} + and $url = $& . $url; + } else { + uscan_debug "Resolving urls with query part unimplemented" + if ($url =~ m/^[#?]/); + $base =~ m{^[^:#?/]+://[^/#?]*(?:/(?:[^#?/]*/)*)?} and do { + my $base_to_path = $&; + $base_to_path .= '/' unless $base_to_path =~ m|/$|; + $url = $base_to_path . $url; + }; + } + } + $url =~ s{^([^:#?/]+://[^/#?]*)(/[^#?]*)}{ + my ($h, $p) = ($1, $2); + $p =~ s{/\.(?:/|$|(?=[#?]))}{/}g; + 1 while $p =~ s{/(?!\.\./)[^/]*/\.\.(?:/|(?=[#?])|$)}{/}g; + $h.$p;}e; + $url; +} + +sub html_search { + my ($self, $content, $patterns, $mangle) = @_; + + # pagenmangle: should not abuse this slow operation + if ( + mangle( + $self->watchfile, \$self->line, + 'pagemangle:\n', [@{ $self->pagemangle }], + \$content + ) + ) { + return undef; + } + if ( !$self->shared->{bare} + and $content =~ m%^<[?]xml%i + and $content =~ m%xmlns="http://s3.amazonaws.com/doc/2006-03-01/"% + and $content !~ m%<Key><a\s+href%) { + # this is an S3 bucket listing. Insert an 'a href' tag + # into the content for each 'Key', so that it looks like html (LP: #798293) + uscan_warn +"*** Amazon AWS special case code is deprecated***\nUse opts=pagemangle rule, instead"; + $content =~ s%<Key>([^<]*)</Key>%<Key><a href="$1">$1</a></Key>%g; + uscan_extra_debug +"processed content:\n$content\n[End of processed content] by Amazon AWS special case code"; + } + clean_content(\$content); + + # Is there a base URL given? + if ($content =~ /<\s*base\s+[^>]*href\s*=\s*([\"\'])(.*?)\1/i) { + $self->parse_result->{urlbase} + = url_canonicalize_dots($self->parse_result->{base}, $2); + } else { + $self->parse_result->{urlbase} = $self->parse_result->{base}; + } + uscan_extra_debug +"processed content:\n$content\n[End of processed content] by fix bad HTML code"; + +# search hrefs in web page to obtain a list of uversionmangled version and matching download URL + { + local $, = ','; + uscan_verbose "Matching pattern:\n @{$self->{patterns}}"; + } + my @hrefs; + while ($content =~ m/<\s*a\s+[^>]*(?<=\s)href\s*=\s*([\"\'])(.*?)\1/sgi) { + my $href = $2; + $href = fix_href($href); + my $href_canonical + = url_canonicalize_dots($self->parse_result->{urlbase}, $href); + if (defined $self->hrefdecode) { + if ($self->hrefdecode eq 'percent-encoding') { + uscan_debug "... Decoding from href: $href"; + $href =~ s/%([A-Fa-f\d]{2})/chr hex $1/eg; + $href_canonical =~ s/%([A-Fa-f\d]{2})/chr hex $1/eg; + } else { + uscan_warn "Illegal value for hrefdecode: " + . "$self->{hrefdecode}"; + return undef; + } + } + uscan_extra_debug "Checking href $href"; + foreach my $_pattern (@$patterns) { + if (my @match = $href =~ /^$_pattern$/) { + push @hrefs, + parse_href($self, $href_canonical, $_pattern, \@match, + $mangle); + } + uscan_extra_debug "Checking href $href_canonical"; + if (my @match = $href_canonical =~ /^$_pattern$/) { + push @hrefs, + parse_href($self, $href_canonical, $_pattern, \@match, + $mangle); + } + } + } + return @hrefs; +} + +sub plain_search { + my ($self, $content) = @_; + my @hrefs; + foreach my $_pattern (@{ $self->patterns }) { + while ($content =~ s/.*?($_pattern)//) { + push @hrefs, $self->parse_href($1, $_pattern, $2); + } + } + return @hrefs; +} + +sub parse_href { + my ($self, $href, $_pattern, $match, $mangle) = @_; + $mangle //= 'uversionmangle'; + + my $mangled_version; + if ($self->watch_version == 2) { + + # watch_version 2 only recognised one group; the code + # below will break version 2 watch files with a construction + # such as file-([\d\.]+(-\d+)?) (bug #327258) + $mangled_version + = ref $match eq 'ARRAY' + ? $match->[0] + : $match; + } else { + # need the map { ... } here to handle cases of (...)? + # which may match but then return undef values + if ($self->versionless) { + + # exception, otherwise $mangled_version = 1 + $mangled_version = ''; + } else { + $mangled_version = join(".", + map { $_ if defined($_) } + ref $match eq 'ARRAY' ? @$match : $href =~ m&^$_pattern$&); + } + + if ( + mangle( + $self->watchfile, \$self->line, + "$mangle:", \@{ $self->$mangle }, + \$mangled_version + ) + ) { + return (); + } + } + $match = ''; + if (defined $self->shared->{download_version}) { + if ($mangled_version eq $self->shared->{download_version}) { + $match = "matched with the download version"; + } + } + my $priority = $mangled_version . '-' . get_priority($href); + return [$priority, $mangled_version, $href, $match]; +} + +1; |