package Devscripts::Uscan::http; use strict; use Cwd qw/abs_path/; use Devscripts::Uscan::Output; use Devscripts::Uscan::Utils; use Devscripts::Uscan::_xtp; use Moo::Role; *http_newfile_base = \&Devscripts::Uscan::_xtp::_xtp_newfile_base; ################################## # search $newversion (http mode) ################################## sub http_search { my ($self) = @_; # $content: web page to be scraped to find the URLs to be downloaded if (defined($1) and $self->downloader->ssl) { uscan_die "you must have the liblwp-protocol-https-perl package installed\nto use https URLs"; } uscan_verbose "Requesting URL:\n $self->{parse_result}->{base}"; my $request = HTTP::Request->new('GET', $self->parse_result->{base}); my $response = $self->downloader->user_agent->request($request); if (!$response->is_success) { uscan_warn "In watchfile $self->{watchfile}, reading webpage\n $self->{parse_result}->{base} failed: " . $response->status_line; return undef; } my @redirections = @{ $self->downloader->user_agent->get_redirections }; uscan_verbose "redirections: @redirections" if @redirections; foreach my $_redir (@redirections) { my $base_dir = $_redir; $base_dir =~ s%^\w+://[^/]+/%/%; if ($_redir =~ m%^(\w+://[^/]+)%) { my $base_site = $1; push @{ $self->patterns }, "(?:(?:$base_site)?" . quotemeta($base_dir) . ")?$self->{parse_result}->{filepattern}"; push @{ $self->sites }, $base_site; push @{ $self->basedirs }, $base_dir; # remove the filename, if any my $base_dir_orig = $base_dir; $base_dir =~ s%/[^/]*$%/%; if ($base_dir ne $base_dir_orig) { push @{ $self->patterns }, "(?:(?:$base_site)?" . quotemeta($base_dir) . ")?$self->{parse_result}->{filepattern}"; push @{ $self->sites }, $base_site; push @{ $self->basedirs }, $base_dir; } } } my $content = $response->decoded_content; uscan_debug "received content:\n$content\n[End of received content] by HTTP"; my @hrefs; if (!$self->searchmode or $self->searchmode eq 'html') { @hrefs = $self->html_search($content); } elsif ($self->searchmode eq 'plain') { @hrefs = $self->plain_search($content); } else { uscan_warn 'Unknown searchmode "' . $self->searchmode . '", skipping'; return undef; } if (@hrefs) { @hrefs = Devscripts::Versort::versort(@hrefs); my $msg = "Found the following matching hrefs on the web page (newest first):\n"; foreach my $href (@hrefs) { $msg .= " $$href[2] ($$href[1]) index=$$href[0] $$href[3]\n"; } uscan_verbose $msg; } my ($newversion, $newfile); if (defined $self->shared->{download_version}) { # extract ones which has $match in the above loop defined my @vhrefs = grep { $$_[3] } @hrefs; if (@vhrefs) { (undef, $newversion, $newfile, undef) = @{ $vhrefs[0] }; } else { uscan_warn "In $self->{watchfile} no matching hrefs for version $self->{shared}->{download_version}" . " in watch line\n $self->{line}"; return undef; } } else { if (@hrefs) { (undef, $newversion, $newfile, undef) = @{ $hrefs[0] }; } else { uscan_warn "In $self->{watchfile} no matching files for watch line\n $self->{line}"; return undef; } } return ($newversion, $newfile); } ####################################################################### # determine $upstream_url (http mode) ####################################################################### # http is complicated due to absolute/relative URL issue sub http_upstream_url { my ($self) = @_; my $upstream_url; my $newfile = $self->search_result->{newfile}; if ($newfile =~ m%^\w+://%) { $upstream_url = $newfile; } elsif ($newfile =~ m%^//%) { $upstream_url = $self->parse_result->{site}; $upstream_url =~ s/^(https?:).*/$1/; $upstream_url .= $newfile; } elsif ($newfile =~ m%^/%) { # absolute filename # Were there any redirections? If so try using those first if ($#{ $self->patterns } > 0) { # replace $site here with the one we were redirected to foreach my $index (0 .. $#{ $self->patterns }) { if ("$self->{sites}->[$index]$newfile" =~ m&^$self->{patterns}->[$index]$&) { $upstream_url = "$self->{sites}->[$index]$newfile"; last; } } if (!defined($upstream_url)) { uscan_verbose "Unable to determine upstream url from redirections,\n" . "defaulting to using site specified in watch file"; $upstream_url = "$self->{sites}->[0]$newfile"; } } else { $upstream_url = "$self->{sites}->[0]$newfile"; } } else { # relative filename, we hope # Were there any redirections? If so try using those first if ($#{ $self->patterns } > 0) { # replace $site here with the one we were redirected to foreach my $index (0 .. $#{ $self->patterns }) { # skip unless the basedir looks like a directory next unless $self->{basedirs}->[$index] =~ m%/$%; my $nf = "$self->{basedirs}->[$index]$newfile"; if ("$self->{sites}->[$index]$nf" =~ m&^$self->{patterns}->[$index]$&) { $upstream_url = "$self->{sites}->[$index]$nf"; last; } } if (!defined($upstream_url)) { uscan_verbose "Unable to determine upstream url from redirections,\n" . "defaulting to using site specified in watch file"; $upstream_url = "$self->{parse_result}->{urlbase}$newfile"; } } else { $upstream_url = "$self->{parse_result}->{urlbase}$newfile"; } } # mangle if necessary $upstream_url =~ s/&/&/g; uscan_verbose "Matching target for downloadurlmangle: $upstream_url"; if (@{ $self->downloadurlmangle }) { if ( mangle( $self->watchfile, \$self->line, 'downloadurlmangle:', \@{ $self->downloadurlmangle }, \$upstream_url ) ) { $self->status(1); return undef; } } return $upstream_url; } sub http_newdir { my ( $https, $downloader, $site, $dir, $pattern, $dirversionmangle, $watchfile, $lineptr, $download_version ) = @_; my ($request, $response, $newdir); my ($download_version_short1, $download_version_short2, $download_version_short3) = partial_version($download_version); my $base = $site . $dir; if (defined($https) and !$downloader->ssl) { uscan_die "$progname: you must have the liblwp-protocol-https-perl package installed\n" . "to use https URLs"; } $request = HTTP::Request->new('GET', $base); $response = $downloader->user_agent->request($request); if (!$response->is_success) { uscan_warn "In watch file $watchfile, reading webpage\n $base failed: " . $response->status_line; return ''; } my $content = $response->content; uscan_debug "received content:\n$content\n[End of received content] by HTTP"; clean_content(\$content); my $dirpattern = "(?:(?:$site)?" . quotemeta($dir) . ")?$pattern"; uscan_verbose "Matching pattern:\n $dirpattern"; my @hrefs; my $match = ''; while ($content =~ m/<\s*a\s+[^>]*href\s*=\s*([\"\'])(.*?)\1/gi) { my $href = fix_href($2); uscan_verbose "Matching target for dirversionmangle: $href"; if ($href =~ m&^$dirpattern/?$&) { my $mangled_version = join(".", map { $_ // '' } $href =~ m&^$dirpattern/?$&); if ( mangle( $watchfile, $lineptr, 'dirversionmangle:', \@{$dirversionmangle}, \$mangled_version ) ) { return 1; } $match = ''; if (defined $download_version and $mangled_version eq $download_version) { $match = "matched with the download version"; } if (defined $download_version_short3 and $mangled_version eq $download_version_short3) { $match = "matched with the download version (partial 3)"; } if (defined $download_version_short2 and $mangled_version eq $download_version_short2) { $match = "matched with the download version (partial 2)"; } if (defined $download_version_short1 and $mangled_version eq $download_version_short1) { $match = "matched with the download version (partial 1)"; } push @hrefs, [$mangled_version, $href, $match]; } } # extract ones which has $match in the above loop defined my @vhrefs = grep { $$_[2] } @hrefs; if (@vhrefs) { @vhrefs = Devscripts::Versort::upstream_versort(@vhrefs); $newdir = $vhrefs[0][1]; } if (@hrefs) { @hrefs = Devscripts::Versort::upstream_versort(@hrefs); my $msg = "Found the following matching directories (newest first):\n"; foreach my $href (@hrefs) { $msg .= " $$href[1] ($$href[0]) $$href[2]\n"; } uscan_verbose $msg; $newdir //= $hrefs[0][1]; } else { uscan_warn "In $watchfile,\n no matching hrefs for pattern\n $site$dir$pattern"; return ''; } # just give the final directory component $newdir =~ s%/$%%; $newdir =~ s%^.*/%%; return ($newdir); } # Nothing to clean here sub http_clean { 0 } sub clean_content { my ($content) = @_; # We need this horrid stuff to handle href=foo type # links. OK, bad HTML, but we have to handle it nonetheless. # It's bug #89749. $$content =~ s/href\s*=\s*(?=[^\"\'])([^\s>]+)/href="$1"/ig; # Strip comments $$content =~ s///sg; return $content; } sub html_search { my ($self, $content) = @_; # pagenmangle: should not abuse this slow operation if ( mangle( $self->watchfile, \$self->line, 'pagemangle:\n', [@{ $self->pagemangle }], \$content ) ) { return undef; } if ( !$self->shared->{bare} and $content =~ m%^<[?]xml%i and $content =~ m%xmlns="http://s3.amazonaws.com/doc/2006-03-01/"% and $content !~ m%([^<]*)%$1%g; uscan_debug "processed content:\n$content\n[End of processed content] by Amazon AWS special case code"; } clean_content(\$content); # Is there a base URL given? if ($content =~ /<\s*base\s+[^>]*href\s*=\s*([\"\'])(.*?)\1/i) { # Ensure it ends with / $self->parse_result->{urlbase} = "$2/"; $self->parse_result->{urlbase} =~ s%//$%/%; } else { # May have to strip a base filename ($self->parse_result->{urlbase} = $self->parse_result->{base}) =~ s%/[^/]*$%/%; } uscan_debug "processed content:\n$content\n[End of processed content] by fix bad HTML code"; # search hrefs in web page to obtain a list of uversionmangled version and matching download URL { local $, = ','; uscan_verbose "Matching pattern:\n @{$self->{patterns}}"; } my @hrefs; while ($content =~ m/<\s*a\s+[^>]*(?<=\s)href\s*=\s*([\"\'])(.*?)\1/sgi) { my $href = $2; $href = fix_href($href); if (defined $self->hrefdecode) { if ($self->hrefdecode eq 'percent-encoding') { uscan_debug "... Decoding from href: $href"; $href =~ s/%([A-Fa-f\d]{2})/chr hex $1/eg; } else { uscan_warn "Illegal value for hrefdecode: " . "$self->{hrefdecode}"; return undef; } } uscan_debug "Checking href $href"; foreach my $_pattern (@{ $self->patterns }) { if ($href =~ /^$_pattern$/) { push @hrefs, $self->parse_href($href, $_pattern, $1); } } } return @hrefs; } sub plain_search { my ($self, $content) = @_; my @hrefs; foreach my $_pattern (@{ $self->patterns }) { while ($content =~ s/.*?($_pattern)//) { push @hrefs, $self->parse_href($1, $_pattern, $2); } } return @hrefs; } sub parse_href { my ($self, $href, $_pattern, $match) = @_; my $mangled_version; if ($self->watch_version == 2) { # watch_version 2 only recognised one group; the code # below will break version 2 watch files with a construction # such as file-([\d\.]+(-\d+)?) (bug #327258) $mangled_version = $match; } else { # need the map { ... } here to handle cases of (...)? # which may match but then return undef values if ($self->versionless) { # exception, otherwise $mangled_version = 1 $mangled_version = ''; } else { $mangled_version = join(".", map { $_ if defined($_) } $href =~ m&^$_pattern$&); } if ( mangle( $self->watchfile, \$self->line, 'uversionmangle:', \@{ $self->uversionmangle }, \$mangled_version ) ) { return (); } } $match = ''; if (defined $self->shared->{download_version}) { if ($mangled_version eq $self->shared->{download_version}) { $match = "matched with the download version"; } } my $priority = $mangled_version . '-' . get_priority($href); return [$priority, $mangled_version, $href, $match]; } 1;