summaryrefslogtreecommitdiffstats
path: root/lib/Devscripts/Uscan/http.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Devscripts/Uscan/http.pm')
-rw-r--r--lib/Devscripts/Uscan/http.pm434
1 files changed, 434 insertions, 0 deletions
diff --git a/lib/Devscripts/Uscan/http.pm b/lib/Devscripts/Uscan/http.pm
new file mode 100644
index 0000000..95fc08a
--- /dev/null
+++ b/lib/Devscripts/Uscan/http.pm
@@ -0,0 +1,434 @@
+package Devscripts::Uscan::http;
+
+use strict;
+use Cwd qw/abs_path/;
+use Devscripts::Uscan::Output;
+use Devscripts::Uscan::Utils;
+use Devscripts::Uscan::_xtp;
+use Moo::Role;
+
+*http_newfile_base = \&Devscripts::Uscan::_xtp::_xtp_newfile_base;
+
+##################################
+# search $newversion (http mode)
+##################################
+sub http_search {
+ my ($self) = @_;
+
+ # $content: web page to be scraped to find the URLs to be downloaded
+ if (defined($1) and $self->downloader->ssl) {
+ uscan_die
+"you must have the liblwp-protocol-https-perl package installed\nto use https URLs";
+ }
+ uscan_verbose "Requesting URL:\n $self->{parse_result}->{base}";
+ my $request = HTTP::Request->new('GET', $self->parse_result->{base});
+ my $response = $self->downloader->user_agent->request($request);
+ if (!$response->is_success) {
+ uscan_warn
+"In watchfile $self->{watchfile}, reading webpage\n $self->{parse_result}->{base} failed: "
+ . $response->status_line;
+ return undef;
+ }
+
+ my @redirections = @{ $self->downloader->user_agent->get_redirections };
+
+ uscan_verbose "redirections: @redirections" if @redirections;
+
+ foreach my $_redir (@redirections) {
+ my $base_dir = $_redir;
+
+ $base_dir =~ s%^\w+://[^/]+/%/%;
+ if ($_redir =~ m%^(\w+://[^/]+)%) {
+ my $base_site = $1;
+
+ push @{ $self->patterns },
+ "(?:(?:$base_site)?"
+ . quotemeta($base_dir)
+ . ")?$self->{parse_result}->{filepattern}";
+ push @{ $self->sites }, $base_site;
+ push @{ $self->basedirs }, $base_dir;
+
+ # remove the filename, if any
+ my $base_dir_orig = $base_dir;
+ $base_dir =~ s%/[^/]*$%/%;
+ if ($base_dir ne $base_dir_orig) {
+ push @{ $self->patterns },
+ "(?:(?:$base_site)?"
+ . quotemeta($base_dir)
+ . ")?$self->{parse_result}->{filepattern}";
+ push @{ $self->sites }, $base_site;
+ push @{ $self->basedirs }, $base_dir;
+ }
+ }
+ }
+
+ my $content = $response->decoded_content;
+ uscan_debug
+ "received content:\n$content\n[End of received content] by HTTP";
+
+ my @hrefs;
+ if (!$self->searchmode or $self->searchmode eq 'html') {
+ @hrefs = $self->html_search($content);
+ } elsif ($self->searchmode eq 'plain') {
+ @hrefs = $self->plain_search($content);
+ } else {
+ uscan_warn 'Unknown searchmode "' . $self->searchmode . '", skipping';
+ return undef;
+ }
+
+ if (@hrefs) {
+ @hrefs = Devscripts::Versort::versort(@hrefs);
+ my $msg
+ = "Found the following matching hrefs on the web page (newest first):\n";
+ foreach my $href (@hrefs) {
+ $msg .= " $$href[2] ($$href[1]) index=$$href[0] $$href[3]\n";
+ }
+ uscan_verbose $msg;
+ }
+ my ($newversion, $newfile);
+ if (defined $self->shared->{download_version}) {
+
+ # extract ones which has $match in the above loop defined
+ my @vhrefs = grep { $$_[3] } @hrefs;
+ if (@vhrefs) {
+ (undef, $newversion, $newfile, undef) = @{ $vhrefs[0] };
+ } else {
+ uscan_warn
+"In $self->{watchfile} no matching hrefs for version $self->{shared}->{download_version}"
+ . " in watch line\n $self->{line}";
+ return undef;
+ }
+ } else {
+ if (@hrefs) {
+ (undef, $newversion, $newfile, undef) = @{ $hrefs[0] };
+ } else {
+ uscan_warn
+"In $self->{watchfile} no matching files for watch line\n $self->{line}";
+ return undef;
+ }
+ }
+ return ($newversion, $newfile);
+}
+
+#######################################################################
+# determine $upstream_url (http mode)
+#######################################################################
+# http is complicated due to absolute/relative URL issue
+sub http_upstream_url {
+ my ($self) = @_;
+ my $upstream_url;
+ my $newfile = $self->search_result->{newfile};
+ if ($newfile =~ m%^\w+://%) {
+ $upstream_url = $newfile;
+ } elsif ($newfile =~ m%^//%) {
+ $upstream_url = $self->parse_result->{site};
+ $upstream_url =~ s/^(https?:).*/$1/;
+ $upstream_url .= $newfile;
+ } elsif ($newfile =~ m%^/%) {
+
+ # absolute filename
+ # Were there any redirections? If so try using those first
+ if ($#{ $self->patterns } > 0) {
+
+ # replace $site here with the one we were redirected to
+ foreach my $index (0 .. $#{ $self->patterns }) {
+ if ("$self->{sites}->[$index]$newfile"
+ =~ m&^$self->{patterns}->[$index]$&) {
+ $upstream_url = "$self->{sites}->[$index]$newfile";
+ last;
+ }
+ }
+ if (!defined($upstream_url)) {
+ uscan_verbose
+ "Unable to determine upstream url from redirections,\n"
+ . "defaulting to using site specified in watch file";
+ $upstream_url = "$self->{sites}->[0]$newfile";
+ }
+ } else {
+ $upstream_url = "$self->{sites}->[0]$newfile";
+ }
+ } else {
+ # relative filename, we hope
+ # Were there any redirections? If so try using those first
+ if ($#{ $self->patterns } > 0) {
+
+ # replace $site here with the one we were redirected to
+ foreach my $index (0 .. $#{ $self->patterns }) {
+
+ # skip unless the basedir looks like a directory
+ next unless $self->{basedirs}->[$index] =~ m%/$%;
+ my $nf = "$self->{basedirs}->[$index]$newfile";
+ if ("$self->{sites}->[$index]$nf"
+ =~ m&^$self->{patterns}->[$index]$&) {
+ $upstream_url = "$self->{sites}->[$index]$nf";
+ last;
+ }
+ }
+ if (!defined($upstream_url)) {
+ uscan_verbose
+ "Unable to determine upstream url from redirections,\n"
+ . "defaulting to using site specified in watch file";
+ $upstream_url = "$self->{parse_result}->{urlbase}$newfile";
+ }
+ } else {
+ $upstream_url = "$self->{parse_result}->{urlbase}$newfile";
+ }
+ }
+
+ # mangle if necessary
+ $upstream_url =~ s/&/&/g;
+ uscan_verbose "Matching target for downloadurlmangle: $upstream_url";
+ if (@{ $self->downloadurlmangle }) {
+ if (
+ mangle(
+ $self->watchfile, \$self->line,
+ 'downloadurlmangle:', \@{ $self->downloadurlmangle },
+ \$upstream_url
+ )
+ ) {
+ $self->status(1);
+ return undef;
+ }
+ }
+ return $upstream_url;
+}
+
+sub http_newdir {
+ my (
+ $https, $downloader, $site,
+ $dir, $pattern, $dirversionmangle,
+ $watchfile, $lineptr, $download_version
+ ) = @_;
+
+ my ($request, $response, $newdir);
+ my ($download_version_short1, $download_version_short2,
+ $download_version_short3)
+ = partial_version($download_version);
+ my $base = $site . $dir;
+
+ if (defined($https) and !$downloader->ssl) {
+ uscan_die
+"$progname: you must have the liblwp-protocol-https-perl package installed\n"
+ . "to use https URLs";
+ }
+ $request = HTTP::Request->new('GET', $base);
+ $response = $downloader->user_agent->request($request);
+ if (!$response->is_success) {
+ uscan_warn
+ "In watch file $watchfile, reading webpage\n $base failed: "
+ . $response->status_line;
+ return '';
+ }
+
+ my $content = $response->content;
+ uscan_debug
+ "received content:\n$content\n[End of received content] by HTTP";
+
+ clean_content(\$content);
+
+ my $dirpattern = "(?:(?:$site)?" . quotemeta($dir) . ")?$pattern";
+
+ uscan_verbose "Matching pattern:\n $dirpattern";
+ my @hrefs;
+ my $match = '';
+ while ($content =~ m/<\s*a\s+[^>]*href\s*=\s*([\"\'])(.*?)\1/gi) {
+ my $href = fix_href($2);
+ uscan_verbose "Matching target for dirversionmangle: $href";
+ if ($href =~ m&^$dirpattern/?$&) {
+ my $mangled_version
+ = join(".", map { $_ // '' } $href =~ m&^$dirpattern/?$&);
+ if (
+ mangle(
+ $watchfile, $lineptr,
+ 'dirversionmangle:', \@{$dirversionmangle},
+ \$mangled_version
+ )
+ ) {
+ return 1;
+ }
+ $match = '';
+ if (defined $download_version
+ and $mangled_version eq $download_version) {
+ $match = "matched with the download version";
+ }
+ if (defined $download_version_short3
+ and $mangled_version eq $download_version_short3) {
+ $match = "matched with the download version (partial 3)";
+ }
+ if (defined $download_version_short2
+ and $mangled_version eq $download_version_short2) {
+ $match = "matched with the download version (partial 2)";
+ }
+ if (defined $download_version_short1
+ and $mangled_version eq $download_version_short1) {
+ $match = "matched with the download version (partial 1)";
+ }
+ push @hrefs, [$mangled_version, $href, $match];
+ }
+ }
+
+ # extract ones which has $match in the above loop defined
+ my @vhrefs = grep { $$_[2] } @hrefs;
+ if (@vhrefs) {
+ @vhrefs = Devscripts::Versort::upstream_versort(@vhrefs);
+ $newdir = $vhrefs[0][1];
+ }
+ if (@hrefs) {
+ @hrefs = Devscripts::Versort::upstream_versort(@hrefs);
+ my $msg = "Found the following matching directories (newest first):\n";
+ foreach my $href (@hrefs) {
+ $msg .= " $$href[1] ($$href[0]) $$href[2]\n";
+ }
+ uscan_verbose $msg;
+ $newdir //= $hrefs[0][1];
+ } else {
+ uscan_warn
+"In $watchfile,\n no matching hrefs for pattern\n $site$dir$pattern";
+ return '';
+ }
+
+ # just give the final directory component
+ $newdir =~ s%/$%%;
+ $newdir =~ s%^.*/%%;
+ return ($newdir);
+}
+
+# Nothing to clean here
+sub http_clean { 0 }
+
+sub clean_content {
+ my ($content) = @_;
+
+ # We need this horrid stuff to handle href=foo type
+ # links. OK, bad HTML, but we have to handle it nonetheless.
+ # It's bug #89749.
+ $$content =~ s/href\s*=\s*(?=[^\"\'])([^\s>]+)/href="$1"/ig;
+
+ # Strip comments
+ $$content =~ s/<!-- .*?-->//sg;
+ return $content;
+}
+
+sub html_search {
+ my ($self, $content) = @_;
+
+ # pagenmangle: should not abuse this slow operation
+ if (
+ mangle(
+ $self->watchfile, \$self->line,
+ 'pagemangle:\n', [@{ $self->pagemangle }],
+ \$content
+ )
+ ) {
+ return undef;
+ }
+ if ( !$self->shared->{bare}
+ and $content =~ m%^<[?]xml%i
+ and $content =~ m%xmlns="http://s3.amazonaws.com/doc/2006-03-01/"%
+ and $content !~ m%<Key><a\s+href%) {
+ # this is an S3 bucket listing. Insert an 'a href' tag
+ # into the content for each 'Key', so that it looks like html (LP: #798293)
+ uscan_warn
+"*** Amazon AWS special case code is deprecated***\nUse opts=pagemangle rule, instead";
+ $content =~ s%<Key>([^<]*)</Key>%<Key><a href="$1">$1</a></Key>%g;
+ uscan_debug
+"processed content:\n$content\n[End of processed content] by Amazon AWS special case code";
+ }
+ clean_content(\$content);
+
+ # Is there a base URL given?
+ if ($content =~ /<\s*base\s+[^>]*href\s*=\s*([\"\'])(.*?)\1/i) {
+
+ # Ensure it ends with /
+ $self->parse_result->{urlbase} = "$2/";
+ $self->parse_result->{urlbase} =~ s%//$%/%;
+ } else {
+ # May have to strip a base filename
+ ($self->parse_result->{urlbase} = $self->parse_result->{base})
+ =~ s%/[^/]*$%/%;
+ }
+ uscan_debug
+"processed content:\n$content\n[End of processed content] by fix bad HTML code";
+
+# search hrefs in web page to obtain a list of uversionmangled version and matching download URL
+ {
+ local $, = ',';
+ uscan_verbose "Matching pattern:\n @{$self->{patterns}}";
+ }
+ my @hrefs;
+ while ($content =~ m/<\s*a\s+[^>]*(?<=\s)href\s*=\s*([\"\'])(.*?)\1/sgi) {
+ my $href = $2;
+ $href = fix_href($href);
+ if (defined $self->hrefdecode) {
+ if ($self->hrefdecode eq 'percent-encoding') {
+ uscan_debug "... Decoding from href: $href";
+ $href =~ s/%([A-Fa-f\d]{2})/chr hex $1/eg;
+ } else {
+ uscan_warn "Illegal value for hrefdecode: "
+ . "$self->{hrefdecode}";
+ return undef;
+ }
+ }
+ uscan_debug "Checking href $href";
+ foreach my $_pattern (@{ $self->patterns }) {
+ if ($href =~ /^$_pattern$/) {
+ push @hrefs, $self->parse_href($href, $_pattern, $1);
+ }
+ }
+ }
+ return @hrefs;
+}
+
+sub plain_search {
+ my ($self, $content) = @_;
+ my @hrefs;
+ foreach my $_pattern (@{ $self->patterns }) {
+ while ($content =~ s/.*?($_pattern)//) {
+ push @hrefs, $self->parse_href($1, $_pattern, $2);
+ }
+ }
+ return @hrefs;
+}
+
+sub parse_href {
+ my ($self, $href, $_pattern, $match) = @_;
+ my $mangled_version;
+ if ($self->watch_version == 2) {
+
+ # watch_version 2 only recognised one group; the code
+ # below will break version 2 watch files with a construction
+ # such as file-([\d\.]+(-\d+)?) (bug #327258)
+ $mangled_version = $match;
+ } else {
+ # need the map { ... } here to handle cases of (...)?
+ # which may match but then return undef values
+ if ($self->versionless) {
+
+ # exception, otherwise $mangled_version = 1
+ $mangled_version = '';
+ } else {
+ $mangled_version
+ = join(".", map { $_ if defined($_) } $href =~ m&^$_pattern$&);
+ }
+
+ if (
+ mangle(
+ $self->watchfile, \$self->line,
+ 'uversionmangle:', \@{ $self->uversionmangle },
+ \$mangled_version
+ )
+ ) {
+ return ();
+ }
+ }
+ $match = '';
+ if (defined $self->shared->{download_version}) {
+ if ($mangled_version eq $self->shared->{download_version}) {
+ $match = "matched with the download version";
+ }
+ }
+ my $priority = $mangled_version . '-' . get_priority($href);
+ return [$priority, $mangled_version, $href, $match];
+}
+
+1;