summaryrefslogtreecommitdiffstats
path: root/lib/Devscripts/Uscan/http.pm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--lib/Devscripts/Uscan/http.pm510
1 files changed, 510 insertions, 0 deletions
diff --git a/lib/Devscripts/Uscan/http.pm b/lib/Devscripts/Uscan/http.pm
new file mode 100644
index 0000000..0da9798
--- /dev/null
+++ b/lib/Devscripts/Uscan/http.pm
@@ -0,0 +1,510 @@
+package Devscripts::Uscan::http;
+
+use strict;
+use Cwd qw/abs_path/;
+use Devscripts::Uscan::Output;
+use Devscripts::Uscan::Utils;
+use Devscripts::Uscan::_xtp;
+use Moo::Role;
+
+*http_newfile_base = \&Devscripts::Uscan::_xtp::_xtp_newfile_base;
+
+##################################
+# search $newversion (http mode)
+##################################
+
+#returns (\@patterns, \@base_sites, \@base_dirs)
+sub handle_redirection {
+ my ($self, $pattern, @additional_bases) = @_;
+ my @redirections = @{ $self->downloader->user_agent->get_redirections };
+ my (@patterns, @base_sites, @base_dirs);
+
+ uscan_verbose "redirections: @redirections" if @redirections;
+
+ foreach my $_redir (@redirections, @additional_bases) {
+ my $base_dir = $_redir;
+
+ $base_dir =~ s%^\w+://[^/]+/%/%;
+ $base_dir =~ s%/[^/]*(?:[#?].*)?$%/%;
+ if ($_redir =~ m%^(\w+://[^/]+)%) {
+ my $base_site = $1;
+
+ push @patterns,
+ quotemeta($base_site) . quotemeta($base_dir) . "$pattern";
+ push @base_sites, $base_site;
+ push @base_dirs, $base_dir;
+
+ # remove the filename, if any
+ my $base_dir_orig = $base_dir;
+ $base_dir =~ s%/[^/]*$%/%;
+ if ($base_dir ne $base_dir_orig) {
+ push @patterns,
+ quotemeta($base_site) . quotemeta($base_dir) . "$pattern";
+ push @base_sites, $base_site;
+ push @base_dirs, $base_dir;
+ }
+ }
+ }
+ return (\@patterns, \@base_sites, \@base_dirs);
+}
+
+sub http_search {
+ my ($self) = @_;
+
+ # $content: web page to be scraped to find the URLs to be downloaded
+ if ($self->{parse_result}->{base} =~ /^https/ and !$self->downloader->ssl)
+ {
+ uscan_die
+"you must have the liblwp-protocol-https-perl package installed\nto use https URLs";
+ }
+ uscan_verbose "Requesting URL:\n $self->{parse_result}->{base}";
+ my $request = HTTP::Request->new('GET', $self->parse_result->{base});
+ foreach my $k (keys %{ $self->downloader->headers }) {
+ if ($k =~ /^(.*?)@(.*)$/) {
+ my $baseUrl = $1;
+ my $hdr = $2;
+ if ($self->parse_result->{base} =~ m#^\Q$baseUrl\E(?:/.*)?$#) {
+ $request->header($hdr => $self->headers->{$k});
+ uscan_verbose "Set per-host custom header $hdr for "
+ . $self->parse_result->{base};
+ } else {
+ uscan_debug
+ "$self->parse_result->{base} does not start with $1";
+ }
+ } else {
+ uscan_warn "Malformed http-header: $k";
+ }
+ }
+ $request->header('Accept-Encoding' => 'gzip');
+ $request->header('Accept' => '*/*');
+ my $response = $self->downloader->user_agent->request($request);
+ if (!$response->is_success) {
+ uscan_warn
+"In watchfile $self->{watchfile}, reading webpage\n $self->{parse_result}->{base} failed: "
+ . $response->status_line;
+ return undef;
+ }
+
+ my ($patterns, $base_sites, $base_dirs)
+ = handle_redirection($self, $self->{parse_result}->{filepattern});
+ push @{ $self->patterns }, @$patterns;
+ push @{ $self->sites }, @$base_sites;
+ push @{ $self->basedirs }, @$base_dirs;
+
+ my $content = $response->decoded_content;
+ uscan_extra_debug
+ "received content:\n$content\n[End of received content] by HTTP";
+
+ my @hrefs;
+ if (!$self->searchmode or $self->searchmode eq 'html') {
+ @hrefs = $self->html_search($content, $self->patterns);
+ } elsif ($self->searchmode eq 'plain') {
+ @hrefs = $self->plain_search($content);
+ } else {
+ uscan_warn 'Unknown searchmode "' . $self->searchmode . '", skipping';
+ return undef;
+ }
+
+ if (@hrefs) {
+ @hrefs = Devscripts::Versort::versort(@hrefs);
+ my $msg
+ = "Found the following matching hrefs on the web page (newest first):\n";
+ foreach my $href (@hrefs) {
+ $msg .= " $$href[2] ($$href[1]) index=$$href[0] $$href[3]\n";
+ }
+ uscan_verbose $msg;
+ }
+ my ($newversion, $newfile);
+ if (defined $self->shared->{download_version}
+ and not $self->versionmode eq 'ignore') {
+
+ # extract ones which has $match in the above loop defined
+ my @vhrefs = grep { $$_[3] } @hrefs;
+ if (@vhrefs) {
+ (undef, $newversion, $newfile, undef) = @{ $vhrefs[0] };
+ } else {
+ uscan_warn
+"In $self->{watchfile} no matching hrefs for version $self->{shared}->{download_version}"
+ . " in watch line\n $self->{line}";
+ return undef;
+ }
+ } else {
+ if (@hrefs) {
+ (undef, $newversion, $newfile, undef) = @{ $hrefs[0] };
+ } else {
+ uscan_warn
+"In $self->{watchfile} no matching files for watch line\n $self->{line}";
+ return undef;
+ }
+ }
+ return ($newversion, $newfile);
+}
+
+#######################################################################
+# determine $upstream_url (http mode)
+#######################################################################
+# http is complicated due to absolute/relative URL issue
+sub http_upstream_url {
+ my ($self) = @_;
+ my $upstream_url;
+ my $newfile = $self->search_result->{newfile};
+ if ($newfile =~ m%^\w+://%) {
+ $upstream_url = $newfile;
+ } elsif ($newfile =~ m%^//%) {
+ $upstream_url = $self->parse_result->{site};
+ $upstream_url =~ s/^(https?:).*/$1/;
+ $upstream_url .= $newfile;
+ } elsif ($newfile =~ m%^/%) {
+
+ # absolute filename
+ # Were there any redirections? If so try using those first
+ if ($#{ $self->patterns } > 0) {
+
+ # replace $site here with the one we were redirected to
+ foreach my $index (0 .. $#{ $self->patterns }) {
+ if ("$self->{sites}->[$index]$newfile"
+ =~ m&^$self->{patterns}->[$index]$&) {
+ $upstream_url = "$self->{sites}->[$index]$newfile";
+ last;
+ }
+ }
+ if (!defined($upstream_url)) {
+ uscan_verbose
+ "Unable to determine upstream url from redirections,\n"
+ . "defaulting to using site specified in watch file";
+ $upstream_url = "$self->{sites}->[0]$newfile";
+ }
+ } else {
+ $upstream_url = "$self->{sites}->[0]$newfile";
+ }
+ } else {
+ # relative filename, we hope
+ # Were there any redirections? If so try using those first
+ if ($#{ $self->patterns } > 0) {
+
+ # replace $site here with the one we were redirected to
+ foreach my $index (0 .. $#{ $self->patterns }) {
+
+ # skip unless the basedir looks like a directory
+ next unless $self->{basedirs}->[$index] =~ m%/$%;
+ my $nf = "$self->{basedirs}->[$index]$newfile";
+ if ("$self->{sites}->[$index]$nf"
+ =~ m&^$self->{patterns}->[$index]$&) {
+ $upstream_url = "$self->{sites}->[$index]$nf";
+ last;
+ }
+ }
+ if (!defined($upstream_url)) {
+ uscan_verbose
+ "Unable to determine upstream url from redirections,\n"
+ . "defaulting to using site specified in watch file";
+ $upstream_url = "$self->{parse_result}->{urlbase}$newfile";
+ }
+ } else {
+ $upstream_url = "$self->{parse_result}->{urlbase}$newfile";
+ }
+ }
+
+ # mangle if necessary
+ $upstream_url =~ s/&/&/g;
+ uscan_verbose "Matching target for downloadurlmangle: $upstream_url";
+ if (@{ $self->downloadurlmangle }) {
+ if (
+ mangle(
+ $self->watchfile, \$self->line,
+ 'downloadurlmangle:', \@{ $self->downloadurlmangle },
+ \$upstream_url
+ )
+ ) {
+ $self->status(1);
+ return undef;
+ }
+ }
+ return $upstream_url;
+}
+
+sub http_newdir {
+ my ($https, $line, $site, $dir, $pattern, $dirversionmangle,
+ $watchfile, $lineptr, $download_version)
+ = @_;
+
+ my $downloader = $line->downloader;
+ my ($request, $response, $newdir);
+ my ($download_version_short1, $download_version_short2,
+ $download_version_short3)
+ = partial_version($download_version);
+ my $base = $site . $dir;
+
+ $pattern .= "/?";
+
+ if (defined($https) and !$downloader->ssl) {
+ uscan_die
+"$progname: you must have the liblwp-protocol-https-perl package installed\n"
+ . "to use https URLs";
+ }
+ # At least for now, set base in the line object - other methods need it
+ local $line->parse_result->{base} = $base;
+ $request = HTTP::Request->new('GET', $base);
+ $response = $downloader->user_agent->request($request);
+ if (!$response->is_success) {
+ uscan_warn
+ "In watch file $watchfile, reading webpage\n $base failed: "
+ . $response->status_line;
+ return '';
+ }
+
+ my $content = $response->content;
+ if ( $response->header('Content-Encoding')
+ and $response->header('Content-Encoding') =~ /^gzip$/i) {
+ require IO::Uncompress::Gunzip;
+ require IO::String;
+ uscan_debug "content seems gzip encoded, let's decode it";
+ my $out;
+ if (IO::Uncompress::Gunzip::gunzip(IO::String->new($content), \$out)) {
+ $content = $out;
+ } else {
+ uscan_warn 'Unable to decode remote content: '
+ . $IO::Uncompress::GunzipError;
+ return '';
+ }
+ }
+ uscan_extra_debug
+ "received content:\n$content\n[End of received content] by HTTP";
+
+ clean_content(\$content);
+
+ my ($dirpatterns, $base_sites, $base_dirs)
+ = handle_redirection($line, $pattern, $base);
+ $downloader->user_agent->clear_redirections; # we won't be needing that
+
+ my @hrefs;
+ for my $parsed (
+ html_search($line, $content, $dirpatterns, 'dirversionmangle')) {
+ my ($priority, $mangled_version, $href, $match) = @$parsed;
+ $match = '';
+ if (defined $download_version
+ and $mangled_version eq $download_version) {
+ $match = "matched with the download version";
+ }
+ if (defined $download_version_short3
+ and $mangled_version eq $download_version_short3) {
+ $match = "matched with the download version (partial 3)";
+ }
+ if (defined $download_version_short2
+ and $mangled_version eq $download_version_short2) {
+ $match = "matched with the download version (partial 2)";
+ }
+ if (defined $download_version_short1
+ and $mangled_version eq $download_version_short1) {
+ $match = "matched with the download version (partial 1)";
+ }
+ push @hrefs, [$mangled_version, $href, $match];
+ }
+
+ # extract ones which has $match in the above loop defined
+ my @vhrefs = grep { $$_[2] } @hrefs;
+ if (@vhrefs) {
+ @vhrefs = Devscripts::Versort::upstream_versort(@vhrefs);
+ $newdir = $vhrefs[0][1];
+ }
+ if (@hrefs) {
+ @hrefs = Devscripts::Versort::upstream_versort(@hrefs);
+ my $msg = "Found the following matching directories (newest first):\n";
+ foreach my $href (@hrefs) {
+ $msg .= " $$href[1] ($$href[0]) $$href[2]\n";
+ }
+ uscan_verbose $msg;
+ $newdir //= $hrefs[0][1];
+ } else {
+ uscan_warn
+"In $watchfile,\n no matching hrefs for pattern\n $site$dir$pattern";
+ return '';
+ }
+
+ # just give the final directory component
+ $newdir =~ s%/$%%;
+ $newdir =~ s%^.*/%%;
+ return ($newdir);
+}
+
+# Nothing to clean here
+sub http_clean { 0 }
+
+sub clean_content {
+ my ($content) = @_;
+
+ # We need this horrid stuff to handle href=foo type
+ # links. OK, bad HTML, but we have to handle it nonetheless.
+ # It's bug #89749.
+ $$content =~ s/href\s*=\s*(?=[^\"\'])([^\s>]+)/href="$1"/ig;
+
+ # Strip comments
+ $$content =~ s/<!-- .*?-->//sg;
+ return $content;
+}
+
+sub url_canonicalize_dots {
+ my ($base, $url) = @_;
+
+ if ($url !~ m{^[^:#?/]+://}) {
+ if ($url =~ m{^//}) {
+ $base =~ m{^[^:#?/]+:}
+ and $url = $& . $url;
+ } elsif ($url =~ m{^/}) {
+ $base =~ m{^[^:#?/]+://[^/#?]*}
+ and $url = $& . $url;
+ } else {
+ uscan_debug "Resolving urls with query part unimplemented"
+ if ($url =~ m/^[#?]/);
+ $base =~ m{^[^:#?/]+://[^/#?]*(?:/(?:[^#?/]*/)*)?} and do {
+ my $base_to_path = $&;
+ $base_to_path .= '/' unless $base_to_path =~ m|/$|;
+ $url = $base_to_path . $url;
+ };
+ }
+ }
+ $url =~ s{^([^:#?/]+://[^/#?]*)(/[^#?]*)}{
+ my ($h, $p) = ($1, $2);
+ $p =~ s{/\.(?:/|$|(?=[#?]))}{/}g;
+ 1 while $p =~ s{/(?!\.\./)[^/]*/\.\.(?:/|(?=[#?])|$)}{/}g;
+ $h.$p;}e;
+ $url;
+}
+
+sub html_search {
+ my ($self, $content, $patterns, $mangle) = @_;
+
+ # pagenmangle: should not abuse this slow operation
+ if (
+ mangle(
+ $self->watchfile, \$self->line,
+ 'pagemangle:\n', [@{ $self->pagemangle }],
+ \$content
+ )
+ ) {
+ return undef;
+ }
+ if ( !$self->shared->{bare}
+ and $content =~ m%^<[?]xml%i
+ and $content =~ m%xmlns="http://s3.amazonaws.com/doc/2006-03-01/"%
+ and $content !~ m%<Key><a\s+href%) {
+ # this is an S3 bucket listing. Insert an 'a href' tag
+ # into the content for each 'Key', so that it looks like html (LP: #798293)
+ uscan_warn
+"*** Amazon AWS special case code is deprecated***\nUse opts=pagemangle rule, instead";
+ $content =~ s%<Key>([^<]*)</Key>%<Key><a href="$1">$1</a></Key>%g;
+ uscan_extra_debug
+"processed content:\n$content\n[End of processed content] by Amazon AWS special case code";
+ }
+ clean_content(\$content);
+
+ # Is there a base URL given?
+ if ($content =~ /<\s*base\s+[^>]*href\s*=\s*([\"\'])(.*?)\1/i) {
+ $self->parse_result->{urlbase}
+ = url_canonicalize_dots($self->parse_result->{base}, $2);
+ } else {
+ $self->parse_result->{urlbase} = $self->parse_result->{base};
+ }
+ uscan_extra_debug
+"processed content:\n$content\n[End of processed content] by fix bad HTML code";
+
+# search hrefs in web page to obtain a list of uversionmangled version and matching download URL
+ {
+ local $, = ',';
+ uscan_verbose "Matching pattern:\n @{$self->{patterns}}";
+ }
+ my @hrefs;
+ while ($content =~ m/<\s*a\s+[^>]*(?<=\s)href\s*=\s*([\"\'])(.*?)\1/sgi) {
+ my $href = $2;
+ $href = fix_href($href);
+ my $href_canonical
+ = url_canonicalize_dots($self->parse_result->{urlbase}, $href);
+ if (defined $self->hrefdecode) {
+ if ($self->hrefdecode eq 'percent-encoding') {
+ uscan_debug "... Decoding from href: $href";
+ $href =~ s/%([A-Fa-f\d]{2})/chr hex $1/eg;
+ $href_canonical =~ s/%([A-Fa-f\d]{2})/chr hex $1/eg;
+ } else {
+ uscan_warn "Illegal value for hrefdecode: "
+ . "$self->{hrefdecode}";
+ return undef;
+ }
+ }
+ uscan_extra_debug "Checking href $href";
+ foreach my $_pattern (@$patterns) {
+ if (my @match = $href =~ /^$_pattern$/) {
+ push @hrefs,
+ parse_href($self, $href_canonical, $_pattern, \@match,
+ $mangle);
+ }
+ uscan_extra_debug "Checking href $href_canonical";
+ if (my @match = $href_canonical =~ /^$_pattern$/) {
+ push @hrefs,
+ parse_href($self, $href_canonical, $_pattern, \@match,
+ $mangle);
+ }
+ }
+ }
+ return @hrefs;
+}
+
+sub plain_search {
+ my ($self, $content) = @_;
+ my @hrefs;
+ foreach my $_pattern (@{ $self->patterns }) {
+ while ($content =~ s/.*?($_pattern)//) {
+ push @hrefs, $self->parse_href($1, $_pattern, $2);
+ }
+ }
+ $self->parse_result->{urlbase} = $self->parse_result->{base};
+ return @hrefs;
+}
+
+sub parse_href {
+ my ($self, $href, $_pattern, $match, $mangle) = @_;
+ $mangle //= 'uversionmangle';
+
+ my $mangled_version;
+ if ($self->watch_version == 2) {
+
+ # watch_version 2 only recognised one group; the code
+ # below will break version 2 watch files with a construction
+ # such as file-([\d\.]+(-\d+)?) (bug #327258)
+ $mangled_version
+ = ref $match eq 'ARRAY'
+ ? $match->[0]
+ : $match;
+ } else {
+ # need the map { ... } here to handle cases of (...)?
+ # which may match but then return undef values
+ if ($self->versionless) {
+
+ # exception, otherwise $mangled_version = 1
+ $mangled_version = '';
+ } else {
+ $mangled_version = join(".",
+ map { $_ if defined($_) }
+ ref $match eq 'ARRAY' ? @$match : $href =~ m&^$_pattern$&);
+ }
+
+ if (
+ mangle(
+ $self->watchfile, \$self->line,
+ "$mangle:", \@{ $self->$mangle },
+ \$mangled_version
+ )
+ ) {
+ return ();
+ }
+ }
+ $match = '';
+ if (defined $self->shared->{download_version}) {
+ if ($mangled_version eq $self->shared->{download_version}) {
+ $match = "matched with the download version";
+ }
+ }
+ my $priority = $mangled_version . '-' . get_priority($href);
+ return [$priority, $mangled_version, $href, $match];
+}
+
+1;