1 files changed, 434 insertions, 0 deletions
diff --git a/lib/Devscripts/Uscan/http.pm b/lib/Devscripts/Uscan/http.pm
new file mode 100644
index 0000000..95fc08a
--- /dev/null
+++ b/lib/Devscripts/Uscan/http.pm
@@ -0,0 +1,434 @@
+package Devscripts::Uscan::http;
+
+use strict;
+use Cwd qw/abs_path/;
+use Devscripts::Uscan::Output;
+use Devscripts::Uscan::Utils;
+use Devscripts::Uscan::_xtp;
+use Moo::Role;
+
+*http_newfile_base = \&Devscripts::Uscan::_xtp::_xtp_newfile_base;
+
+##################################
+# search $newversion (http mode)
+##################################
+sub http_search {
+    my ($self) = @_;
+
+    # $content: web page to be scraped to find the URLs to be downloaded
+    if (defined($1) and $self->downloader->ssl) {
+        uscan_die
+"you must have the liblwp-protocol-https-perl package installed\nto use https URLs";
+    }
+    uscan_verbose "Requesting URL:\n   $self->{parse_result}->{base}";
+    my $request = HTTP::Request->new('GET', $self->parse_result->{base});
+    my $response = $self->downloader->user_agent->request($request);
+    if (!$response->is_success) {
+        uscan_warn
+"In watchfile $self->{watchfile}, reading webpage\n  $self->{parse_result}->{base} failed: "
+          . $response->status_line;
+        return undef;
+    }
+
+    my @redirections = @{ $self->downloader->user_agent->get_redirections };
+
+    uscan_verbose "redirections: @redirections" if @redirections;
+
+    foreach my $_redir (@redirections) {
+        my $base_dir = $_redir;
+
+        $base_dir =~ s%^\w+://[^/]+/%/%;
+        if ($_redir =~ m%^(\w+://[^/]+)%) {
+            my $base_site = $1;
+
+            push @{ $self->patterns },
+                "(?:(?:$base_site)?"
+              . quotemeta($base_dir)
+              . ")?$self->{parse_result}->{filepattern}";
+            push @{ $self->sites },    $base_site;
+            push @{ $self->basedirs }, $base_dir;
+
+            # remove the filename, if any
+            my $base_dir_orig = $base_dir;
+            $base_dir =~ s%/[^/]*$%/%;
+            if ($base_dir ne $base_dir_orig) {
+                push @{ $self->patterns },
+                    "(?:(?:$base_site)?"
+                  . quotemeta($base_dir)
+                  . ")?$self->{parse_result}->{filepattern}";
+                push @{ $self->sites },    $base_site;
+                push @{ $self->basedirs }, $base_dir;
+            }
+        }
+    }
+
+    my $content = $response->decoded_content;
+    uscan_debug
+      "received content:\n$content\n[End of received content] by HTTP";
+
+    my @hrefs;
+    if (!$self->searchmode or $self->searchmode eq 'html') {
+        @hrefs = $self->html_search($content);
+    } elsif ($self->searchmode eq 'plain') {
+        @hrefs = $self->plain_search($content);
+    } else {
+        uscan_warn 'Unknown searchmode "' . $self->searchmode . '", skipping';
+        return undef;
+    }
+
+    if (@hrefs) {
+        @hrefs = Devscripts::Versort::versort(@hrefs);
+        my $msg
+          = "Found the following matching hrefs on the web page (newest first):\n";
+        foreach my $href (@hrefs) {
+            $msg .= "   $$href[2] ($$href[1]) index=$$href[0] $$href[3]\n";
+        }
+        uscan_verbose $msg;
+    }
+    my ($newversion, $newfile);
+    if (defined $self->shared->{download_version}) {
+
+        # extract ones which has $match in the above loop defined
+        my @vhrefs = grep { $$_[3] } @hrefs;
+        if (@vhrefs) {
+            (undef, $newversion, $newfile, undef) = @{ $vhrefs[0] };
+        } else {
+            uscan_warn
+"In $self->{watchfile} no matching hrefs for version $self->{shared}->{download_version}"
+              . " in watch line\n  $self->{line}";
+            return undef;
+        }
+    } else {
+        if (@hrefs) {
+            (undef, $newversion, $newfile, undef) = @{ $hrefs[0] };
+        } else {
+            uscan_warn
+"In $self->{watchfile} no matching files for watch line\n  $self->{line}";
+            return undef;
+        }
+    }
+    return ($newversion, $newfile);
+}
+
+#######################################################################
+# determine $upstream_url (http mode)
+#######################################################################
+# http is complicated due to absolute/relative URL issue
+sub http_upstream_url {
+    my ($self) = @_;
+    my $upstream_url;
+    my $newfile = $self->search_result->{newfile};
+    if ($newfile =~ m%^\w+://%) {
+        $upstream_url = $newfile;
+    } elsif ($newfile =~ m%^//%) {
+        $upstream_url = $self->parse_result->{site};
+        $upstream_url =~ s/^(https?:).*/$1/;
+        $upstream_url .= $newfile;
+    } elsif ($newfile =~ m%^/%) {
+
+        # absolute filename
+        # Were there any redirections? If so try using those first
+        if ($#{ $self->patterns } > 0) {
+
+            # replace $site here with the one we were redirected to
+            foreach my $index (0 .. $#{ $self->patterns }) {
+                if ("$self->{sites}->[$index]$newfile"
+                    =~ m&^$self->{patterns}->[$index]$&) {
+                    $upstream_url = "$self->{sites}->[$index]$newfile";
+                    last;
+                }
+            }
+            if (!defined($upstream_url)) {
+                uscan_verbose
+                  "Unable to determine upstream url from redirections,\n"
+                  . "defaulting to using site specified in watch file";
+                $upstream_url = "$self->{sites}->[0]$newfile";
+            }
+        } else {
+            $upstream_url = "$self->{sites}->[0]$newfile";
+        }
+    } else {
+        # relative filename, we hope
+        # Were there any redirections? If so try using those first
+        if ($#{ $self->patterns } > 0) {
+
+            # replace $site here with the one we were redirected to
+            foreach my $index (0 .. $#{ $self->patterns }) {
+
+                # skip unless the basedir looks like a directory
+                next unless $self->{basedirs}->[$index] =~ m%/$%;
+                my $nf = "$self->{basedirs}->[$index]$newfile";
+                if ("$self->{sites}->[$index]$nf"
+                    =~ m&^$self->{patterns}->[$index]$&) {
+                    $upstream_url = "$self->{sites}->[$index]$nf";
+                    last;
+                }
+            }
+            if (!defined($upstream_url)) {
+                uscan_verbose
+                  "Unable to determine upstream url from redirections,\n"
+                  . "defaulting to using site specified in watch file";
+                $upstream_url = "$self->{parse_result}->{urlbase}$newfile";
+            }
+        } else {
+            $upstream_url = "$self->{parse_result}->{urlbase}$newfile";
+        }
+    }
+
+    # mangle if necessary
+    $upstream_url =~ s/&amp;/&/g;
+    uscan_verbose "Matching target for downloadurlmangle: $upstream_url";
+    if (@{ $self->downloadurlmangle }) {
+        if (
+            mangle(
+                $self->watchfile,     \$self->line,
+                'downloadurlmangle:', \@{ $self->downloadurlmangle },
+                \$upstream_url
+            )
+        ) {
+            $self->status(1);
+            return undef;
+        }
+    }
+    return $upstream_url;
+}
+
+sub http_newdir {
+    my (
+        $https,     $downloader, $site,
+        $dir,       $pattern,    $dirversionmangle,
+        $watchfile, $lineptr,    $download_version
+    ) = @_;
+
+    my ($request, $response, $newdir);
+    my ($download_version_short1, $download_version_short2,
+        $download_version_short3)
+      = partial_version($download_version);
+    my $base = $site . $dir;
+
+    if (defined($https) and !$downloader->ssl) {
+        uscan_die
+"$progname: you must have the liblwp-protocol-https-perl package installed\n"
+          . "to use https URLs";
+    }
+    $request = HTTP::Request->new('GET', $base);
+    $response = $downloader->user_agent->request($request);
+    if (!$response->is_success) {
+        uscan_warn
+          "In watch file $watchfile, reading webpage\n  $base failed: "
+          . $response->status_line;
+        return '';
+    }
+
+    my $content = $response->content;
+    uscan_debug
+      "received content:\n$content\n[End of received content] by HTTP";
+
+    clean_content(\$content);
+
+    my $dirpattern = "(?:(?:$site)?" . quotemeta($dir) . ")?$pattern";
+
+    uscan_verbose "Matching pattern:\n   $dirpattern";
+    my @hrefs;
+    my $match = '';
+    while ($content =~ m/<\s*a\s+[^>]*href\s*=\s*([\"\'])(.*?)\1/gi) {
+        my $href = fix_href($2);
+        uscan_verbose "Matching target for dirversionmangle:   $href";
+        if ($href =~ m&^$dirpattern/?$&) {
+            my $mangled_version
+              = join(".", map { $_ // '' } $href =~ m&^$dirpattern/?$&);
+            if (
+                mangle(
+                    $watchfile,          $lineptr,
+                    'dirversionmangle:', \@{$dirversionmangle},
+                    \$mangled_version
+                )
+            ) {
+                return 1;
+            }
+            $match = '';
+            if (defined $download_version
+                and $mangled_version eq $download_version) {
+                $match = "matched with the download version";
+            }
+            if (defined $download_version_short3
+                and $mangled_version eq $download_version_short3) {
+                $match = "matched with the download version (partial 3)";
+            }
+            if (defined $download_version_short2
+                and $mangled_version eq $download_version_short2) {
+                $match = "matched with the download version (partial 2)";
+            }
+            if (defined $download_version_short1
+                and $mangled_version eq $download_version_short1) {
+                $match = "matched with the download version (partial 1)";
+            }
+            push @hrefs, [$mangled_version, $href, $match];
+        }
+    }
+
+    # extract ones which has $match in the above loop defined
+    my @vhrefs = grep { $$_[2] } @hrefs;
+    if (@vhrefs) {
+        @vhrefs = Devscripts::Versort::upstream_versort(@vhrefs);
+        $newdir = $vhrefs[0][1];
+    }
+    if (@hrefs) {
+        @hrefs = Devscripts::Versort::upstream_versort(@hrefs);
+        my $msg = "Found the following matching directories (newest first):\n";
+        foreach my $href (@hrefs) {
+            $msg .= "   $$href[1] ($$href[0]) $$href[2]\n";
+        }
+        uscan_verbose $msg;
+        $newdir //= $hrefs[0][1];
+    } else {
+        uscan_warn
+"In $watchfile,\n  no matching hrefs for pattern\n  $site$dir$pattern";
+        return '';
+    }
+
+    # just give the final directory component
+    $newdir =~ s%/$%%;
+    $newdir =~ s%^.*/%%;
+    return ($newdir);
+}
+
+# Nothing to clean here
+sub http_clean { 0 }
+
+sub clean_content {
+    my ($content) = @_;
+
+    # We need this horrid stuff to handle href=foo type
+    # links.  OK, bad HTML, but we have to handle it nonetheless.
+    # It's bug #89749.
+    $$content =~ s/href\s*=\s*(?=[^\"\'])([^\s>]+)/href="$1"/ig;
+
+    # Strip comments
+    $$content =~ s/<!-- .*?-->//sg;
+    return $content;
+}
+
+sub html_search {
+    my ($self, $content) = @_;
+
+    # pagenmangle: should not abuse this slow operation
+    if (
+        mangle(
+            $self->watchfile, \$self->line,
+            'pagemangle:\n', [@{ $self->pagemangle }],
+            \$content
+        )
+    ) {
+        return undef;
+    }
+    if (   !$self->shared->{bare}
+        and $content =~ m%^<[?]xml%i
+        and $content =~ m%xmlns="http://s3.amazonaws.com/doc/2006-03-01/"%
+        and $content !~ m%<Key><a\s+href%) {
+    # this is an S3 bucket listing.  Insert an 'a href' tag
+    # into the content for each 'Key', so that it looks like html (LP: #798293)
+        uscan_warn
+"*** Amazon AWS special case code is deprecated***\nUse opts=pagemangle rule, instead";
+        $content =~ s%<Key>([^<]*)</Key>%<Key><a href="$1">$1</a></Key>%g;
+        uscan_debug
+"processed content:\n$content\n[End of processed content] by Amazon AWS special case code";
+    }
+    clean_content(\$content);
+
+    # Is there a base URL given?
+    if ($content =~ /<\s*base\s+[^>]*href\s*=\s*([\"\'])(.*?)\1/i) {
+
+        # Ensure it ends with /
+        $self->parse_result->{urlbase} = "$2/";
+        $self->parse_result->{urlbase} =~ s%//$%/%;
+    } else {
+        # May have to strip a base filename
+        ($self->parse_result->{urlbase} = $self->parse_result->{base})
+          =~ s%/[^/]*$%/%;
+    }
+    uscan_debug
+"processed content:\n$content\n[End of processed content] by fix bad HTML code";
+
+# search hrefs in web page to obtain a list of uversionmangled version and matching download URL
+    {
+        local $, = ',';
+        uscan_verbose "Matching pattern:\n   @{$self->{patterns}}";
+    }
+    my @hrefs;
+    while ($content =~ m/<\s*a\s+[^>]*(?<=\s)href\s*=\s*([\"\'])(.*?)\1/sgi) {
+        my $href = $2;
+        $href = fix_href($href);
+        if (defined $self->hrefdecode) {
+            if ($self->hrefdecode eq 'percent-encoding') {
+                uscan_debug "... Decoding from href: $href";
+                $href =~ s/%([A-Fa-f\d]{2})/chr hex $1/eg;
+            } else {
+                uscan_warn "Illegal value for hrefdecode: "
+                  . "$self->{hrefdecode}";
+                return undef;
+            }
+        }
+        uscan_debug "Checking href $href";
+        foreach my $_pattern (@{ $self->patterns }) {
+            if ($href =~ /^$_pattern$/) {
+                push @hrefs, $self->parse_href($href, $_pattern, $1);
+            }
+        }
+    }
+    return @hrefs;
+}
+
+sub plain_search {
+    my ($self, $content) = @_;
+    my @hrefs;
+    foreach my $_pattern (@{ $self->patterns }) {
+        while ($content =~ s/.*?($_pattern)//) {
+            push @hrefs, $self->parse_href($1, $_pattern, $2);
+        }
+    }
+    return @hrefs;
+}
+
+sub parse_href {
+    my ($self, $href, $_pattern, $match) = @_;
+    my $mangled_version;
+    if ($self->watch_version == 2) {
+
+        # watch_version 2 only recognised one group; the code
+        # below will break version 2 watch files with a construction
+        # such as file-([\d\.]+(-\d+)?) (bug #327258)
+        $mangled_version = $match;
+    } else {
+        # need the map { ... } here to handle cases of (...)?
+        # which may match but then return undef values
+        if ($self->versionless) {
+
+            # exception, otherwise $mangled_version = 1
+            $mangled_version = '';
+        } else {
+            $mangled_version
+              = join(".", map { $_ if defined($_) } $href =~ m&^$_pattern$&);
+        }
+
+        if (
+            mangle(
+                $self->watchfile,  \$self->line,
+                'uversionmangle:', \@{ $self->uversionmangle },
+                \$mangled_version
+            )
+        ) {
+            return ();
+        }
+    }
+    $match = '';
+    if (defined $self->shared->{download_version}) {
+        if ($mangled_version eq $self->shared->{download_version}) {
+            $match = "matched with the download version";
+        }
+    }
+    my $priority = $mangled_version . '-' . get_priority($href);
+    return [$priority, $mangled_version, $href, $match];
+}
+
+1;