Adding upstream version 1.21.3.upstream/1.21.3 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 17:04:52 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 17:04:52 +0000
commit: 5e03c718f4e7ff13cb6834eda737c269ebed02ad (patch)
tree: bfad3f5be123f000fdb03e26400050dece33d72f /src/url.c
parent: Initial commit. (diff)
download: wget-upstream.tar.xz
wget-upstream.zip
1 files changed, 2535 insertions, 0 deletions
diff --git a/src/url.c b/src/url.c
new file mode 100644
index 0000000..5dfb91a
--- /dev/null
+++ b/src/url.c
@@ -0,0 +1,2535 @@
+/* URL handling.
+   Copyright (C) 1996-2011, 2015, 2018-2022 Free Software Foundation,
+   Inc.
+
+This file is part of GNU Wget.
+
+GNU Wget is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at
+your option) any later version.
+
+GNU Wget is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Wget.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this program, or any covered work, by linking or
+combining it with the OpenSSL project's OpenSSL library (or a
+modified version of that library), containing parts covered by the
+terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
+grants you additional permission to convey the resulting work.
+Corresponding Source for a non-source form of such a combination
+shall include the source code for the parts of OpenSSL used as well
+as that of the covered work.  */
+
+#include "wget.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "utils.h"
+#include "url.h"
+#include "host.h"  /* for is_valid_ipv6_address */
+#include "c-strcase.h"
+
+#ifdef HAVE_ICONV
+# include <iconv.h>
+#endif
+#include <langinfo.h>
+
+#ifdef __VMS
+#include "vms.h"
+#endif /* def __VMS */
+
+#ifdef TESTING
+#include "../tests/unit-tests.h"
+#endif
+
+enum {
+  scm_disabled = 1,             /* for https when OpenSSL fails to init. */
+  scm_has_params = 2,           /* whether scheme has ;params */
+  scm_has_query = 4,            /* whether scheme has ?query */
+  scm_has_fragment = 8          /* whether scheme has #fragment */
+};
+
+struct scheme_data
+{
+  /* Short name of the scheme, such as "http" or "ftp". */
+  const char *name;
+  /* Leading string that identifies the scheme, such as "https://". */
+  const char *leading_string;
+  /* Default port of the scheme when none is specified. */
+  int default_port;
+  /* Various flags. */
+  int flags;
+};
+
+/* Supported schemes: */
+static struct scheme_data supported_schemes[] =
+{
+  { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
+#ifdef HAVE_SSL
+  { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
+#endif
+  { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
+#ifdef HAVE_SSL
+  /*
+   * Explicit FTPS uses the same port as FTP.
+   * Implicit FTPS has its own port (990), but it is disabled by default.
+   */
+  { "ftps",     "ftps://",  DEFAULT_FTP_PORT,  scm_has_params|scm_has_fragment },
+#endif
+
+  /* SCHEME_INVALID */
+  { NULL,       NULL,       -1,                 0 }
+};
+
+/* Forward declarations: */
+
+static bool path_simplify (enum url_scheme, char *);
+
+/* Support for escaping and unescaping of URL strings.  */
+
+/* Table of "reserved" and "unsafe" characters.  Those terms are
+   rfc1738-speak, as such largely obsoleted by rfc2396 and later
+   specs, but the general idea remains.
+
+   A reserved character is the one that you can't decode without
+   changing the meaning of the URL.  For example, you can't decode
+   "/foo/%2f/bar" into "/foo///bar" because the number and contents of
+   path components is different.  Non-reserved characters can be
+   changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
+   unsafe characters are loosely based on rfc1738, plus "$" and ",",
+   as recommended by rfc2396, and minus "~", which is very frequently
+   used (and sometimes unrecognized as %7E by broken servers).
+
+   An unsafe character is the one that should be encoded when URLs are
+   placed in foreign environments.  E.g. space and newline are unsafe
+   in HTTP contexts because HTTP uses them as separator and line
+   terminator, so they must be encoded to %20 and %0A respectively.
+   "*" is unsafe in shell context, etc.
+
+   We determine whether a character is unsafe through static table
+   lookup.  This code assumes ASCII character set and 8-bit chars.  */
+
+enum {
+  /* rfc1738 reserved chars + "$" and ",".  */
+  urlchr_reserved = 1,
+
+  /* rfc1738 unsafe chars, plus non-printables.  */
+  urlchr_unsafe   = 2
+};
+
+#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
+#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
+#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
+
+/* Shorthands for the table: */
+#define R  urlchr_reserved
+#define U  urlchr_unsafe
+#define RU R|U
+
+static const unsigned char urlchr_table[256] =
+{
+  U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
+  U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
+  U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
+  U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
+  U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
+  0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
+  0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
+ RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
+  0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
+  U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
+  0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
+  0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
+
+  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
+  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
+  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
+  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
+
+  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
+  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
+  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
+  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
+};
+#undef R
+#undef U
+#undef RU
+
+static void
+url_unescape_1 (char *s, unsigned char mask)
+{
+  unsigned char *t = (unsigned char *) s; /* t - tortoise */
+  unsigned char *h = (unsigned char *) s; /* h - hare     */
+
+  for (; *h; h++, t++)
+    {
+      if (*h != '%')
+        {
+        copychar:
+          *t = *h;
+        }
+      else
+        {
+          unsigned char c;
+          /* Do nothing if '%' is not followed by two hex digits. */
+          if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
+            goto copychar;
+          c = X2DIGITS_TO_NUM (h[1], h[2]);
+          if (urlchr_test(c, mask))
+            goto copychar;
+          /* Don't unescape %00 because there is no way to insert it
+             into a C string without effectively truncating it. */
+          if (c == '\0')
+            goto copychar;
+          *t = c;
+          h += 2;
+        }
+    }
+  *t = '\0';
+}
+
+/* URL-unescape the string S.
+
+   This is done by transforming the sequences "%HH" to the character
+   represented by the hexadecimal digits HH.  If % is not followed by
+   two hexadecimal digits, it is inserted literally.
+
+   The transformation is done in place.  If you need the original
+   string intact, make a copy before calling this function.  */
+void
+url_unescape (char *s)
+{
+  url_unescape_1 (s, 0);
+}
+
+/* URL-unescape the string S.
+
+   This functions behaves identically as url_unescape(), but does not
+   convert characters from "reserved". In other words, it only converts
+   "unsafe" characters.  */
+void
+url_unescape_except_reserved (char *s)
+{
+  url_unescape_1 (s, urlchr_reserved);
+}
+
+/* The core of url_escape_* functions.  Escapes the characters that
+   match the provided mask in urlchr_table.
+
+   If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
+   returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
+   allocated string will be returned in all cases.  */
+
+static char *
+url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
+{
+  const char *p1;
+  char *p2, *newstr;
+  int newlen;
+  int addition = 0;
+
+  for (p1 = s; *p1; p1++)
+    if (urlchr_test (*p1, mask))
+      addition += 2;            /* Two more characters (hex digits) */
+
+  if (!addition)
+    return allow_passthrough ? (char *)s : xstrdup (s);
+
+  newlen = (p1 - s) + addition;
+  newstr = xmalloc (newlen + 1);
+
+  p1 = s;
+  p2 = newstr;
+  while (*p1)
+    {
+      /* Quote the characters that match the test mask. */
+      if (urlchr_test (*p1, mask))
+        {
+          unsigned char c = *p1++;
+          *p2++ = '%';
+          *p2++ = XNUM_TO_DIGIT (c >> 4);
+          *p2++ = XNUM_TO_DIGIT (c & 0xf);
+        }
+      else
+        *p2++ = *p1++;
+    }
+  assert (p2 - newstr == newlen);
+  *p2 = '\0';
+
+  return newstr;
+}
+
+/* URL-escape the unsafe characters (see urlchr_table) in a given
+   string, returning a freshly allocated string.  */
+
+char *
+url_escape (const char *s)
+{
+  return url_escape_1 (s, urlchr_unsafe, false);
+}
+
+/* URL-escape the unsafe and reserved characters (see urlchr_table) in
+   a given string, returning a freshly allocated string.  */
+
+char *
+url_escape_unsafe_and_reserved (const char *s)
+{
+  return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false);
+}
+
+/* URL-escape the unsafe characters (see urlchr_table) in a given
+   string.  If no characters are unsafe, S is returned.  */
+
+static char *
+url_escape_allow_passthrough (const char *s)
+{
+  return url_escape_1 (s, urlchr_unsafe, true);
+}
+
+/* Decide whether the char at position P needs to be encoded.  (It is
+   not enough to pass a single char *P because the function may need
+   to inspect the surrounding context.)
+
+   Return true if the char should be escaped as %XX, false otherwise.  */
+
+static inline bool
+char_needs_escaping (const char *p)
+{
+  if (*p == '%')
+    {
+      if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
+        return false;
+      else
+        /* Garbled %.. sequence: encode `%'. */
+        return true;
+    }
+  else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
+    return true;
+  else
+    return false;
+}
+
+/* Translate a %-escaped (but possibly non-conformant) input string S
+   into a %-escaped (and conformant) output string.  If no characters
+   are encoded or decoded, return the same string S; otherwise, return
+   a freshly allocated string with the new contents.
+
+   After a URL has been run through this function, the protocols that
+   use `%' as the quote character can use the resulting string as-is,
+   while those that don't can use url_unescape to get to the intended
+   data.  This function is stable: once the input is transformed,
+   further transformations of the result yield the same output.
+
+   Let's discuss why this function is needed.
+
+   Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
+   a raw space character would mess up the HTTP request, it needs to
+   be quoted, like this:
+
+       GET /abc%20def HTTP/1.0
+
+   It would appear that the unsafe chars need to be quoted, for
+   example with url_escape.  But what if we're requested to download
+   `abc%20def'?  url_escape transforms "%" to "%25", which would leave
+   us with `abc%2520def'.  This is incorrect -- since %-escapes are
+   part of URL syntax, "%20" is the correct way to denote a literal
+   space on the Wget command line.  This leads to the conclusion that
+   in that case Wget should not call url_escape, but leave the `%20'
+   as is.  This is clearly contradictory, but it only gets worse.
+
+   What if the requested URI is `abc%20 def'?  If we call url_escape,
+   we end up with `/abc%2520%20def', which is almost certainly not
+   intended.  If we don't call url_escape, we are left with the
+   embedded space and cannot complete the request.  What the user
+   meant was for Wget to request `/abc%20%20def', and this is where
+   reencode_escapes kicks in.
+
+   Wget used to solve this by first decoding %-quotes, and then
+   encoding all the "unsafe" characters found in the resulting string.
+   This was wrong because it didn't preserve certain URL special
+   (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
+   == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
+   whether we considered `+' reserved (it is).  One of these results
+   is inevitable because by the second step we would lose information
+   on whether the `+' was originally encoded or not.  Both results
+   were wrong because in CGI parameters + means space, while %2B means
+   literal plus.  reencode_escapes correctly translates the above to
+   "a%2B+b", i.e. returns the original string.
+
+   This function uses a modified version of the algorithm originally
+   proposed by Anon Sricharoenchai:
+
+   * Encode all "unsafe" characters, except those that are also
+     "reserved", to %XX.  See urlchr_table for which characters are
+     unsafe and reserved.
+
+   * Encode the "%" characters not followed by two hex digits to
+     "%25".
+
+   * Pass through all other characters and %XX escapes as-is.  (Up to
+     Wget 1.10 this decoded %XX escapes corresponding to "safe"
+     characters, but that was obtrusive and broke some servers.)
+
+   Anon's test case:
+
+   "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
+   ->
+   "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
+
+   Simpler test cases:
+
+   "foo bar"         -> "foo%20bar"
+   "foo%20bar"       -> "foo%20bar"
+   "foo %20bar"      -> "foo%20%20bar"
+   "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
+   "foo%25%20bar"    -> "foo%25%20bar"
+   "foo%2%20bar"     -> "foo%252%20bar"
+   "foo+bar"         -> "foo+bar"            (plus is reserved!)
+   "foo%2b+bar"      -> "foo%2b+bar"  */
+
+static char *
+reencode_escapes (const char *s)
+{
+  const char *p1;
+  char *newstr, *p2;
+  int oldlen, newlen;
+
+  int encode_count = 0;
+
+  /* First pass: inspect the string to see if there's anything to do,
+     and to calculate the new length.  */
+  for (p1 = s; *p1; p1++)
+    if (char_needs_escaping (p1))
+      ++encode_count;
+
+  if (!encode_count)
+    /* The string is good as it is. */
+    return (char *) s;          /* C const model sucks. */
+
+  oldlen = p1 - s;
+  /* Each encoding adds two characters (hex digits).  */
+  newlen = oldlen + 2 * encode_count;
+  newstr = xmalloc (newlen + 1);
+
+  /* Second pass: copy the string to the destination address, encoding
+     chars when needed.  */
+  p1 = s;
+  p2 = newstr;
+
+  while (*p1)
+    if (char_needs_escaping (p1))
+      {
+        unsigned char c = *p1++;
+        *p2++ = '%';
+        *p2++ = XNUM_TO_DIGIT (c >> 4);
+        *p2++ = XNUM_TO_DIGIT (c & 0xf);
+      }
+    else
+      *p2++ = *p1++;
+
+  *p2 = '\0';
+  assert (p2 - newstr == newlen);
+  return newstr;
+}
+
+/* Returns the scheme type if the scheme is supported, or
+   SCHEME_INVALID if not.  */
+
+enum url_scheme
+url_scheme (const char *url)
+{
+  int i;
+
+  for (i = 0; supported_schemes[i].leading_string; i++)
+    if (0 == c_strncasecmp (url, supported_schemes[i].leading_string,
+                          strlen (supported_schemes[i].leading_string)))
+      {
+        if (!(supported_schemes[i].flags & scm_disabled))
+          return (enum url_scheme) i;
+        else
+          return SCHEME_INVALID;
+      }
+
+  return SCHEME_INVALID;
+}
+
+#define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
+
+/* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
+   currently implemented, it returns true if URL begins with
+   [-+a-zA-Z0-9]+: .  */
+
+bool
+url_has_scheme (const char *url)
+{
+  const char *p = url;
+
+  /* The first char must be a scheme char. */
+  if (!*p || !SCHEME_CHAR (*p))
+    return false;
+  ++p;
+  /* Followed by 0 or more scheme chars. */
+  while (*p && SCHEME_CHAR (*p))
+    ++p;
+  /* Terminated by ':'. */
+  return *p == ':';
+}
+
+bool
+url_valid_scheme (const char *url)
+{
+  enum url_scheme scheme = url_scheme (url);
+  return scheme != SCHEME_INVALID;
+}
+
+int
+scheme_default_port (enum url_scheme scheme)
+{
+  return supported_schemes[scheme].default_port;
+}
+
+void
+scheme_disable (enum url_scheme scheme)
+{
+  supported_schemes[scheme].flags |= scm_disabled;
+}
+
+const char *
+scheme_leading_string (enum url_scheme scheme)
+{
+  return supported_schemes[scheme].leading_string;
+}
+
+/* Skip the username and password, if present in the URL.  The
+   function should *not* be called with the complete URL, but with the
+   portion after the scheme.
+
+   If no username and password are found, return URL.  */
+
+static const char *
+url_skip_credentials (const char *url)
+{
+  /* Look for '@' that comes before terminators, such as '/', '?',
+     '#', or ';'.  */
+  const char *p = (const char *)strpbrk (url, "@/?#;");
+  if (!p || *p != '@')
+    return url;
+  return p + 1;
+}
+
+/* Parse credentials contained in [BEG, END).  The region is expected
+   to have come from a URL and is unescaped.  */
+
+static bool
+parse_credentials (const char *beg, const char *end, char **user, char **passwd)
+{
+  char *colon;
+  const char *userend;
+
+  if (beg == end)
+    return false;               /* empty user name */
+
+  colon = memchr (beg, ':', end - beg);
+  if (colon == beg)
+    return false;               /* again empty user name */
+
+  if (colon)
+    {
+      *passwd = strdupdelim (colon + 1, end);
+      userend = colon;
+      url_unescape (*passwd);
+    }
+  else
+    {
+      *passwd = NULL;
+      userend = end;
+    }
+  *user = strdupdelim (beg, userend);
+  url_unescape (*user);
+  return true;
+}
+
+/* Used by main.c: detect URLs written using the "shorthand" URL forms
+   originally popularized by Netscape and NcFTP.  HTTP shorthands look
+   like this:
+
+   www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
+   www.foo.com[:port]            -> http://www.foo.com[:port]
+
+   FTP shorthands look like this:
+
+   foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
+   foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
+
+   If the URL needs not or cannot be rewritten, return NULL.  */
+
+char *
+rewrite_shorthand_url (const char *url)
+{
+  const char *p;
+  char *ret;
+
+  if (url_scheme (url) != SCHEME_INVALID)
+    return NULL;
+
+  /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
+     latter Netscape.  */
+  p = strpbrk (url, ":/");
+  if (p == url)
+    return NULL;
+
+  /* If we're looking at "://", it means the URL uses a scheme we
+     don't support, which may include "https" when compiled without
+     SSL support.  Don't bogusly rewrite such URLs.  */
+  if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
+    return NULL;
+
+  if (p && *p == ':')
+    {
+      /* Colon indicates ftp, as in foo.bar.com:path.  Check for
+         special case of http port number ("localhost:10000").  */
+      int digits = strspn (p + 1, "0123456789");
+      if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
+        goto http;
+
+      /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
+      if ((ret = aprintf ("ftp://%s", url)) != NULL)
+        ret[6 + (p - url)] = '/';
+    }
+  else
+    {
+    http:
+      /* Just prepend "http://" to URL. */
+      ret = aprintf ("http://%s", url);
+    }
+  return ret;
+}
+
+static void split_path (const char *, char **, char **);
+
+/* Like strpbrk, with the exception that it returns the pointer to the
+   terminating zero (end-of-string aka "eos") if no matching character
+   is found.  */
+
+static inline char *
+strpbrk_or_eos (const char *s, const char *accept)
+{
+  char *p = strpbrk (s, accept);
+  if (!p)
+    p = strchr (s, '\0');
+  return p;
+}
+
+/* Turn STR into lowercase; return true if a character was actually
+   changed. */
+
+static bool
+lowercase_str (char *str)
+{
+  bool changed = false;
+  for (; *str; str++)
+    if (c_isupper (*str))
+      {
+        changed = true;
+        *str = c_tolower (*str);
+      }
+  return changed;
+}
+
+static const char *
+init_seps (enum url_scheme scheme)
+{
+  static char seps[8] = ":/";
+  char *p = seps + 2;
+  int flags = supported_schemes[scheme].flags;
+
+  if (flags & scm_has_params)
+    *p++ = ';';
+  if (flags & scm_has_query)
+    *p++ = '?';
+  if (flags & scm_has_fragment)
+    *p++ = '#';
+  *p = '\0';
+  return seps;
+}
+
+static const char *parse_errors[] = {
+#define PE_NO_ERROR                     0
+  N_("No error"),
+#define PE_UNSUPPORTED_SCHEME           1
+  N_("Unsupported scheme %s"), /* support for format token only here */
+#define PE_MISSING_SCHEME               2
+  N_("Scheme missing"),
+#define PE_INVALID_HOST_NAME            3
+  N_("Invalid host name"),
+#define PE_BAD_PORT_NUMBER              4
+  N_("Bad port number"),
+#define PE_INVALID_USER_NAME            5
+  N_("Invalid user name"),
+#define PE_UNTERMINATED_IPV6_ADDRESS    6
+  N_("Unterminated IPv6 numeric address"),
+#define PE_IPV6_NOT_SUPPORTED           7
+  N_("IPv6 addresses not supported"),
+#define PE_INVALID_IPV6_ADDRESS         8
+  N_("Invalid IPv6 numeric address")
+};
+
+/* Parse a URL.
+
+   Return a new struct url if successful, NULL on error.  In case of
+   error, and if ERROR is not NULL, also set *ERROR to the appropriate
+   error code. */
+struct url *
+url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
+{
+  struct url *u;
+  const char *p;
+  bool path_modified, host_modified;
+
+  enum url_scheme scheme;
+  const char *seps;
+
+  const char *uname_b,     *uname_e;
+  const char *host_b,      *host_e;
+  const char *path_b,      *path_e;
+  const char *params_b,    *params_e;
+  const char *query_b,     *query_e;
+  const char *fragment_b,  *fragment_e;
+
+  int port;
+  char *user = NULL, *passwd = NULL;
+
+  const char *url_encoded = NULL;
+
+  int error_code;
+
+  scheme = url_scheme (url);
+  if (scheme == SCHEME_INVALID)
+    {
+      if (url_has_scheme (url))
+        error_code = PE_UNSUPPORTED_SCHEME;
+      else
+        error_code = PE_MISSING_SCHEME;
+      goto error;
+    }
+
+  url_encoded = url;
+
+  if (iri && iri->utf8_encode)
+    {
+      char *new_url = NULL;
+
+      iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, &new_url);
+      if (!iri->utf8_encode)
+        new_url = NULL;
+      else
+        {
+          xfree (iri->orig_url);
+          iri->orig_url = xstrdup (url);
+          url_encoded = reencode_escapes (new_url);
+          if (url_encoded != new_url)
+            xfree (new_url);
+          percent_encode = false;
+        }
+    }
+
+  if (percent_encode)
+    url_encoded = reencode_escapes (url);
+
+  p = url_encoded;
+  p += strlen (supported_schemes[scheme].leading_string);
+  uname_b = p;
+  p = url_skip_credentials (p);
+  uname_e = p;
+
+  /* scheme://user:pass@host[:port]... */
+  /*                    ^              */
+
+  /* We attempt to break down the URL into the components path,
+     params, query, and fragment.  They are ordered like this:
+
+       scheme://host[:port][/path][;params][?query][#fragment]  */
+
+  path_b     = path_e     = NULL;
+  params_b   = params_e   = NULL;
+  query_b    = query_e    = NULL;
+  fragment_b = fragment_e = NULL;
+
+  /* Initialize separators for optional parts of URL, depending on the
+     scheme.  For example, FTP has params, and HTTP and HTTPS have
+     query string and fragment. */
+  seps = init_seps (scheme);
+
+  host_b = p;
+
+  if (*p == '[')
+    {
+      /* Handle IPv6 address inside square brackets.  Ideally we'd
+         just look for the terminating ']', but rfc2732 mandates
+         rejecting invalid IPv6 addresses.  */
+
+      /* The address begins after '['. */
+      host_b = p + 1;
+      host_e = strchr (host_b, ']');
+
+      if (!host_e)
+        {
+          error_code = PE_UNTERMINATED_IPV6_ADDRESS;
+          goto error;
+        }
+
+#ifdef ENABLE_IPV6
+      /* Check if the IPv6 address is valid. */
+      if (!is_valid_ipv6_address(host_b, host_e))
+        {
+          error_code = PE_INVALID_IPV6_ADDRESS;
+          goto error;
+        }
+
+      /* Continue parsing after the closing ']'. */
+      p = host_e + 1;
+#else
+      error_code = PE_IPV6_NOT_SUPPORTED;
+      goto error;
+#endif
+
+      /* The closing bracket must be followed by a separator or by the
+         null char.  */
+      /* http://[::1]... */
+      /*             ^   */
+      if (!strchr (seps, *p))
+        {
+          /* Trailing garbage after []-delimited IPv6 address. */
+          error_code = PE_INVALID_HOST_NAME;
+          goto error;
+        }
+    }
+  else
+    {
+      p = strpbrk_or_eos (p, seps);
+      host_e = p;
+    }
+  ++seps;                       /* advance to '/' */
+
+  if (host_b == host_e)
+    {
+      error_code = PE_INVALID_HOST_NAME;
+      goto error;
+    }
+
+  port = scheme_default_port (scheme);
+  if (*p == ':')
+    {
+      const char *port_b, *port_e, *pp;
+
+      /* scheme://host:port/tralala */
+      /*              ^             */
+      ++p;
+      port_b = p;
+      p = strpbrk_or_eos (p, seps);
+      port_e = p;
+
+      /* Allow empty port, as per rfc2396. */
+      if (port_b != port_e)
+        for (port = 0, pp = port_b; pp < port_e; pp++)
+          {
+            if (!c_isdigit (*pp))
+              {
+                /* http://host:12randomgarbage/blah */
+                /*               ^                  */
+                error_code = PE_BAD_PORT_NUMBER;
+                goto error;
+              }
+            port = 10 * port + (*pp - '0');
+            /* Check for too large port numbers here, before we have
+               a chance to overflow on bogus port values.  */
+            if (port > 0xffff)
+              {
+                error_code = PE_BAD_PORT_NUMBER;
+                goto error;
+              }
+          }
+    }
+  /* Advance to the first separator *after* '/' (either ';' or '?',
+     depending on the scheme).  */
+  ++seps;
+
+  /* Get the optional parts of URL, each part being delimited by
+     current location and the position of the next separator.  */
+#define GET_URL_PART(sepchar, var) do {                         \
+  if (*p == sepchar)                                            \
+    var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
+  ++seps;                                                       \
+} while (0)
+
+  GET_URL_PART ('/', path);
+  if (supported_schemes[scheme].flags & scm_has_params)
+    GET_URL_PART (';', params);
+  if (supported_schemes[scheme].flags & scm_has_query)
+    GET_URL_PART ('?', query);
+  if (supported_schemes[scheme].flags & scm_has_fragment)
+    GET_URL_PART ('#', fragment);
+
+#undef GET_URL_PART
+  assert (*p == 0);
+
+  if (uname_b != uname_e)
+    {
+      /* http://user:pass@host */
+      /*        ^         ^    */
+      /*     uname_b   uname_e */
+      if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
+        {
+          error_code = PE_INVALID_USER_NAME;
+          goto error;
+        }
+    }
+
+  u = xnew0 (struct url);
+  u->scheme = scheme;
+  u->host   = strdupdelim (host_b, host_e);
+  u->port   = port;
+  u->user   = user;
+  u->passwd = passwd;
+
+  u->path = strdupdelim (path_b, path_e);
+  path_modified = path_simplify (scheme, u->path);
+  split_path (u->path, &u->dir, &u->file);
+
+  host_modified = lowercase_str (u->host);
+
+  /* Decode %HH sequences in host name.  This is important not so much
+     to support %HH sequences in host names (which other browser
+     don't), but to support binary characters (which will have been
+     converted to %HH by reencode_escapes).  */
+  if (strchr (u->host, '%'))
+    {
+      url_unescape (u->host);
+      host_modified = true;
+
+      /* check for invalid control characters in host name */
+      for (p = u->host; *p; p++)
+        {
+          if (c_iscntrl(*p))
+            {
+              url_free(u);
+              error_code = PE_INVALID_HOST_NAME;
+              goto error;
+            }
+        }
+
+      /* Apply IDNA regardless of iri->utf8_encode status */
+      if (opt.enable_iri && iri)
+        {
+          char *new = idn_encode (iri, u->host);
+          if (new)
+            {
+              xfree (u->host);
+              u->host = new;
+              host_modified = true;
+            }
+        }
+    }
+
+  if (params_b)
+    u->params = strdupdelim (params_b, params_e);
+  if (query_b)
+    u->query = strdupdelim (query_b, query_e);
+  if (fragment_b)
+    u->fragment = strdupdelim (fragment_b, fragment_e);
+
+  if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
+    {
+      /* If we suspect that a transformation has rendered what
+         url_string might return different from URL_ENCODED, rebuild
+         u->url using url_string.  */
+      u->url = url_string (u, URL_AUTH_SHOW);
+
+      if (url_encoded != url)
+        xfree (url_encoded);
+    }
+  else
+    {
+      if (url_encoded == url)
+        u->url = xstrdup (url);
+      else
+        u->url = (char *) url_encoded;
+    }
+
+  return u;
+
+ error:
+  /* Cleanup in case of error: */
+  if (url_encoded && url_encoded != url)
+    xfree (url_encoded);
+
+  /* Transmit the error code to the caller, if the caller wants to
+     know.  */
+  if (error)
+    *error = error_code;
+  return NULL;
+}
+
+/* Return the error message string from ERROR_CODE, which should have
+   been retrieved from url_parse.  The error message is translated.  */
+
+char *
+url_error (const char *url, int error_code)
+{
+  assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
+
+  if (error_code == PE_UNSUPPORTED_SCHEME)
+    {
+      char *error, *p;
+      char *scheme = xstrdup (url);
+      assert (url_has_scheme (url));
+
+      if ((p = strchr (scheme, ':')))
+        *p = '\0';
+      if (!c_strcasecmp (scheme, "https"))
+        error = aprintf (_("HTTPS support not compiled in"));
+      else
+        error = aprintf (_(parse_errors[error_code]), quote (scheme));
+      xfree (scheme);
+
+      return error;
+    }
+  else
+    return xstrdup (_(parse_errors[error_code]));
+}
+
+/* Split PATH into DIR and FILE.  PATH comes from the URL and is
+   expected to be URL-escaped.
+
+   The path is split into directory (the part up to the last slash)
+   and file (the part after the last slash), which are subsequently
+   unescaped.  Examples:
+
+   PATH                 DIR           FILE
+   "foo/bar/baz"        "foo/bar"     "baz"
+   "foo/bar/"           "foo/bar"     ""
+   "foo"                ""            "foo"
+   "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
+
+   DIR and FILE are freshly allocated.  */
+
+static void
+split_path (const char *path, char **dir, char **file)
+{
+  char *last_slash = strrchr (path, '/');
+  if (!last_slash)
+    {
+      *dir = xstrdup ("");
+      *file = xstrdup (path);
+    }
+  else
+    {
+      *dir = strdupdelim (path, last_slash);
+      *file = xstrdup (last_slash + 1);
+    }
+  url_unescape (*dir);
+  url_unescape (*file);
+}
+
+/* Note: URL's "full path" is the path with the query string and
+   params appended.  The "fragment" (#foo) is intentionally ignored,
+   but that might be changed.  For example, if the original URL was
+   "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
+   the full path will be "/foo/bar/baz;bullshit?querystring".  */
+
+/* Return the length of the full path, without the terminating
+   zero.  */
+
+static int
+full_path_length (const struct url *url)
+{
+  int len = 0;
+
+#define FROB(el) if (url->el) len += 1 + strlen (url->el)
+
+  FROB (path);
+  FROB (params);
+  FROB (query);
+
+#undef FROB
+
+  return len;
+}
+
+/* Write out the full path. */
+
+static void
+full_path_write (const struct url *url, char *where)
+{
+#define FROB(el, chr) do {                      \
+  char *f_el = url->el;                         \
+  if (f_el) {                                   \
+    int l = strlen (f_el);                      \
+    *where++ = chr;                             \
+    memcpy (where, f_el, l);                    \
+    where += l;                                 \
+  }                                             \
+} while (0)
+
+  FROB (path, '/');
+  FROB (params, ';');
+  FROB (query, '?');
+
+#undef FROB
+}
+
+/* Public function for getting the "full path".  E.g. if u->path is
+   "foo/bar" and u->query is "param=value", full_path will be
+   "/foo/bar?param=value". */
+
+char *
+url_full_path (const struct url *url)
+{
+  int length = full_path_length (url);
+  char *full_path = xmalloc (length + 1);
+
+  full_path_write (url, full_path);
+  full_path[length] = '\0';
+
+  return full_path;
+}
+
+/* Unescape CHR in an otherwise escaped STR.  Used to selectively
+   escaping of certain characters, such as "/" and ":".  Returns a
+   count of unescaped chars.  */
+
+static void
+unescape_single_char (char *str, char chr)
+{
+  const char c1 = XNUM_TO_DIGIT (chr >> 4);
+  const char c2 = XNUM_TO_DIGIT (chr & 0xf);
+  char *h = str;                /* hare */
+  char *t = str;                /* tortoise */
+  for (; *h; h++, t++)
+    {
+      if (h[0] == '%' && h[1] == c1 && h[2] == c2)
+        {
+          *t = chr;
+          h += 2;
+        }
+      else
+        *t = *h;
+    }
+  *t = '\0';
+}
+
+/* Escape unsafe and reserved characters, except for the slash
+   characters.  */
+
+static char *
+url_escape_dir (const char *dir)
+{
+  char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
+  if (newdir == dir)
+    return (char *)dir;
+
+  unescape_single_char (newdir, '/');
+  return newdir;
+}
+
+/* Sync u->path and u->url with u->dir and u->file.  Called after
+   u->file or u->dir have been changed, typically by the FTP code.  */
+
+static void
+sync_path (struct url *u)
+{
+  char *newpath, *efile, *edir;
+
+  xfree (u->path);
+
+  /* u->dir and u->file are not escaped.  URL-escape them before
+     reassembling them into u->path.  That way, if they contain
+     separators like '?' or even if u->file contains slashes, the
+     path will be correctly assembled.  (u->file can contain slashes
+     if the URL specifies it with %2f, or if an FTP server returns
+     it.)  */
+  edir = url_escape_dir (u->dir);
+  efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
+
+  if (!*edir)
+    newpath = xstrdup (efile);
+  else
+    {
+      int dirlen = strlen (edir);
+      int filelen = strlen (efile);
+
+      /* Copy "DIR/FILE" to newpath. */
+      char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
+      memcpy (p, edir, dirlen);
+      p += dirlen;
+      *p++ = '/';
+      memcpy (p, efile, filelen);
+      p += filelen;
+      *p = '\0';
+    }
+
+  u->path = newpath;
+
+  if (edir != u->dir)
+    xfree (edir);
+  if (efile != u->file)
+    xfree (efile);
+
+  /* Regenerate u->url as well.  */
+  xfree (u->url);
+  u->url = url_string (u, URL_AUTH_SHOW);
+}
+
+/* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
+   This way we can sync u->path and u->url when they get changed.  */
+
+void
+url_set_dir (struct url *url, const char *newdir)
+{
+  xfree (url->dir);
+  url->dir = xstrdup (newdir);
+  sync_path (url);
+}
+
+void
+url_set_file (struct url *url, const char *newfile)
+{
+  xfree (url->file);
+  url->file = xstrdup (newfile);
+  sync_path (url);
+}
+
+void
+url_free (struct url *url)
+{
+  if (url)
+    {
+      xfree (url->host);
+
+      xfree (url->path);
+      xfree (url->url);
+
+      xfree (url->params);
+      xfree (url->query);
+      xfree (url->fragment);
+      xfree (url->user);
+      xfree (url->passwd);
+
+      xfree (url->dir);
+      xfree (url->file);
+
+      xfree (url);
+    }
+}
+
+/* Create all the necessary directories for PATH (a file).  Calls
+   make_directory internally.  */
+int
+mkalldirs (const char *path)
+{
+  const char *p;
+  char *t;
+  struct stat st;
+  int res;
+
+  p = strrchr(path, '/');
+  p = p == NULL ? path : p;
+
+  /* Don't create if it's just a file.  */
+  if ((p == path) && (*p != '/'))
+    return 0;
+  t = strdupdelim (path, p);
+
+  /* Check whether the directory exists.  */
+  if ((stat (t, &st) == 0))
+    {
+      if (S_ISDIR (st.st_mode))
+        {
+          xfree (t);
+          return 0;
+        }
+      else
+        {
+          /* If the dir exists as a file name, remove it first.  This
+             is *only* for Wget to work with buggy old CERN http
+             servers.  Here is the scenario: When Wget tries to
+             retrieve a directory without a slash, e.g.
+             http://foo/bar (bar being a directory), CERN server will
+             not redirect it too http://foo/bar/ -- it will generate a
+             directory listing containing links to bar/file1,
+             bar/file2, etc.  Wget will lose because it saves this
+             HTML listing to a file `bar', so it cannot create the
+             directory.  To work around this, if the file of the same
+             name exists, we just remove it and create the directory
+             anyway.  */
+          DEBUGP (("Removing %s because of directory danger!\n", t));
+          if (unlink (t))
+            logprintf (LOG_NOTQUIET, "Failed to unlink %s (%d): %s\n",
+                       t, errno, strerror(errno));
+        }
+    }
+  res = make_directory (t);
+  if (res != 0)
+    logprintf (LOG_NOTQUIET, "%s: %s\n", t, strerror (errno));
+  xfree (t);
+  return res;
+}
+
+/* Functions for constructing the file name out of URL components.  */
+
+/* A growable string structure, used by url_file_name and friends.
+   This should perhaps be moved to utils.c.
+
+   The idea is to have a convenient and efficient way to construct a
+   string by having various functions append data to it.  Instead of
+   passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
+   functions in questions, we pass the pointer to this struct.
+
+   Functions that write to the members in this struct must make sure
+   that base remains null terminated by calling append_null().
+   */
+
+struct growable {
+  char *base;
+  int size;   /* memory allocated */
+  int tail;   /* string length */
+};
+
+/* Ensure that the string can accept APPEND_COUNT more characters past
+   the current TAIL position.  If necessary, this will grow the string
+   and update its allocated size.  If the string is already large
+   enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
+#define GROW(g, append_size) do {                                       \
+  struct growable *G_ = g;                                              \
+  DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
+} while (0)
+
+/* Return the tail position of the string. */
+#define TAIL(r) ((r)->base + (r)->tail)
+
+/* Move the tail position by APPEND_COUNT characters. */
+#define TAIL_INCR(r, append_count) ((r)->tail += append_count)
+
+
+/* Append NULL to DEST. */
+static void
+append_null (struct growable *dest)
+{
+  GROW (dest, 1);
+  *TAIL (dest) = 0;
+}
+
+/* Append CH to DEST. */
+static void
+append_char (char ch, struct growable *dest)
+{
+  if (ch)
+    {
+      GROW (dest, 1);
+      *TAIL (dest) = ch;
+      TAIL_INCR (dest, 1);
+    }
+
+  append_null (dest);
+}
+
+/* Append the string STR to DEST. */
+static void
+append_string (const char *str, struct growable *dest)
+{
+  int l = strlen (str);
+
+  if (l)
+    {
+      GROW (dest, l);
+      memcpy (TAIL (dest), str, l);
+      TAIL_INCR (dest, l);
+    }
+
+  append_null (dest);
+}
+
+
+enum {
+  filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
+  filechr_not_vms     = 2,      /* unusable on VMS (ODS5), 0x00-0x1F * ? */
+  filechr_not_windows = 4,      /* unusable on Windows, one of \|/<>?:*" */
+  filechr_control     = 8       /* a control character, e.g. 0-31 */
+};
+
+#define FILE_CHAR_TEST(c, mask) \
+    ((opt.restrict_files_nonascii && !c_isascii ((unsigned char)(c))) || \
+    (filechr_table[(unsigned char)(c)] & (mask)))
+
+/* Shorthands for the table: */
+#define U filechr_not_unix
+#define V filechr_not_vms
+#define W filechr_not_windows
+#define C filechr_control
+
+#define UVWC U|V|W|C
+#define UW U|W
+#define VC V|C
+#define VW V|W
+
+/* Table of characters unsafe under various conditions (see above).
+
+   Arguably we could also claim `%' to be unsafe, since we use it as
+   the escape character.  If we ever want to be able to reliably
+   translate file name back to URL, this would become important
+   crucial.  Right now, it's better to be minimal in escaping.  */
+
+static const unsigned char filechr_table[256] =
+{
+UVWC, VC, VC, VC,  VC, VC, VC, VC,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
+  VC, VC, VC, VC,  VC, VC, VC, VC,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
+  VC, VC, VC, VC,  VC, VC, VC, VC,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
+  VC, VC, VC, VC,  VC, VC, VC, VC,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
+   0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
+   0,  0, VW,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
+   0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
+   0,  0,  W,  0,   W,  0,  W, VW,   /* 8   9   :   ;    <   =   >   ?   */
+   0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
+   0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
+   0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
+   0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
+   0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
+   0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
+   0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
+   0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
+
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0, /* 128-143 */
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0, /* 144-159 */
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
+};
+#undef U
+#undef V
+#undef W
+#undef C
+#undef UW
+#undef UVWC
+#undef VC
+#undef VW
+
+/* FN_PORT_SEP is the separator between host and port in file names
+   for non-standard port numbers.  On Unix this is normally ':', as in
+   "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
+   because Windows can't handle ':' in file names.  */
+#define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
+
+/* FN_QUERY_SEP is the separator between the file name and the URL
+   query, normally '?'.  Because VMS and Windows cannot handle '?' in a
+   file name, we use '@' instead there.  */
+#define FN_QUERY_SEP \
+ (((opt.restrict_files_os != restrict_vms) && \
+   (opt.restrict_files_os != restrict_windows)) ? '?' : '@')
+#define FN_QUERY_SEP_STR \
+ (((opt.restrict_files_os != restrict_vms) && \
+   (opt.restrict_files_os != restrict_windows)) ? "?" : "@")
+
+/* Quote path element, characters in [b, e), as file name, and append
+   the quoted string to DEST.  Each character is quoted as per
+   file_unsafe_char and the corresponding table.
+
+   If ESCAPED is true, the path element is considered to be
+   URL-escaped and will be unescaped prior to inspection.  */
+
+static void
+append_uri_pathel (const char *b, const char *e, bool escaped,
+                   struct growable *dest)
+{
+  const char *p;
+  char buf[1024];
+  char *unescaped = NULL;
+  int quoted, outlen;
+  int mask;
+  int max_length;
+
+  if (!dest)
+    return;
+
+  if (opt.restrict_files_os == restrict_unix)
+    mask = filechr_not_unix;
+  else if (opt.restrict_files_os == restrict_vms)
+    mask = filechr_not_vms;
+  else
+    mask = filechr_not_windows;
+
+  if (opt.restrict_files_ctrl)
+    mask |= filechr_control;
+
+  /* Copy [b, e) to PATHEL and URL-unescape it. */
+  if (escaped)
+    {
+      size_t len = e - b;
+		if (len < sizeof (buf))
+        unescaped = buf;
+      else
+        unescaped = xmalloc(len + 1);
+
+		memcpy(unescaped, b, len);
+		unescaped[len] = 0;
+
+      url_unescape (unescaped);
+      b = unescaped;
+      e = unescaped + strlen (unescaped);
+    }
+
+  /* Defang ".." when found as component of path.  Remember that path
+     comes from the URL and might contain malicious input.  */
+  if (e - b == 2 && b[0] == '.' && b[1] == '.')
+    {
+      b = "%2E%2E";
+      e = b + 6;
+    }
+
+  /* Walk the PATHEL string and check how many characters we'll need
+     to quote.  */
+  quoted = 0;
+  for (p = b; p < e; p++)
+    if (FILE_CHAR_TEST (*p, mask))
+      ++quoted;
+
+  /* Calculate the length of the output string.  e-b is the input
+     string length.  Each quoted char introduces two additional
+     characters in the string, hence 2*quoted.  */
+  outlen = (e - b) + (2 * quoted);
+# ifdef WINDOWS
+  max_length = MAX_PATH;
+# else
+  max_length = get_max_length(dest->base, dest->tail, _PC_NAME_MAX);
+# endif
+  max_length -= CHOMP_BUFFER;
+  if (max_length > 0 && outlen > max_length)
+    {
+      logprintf (LOG_NOTQUIET, "The destination name is too long (%d), reducing to %d\n", outlen, max_length);
+
+      outlen = max_length;
+    }
+  GROW (dest, outlen);
+
+  // This should not happen, but it's impossible to argue with static analysis that it can't happen
+  // (in theory it can). So give static analyzers a hint.
+  if (!dest->base)
+    return;
+
+  if (!quoted)
+    {
+      /* If there's nothing to quote, we can simply append the string
+         without processing it again.  */
+      memcpy (TAIL (dest), b, outlen);
+    }
+  else
+    {
+      char *q = TAIL (dest);
+      int i;
+
+      for (i = 0, p = b; p < e; p++)
+        {
+          if (!FILE_CHAR_TEST (*p, mask))
+	    {
+	      if (i == outlen)
+	        break;
+	      *q++ = *p;
+	      i++;
+	    }
+          else if (i + 3 > outlen)
+	    break;
+	  else
+            {
+              unsigned char ch = *p;
+              *q++ = '%';
+              *q++ = XNUM_TO_DIGIT (ch >> 4);
+              *q++ = XNUM_TO_DIGIT (ch & 0xf);
+	      i += 3;
+            }
+        }
+      assert (q - TAIL (dest) <= outlen);
+    }
+
+  /* Perform inline case transformation if required.  */
+  if (opt.restrict_files_case == restrict_lowercase
+      || opt.restrict_files_case == restrict_uppercase)
+    {
+      char *q;
+      for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
+        {
+          if (opt.restrict_files_case == restrict_lowercase)
+            *q = c_tolower (*q);
+          else
+            *q = c_toupper (*q);
+        }
+    }
+
+  TAIL_INCR (dest, outlen);
+  append_null (dest);
+
+  if (unescaped && unescaped != buf)
+	  free (unescaped);
+}
+
+#ifdef HAVE_ICONV
+static char *
+convert_fname (char *fname)
+{
+  char *converted_fname;
+  const char *from_encoding = opt.encoding_remote;
+  const char *to_encoding = opt.locale;
+  iconv_t cd;
+  size_t len, done, inlen, outlen;
+  char *s;
+  const char *orig_fname;
+
+  /* Defaults for remote and local encodings.  */
+  if (!from_encoding)
+    from_encoding = "UTF-8";
+  if (!to_encoding)
+    to_encoding = nl_langinfo (CODESET);
+
+  cd = iconv_open (to_encoding, from_encoding);
+  if (cd == (iconv_t) (-1))
+    {
+      logprintf (LOG_VERBOSE, _ ("Conversion from %s to %s isn't supported\n"),
+                 quote_n (0, from_encoding), quote_n (1, to_encoding));
+      return fname;
+    }
+
+  orig_fname = fname;
+  inlen = strlen (fname);
+  len = outlen = inlen * 2;
+  converted_fname = s = xmalloc (outlen + 1);
+  done = 0;
+
+  for (;;)
+    {
+      errno = 0;
+      if (iconv (cd, (ICONV_CONST char **) &fname, &inlen, &s, &outlen) == 0
+          && iconv (cd, NULL, NULL, &s, &outlen) == 0)
+        {
+          *(converted_fname + len - outlen - done) = '\0';
+          iconv_close (cd);
+          DEBUGP (("Converted file name '%s' (%s) -> '%s' (%s)\n",
+                   orig_fname, from_encoding, converted_fname, to_encoding));
+          xfree (orig_fname);
+          return converted_fname;
+        }
+
+      /* Incomplete or invalid multibyte sequence */
+      if (errno == EINVAL || errno == EILSEQ || errno == 0)
+        {
+          if (errno)
+            logprintf (LOG_VERBOSE,
+                       _ ("Incomplete or invalid multibyte sequence encountered\n"));
+          else
+            logprintf (LOG_VERBOSE,
+                       _ ("Unconvertable multibyte sequence encountered\n"));
+          xfree (converted_fname);
+          converted_fname = (char *) orig_fname;
+          break;
+        }
+      else if (errno == E2BIG) /* Output buffer full */
+        {
+          done = len;
+          len = outlen = done + inlen * 2;
+          converted_fname = xrealloc (converted_fname, outlen + 1);
+          s = converted_fname + done;
+        }
+      else /* Weird, we got an unspecified error */
+        {
+          logprintf (LOG_VERBOSE, _ ("Unhandled errno %d\n"), errno);
+          xfree (converted_fname);
+          converted_fname = (char *) orig_fname;
+          break;
+        }
+    }
+  DEBUGP (("Failed to convert file name '%s' (%s) -> '?' (%s)\n",
+           orig_fname, from_encoding, to_encoding));
+
+  iconv_close (cd);
+
+  return converted_fname;
+}
+#else
+static char *
+convert_fname (char *fname)
+{
+  return fname;
+}
+#endif
+
+/* Append to DEST the directory structure that corresponds the
+   directory part of URL's path.  For example, if the URL is
+   http://server/dir1/dir2/file, this appends "/dir1/dir2".
+
+   Each path element ("dir1" and "dir2" in the above example) is
+   examined, url-unescaped, and re-escaped as file name element.
+
+   Additionally, it cuts as many directories from the path as
+   specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
+   will produce "bar" for the above example.  For 2 or more, it will
+   produce "".
+
+   Each component of the path is quoted for use as file name.  */
+
+static void
+append_dir_structure (const struct url *u, struct growable *dest)
+{
+  char *pathel, *next;
+  int cut = opt.cut_dirs;
+
+  /* Go through the path components, de-URL-quote them, and quote them
+     (if necessary) as file names.  */
+
+  pathel = u->path;
+  for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
+    {
+      if (cut-- > 0)
+        continue;
+      if (pathel == next)
+        /* Ignore empty pathels.  */
+        continue;
+
+      if (dest->tail)
+        append_char ('/', dest);
+
+      append_uri_pathel (pathel, next, true, dest);
+    }
+}
+
+/* Return a unique file name that matches the given URL as well as
+   possible.  Does not create directories on the file system.  */
+
+char *
+url_file_name (const struct url *u, char *replaced_filename)
+{
+  struct growable fnres;        /* stands for "file name result" */
+  struct growable temp_fnres;
+
+  const char *u_file;
+  char *fname, *unique, *fname_len_check;
+  const char *index_filename = "index.html"; /* The default index file is index.html */
+
+  fnres.base = NULL;
+  fnres.size = 0;
+  fnres.tail = 0;
+
+  temp_fnres.base = NULL;
+  temp_fnres.size = 0;
+  temp_fnres.tail = 0;
+
+  /* If an alternative index file was defined, change index_filename */
+  if (opt.default_page)
+    index_filename = opt.default_page;
+
+
+  /* Start with the directory prefix, if specified. */
+  if (opt.dir_prefix)
+    append_string (opt.dir_prefix, &fnres);
+
+  /* If "dirstruct" is turned on (typically the case with -r), add
+     the host and port (unless those have been turned off) and
+     directory structure.  */
+  /* All safe remote chars are unescaped and stored in temp_fnres,
+     then converted to local and appended to fnres.
+     Internationalized URL/IDN will produce punycode to lookup IP from DNS:
+     https://en.wikipedia.org/wiki/URL
+     https://en.wikipedia.org/wiki/Internationalized_domain_name
+     Non-ASCII code chars in the path:
+     https://en.wikipedia.org/wiki/List_of_Unicode_characters
+     https://en.wikipedia.org/wiki/List_of_writing_systems */
+  if (opt.dirstruct)
+    {
+      if (opt.protocol_directories)
+        {
+          if (temp_fnres.tail)
+            append_char ('/', &temp_fnres);
+          append_string (supported_schemes[u->scheme].name, &temp_fnres);
+        }
+      if (opt.add_hostdir)
+        {
+          if (temp_fnres.tail)
+            append_char ('/', &temp_fnres);
+          if (0 != strcmp (u->host, ".."))
+            append_string (u->host, &temp_fnres);
+          else
+            /* Host name can come from the network; malicious DNS may
+               allow ".." to be resolved, causing us to write to
+               "../<file>".  Defang such host names.  */
+            append_string ("%2E%2E", &temp_fnres);
+          if (u->port != scheme_default_port (u->scheme))
+            {
+              char portstr[24];
+              number_to_string (portstr, u->port);
+              append_char (FN_PORT_SEP, &temp_fnres);
+              append_string (portstr, &temp_fnres);
+            }
+        }
+
+      append_dir_structure (u, &temp_fnres);
+    }
+
+  if (!replaced_filename)
+    {
+      /* Create the filename. */
+      u_file = *u->file ? u->file : index_filename;
+
+      /* Append "?query" to the file name, even if empty,
+       * and create fname_len_check. */
+      if (u->query)
+        fname_len_check = concat_strings (u_file, FN_QUERY_SEP_STR, u->query, NULL);
+      else
+        fname_len_check = strdupdelim (u_file, u_file + strlen (u_file));
+    }
+  else
+    {
+      u_file = replaced_filename;
+      fname_len_check = strdupdelim (u_file, u_file + strlen (u_file));
+    }
+
+  if (temp_fnres.tail)
+    append_char ('/', &temp_fnres);
+
+  append_uri_pathel (fname_len_check,
+    fname_len_check + strlen (fname_len_check), true, &temp_fnres);
+
+  /* Zero-terminate the temporary file name. */
+  append_char ('\0', &temp_fnres);
+
+  /* convert all remote chars before length check and appending to local path */
+  fname = convert_fname (temp_fnres.base);
+  temp_fnres.base = NULL;
+  temp_fnres.size = 0;
+  temp_fnres.tail = 0;
+  append_string (fname, &temp_fnres);
+
+  xfree (fname);
+  xfree (fname_len_check);
+
+  /* The filename has already been 'cleaned' by append_uri_pathel() above.  So,
+   * just append it. */
+  if (fnres.tail)
+    append_char ('/', &fnres);
+  append_string (temp_fnres.base, &fnres);
+
+  fname = fnres.base;
+
+  /* Make a final check that the path length is acceptable? */
+  /* TODO: check fnres.base for path length problem */
+
+  xfree (temp_fnres.base);
+
+  /* Check the cases in which the unique extensions are not used:
+     1) Clobbering is turned off (-nc).
+     2) Retrieval with regetting.
+     3) Timestamping is used.
+     4) Hierarchy is built.
+     5) Backups are specified.
+
+     The exception is the case when file does exist and is a
+     directory (see `mkalldirs' for explanation).  */
+
+  if (ALLOW_CLOBBER
+      && !(file_exists_p (fname, NULL) && !file_non_directory_p (fname)))
+    {
+      unique = fname;
+    }
+  else
+    {
+      unique = unique_name_passthrough (fname);
+      if (unique != fname)
+        xfree (fname);
+    }
+
+/* On VMS, alter the name as required. */
+#ifdef __VMS
+  {
+    char *unique2;
+
+    unique2 = ods_conform( unique);
+    if (unique2 != unique)
+      {
+        xfree (unique);
+        unique = unique2;
+      }
+  }
+#endif /* def __VMS */
+
+  return unique;
+}
+
+/* Resolve "." and ".." elements of PATH by destructively modifying
+   PATH and return true if PATH has been modified, false otherwise.
+
+   The algorithm is in spirit similar to the one described in rfc1808,
+   although implemented differently, in one pass.  To recap, path
+   elements containing only "." are removed, and ".." is taken to mean
+   "back up one element".  Single leading and trailing slashes are
+   preserved.
+
+   For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
+   test examples are provided below.  If you change anything in this
+   function, run test_path_simplify to make sure you haven't broken a
+   test case.  */
+
+static bool
+path_simplify (enum url_scheme scheme, char *path)
+{
+  char *h = path;               /* hare */
+  char *t = path;               /* tortoise */
+  char *beg = path;
+  char *end = strchr (path, '\0');
+
+  while (h < end)
+    {
+      /* Hare should be at the beginning of a path element. */
+
+      if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
+        {
+          /* Ignore "./". */
+          h += 2;
+        }
+      else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
+        {
+          /* Handle "../" by retreating the tortoise by one path
+             element -- but not past beginning.  */
+          if (t > beg)
+            {
+              /* Move backwards until T hits the beginning of the
+                 previous path element or the beginning of path. */
+              for (--t; t > beg && t[-1] != '/'; t--)
+                ;
+            }
+          else if (scheme == SCHEME_FTP
+#ifdef HAVE_SSL
+              || scheme == SCHEME_FTPS
+#endif
+              )
+            {
+              /* If we're at the beginning, copy the "../" literally
+                 and move the beginning so a later ".." doesn't remove
+                 it.  This violates RFC 3986; but we do it for FTP
+                 anyway because there is otherwise no way to get at a
+                 parent directory, when the FTP server drops us in a
+                 non-root directory (which is not uncommon). */
+              beg = t + 3;
+              goto regular;
+            }
+          h += 3;
+        }
+      else
+        {
+        regular:
+          /* A regular path element.  If H hasn't advanced past T,
+             simply skip to the next path element.  Otherwise, copy
+             the path element until the next slash.  */
+          if (t == h)
+            {
+              /* Skip the path element, including the slash.  */
+              while (h < end && *h != '/')
+                t++, h++;
+              if (h < end)
+                t++, h++;
+            }
+          else
+            {
+              /* Copy the path element, including the final slash.  */
+              while (h < end && *h != '/')
+                *t++ = *h++;
+              if (h < end)
+                *t++ = *h++;
+            }
+        }
+    }
+
+  if (t != h)
+    *t = '\0';
+
+  return t != h;
+}
+
+/* Return the length of URL's path.  Path is considered to be
+   terminated by one or more of the ?query or ;params or #fragment,
+   depending on the scheme.  */
+
+static const char *
+path_end (const char *url)
+{
+  enum url_scheme scheme = url_scheme (url);
+  const char *seps;
+  if (scheme == SCHEME_INVALID)
+    scheme = SCHEME_HTTP;       /* use http semantics for rel links */
+  /* +2 to ignore the first two separators ':' and '/' */
+  seps = init_seps (scheme) + 2;
+  return strpbrk_or_eos (url, seps);
+}
+
+/* Find the last occurrence of character C in the range [b, e), or
+   NULL, if none are present.  */
+#define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
+
+/* Merge BASE with LINK and return the resulting URI.
+
+   Either of the URIs may be absolute or relative, complete with the
+   host name, or path only.  This tries to reasonably handle all
+   foreseeable cases.  It only employs minimal URL parsing, without
+   knowledge of the specifics of schemes.
+
+   I briefly considered making this function call path_simplify after
+   the merging process, as rfc1738 seems to suggest.  This is a bad
+   idea for several reasons: 1) it complexifies the code, and 2)
+   url_parse has to simplify path anyway, so it's wasteful to boot.  */
+
+char *
+uri_merge (const char *base, const char *link)
+{
+  int linklength;
+  const char *end;
+  char *merge;
+
+  if (url_has_scheme (link))
+    return xstrdup (link);
+
+  /* We may not examine BASE past END. */
+  end = path_end (base);
+  linklength = strlen (link);
+
+  if (!*link)
+    {
+      /* Empty LINK points back to BASE, query string and all. */
+      return xstrdup (base);
+    }
+  else if (*link == '?')
+    {
+      /* LINK points to the same location, but changes the query
+         string.  Examples: */
+      /* uri_merge("path",         "?new") -> "path?new"     */
+      /* uri_merge("path?foo",     "?new") -> "path?new"     */
+      /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
+      /* uri_merge("path#foo",     "?new") -> "path?new"     */
+      int baselength = end - base;
+      merge = xmalloc (baselength + linklength + 1);
+      memcpy (merge, base, baselength);
+      memcpy (merge + baselength, link, linklength);
+      merge[baselength + linklength] = '\0';
+    }
+  else if (*link == '#')
+    {
+      /* uri_merge("path",         "#new") -> "path#new"     */
+      /* uri_merge("path#foo",     "#new") -> "path#new"     */
+      /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
+      /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
+      int baselength;
+      const char *end1 = strchr (base, '#');
+      if (!end1)
+        end1 = base + strlen (base);
+      baselength = end1 - base;
+      merge = xmalloc (baselength + linklength + 1);
+      memcpy (merge, base, baselength);
+      memcpy (merge + baselength, link, linklength);
+      merge[baselength + linklength] = '\0';
+    }
+  else if (*link == '/' && *(link + 1) == '/')
+    {
+      /* LINK begins with "//" and so is a net path: we need to
+         replace everything after (and including) the double slash
+         with LINK. */
+
+      /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
+      /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
+      /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
+
+      int span;
+      const char *slash;
+      const char *start_insert;
+
+      /* Look for first slash. */
+      slash = memchr (base, '/', end - base);
+      /* If found slash and it is a double slash, then replace
+         from this point, else default to replacing from the
+         beginning.  */
+      if (slash && *(slash + 1) == '/')
+        start_insert = slash;
+      else
+        start_insert = base;
+
+      span = start_insert - base;
+      merge = xmalloc (span + linklength + 1);
+      if (span)
+        memcpy (merge, base, span);
+      memcpy (merge + span, link, linklength);
+      merge[span + linklength] = '\0';
+    }
+  else if (*link == '/')
+    {
+      /* LINK is an absolute path: we need to replace everything
+         after (and including) the FIRST slash with LINK.
+
+         So, if BASE is "http://host/whatever/foo/bar", and LINK is
+         "/qux/xyzzy", our result should be
+         "http://host/qux/xyzzy".  */
+      int span;
+      const char *slash;
+      const char *start_insert = NULL; /* for gcc to shut up. */
+      const char *pos = base;
+      bool seen_slash_slash = false;
+      /* We're looking for the first slash, but want to ignore
+         double slash. */
+    again:
+      slash = memchr (pos, '/', end - pos);
+      if (slash && !seen_slash_slash)
+        if (*(slash + 1) == '/')
+          {
+            pos = slash + 2;
+            seen_slash_slash = true;
+            goto again;
+          }
+
+      /* At this point, SLASH is the location of the first / after
+         "//", or the first slash altogether.  START_INSERT is the
+         pointer to the location where LINK will be inserted.  When
+         examining the last two examples, keep in mind that LINK
+         begins with '/'. */
+
+      if (!slash && !seen_slash_slash)
+        /* example: "foo" */
+        /*           ^    */
+        start_insert = base;
+      else if (!slash && seen_slash_slash)
+        /* example: "http://foo" */
+        /*                     ^ */
+        start_insert = end;
+      else if (slash && !seen_slash_slash)
+        /* example: "foo/bar" */
+        /*           ^        */
+        start_insert = base;
+      else if (slash && seen_slash_slash)
+        /* example: "http://something/" */
+        /*                           ^  */
+        start_insert = slash;
+
+      span = start_insert - base;
+      merge = xmalloc (span + linklength + 1);
+      if (span)
+        memcpy (merge, base, span);
+      memcpy (merge + span, link, linklength);
+      merge[span + linklength] = '\0';
+    }
+  else
+    {
+      /* LINK is a relative URL: we need to replace everything
+         after last slash (possibly empty) with LINK.
+
+         So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
+         our result should be "whatever/foo/qux/xyzzy".  */
+      bool need_explicit_slash = false;
+      int span;
+      const char *start_insert;
+      const char *last_slash = find_last_char (base, end, '/');
+      if (!last_slash)
+        {
+          /* No slash found at all.  Replace what we have with LINK. */
+          start_insert = base;
+        }
+      else if (last_slash && last_slash >= base + 2
+               && last_slash[-2] == ':' && last_slash[-1] == '/')
+        {
+          /* example: http://host"  */
+          /*                      ^ */
+          start_insert = end + 1;
+          need_explicit_slash = true;
+        }
+      else
+        {
+          /* example: "whatever/foo/bar" */
+          /*                        ^    */
+          start_insert = last_slash + 1;
+        }
+
+      span = start_insert - base;
+      merge = xmalloc (span + linklength + 1);
+      if (span)
+        memcpy (merge, base, span);
+      if (need_explicit_slash)
+        merge[span - 1] = '/';
+      memcpy (merge + span, link, linklength);
+      merge[span + linklength] = '\0';
+    }
+
+  return merge;
+}
+
+#define APPEND(p, s) do {                       \
+  int len = strlen (s);                         \
+  memcpy (p, s, len);                           \
+  p += len;                                     \
+} while (0)
+
+/* Use this instead of password when the actual password is supposed
+   to be hidden.  We intentionally use a generic string without giving
+   away the number of characters in the password, like previous
+   versions did.  */
+#define HIDDEN_PASSWORD "*password*"
+
+/* Recreate the URL string from the data in URL.
+
+   If HIDE is true (as it is when we're calling this on a URL we plan
+   to print, but not when calling it to canonicalize a URL for use
+   within the program), password will be hidden.  Unsafe characters in
+   the URL will be quoted.  */
+
+char *
+url_string (const struct url *url, enum url_auth_mode auth_mode)
+{
+  int size;
+  char *result, *p;
+  char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
+
+  int scheme_port = supported_schemes[url->scheme].default_port;
+  const char *scheme_str = supported_schemes[url->scheme].leading_string;
+  int fplen = full_path_length (url);
+
+  bool brackets_around_host;
+
+  assert (scheme_str != NULL);
+
+  /* Make sure the user name and password are quoted. */
+  if (url->user)
+    {
+      if (auth_mode != URL_AUTH_HIDE)
+        {
+          quoted_user = url_escape_allow_passthrough (url->user);
+          if (url->passwd)
+            {
+              if (auth_mode == URL_AUTH_HIDE_PASSWD)
+                quoted_passwd = (char *) HIDDEN_PASSWORD;
+              else
+                quoted_passwd = url_escape_allow_passthrough (url->passwd);
+            }
+        }
+    }
+
+  /* In the unlikely event that the host name contains non-printable
+     characters, quote it for displaying to the user.  */
+  quoted_host = url_escape_allow_passthrough (url->host);
+
+  /* Undo the quoting of colons that URL escaping performs.  IPv6
+     addresses may legally contain colons, and in that case must be
+     placed in square brackets.  */
+  if (quoted_host != url->host)
+    unescape_single_char (quoted_host, ':');
+  brackets_around_host = strchr (quoted_host, ':') != NULL;
+
+  size = (strlen (scheme_str)
+          + strlen (quoted_host)
+          + (brackets_around_host ? 2 : 0)
+          + fplen
+          + 1);
+  if (url->port != scheme_port)
+    size += 1 + numdigit (url->port);
+  if (quoted_user)
+    {
+      size += 1 + strlen (quoted_user);
+      if (quoted_passwd)
+        size += 1 + strlen (quoted_passwd);
+    }
+
+  p = result = xmalloc (size);
+
+  APPEND (p, scheme_str);
+  if (quoted_user)
+    {
+      APPEND (p, quoted_user);
+      if (quoted_passwd)
+        {
+          *p++ = ':';
+          APPEND (p, quoted_passwd);
+        }
+      *p++ = '@';
+    }
+
+  if (brackets_around_host)
+    *p++ = '[';
+  APPEND (p, quoted_host);
+  if (brackets_around_host)
+    *p++ = ']';
+  if (url->port != scheme_port)
+    {
+      *p++ = ':';
+      p = number_to_string (p, url->port);
+    }
+
+  full_path_write (url, p);
+  p += fplen;
+  *p++ = '\0';
+
+  assert (p - result == size);
+
+  if (quoted_user && quoted_user != url->user)
+    xfree (quoted_user);
+  if (quoted_passwd && auth_mode == URL_AUTH_SHOW
+      && quoted_passwd != url->passwd)
+    xfree (quoted_passwd);
+  if (quoted_host != url->host)
+    xfree (quoted_host);
+
+  return result;
+}
+
+/* Return true if scheme a is similar to scheme b.
+
+   Schemes are similar if they are equal.  If SSL is supported, schemes
+   are also similar if one is http (SCHEME_HTTP) and the other is https
+   (SCHEME_HTTPS).  */
+bool
+schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
+{
+  if (a == b)
+    return true;
+#ifdef HAVE_SSL
+  if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
+      || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
+    return true;
+#endif
+  return false;
+}
+
+static int
+getchar_from_escaped_string (const char *str, char *c)
+{
+  const char *p = str;
+
+  assert (str && *str);
+  assert (c);
+
+  if (p[0] == '%')
+    {
+      if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
+        {
+          *c = '%';
+          return 1;
+        }
+      else
+        {
+          if (p[2] == 0)
+            return 0; /* error: invalid string */
+
+          *c = X2DIGITS_TO_NUM (p[1], p[2]);
+          if (URL_RESERVED_CHAR(*c))
+            {
+              *c = '%';
+              return 1;
+            }
+          else
+            return 3;
+        }
+    }
+  else
+    {
+      *c = p[0];
+    }
+
+  return 1;
+}
+
+bool
+are_urls_equal (const char *u1, const char *u2)
+{
+  const char *p, *q;
+  int pp, qq;
+  char ch1, ch2;
+  assert(u1 && u2);
+
+  p = u1;
+  q = u2;
+
+  while (*p && *q
+         && (pp = getchar_from_escaped_string (p, &ch1))
+         && (qq = getchar_from_escaped_string (q, &ch2))
+         && (c_tolower(ch1) == c_tolower(ch2)))
+    {
+      p += pp;
+      q += qq;
+    }
+
+  return (*p == 0 && *q == 0 ? true : false);
+}
+
+#ifdef TESTING
+/* Debugging and testing support for path_simplify. */
+
+#if 0
+/* Debug: run path_simplify on PATH and return the result in a new
+   string.  Useful for calling from the debugger.  */
+static char *
+ps (char *path)
+{
+  char *copy = xstrdup (path);
+  path_simplify (copy);
+  return copy;
+}
+#endif
+
+static const char *
+run_test (const char *test, const char *expected_result, enum url_scheme scheme,
+          bool expected_change)
+{
+  char *test_copy = xstrdup (test);
+  bool modified = path_simplify (scheme, test_copy);
+
+  if (0 != strcmp (test_copy, expected_result))
+    {
+      printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
+              test, expected_result, test_copy);
+      mu_assert ("", 0);
+    }
+  if (modified != expected_change)
+    {
+      if (expected_change)
+        printf ("Expected modification with path_simplify(\"%s\").\n",
+                test);
+      else
+        printf ("Expected no modification with path_simplify(\"%s\").\n",
+                test);
+    }
+  xfree (test_copy);
+  mu_assert ("", modified == expected_change);
+  return NULL;
+}
+
+const char *
+test_path_simplify (void)
+{
+  static const struct {
+    const char *test, *result;
+    enum url_scheme scheme;
+    bool should_modify;
+  } tests[] = {
+    { "",                       "",             SCHEME_HTTP, false },
+    { ".",                      "",             SCHEME_HTTP, true },
+    { "./",                     "",             SCHEME_HTTP, true },
+    { "..",                     "",             SCHEME_HTTP, true },
+    { "../",                    "",             SCHEME_HTTP, true },
+    { "..",                     "..",           SCHEME_FTP,  false },
+    { "../",                    "../",          SCHEME_FTP,  false },
+    { "foo",                    "foo",          SCHEME_HTTP, false },
+    { "foo/bar",                "foo/bar",      SCHEME_HTTP, false },
+    { "foo///bar",              "foo///bar",    SCHEME_HTTP, false },
+    { "foo/.",                  "foo/",         SCHEME_HTTP, true },
+    { "foo/./",                 "foo/",         SCHEME_HTTP, true },
+    { "foo./",                  "foo./",        SCHEME_HTTP, false },
+    { "foo/../bar",             "bar",          SCHEME_HTTP, true },
+    { "foo/../bar/",            "bar/",         SCHEME_HTTP, true },
+    { "foo/bar/..",             "foo/",         SCHEME_HTTP, true },
+    { "foo/bar/../x",           "foo/x",        SCHEME_HTTP, true },
+    { "foo/bar/../x/",          "foo/x/",       SCHEME_HTTP, true },
+    { "foo/..",                 "",             SCHEME_HTTP, true },
+    { "foo/../..",              "",             SCHEME_HTTP, true },
+    { "foo/../../..",           "",             SCHEME_HTTP, true },
+    { "foo/../../bar/../../baz", "baz",         SCHEME_HTTP, true },
+    { "foo/../..",              "..",           SCHEME_FTP,  true },
+    { "foo/../../..",           "../..",        SCHEME_FTP,  true },
+    { "foo/../../bar/../../baz", "../../baz",   SCHEME_FTP,  true },
+    { "a/b/../../c",            "c",            SCHEME_HTTP, true },
+    { "./a/../b",               "b",            SCHEME_HTTP, true }
+  };
+  unsigned i;
+
+  for (i = 0; i < countof (tests); i++)
+    {
+      const char *message;
+      const char *test = tests[i].test;
+      const char *expected_result = tests[i].result;
+      enum url_scheme scheme = tests[i].scheme;
+      bool  expected_change = tests[i].should_modify;
+
+      message = run_test (test, expected_result, scheme, expected_change);
+      if (message) return message;
+    }
+  return NULL;
+}
+
+const char *
+test_append_uri_pathel(void)
+{
+  unsigned i;
+  static const struct {
+    const char *original_url;
+    const char *input;
+    bool escaped;
+    const char *expected_result;
+  } test_array[] = {
+    { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
+  };
+
+  for (i = 0; i < countof(test_array); ++i)
+    {
+      struct growable dest;
+      const char *p = test_array[i].input;
+
+      memset (&dest, 0, sizeof (dest));
+
+      append_string (test_array[i].original_url, &dest);
+      append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
+
+      mu_assert ("test_append_uri_pathel: wrong result",
+                 strcmp (dest.base, test_array[i].expected_result) == 0);
+      xfree (dest.base);
+    }
+
+  return NULL;
+}
+
+const char *
+test_are_urls_equal(void)
+{
+  unsigned i;
+  static const struct {
+    const char *url1;
+    const char *url2;
+    bool expected_result;
+  } test_array[] = {
+    { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
+    { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
+    { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
+    { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
+    { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
+    { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
+  };
+
+  for (i = 0; i < countof(test_array); ++i)
+    {
+      mu_assert ("test_are_urls_equal: wrong result",
+                 are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
+    }
+
+  return NULL;
+}
+
+#endif /* TESTING */
+
+/*
+ * vim: et ts=2 sw=2
+ */
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 17:04:52 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 17:04:52 +0000
commit	5e03c718f4e7ff13cb6834eda737c269ebed02ad (patch)
tree	bfad3f5be123f000fdb03e26400050dece33d72f /src/url.c
parent	Initial commit. (diff)
download	wget-upstream.tar.xz wget-upstream.zip