summaryrefslogtreecommitdiffstats
path: root/WWW/Library/Implementation/HTParse.c
diff options
context:
space:
mode:
Diffstat (limited to 'WWW/Library/Implementation/HTParse.c')
-rw-r--r--WWW/Library/Implementation/HTParse.c1339
1 files changed, 1339 insertions, 0 deletions
diff --git a/WWW/Library/Implementation/HTParse.c b/WWW/Library/Implementation/HTParse.c
new file mode 100644
index 0000000..a7f648f
--- /dev/null
+++ b/WWW/Library/Implementation/HTParse.c
@@ -0,0 +1,1339 @@
+/*
+ * $LynxId: HTParse.c,v 1.90 2020/09/05 00:19:54 tom Exp $
+ *
+ * Parse HyperText Document Address HTParse.c
+ * ================================
+ */
+
+#include <HTUtils.h>
+#include <HTParse.h>
+
+#include <LYUtils.h>
+#include <LYLeaks.h>
+#include <LYStrings.h>
+#include <LYCharUtils.h>
+#include <LYGlobalDefs.h>
+
+#ifdef HAVE_ALLOCA_H
+#include <alloca.h>
+#else
+#ifdef __MINGW32__
+#include <malloc.h>
+#endif /* __MINGW32__ */
+#endif
+
+#ifdef USE_IDNA
+#include <idna.h>
+#include <idn-free.h>
+#endif
+
+#define HEX_ESCAPE '%'
+
+struct struct_parts {
+ char *access;
+ char *host;
+ char *absolute;
+ char *relative;
+ char *search; /* treated normally as part of path */
+ char *anchor;
+};
+
+#if 0 /* for debugging */
+static void show_parts(const char *name, struct struct_parts *parts, int line)
+{
+ if (TRACE) {
+ CTRACE((tfp, "struct_parts(%s) %s@%d\n", name, __FILE__, line));
+ CTRACE((tfp, " access '%s'\n", NONNULL(parts->access)));
+ CTRACE((tfp, " host '%s'\n", NONNULL(parts->host)));
+ CTRACE((tfp, " absolute '%s'\n", NONNULL(parts->absolute)));
+ CTRACE((tfp, " relative '%s'\n", NONNULL(parts->relative)));
+ CTRACE((tfp, " search '%s'\n", NONNULL(parts->search)));
+ CTRACE((tfp, " anchor '%s'\n", NONNULL(parts->anchor)));
+ }
+}
+#define SHOW_PARTS(name) show_parts(#name, &name, __LINE__)
+#else
+#define SHOW_PARTS(name) /* nothing */
+#endif
+
+/* Strip white space off a string. HTStrip()
+ * -------------------------------
+ *
+ * On exit,
+ * Return value points to first non-white character, or to 0 if none.
+ * All trailing white space is OVERWRITTEN with zero.
+ */
+char *HTStrip(char *s)
+{
+#define SPACE(c) ((c == ' ') || (c == '\t') || (c == '\n'))
+ char *p;
+
+ for (p = s; *p; p++) { /* Find end of string */
+ ;
+ }
+ for (p--; p >= s; p--) {
+ if (SPACE(*p))
+ *p = '\0'; /* Zap trailing blanks */
+ else
+ break;
+ }
+ while (SPACE(*s))
+ s++; /* Strip leading blanks */
+ return s;
+}
+
+/* Scan a filename for its constituents. scan()
+ * -------------------------------------
+ *
+ * On entry,
+ * name points to a document name which may be incomplete.
+ * On exit,
+ * absolute or relative may be nonzero (but not both).
+ * host, anchor and access may be nonzero if they were specified.
+ * Any which are nonzero point to zero terminated strings.
+ */
+static void scan(char *name,
+ struct struct_parts *parts)
+{
+ char *after_access;
+ char *p;
+
+ parts->access = NULL;
+ parts->host = NULL;
+ parts->absolute = NULL;
+ parts->relative = NULL;
+ parts->search = NULL; /* normally not used - kw */
+ parts->anchor = NULL;
+
+ /*
+ * Scan left-to-right for a scheme (access).
+ */
+ after_access = name;
+ for (p = name; *p; p++) {
+ if (*p == ':') {
+ *p = '\0';
+ parts->access = name; /* Access name has been specified */
+ after_access = (p + 1);
+ break;
+ }
+ if (*p == '/' || *p == '#' || *p == ';' || *p == '?')
+ break;
+ }
+
+ /*
+ * Scan left-to-right for a fragment (anchor).
+ */
+ for (p = after_access; *p; p++) {
+ if (*p == '#') {
+ parts->anchor = (p + 1);
+ *p = '\0'; /* terminate the rest */
+ break; /* leave things after first # alone - kw */
+ }
+ }
+
+ /*
+ * Scan left-to-right for a host or absolute path.
+ */
+ p = after_access;
+ if (*p == '/') {
+ if (p[1] == '/') {
+ parts->host = (p + 2); /* host has been specified */
+ *p = '\0'; /* Terminate access */
+ p = StrChr(parts->host, '/'); /* look for end of host name if any */
+ if (p != NULL) {
+ *p = '\0'; /* Terminate host */
+ parts->absolute = (p + 1); /* Root has been found */
+ } else {
+ p = StrChr(parts->host, '?');
+ if (p != NULL) {
+ *p = '\0'; /* Terminate host */
+ parts->search = (p + 1);
+ }
+ }
+ } else {
+ parts->absolute = (p + 1); /* Root found but no host */
+ }
+ } else {
+ parts->relative = (*after_access) ?
+ after_access : NULL; /* NULL for "" */
+ }
+
+ /*
+ * Check schemes that commonly have unescaped hashes.
+ */
+ if (parts->access && parts->anchor &&
+ /* optimize */ StrChr("lnsdLNSD", *parts->access) != NULL) {
+ if ((!parts->host && strcasecomp(parts->access, "lynxcgi")) ||
+ !strcasecomp(parts->access, "nntp") ||
+ !strcasecomp(parts->access, "snews") ||
+ !strcasecomp(parts->access, "news") ||
+ !strcasecomp(parts->access, "data")) {
+ /*
+ * Access specified but no host and not a lynxcgi URL, so the
+ * anchor may not really be one, e.g., news:j462#36487@foo.bar, or
+ * it's an nntp or snews URL, or news URL with a host. Restore the
+ * '#' in the address.
+ */
+ /* but only if we have found a path component of which this will
+ * become part. - kw */
+ if (parts->relative || parts->absolute) {
+ *(parts->anchor - 1) = '#';
+ parts->anchor = NULL;
+ }
+ }
+ }
+} /*scan */
+
+#if defined(HAVE_ALLOCA) && !defined(LY_FIND_LEAKS)
+#define LYalloca(x) alloca((size_t)(x))
+#define LYalloca_free(x) {}
+#else
+#define LYalloca(x) malloc((size_t)(x))
+#define LYalloca_free(x) free((void *)(x))
+#endif
+
+static char *strchr_or_end(char *string, int ch)
+{
+ char *result = StrChr(string, ch);
+
+ if (result == 0) {
+ result = string + strlen(string);
+ }
+ return result;
+}
+
+/*
+ * Given a host specification that may end with a port number, e.g.,
+ * foobar:123
+ * point to the ':' which begins the ":port" to make it simple to handle the
+ * substring.
+ *
+ * If no port is found (or a syntax error), return null.
+ */
+char *HTParsePort(char *host, int *portp)
+{
+ int brackets = 0;
+ char *result = NULL;
+
+ *portp = 0;
+ if (host != NULL) {
+ while (*host != '\0' && result == 0) {
+ switch (*host++) {
+ case ':':
+ if (brackets == 0 && isdigit(UCH(*host))) {
+ char *next = NULL;
+
+ *portp = (int) strtol(host, &next, 10);
+ if (next != 0 && next != host && *next == '\0') {
+ result = (host - 1);
+ CTRACE((tfp, "HTParsePort %d\n", *portp));
+ }
+ }
+ break;
+ case '[': /* for ipv6 */
+ ++brackets;
+ break;
+ case ']': /* for ipv6 */
+ --brackets;
+ break;
+ }
+ }
+ }
+ return result;
+}
+
+#ifdef USE_IDNA
+static int hex_decode(int ch)
+{
+ int result = -1;
+
+ if (ch >= '0' && ch <= '9')
+ result = (ch - '0');
+ else if (ch >= 'a' && ch <= 'f')
+ result = (ch - 'a') + 10;
+ else if (ch >= 'A' && ch <= 'F')
+ result = (ch - 'A') + 10;
+ return result;
+}
+
+/*
+ * Convert in-place the given hostname to IDNA form. That requires up to 64
+ * characters, and we've allowed for that, with MIN_PARSE.
+ */
+static void convert_to_idna(char *host)
+{
+ size_t length = strlen(host);
+ char *endhost = host + length;
+ char *buffer = malloc(length + 1);
+ char *params = malloc(length + 1);
+ char *output = NULL;
+ char *src, *dst;
+ int code;
+ int hi, lo;
+
+ if (buffer != NULL && params != NULL) {
+ code = TRUE;
+ *params = '\0';
+ for (dst = buffer, src = host; src < endhost; ++dst) {
+ int ch = *src++;
+
+ if (RFC_3986_GEN_DELIMS(ch)) {
+ strcpy(params, src - 1);
+ *dst = '\0';
+ break;
+ } else if (ch == HEX_ESCAPE) {
+ if ((src + 1) < endhost
+ && (hi = hex_decode(src[0])) >= 0
+ && (lo = hex_decode(src[1])) >= 0) {
+
+ *dst = (char) ((hi << 4) | lo);
+ src += 2;
+ } else {
+ CTRACE((tfp, "convert_to_idna: `%s' is malformed\n", host));
+ code = FALSE;
+ break;
+ }
+ } else {
+ *dst = (char) ch;
+ }
+ }
+ if (code) {
+ *dst = '\0';
+ code = idna_to_ascii_8z(buffer, &output, IDNA_USE_STD3_ASCII_RULES);
+ if (code == IDNA_SUCCESS) {
+ strcpy(host, output);
+ strcat(host, params);
+ } else {
+ CTRACE((tfp, "convert_to_idna: `%s': %s\n",
+ buffer,
+ idna_strerror((Idna_rc) code)));
+ }
+ if (output)
+ idn_free(output);
+ }
+ }
+ free(buffer);
+ free(params);
+}
+#define MIN_PARSE 80
+#else
+#define MIN_PARSE 8
+#endif
+
+/* Parse a Name relative to another name. HTParse()
+ * --------------------------------------
+ *
+ * This returns those parts of a name which are given (and requested)
+ * substituting bits from the related name where necessary.
+ *
+ * Originally based on RFC 1808, some details in RFC 3986 are used.
+ *
+ * On entry,
+ * aName A filename given
+ * relatedName A name relative to which aName is to be parsed
+ * wanted A mask for the bits which are wanted.
+ *
+ * On exit,
+ * returns A pointer to a malloc'd string which MUST BE FREED
+ */
+char *HTParse(const char *aName,
+ const char *relatedName,
+ int wanted)
+{
+ char *result = NULL;
+ char *tail = NULL; /* a pointer to the end of the 'result' string */
+ char *return_value = NULL;
+ size_t len, len1, len2;
+ size_t need;
+ char *name = NULL;
+ char *rel = NULL;
+ char *p, *q;
+ char *acc_method;
+ struct struct_parts given, related;
+
+ CTRACE((tfp, "HTParse: aName:`%s'\n", aName));
+ CTRACE((tfp, " relatedName:`%s'\n", relatedName));
+
+ if (wanted & (PARSE_STRICTPATH | PARSE_QUERY)) { /* if detail wanted... */
+ if ((wanted & (PARSE_STRICTPATH | PARSE_QUERY))
+ == (PARSE_STRICTPATH | PARSE_QUERY)) /* if strictpath AND query */
+ wanted |= PARSE_PATH; /* then treat as if PARSE_PATH wanted */
+ if (wanted & PARSE_PATH) /* if PARSE_PATH wanted */
+ wanted &= ~(PARSE_STRICTPATH | PARSE_QUERY); /* ignore details */
+ }
+/* *INDENT-OFF* */
+ CTRACE((tfp, " want:%s%s%s%s%s%s%s\n",
+ wanted & PARSE_PUNCTUATION ? " punc" : "",
+ wanted & PARSE_ANCHOR ? " anchor" : "",
+ wanted & PARSE_PATH ? " path" : "",
+ wanted & PARSE_HOST ? " host" : "",
+ wanted & PARSE_ACCESS ? " access" : "",
+ wanted & PARSE_STRICTPATH ? " PATH" : "",
+ wanted & PARSE_QUERY ? " QUERY" : ""));
+/* *INDENT-ON* */
+
+ /*
+ * Allocate the temporary string. Optimized.
+ */
+ len1 = strlen(aName) + 1;
+ len2 = strlen(relatedName) + 1;
+ len = len1 + len2 + MIN_PARSE; /* Lots of space: more than enough */
+
+ need = (len * 2 + len1 + len2);
+ if (need > (size_t) max_uri_size ||
+ (int) need < (int) len1 ||
+ (int) need < (int) len2)
+ return StrAllocCopy(return_value, "");
+
+ result = tail = (char *) LYalloca(need);
+ if (result == NULL) {
+ outofmem(__FILE__, "HTParse");
+ }
+ *result = '\0';
+ name = result + len;
+ rel = name + len1;
+
+ /*
+ * Make working copy of the input string to cut up.
+ */
+ MemCpy(name, aName, len1);
+
+ /*
+ * Cut up the string into URL fields.
+ */
+ scan(name, &given);
+ SHOW_PARTS(given);
+
+ /*
+ * Now related string.
+ */
+ if ((given.access && given.host && given.absolute) || !*relatedName) {
+ /*
+ * Inherit nothing!
+ */
+ related.access = NULL;
+ related.host = NULL;
+ related.absolute = NULL;
+ related.relative = NULL;
+ related.search = NULL;
+ related.anchor = NULL;
+ } else {
+ MemCpy(rel, relatedName, len2);
+ scan(rel, &related);
+ }
+ SHOW_PARTS(related);
+
+ /*
+ * Handle the scheme (access) field.
+ */
+ if (given.access && given.host && !given.relative && !given.absolute) {
+ if (!strcmp(given.access, "http") ||
+ !strcmp(given.access, "https") ||
+ !strcmp(given.access, "ftp")) {
+
+ /*
+ * Assume root.
+ */
+ given.absolute = empty_string;
+ }
+ }
+ acc_method = given.access ? given.access : related.access;
+ if (wanted & PARSE_ACCESS) {
+ if (acc_method) {
+ strcpy(tail, acc_method);
+ tail += strlen(tail);
+ if (wanted & PARSE_PUNCTUATION) {
+ *tail++ = ':';
+ *tail = '\0';
+ }
+ }
+ }
+
+ /*
+ * If different schemes, inherit nothing.
+ *
+ * We'll try complying with RFC 1808 and the Fielding draft, and inherit
+ * nothing if both schemes are given, rather than only when they differ,
+ * except for file URLs - FM
+ *
+ * After trying it for a while, it's still premature, IHMO, to go along
+ * with it, so this is back to inheriting for identical schemes whether or
+ * not they are "file". If you want to try it again yourself, uncomment
+ * the strcasecomp() below. - FM
+ */
+ if ((given.access && related.access) &&
+ ( /* strcasecomp(given.access, "file") || */
+ strcmp(given.access, related.access))) {
+ related.host = NULL;
+ related.absolute = NULL;
+ related.relative = NULL;
+ related.search = NULL;
+ related.anchor = NULL;
+ }
+
+ /*
+ * Handle the host field.
+ */
+ if (wanted & PARSE_HOST) {
+ if (given.host || related.host) {
+ if (wanted & PARSE_PUNCTUATION) {
+ *tail++ = '/';
+ *tail++ = '/';
+ }
+ strcpy(tail, given.host ? given.host : related.host);
+ /*
+ * Ignore default port numbers, and trailing dots on FQDNs, which
+ * will only cause identical addresses to look different. (related
+ * is already a clean url).
+ */
+ {
+ char *p2, *h;
+ int portnumber;
+ int gen_delims = 0;
+
+ if ((p2 = HTSkipToAt(result, &gen_delims)) != NULL
+ && gen_delims == 0) {
+ tail = (p2 + 1);
+ }
+ p2 = HTParsePort(result, &portnumber);
+ if (p2 != NULL && acc_method != NULL) {
+ /*
+ * Port specified.
+ */
+#define ACC_METHOD(a,b) (!strcmp(acc_method, a) && (portnumber == b))
+ if (ACC_METHOD("http", 80) ||
+ ACC_METHOD("https", 443) ||
+ ACC_METHOD("gopher", 70) ||
+ ACC_METHOD("ftp", 21) ||
+ ACC_METHOD("wais", 210) ||
+ ACC_METHOD("nntp", 119) ||
+ ACC_METHOD("news", 119) ||
+ ACC_METHOD("newspost", 119) ||
+ ACC_METHOD("newsreply", 119) ||
+ ACC_METHOD("snews", 563) ||
+ ACC_METHOD("snewspost", 563) ||
+ ACC_METHOD("snewsreply", 563) ||
+ ACC_METHOD("finger", 79) ||
+ ACC_METHOD("telnet", 23) ||
+ ACC_METHOD("tn3270", 23) ||
+ ACC_METHOD("rlogin", 513) ||
+ ACC_METHOD("cso", 105))
+ *p2 = '\0'; /* It is the default: ignore it */
+ }
+ if (p2 == NULL) {
+ int len3 = (int) strlen(tail);
+
+ if (len3 > 0) {
+ h = tail + len3 - 1; /* last char of hostname */
+ if (*h == '.')
+ *h = '\0'; /* chop final . */
+ }
+ } else if (p2 != result) {
+ h = p2;
+ h--; /* End of hostname */
+ if (*h == '.') {
+ /*
+ * Slide p2 over h.
+ */
+ while (*p2 != '\0')
+ *h++ = *p2++;
+ *h = '\0'; /* terminate */
+ }
+ }
+ }
+#ifdef USE_IDNA
+ /*
+ * Depending on locale-support, we could have a literal UTF-8
+ * string as a host name, or a URL-encoded form of that.
+ */
+ convert_to_idna(tail);
+#endif
+ }
+ }
+
+ /*
+ * Trim any blanks from the result so far - there's no excuse for blanks
+ * in a hostname. Also update the tail here.
+ */
+ tail = LYRemoveBlanks(result);
+
+ /*
+ * If host in given or related was ended directly with a '?' (no slash),
+ * fake the search part into absolute. This is the only case search is
+ * returned from scan. A host must have been present. this restores the
+ * '?' at which the host part had been truncated in scan, we have to do
+ * this after host part handling is done. - kw
+ */
+ if (given.search && *(given.search - 1) == '\0') {
+ given.absolute = given.search - 1;
+ given.absolute[0] = '?';
+ } else if (related.search && !related.absolute &&
+ *(related.search - 1) == '\0') {
+ related.absolute = related.search - 1;
+ related.absolute[0] = '?';
+ }
+
+ /*
+ * If different hosts, inherit no path.
+ */
+ if (given.host && related.host)
+ if (strcmp(given.host, related.host) != 0) {
+ related.absolute = NULL;
+ related.relative = NULL;
+ related.anchor = NULL;
+ }
+
+ /*
+ * Handle the path.
+ */
+ if (wanted & (PARSE_PATH | PARSE_STRICTPATH | PARSE_QUERY)) {
+ int want_detail = (wanted & (PARSE_STRICTPATH | PARSE_QUERY));
+
+ if (acc_method && !given.absolute && given.relative) {
+ /*
+ * Treat all given nntp or snews paths, or given paths for news
+ * URLs with a host, as absolute.
+ */
+ switch (*acc_method) {
+ case 'N':
+ case 'n':
+ if (!strcasecomp(acc_method, "nntp") ||
+ (!strcasecomp(acc_method, "news") &&
+ !strncasecomp(result, "news://", 7))) {
+ given.absolute = given.relative;
+ given.relative = NULL;
+ }
+ break;
+ case 'S':
+ case 's':
+ if (!strcasecomp(acc_method, "snews")) {
+ given.absolute = given.relative;
+ given.relative = NULL;
+ }
+ break;
+ }
+ }
+
+ if (given.absolute) { /* All is given */
+ char *base = tail;
+
+ if (wanted & PARSE_PUNCTUATION)
+ *tail++ = '/';
+ strcpy(tail, given.absolute);
+ HTSimplify(base, TRUE);
+ CTRACE((tfp, "HTParse: (ABS)\n"));
+ } else if (related.absolute) { /* Adopt path not name */
+ char *base = tail;
+
+ *tail++ = '/';
+ strcpy(tail, related.absolute);
+ if (given.relative) {
+ /* RFC 1808 part 4 step 5 (if URL path is empty) */
+ /* a) if given has params, add/replace that */
+ if (given.relative[0] == ';') {
+ strcpy(strchr_or_end(tail, ';'), given.relative);
+ }
+ /* b) if given has query, add/replace that */
+ else if (given.relative[0] == '?') {
+ strcpy(strchr_or_end(tail, '?'), given.relative);
+ }
+ /* otherwise fall through to RFC 1808 part 4 step 6 */
+ else {
+ p = StrChr(tail, '?'); /* Search part? */
+ if (p == NULL)
+ p = (tail + strlen(tail) - 1);
+ for (; *p != '/'; p--) ; /* last / */
+ p[1] = '\0'; /* Remove filename */
+ strcat(p, given.relative); /* Add given one */
+ }
+ HTSimplify(base, FALSE);
+ if (*base == '\0')
+ strcpy(base, "/");
+ } else {
+ HTSimplify(base, TRUE);
+ }
+ CTRACE((tfp, "HTParse: (Related-ABS)\n"));
+ } else if (given.relative) {
+ strcpy(tail, given.relative); /* what we've got */
+ HTSimplify(tail, FALSE);
+ CTRACE((tfp, "HTParse: (REL)\n"));
+ } else if (related.relative) {
+ strcpy(tail, related.relative);
+ HTSimplify(tail, FALSE);
+ CTRACE((tfp, "HTParse: (Related-REL)\n"));
+ } else { /* No inheritance */
+ if (!isLYNXCGI(aName) &&
+ !isLYNXEXEC(aName) &&
+ !isLYNXPROG(aName)) {
+ *tail++ = '/';
+ *tail = '\0';
+ } else {
+ HTSimplify(tail, FALSE);
+ }
+ if (!strcmp(result, "news:/"))
+ result[5] = '*';
+ CTRACE((tfp, "HTParse: (No inheritance)\n"));
+ }
+ if (want_detail) {
+ p = StrChr(tail, '?'); /* Search part? */
+ if (p) {
+ if (PARSE_STRICTPATH) {
+ *p = '\0';
+ } else {
+ if (!(wanted & PARSE_PUNCTUATION))
+ p++;
+ do {
+ *tail++ = *p;
+ } while (*p++);
+ }
+ } else {
+ if (wanted & PARSE_QUERY)
+ *tail = '\0';
+ }
+ }
+ }
+
+ /*
+ * Handle the fragment (anchor). Never inherit.
+ */
+ if (wanted & PARSE_ANCHOR) {
+ if (given.anchor && *given.anchor) {
+ tail += strlen(tail);
+ if (wanted & PARSE_PUNCTUATION)
+ *tail++ = '#';
+ strcpy(tail, given.anchor);
+ }
+ }
+
+ /*
+ * If there are any blanks remaining in the string, escape them as needed.
+ * See the discussion in LYLegitimizeHREF() for example.
+ */
+ if ((p = StrChr(result, ' ')) != 0) {
+ switch (is_url(result)) {
+ case UNKNOWN_URL_TYPE:
+ CTRACE((tfp, "HTParse: ignore:`%s'\n", result));
+ break;
+ case LYNXEXEC_URL_TYPE:
+ case LYNXPROG_URL_TYPE:
+ case LYNXCGI_URL_TYPE:
+ case LYNXPRINT_URL_TYPE:
+ case LYNXHIST_URL_TYPE:
+ case LYNXDOWNLOAD_URL_TYPE:
+ case LYNXKEYMAP_URL_TYPE:
+ case LYNXIMGMAP_URL_TYPE:
+ case LYNXCOOKIE_URL_TYPE:
+ case LYNXCACHE_URL_TYPE:
+ case LYNXDIRED_URL_TYPE:
+ case LYNXOPTIONS_URL_TYPE:
+ case LYNXCFG_URL_TYPE:
+ case LYNXCOMPILE_OPTS_URL_TYPE:
+ case LYNXMESSAGES_URL_TYPE:
+ CTRACE((tfp, "HTParse: spaces:`%s'\n", result));
+ break;
+ case NOT_A_URL_TYPE:
+ default:
+ CTRACE((tfp, "HTParse: encode:`%s'\n", result));
+ do {
+ q = p + strlen(p) + 2;
+
+ while (q != p + 1) {
+ q[0] = q[-2];
+ --q;
+ }
+ p[0] = HEX_ESCAPE;
+ p[1] = '2';
+ p[2] = '0';
+ } while ((p = StrChr(result, ' ')) != 0);
+ break;
+ }
+ }
+ CTRACE((tfp, "HTParse: result:`%s'\n", result));
+
+ StrAllocCopy(return_value, result);
+ LYalloca_free(result);
+
+ /* FIXME: could be optimized using HTParse() internals */
+ if (*relatedName &&
+ ((wanted & PARSE_ALL_WITHOUT_ANCHOR) == PARSE_ALL_WITHOUT_ANCHOR)) {
+ /*
+ * Check whether to fill in localhost. - FM
+ */
+ LYFillLocalFileURL(&return_value, relatedName);
+ CTRACE((tfp, "pass LYFillLocalFile:`%s'\n", return_value));
+ }
+
+ return return_value; /* exactly the right length */
+}
+
+/* HTParseAnchor(), fast HTParse() specialization
+ * ----------------------------------------------
+ *
+ * On exit,
+ * returns A pointer within input string (probably to its end '\0')
+ */
+const char *HTParseAnchor(const char *aName)
+{
+ const char *p = aName;
+
+ for (; *p && *p != '#'; p++) {
+ ;
+ }
+ if (*p == '#') {
+ /* the safe way based on HTParse() -
+ * keeping in mind scan() peculiarities on schemes:
+ */
+ struct struct_parts given;
+ size_t need = ((unsigned) ((p - aName) + (int) strlen(p) + 1));
+ char *name;
+
+ if (need > (size_t) max_uri_size) {
+ p += strlen(p);
+ } else {
+ name = (char *) LYalloca(need);
+
+ if (name == NULL) {
+ outofmem(__FILE__, "HTParseAnchor");
+ }
+ strcpy(name, aName);
+ scan(name, &given);
+ LYalloca_free(name);
+
+ p++; /*next to '#' */
+ if (given.anchor == NULL) {
+ for (; *p; p++) /*scroll to end '\0' */
+ ;
+ }
+ }
+ }
+ return p;
+}
+
+/* Simplify a filename. HTSimplify()
+ * --------------------
+ *
+ * A unix-style file is allowed to contain the sequence xxx/../ which may
+ * be replaced by "" , and the sequence "/./" which may be replaced by "/".
+ * Simplification helps us recognize duplicate filenames.
+ *
+ * RFC 3986 section 5.2.4 says to do this whether or not the path was relative.
+ */
+void HTSimplify(char *filename, BOOL absolute)
+{
+#define MY_FMT "HTParse HTSimplify\t(%s)"
+#ifdef NO_LYNX_TRACE
+#define debug_at(at) /* nothing */
+#define atln "?"
+#else
+ const char *atln;
+
+#define debug_at(at) atln = at
+#endif
+ char *mark;
+ char *p;
+ size_t limit;
+
+ CTRACE2(TRACE_HTPARSE,
+ (tfp, MY_FMT " %s\n",
+ filename,
+ absolute ? "ABS" : "REL"));
+
+ if (LYIsPathSep(*filename) && !absolute)
+ ++filename;
+ mark = filename;
+ limit = strlen(filename);
+
+ for (p = filename; *p; ++p) {
+ if (*p == '?' || *p == '#') {
+ limit = (size_t) (p - filename);
+ break;
+ }
+ }
+ while ((limit != 0) && (*filename != '\0')) {
+ size_t trim = 0;
+ size_t skip = 0;
+ size_t last = 0;
+
+ debug_at("?");
+ p = filename;
+ if (limit >= 2 && !memcmp(p, "./", 2)) { /* 2A */
+ debug_at("2A");
+ trim = 2;
+ } else if (limit >= 3 && !memcmp(p, "../", 3)) {
+ debug_at("2A2");
+ trim = 3;
+ } else if (limit >= 3 && !memcmp(p, "/./", 3)) { /* 2B */
+ debug_at("2B");
+ trim = 2;
+ skip = 1;
+ } else if (limit == 2 && !memcmp(p, "/.", 2)) {
+ debug_at("2B2");
+ trim = 1;
+ skip = 1;
+ } else if (limit >= 4 && !memcmp(p, "/../", 4)) { /* 2C */
+ debug_at("2C");
+ trim = 3;
+ skip = 1;
+ last = 1;
+ } else if (limit == 3 && !memcmp(p, "/..", 3)) {
+ debug_at("2C2");
+ trim = 2;
+ skip = 1;
+ last = 1;
+ } else if (limit == 2 && !memcmp(p, "..", 2)) { /* 2D */
+ debug_at("2D");
+ trim = 2;
+ } else if (limit == 1 && !memcmp(p, ".", 1)) {
+ debug_at("2D2");
+ trim = 1;
+ }
+ if (trim) {
+ CTRACE2(TRACE_HTPARSE,
+ (tfp, MY_FMT " trim %lu/%lu (%.*s) '%.*s' @%s\n",
+ mark, (unsigned long) trim, (unsigned long) limit,
+ (int) trim, p + skip, (int) limit, p, atln));
+ }
+ if (last) {
+ char *prior = filename;
+
+ if (prior != mark) {
+ --prior;
+ while (prior != mark && *prior != '/') {
+ --prior;
+ }
+ }
+ if (prior != filename) {
+ trim += (size_t) (filename - prior);
+ limit += (size_t) (filename - prior);
+ filename = p = prior;
+ CTRACE2(TRACE_HTPARSE,
+ (tfp, MY_FMT " TRIM %lu/%lu (%.*s)\n",
+ mark, (unsigned long) trim, (unsigned long) limit,
+ (int) trim, filename + skip));
+ }
+ }
+ if (trim) {
+ limit -= trim;
+ for (p = filename;; ++p) {
+ if ((p[0] = p[trim]) == '\0') {
+ break;
+ }
+ if (skip) {
+ p[0] = '/';
+ skip = 0;
+ }
+ }
+ CTRACE2(TRACE_HTPARSE,
+ (tfp, MY_FMT " loop %lu\n", mark, (unsigned long) limit));
+ } else {
+ if (*filename == '/') {
+ ++filename;
+ --limit;
+ }
+ while ((limit != 0) && (*filename != '/')) {
+ ++filename;
+ --limit;
+ }
+ }
+ }
+ CTRACE2(TRACE_HTPARSE, (tfp, MY_FMT " done\n", mark));
+#undef MY_FMT
+}
+
+/* Make Relative Name. HTRelative()
+ * -------------------
+ *
+ * This function creates and returns a string which gives an expression of
+ * one address as related to another. Where there is no relation, an absolute
+ * address is returned.
+ *
+ * On entry,
+ * Both names must be absolute, fully qualified names of nodes
+ * (no anchor bits)
+ *
+ * On exit,
+ * The return result points to a newly allocated name which, if
+ * parsed by HTParse relative to relatedName, will yield aName.
+ * The caller is responsible for freeing the resulting name later.
+ *
+ */
+char *HTRelative(const char *aName,
+ const char *relatedName)
+{
+ char *result = NULL;
+ const char *p = aName;
+ const char *q = relatedName;
+ const char *after_access = NULL;
+ const char *path = NULL;
+ const char *last_slash = NULL;
+ int slashes = 0;
+
+ for (; *p; p++, q++) { /* Find extent of match */
+ if (*p != *q)
+ break;
+ if (*p == ':')
+ after_access = p + 1;
+ if (*p == '/') {
+ last_slash = p;
+ slashes++;
+ if (slashes == 3)
+ path = p;
+ }
+ }
+
+ /* q, p point to the first non-matching character or zero */
+
+ if (!after_access) { /* Different access */
+ StrAllocCopy(result, aName);
+ } else if (slashes < 3) { /* Different nodes */
+ StrAllocCopy(result, after_access);
+ } else if (slashes == 3) { /* Same node, different path */
+ StrAllocCopy(result, path);
+ } else { /* Some path in common */
+ unsigned levels = 0;
+
+ for (; *q && (*q != '#'); q++)
+ if (*q == '/')
+ levels++;
+ result = typecallocn(char, 3 * levels + strlen(last_slash) + 1);
+
+ if (result == NULL)
+ outofmem(__FILE__, "HTRelative");
+
+ result[0] = '\0';
+ for (; levels; levels--)
+ strcat(result, "../");
+ strcat(result, last_slash + 1);
+ }
+ CTRACE((tfp,
+ "HTparse: `%s' expressed relative to\n `%s' is\n `%s'.\n",
+ aName, relatedName, result));
+ return result;
+}
+
+#define AlloCopy(next,base,extra) \
+ typecallocn(char, ((next - base) + ((int) extra)))
+
+/* Escape undesirable characters using % HTEscape()
+ * -------------------------------------
+ *
+ * This function takes a pointer to a string in which
+ * some characters may be unacceptable unescaped.
+ * It returns a string which has these characters
+ * represented by a '%' character followed by two hex digits.
+ *
+ * Unlike HTUnEscape(), this routine returns a calloc'd string.
+ */
+/* *INDENT-OFF* */
+static const unsigned char isAcceptable[96] =
+
+/* Bit 0 xalpha -- see HTFile.h
+ * Bit 1 xpalpha -- as xalpha but with plus.
+ * Bit 2 ... path -- as xpalphas but with /
+ */
+ /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
+ { 0,0,0,0,0,0,0,0,0,0,7,6,0,7,7,4, /* 2x !"#$%&'()*+,-./ */
+ 7,7,7,7,7,7,7,7,7,7,0,0,0,0,0,0, /* 3x 0123456789:;<=>? */
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /* 4x @ABCDEFGHIJKLMNO */
+ 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,7, /* 5X PQRSTUVWXYZ[\]^_ */
+ 0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /* 6x `abcdefghijklmno */
+ 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,0 }; /* 7X pqrstuvwxyz{|}~ DEL */
+/* *INDENT-ON* */
+
+static const char *hex = "0123456789ABCDEF";
+
+#define ACCEPTABLE(a) ( a>=32 && a<128 && ((isAcceptable[a-32]) & mask))
+
+char *HTEscape(const char *str,
+ unsigned mask)
+{
+ const char *p;
+ char *q;
+ char *result;
+ size_t unacceptable = 0;
+
+ for (p = str; *p; p++)
+ if (!ACCEPTABLE(UCH(TOASCII(*p))))
+ unacceptable++;
+ result = AlloCopy(p, str, (unacceptable * 2) + 1);
+
+ if (result == NULL)
+ outofmem(__FILE__, "HTEscape");
+
+ for (q = result, p = str; *p; p++) {
+ unsigned char a = UCH(TOASCII(*p));
+
+ if (!ACCEPTABLE(a)) {
+ *q++ = HEX_ESCAPE; /* Means hex coming */
+ *q++ = hex[a >> 4];
+ *q++ = hex[a & 15];
+ } else
+ *q++ = *p;
+ }
+ *q = '\0'; /* Terminate */
+ return result;
+}
+
+/* Escape unsafe characters using % HTEscapeUnsafe()
+ * --------------------------------
+ *
+ * This function takes a pointer to a string in which
+ * some characters may be that may be unsafe are unescaped.
+ * It returns a string which has these characters
+ * represented by a '%' character followed by two hex digits.
+ *
+ * Unlike HTUnEscape(), this routine returns a malloc'd string.
+ */
+#define UNSAFE(ch) (((ch) <= 32) || ((ch) >= 127))
+
+char *HTEscapeUnsafe(const char *str)
+{
+ const char *p;
+ char *q;
+ char *result;
+ size_t unacceptable = 0;
+
+ for (p = str; *p; p++)
+ if (UNSAFE(UCH(TOASCII(*p))))
+ unacceptable++;
+ result = AlloCopy(p, str, (unacceptable * 2) + 1);
+
+ if (result == NULL)
+ outofmem(__FILE__, "HTEscapeUnsafe");
+
+ for (q = result, p = str; *p; p++) {
+ unsigned char a = UCH(TOASCII(*p));
+
+ if (UNSAFE(a)) {
+ *q++ = HEX_ESCAPE; /* Means hex coming */
+ *q++ = hex[a >> 4];
+ *q++ = hex[a & 15];
+ } else
+ *q++ = *p;
+ }
+ *q = '\0'; /* Terminate */
+ return result;
+}
+
+/* Escape undesirable characters using % but space to +. HTEscapeSP()
+ * -----------------------------------------------------
+ *
+ * This function takes a pointer to a string in which
+ * some characters may be unacceptable unescaped.
+ * It returns a string which has these characters
+ * represented by a '%' character followed by two hex digits,
+ * except that spaces are converted to '+' instead of %2B.
+ *
+ * Unlike HTUnEscape(), this routine returns a calloced string.
+ */
+char *HTEscapeSP(const char *str,
+ unsigned mask)
+{
+ const char *p;
+ char *q;
+ char *result;
+ size_t unacceptable = 0;
+
+ for (p = str; *p; p++)
+ if (!(*p == ' ' || ACCEPTABLE(UCH(TOASCII(*p)))))
+ unacceptable++;
+ result = AlloCopy(p, str, (unacceptable * 2) + 1);
+
+ if (result == NULL)
+ outofmem(__FILE__, "HTEscape");
+
+ for (q = result, p = str; *p; p++) {
+ unsigned char a = UCH(TOASCII(*p));
+
+ if (a == 32) {
+ *q++ = '+';
+ } else if (!ACCEPTABLE(a)) {
+ *q++ = HEX_ESCAPE; /* Means hex coming */
+ *q++ = hex[a >> 4];
+ *q++ = hex[a & 15];
+ } else {
+ *q++ = *p;
+ }
+ }
+ *q = '\0'; /* Terminate */
+ return result;
+}
+
+/* Decode %xx escaped characters. HTUnEscape()
+ * ------------------------------
+ *
+ * This function takes a pointer to a string in which some
+ * characters may have been encoded in %xy form, where xy is
+ * the ASCII hex code for character 16x+y.
+ * The string is converted in place, as it will never grow.
+ */
+static char from_hex(int c)
+{
+ return (char) (c >= '0' && c <= '9' ? c - '0'
+ : c >= 'A' && c <= 'F' ? c - 'A' + 10
+ : c - 'a' + 10); /* accept small letters just in case */
+}
+
+char *HTUnEscape(char *str)
+{
+ char *p = str;
+ char *q = str;
+
+ if (!(p && *p))
+ return str;
+
+ while (*p != '\0') {
+ if (*p == HEX_ESCAPE &&
+ /*
+ * Tests shouldn't be needed, but better safe than sorry.
+ */
+ p[1] && p[2] &&
+ isxdigit(UCH(p[1])) &&
+ isxdigit(UCH(p[2]))) {
+ p++;
+ if (*p)
+ *q = (char) (from_hex(*p++) * 16);
+ if (*p) {
+ /*
+ * Careful! FROMASCII() may evaluate its arg more than once!
+ */
+ /* S/390 -- gil -- 0221 */
+ *q = (char) (*q + from_hex(*p++));
+ }
+ *q = FROMASCII(*q);
+ q++;
+ } else {
+ *q++ = *p++;
+ }
+ }
+
+ *q = '\0';
+ return str;
+
+} /* HTUnEscape */
+
+/* Decode some %xx escaped characters. HTUnEscapeSome()
+ * ----------------------------------- Klaus Weide
+ * (kweide@tezcat.com)
+ * This function takes a pointer to a string in which some
+ * characters may have been encoded in %xy form, where xy is
+ * the ASCII hex code for character 16x+y, and a pointer to
+ * a second string containing one or more characters which
+ * should be unescaped if escaped in the first string.
+ * The first string is converted in place, as it will never grow.
+ */
+char *HTUnEscapeSome(char *str,
+ const char *do_trans)
+{
+ char *p = str;
+ char *q = str;
+ char testcode;
+
+ if (p == NULL || *p == '\0' || do_trans == NULL || *do_trans == '\0')
+ return str;
+
+ while (*p != '\0') {
+ if (*p == HEX_ESCAPE &&
+ p[1] && p[2] && /* tests shouldn't be needed, but.. */
+ isxdigit(UCH(p[1])) &&
+ isxdigit(UCH(p[2])) &&
+ (testcode = (char) FROMASCII(from_hex(p[1]) * 16 +
+ from_hex(p[2]))) && /* %00 no good */
+ StrChr(do_trans, testcode)) { /* it's one of the ones we want */
+ *q++ = testcode;
+ p += 3;
+ } else {
+ *q++ = *p++;
+ }
+ }
+
+ *q = '\0';
+ return str;
+
+} /* HTUnEscapeSome */
+/* *INDENT-OFF* */
+static const unsigned char crfc[96] =
+
+/* Bit 0 xalpha -- need "quoting"
+ * Bit 1 xpalpha -- need \escape if quoted
+ */
+ /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
+ { 1,0,3,0,0,0,0,0,1,1,0,0,1,0,1,0, /* 2x !"#$%&'()*+,-./ */
+ 0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0, /* 3x 0123456789:;<=>? */
+ 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 4x @ABCDEFGHIJKLMNO */
+ 0,0,0,0,0,0,0,0,0,0,0,1,2,1,0,0, /* 5X PQRSTUVWXYZ[\]^_ */
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 6x `abcdefghijklmno */
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3 }; /* 7X pqrstuvwxyz{|}~ DEL */
+/* *INDENT-ON* */
+
+#define ASCII_TAB '\011'
+#define ASCII_LF '\012'
+#define ASCII_CR '\015'
+#define ASCII_SPC '\040'
+#define ASCII_BAK '\134'
+
+/*
+ * Turn a string which is not a RFC 822 token into a quoted-string. - KW
+ * The "quoted" parameter tells whether we need the beginning/ending quote
+ * marks. If not, the caller will provide them -TD
+ */
+void HTMake822Word(char **str,
+ int quoted)
+{
+ const char *p;
+ char *q;
+ char *result;
+ unsigned char a;
+ unsigned added = 0;
+
+ if (isEmpty(*str)) {
+ StrAllocCopy(*str, quoted ? "\"\"" : "");
+ return;
+ }
+ for (p = *str; *p; p++) {
+ a = UCH(TOASCII(*p)); /* S/390 -- gil -- 0240 */
+ if (a < 32 || a >= 128 ||
+ ((crfc[a - 32]) & 1)) {
+ if (!added)
+ added = 2;
+ if (a >= 160 || a == '\t')
+ continue;
+ if (a == '\r' || a == '\n')
+ added += 2;
+ else if ((a & 127) < 32 || ((crfc[a - 32]) & 2))
+ added++;
+ }
+ }
+ if (!added)
+ return;
+ result = AlloCopy(p, *str, added + 1);
+ if (result == NULL)
+ outofmem(__FILE__, "HTMake822Word");
+
+ q = result;
+ if (quoted)
+ *q++ = '"';
+ /*
+ * Having converted the character to ASCII, we can't use symbolic
+ * escape codes, since they're in the host character set, which
+ * is not necessarily ASCII. Thus we use octal escape codes instead.
+ * -- gil (Paul Gilmartin) <pg@sweng.stortek.com>
+ */
+ /* S/390 -- gil -- 0268 */
+ for (p = *str; *p; p++) {
+ a = UCH(TOASCII(*p));
+ if ((a != ASCII_TAB) &&
+ ((a & 127) < ASCII_SPC ||
+ (a < 128 && ((crfc[a - 32]) & 2))))
+ *q++ = ASCII_BAK;
+ *q++ = *p;
+ if (a == ASCII_LF ||
+ (a == ASCII_CR && (TOASCII(*(p + 1)) != ASCII_LF)))
+ *q++ = ' ';
+ }
+ if (quoted)
+ *q++ = '"';
+ *q = '\0'; /* Terminate */
+ FREE(*str);
+ *str = result;
+}