diff options
Diffstat (limited to 'WWW/Library/Implementation/HTParse.c')
-rw-r--r-- | WWW/Library/Implementation/HTParse.c | 1383 |
1 files changed, 1383 insertions, 0 deletions
diff --git a/WWW/Library/Implementation/HTParse.c b/WWW/Library/Implementation/HTParse.c new file mode 100644 index 0000000..2e45441 --- /dev/null +++ b/WWW/Library/Implementation/HTParse.c @@ -0,0 +1,1383 @@ +/* + * $LynxId: HTParse.c,v 1.98 2021/07/27 21:29:49 tom Exp $ + * + * Parse HyperText Document Address HTParse.c + * ================================ + */ + +#include <HTUtils.h> +#include <HTParse.h> + +#include <LYUtils.h> +#include <LYLeaks.h> +#include <LYStrings.h> +#include <LYCharUtils.h> +#include <LYGlobalDefs.h> + +#ifdef HAVE_ALLOCA_H +#include <alloca.h> +#else +#ifdef __MINGW32__ +#include <malloc.h> +#endif /* __MINGW32__ */ +#endif + +#ifdef USE_IDN2 +#include <idn2.h> +#define FreeIdna(out) idn2_free(out) +#elif defined(USE_IDNA) +#include <idna.h> +#include <idn-free.h> +#define FreeIdna(out) idn_free(out) +#define IDN2_OK IDNA_SUCCESS +#endif + +#define HEX_ESCAPE '%' + +struct struct_parts { + char *access; + char *host; + char *absolute; + char *relative; + char *search; /* treated normally as part of path */ + char *anchor; +}; + +#if 0 /* for debugging */ +static void show_parts(const char *name, struct struct_parts *parts, int line) +{ + if (TRACE) { + CTRACE((tfp, "struct_parts(%s) %s@%d\n", name, __FILE__, line)); + CTRACE((tfp, " access '%s'\n", NONNULL(parts->access))); + CTRACE((tfp, " host '%s'\n", NONNULL(parts->host))); + CTRACE((tfp, " absolute '%s'\n", NONNULL(parts->absolute))); + CTRACE((tfp, " relative '%s'\n", NONNULL(parts->relative))); + CTRACE((tfp, " search '%s'\n", NONNULL(parts->search))); + CTRACE((tfp, " anchor '%s'\n", NONNULL(parts->anchor))); + } +} +#define SHOW_PARTS(name) show_parts(#name, &name, __LINE__) +#else +#define SHOW_PARTS(name) /* nothing */ +#endif + +/* Strip white space off a string. HTStrip() + * ------------------------------- + * + * On exit, + * Return value points to first non-white character, or to 0 if none. + * All trailing white space is OVERWRITTEN with zero. + */ +char *HTStrip(char *s) +{ +#define SPACE(c) ((c == ' ') || (c == '\t') || (c == '\n')) + char *p; + + for (p = s; *p; p++) { /* Find end of string */ + ; + } + for (p--; p >= s; p--) { + if (SPACE(*p)) + *p = '\0'; /* Zap trailing blanks */ + else + break; + } + while (SPACE(*s)) + s++; /* Strip leading blanks */ + return s; +} + +/* Scan a filename for its constituents. scan() + * ------------------------------------- + * + * On entry, + * name points to a document name which may be incomplete. + * On exit, + * absolute or relative may be nonzero (but not both). + * host, anchor and access may be nonzero if they were specified. + * Any which are nonzero point to zero terminated strings. + */ +static void scan(char *name, + struct struct_parts *parts) +{ + char *after_access; + char *p; + + parts->access = NULL; + parts->host = NULL; + parts->absolute = NULL; + parts->relative = NULL; + parts->search = NULL; /* normally not used - kw */ + parts->anchor = NULL; + + /* + * Scan left-to-right for a scheme (access). + */ + after_access = name; + for (p = name; *p; p++) { + if (*p == ':') { + *p = '\0'; + parts->access = name; /* Access name has been specified */ + after_access = (p + 1); + break; + } + if (*p == '/' || *p == '#' || *p == ';' || *p == '?') + break; + } + + /* + * Scan left-to-right for a fragment (anchor). + */ + for (p = after_access; *p; p++) { + if (*p == '#') { + parts->anchor = (p + 1); + *p = '\0'; /* terminate the rest */ + break; /* leave things after first # alone - kw */ + } + } + + /* + * Scan left-to-right for a host or absolute path. + */ + p = after_access; + if (*p == '/') { + if (p[1] == '/') { + parts->host = (p + 2); /* host has been specified */ + *p = '\0'; /* Terminate access */ + p = StrChr(parts->host, '/'); /* look for end of host name if any */ + if (p != NULL) { + *p = '\0'; /* Terminate host */ + parts->absolute = (p + 1); /* Root has been found */ + } else { + p = StrChr(parts->host, '?'); + if (p != NULL) { + *p = '\0'; /* Terminate host */ + parts->search = (p + 1); + } + } + } else { + parts->absolute = (p + 1); /* Root found but no host */ + } + } else { + parts->relative = (*after_access) ? + after_access : NULL; /* NULL for "" */ + } + + /* + * Check schemes that commonly have unescaped hashes. + */ + if (parts->access && parts->anchor && + /* optimize */ StrChr("lnsdLNSD", *parts->access) != NULL) { + if ((!parts->host && strcasecomp(parts->access, "lynxcgi")) || + !strcasecomp(parts->access, "nntp") || + !strcasecomp(parts->access, "snews") || + !strcasecomp(parts->access, "news") || + !strcasecomp(parts->access, "data")) { + /* + * Access specified but no host and not a lynxcgi URL, so the + * anchor may not really be one, e.g., news:j462#36487@foo.bar, or + * it's an nntp or snews URL, or news URL with a host. Restore the + * '#' in the address. + */ + /* but only if we have found a path component of which this will + * become part. - kw */ + if (parts->relative || parts->absolute) { + *(parts->anchor - 1) = '#'; + parts->anchor = NULL; + } + } + } +} /*scan */ + +#if defined(HAVE_ALLOCA) && !defined(LY_FIND_LEAKS) +#define LYalloca(x) alloca((size_t)(x)) +#define LYalloca_free(x) {} +#else +#define LYalloca(x) malloc((size_t)(x)) +#define LYalloca_free(x) free((void *)(x)) +#endif + +static char *strchr_or_end(char *string, int ch) +{ + char *result = StrChr(string, ch); + + if (result == 0) { + result = string + strlen(string); + } + return result; +} + +/* + * Given a host specification that may end with a port number, e.g., + * foobar:123 + * point to the ':' which begins the ":port" to make it simple to handle the + * substring. + * + * If no port is found (or a syntax error), return null. + */ +char *HTParsePort(char *host, int *portp) +{ + int brackets = 0; + char *result = NULL; + + *portp = 0; + if (host != NULL) { + while (*host != '\0' && result == 0) { + switch (*host++) { + case ':': + if (brackets == 0 && isdigit(UCH(*host))) { + char *next = NULL; + + *portp = (int) strtol(host, &next, 10); + if (next != 0 && next != host && *next == '\0') { + result = (host - 1); + CTRACE((tfp, "HTParsePort %d\n", *portp)); + } + } + break; + case '[': /* for ipv6 */ + ++brackets; + break; + case ']': /* for ipv6 */ + --brackets; + break; + } + } + } + return result; +} + +#if defined(USE_IDNA) || defined(USE_IDN2) +static int hex_decode(int ch) +{ + int result = -1; + + if (ch >= '0' && ch <= '9') + result = (ch - '0'); + else if (ch >= 'a' && ch <= 'f') + result = (ch - 'a') + 10; + else if (ch >= 'A' && ch <= 'F') + result = (ch - 'A') + 10; + return result; +} + +/* + * Convert in-place the given hostname to IDNA form. That requires up to 64 + * characters, and we've allowed for that, with MIN_PARSE. + */ +static void convert_to_idna(char *host) +{ + size_t length = strlen(host); + char *endhost = host + length; + char *buffer = malloc(length + 1); + char *params = malloc(length + 1); + char *output = NULL; + char *src, *dst; + int code; + int hi, lo; + + if (buffer != NULL && params != NULL) { + code = TRUE; + *params = '\0'; + for (dst = buffer, src = host; src < endhost; ++dst) { + int ch = *src++; + + if (RFC_3986_GEN_DELIMS(ch)) { + strcpy(params, src - 1); + *dst = '\0'; + break; + } else if (ch == HEX_ESCAPE) { + if ((src + 1) < endhost + && (hi = hex_decode(src[0])) >= 0 + && (lo = hex_decode(src[1])) >= 0) { + + *dst = (char) ((hi << 4) | lo); + src += 2; + } else { + CTRACE((tfp, "convert_to_idna: `%s' is malformed\n", host)); + code = FALSE; + break; + } + } else { + *dst = (char) ch; + } + } + if (code) { + *dst = '\0'; +#ifdef USE_IDN2 +#if (!defined(IDN2_VERSION_NUMBER) || IDN2_VERSION_NUMBER < 0x02000003) + /* + * Older libidn2 mishandles STD3, stripping underscores. + */ + if (strchr(buffer, '_') != NULL) { + code = -1; + } else +#endif + switch (LYidnaMode) { + case LYidna2003: + code = idn2_to_ascii_8z(buffer, &output, IDN2_TRANSITIONAL); + break; + case LYidna2008: + /* IDNA2008 rules without the TR46 amendments */ + code = idn2_to_ascii_8z(buffer, &output, 0); + break; + case LYidnaTR46: + code = idn2_to_ascii_8z(buffer, &output, IDN2_NONTRANSITIONAL + | IDN2_NFC_INPUT); + break; + case LYidnaCompat: + /* IDNA2008 */ + code = idn2_to_ascii_8z(buffer, &output, IDN2_NONTRANSITIONAL + | IDN2_NFC_INPUT); + if (code == IDN2_DISALLOWED) { + /* IDNA2003 - compatible */ + code = idn2_to_ascii_8z(buffer, &output, IDN2_TRANSITIONAL); + } + break; + } +#else + code = idna_to_ascii_8z(buffer, &output, IDNA_USE_STD3_ASCII_RULES); +#endif + if (code == IDN2_OK) { + CTRACE((tfp, "convert_to_idna: `%s' -> `%s': OK\n", buffer, output)); + strcpy(host, output); + strcat(host, params); + } else { + CTRACE((tfp, "convert_to_idna: `%s': %s\n", + buffer, + idna_strerror((Idna_rc) code))); + } + if (output) + FreeIdna(output); + } + } + free(buffer); + free(params); +} +#define MIN_PARSE 80 +#else +#define MIN_PARSE 8 +#endif + +/* Parse a Name relative to another name. HTParse() + * -------------------------------------- + * + * This returns those parts of a name which are given (and requested) + * substituting bits from the related name where necessary. + * + * Originally based on RFC 1808, some details in RFC 3986 are used. + * + * On entry, + * aName A filename given + * relatedName A name relative to which aName is to be parsed + * wanted A mask for the bits which are wanted. + * + * On exit, + * returns A pointer to a malloc'd string which MUST BE FREED + */ +char *HTParse(const char *aName, + const char *relatedName, + int wanted) +{ + char *result = NULL; + char *tail = NULL; /* a pointer to the end of the 'result' string */ + char *return_value = NULL; + size_t len, len1, len2; + size_t need; + char *name = NULL; + char *rel = NULL; + char *p, *q; + char *acc_method; + struct struct_parts given, related; + + CTRACE((tfp, "HTParse: aName:`%s'\n", aName)); + CTRACE((tfp, " relatedName:`%s'\n", relatedName)); + + if (wanted & (PARSE_STRICTPATH | PARSE_QUERY)) { /* if detail wanted... */ + if ((wanted & (PARSE_STRICTPATH | PARSE_QUERY)) + == (PARSE_STRICTPATH | PARSE_QUERY)) /* if strictpath AND query */ + wanted |= PARSE_PATH; /* then treat as if PARSE_PATH wanted */ + if (wanted & PARSE_PATH) /* if PARSE_PATH wanted */ + wanted &= ~(PARSE_STRICTPATH | PARSE_QUERY); /* ignore details */ + } +/* *INDENT-OFF* */ + CTRACE((tfp, " want:%s%s%s%s%s%s%s\n", + wanted & PARSE_PUNCTUATION ? " punc" : "", + wanted & PARSE_ANCHOR ? " anchor" : "", + wanted & PARSE_PATH ? " path" : "", + wanted & PARSE_HOST ? " host" : "", + wanted & PARSE_ACCESS ? " access" : "", + wanted & PARSE_STRICTPATH ? " PATH" : "", + wanted & PARSE_QUERY ? " QUERY" : "")); +/* *INDENT-ON* */ + + /* + * Allocate the temporary string. Optimized. + */ + len1 = strlen(aName) + 1; + len2 = strlen(relatedName) + 1; + len = len1 + len2 + MIN_PARSE; /* Lots of space: more than enough */ + + need = (len * 2 + len1 + len2); + if (need > (size_t) max_uri_size || + (int) need < (int) len1 || + (int) need < (int) len2) + return StrAllocCopy(return_value, ""); + + result = tail = (char *) LYalloca(need); + if (result == NULL) { + outofmem(__FILE__, "HTParse"); + } + *result = '\0'; + name = result + len; + rel = name + len1; + + /* + * Make working copy of the input string to cut up. + */ + MemCpy(name, aName, len1); + + /* + * Cut up the string into URL fields. + */ + scan(name, &given); + SHOW_PARTS(given); + + /* + * Now related string. + */ + if ((given.access && given.host && given.absolute) || !*relatedName) { + /* + * Inherit nothing! + */ + related.access = NULL; + related.host = NULL; + related.absolute = NULL; + related.relative = NULL; + related.search = NULL; + related.anchor = NULL; + } else { + MemCpy(rel, relatedName, len2); + scan(rel, &related); + } + SHOW_PARTS(related); + + /* + * Handle the scheme (access) field. + */ + if (given.access && given.host && !given.relative && !given.absolute) { + if (!strcmp(given.access, "http") || + !strcmp(given.access, "https") || + !strcmp(given.access, "ftp")) { + + /* + * Assume root. + */ + given.absolute = empty_string; + } + } + acc_method = given.access ? given.access : related.access; + if (wanted & PARSE_ACCESS) { + if (acc_method) { + strcpy(tail, acc_method); + tail += strlen(tail); + if (wanted & PARSE_PUNCTUATION) { + *tail++ = ':'; + *tail = '\0'; + } + } + } + + /* + * If different schemes, inherit nothing. + * + * We'll try complying with RFC 1808 and the Fielding draft, and inherit + * nothing if both schemes are given, rather than only when they differ, + * except for file URLs - FM + * + * After trying it for a while, it's still premature, IHMO, to go along + * with it, so this is back to inheriting for identical schemes whether or + * not they are "file". If you want to try it again yourself, uncomment + * the strcasecomp() below. - FM + */ + if ((given.access && related.access) && + ( /* strcasecomp(given.access, "file") || */ + strcmp(given.access, related.access))) { + related.host = NULL; + related.absolute = NULL; + related.relative = NULL; + related.search = NULL; + related.anchor = NULL; + } + + /* + * Handle the host field. + */ + if (wanted & PARSE_HOST) { + if (given.host || related.host) { + if (wanted & PARSE_PUNCTUATION) { + *tail++ = '/'; + *tail++ = '/'; + } + strcpy(tail, given.host ? given.host : related.host); + /* + * Ignore default port numbers, and trailing dots on FQDNs, which + * will only cause identical addresses to look different. (related + * is already a clean url). + */ + { + char *p2, *h; + int portnumber; + int gen_delims = 0; + + if ((p2 = HTSkipToAt(result, &gen_delims)) != NULL + && gen_delims == 0) { + tail = (p2 + 1); + } + p2 = HTParsePort(result, &portnumber); + if (p2 != NULL && acc_method != NULL) { + /* + * Port specified. + */ +#define ACC_METHOD(a,b) (!strcmp(acc_method, a) && (portnumber == b)) + if (ACC_METHOD("http", 80) || + ACC_METHOD("https", 443) || + ACC_METHOD("gopher", 70) || + ACC_METHOD("ftp", 21) || + ACC_METHOD("wais", 210) || + ACC_METHOD("nntp", 119) || + ACC_METHOD("news", 119) || + ACC_METHOD("newspost", 119) || + ACC_METHOD("newsreply", 119) || + ACC_METHOD("snews", 563) || + ACC_METHOD("snewspost", 563) || + ACC_METHOD("snewsreply", 563) || + ACC_METHOD("finger", 79) || + ACC_METHOD("telnet", 23) || + ACC_METHOD("tn3270", 23) || + ACC_METHOD("rlogin", 513) || + ACC_METHOD("cso", 105)) + *p2 = '\0'; /* It is the default: ignore it */ + } + if (p2 == NULL) { + int len3 = (int) strlen(tail); + + if (len3 > 0) { + h = tail + len3 - 1; /* last char of hostname */ + if (*h == '.') + *h = '\0'; /* chop final . */ + } + } else if (p2 != result) { + h = p2; + h--; /* End of hostname */ + if (*h == '.') { + /* + * Slide p2 over h. + */ + while (*p2 != '\0') + *h++ = *p2++; + *h = '\0'; /* terminate */ + } + } + } +#if defined(USE_IDNA) || defined(USE_IDN2) + /* + * Depending on locale-support, we could have a literal UTF-8 + * string as a host name, or a URL-encoded form of that. + */ + convert_to_idna(tail); +#endif + } + } + + /* + * Trim any blanks from the result so far - there's no excuse for blanks + * in a hostname. Also update the tail here. + */ + tail = LYRemoveBlanks(result); + + /* + * If host in given or related was ended directly with a '?' (no slash), + * fake the search part into absolute. This is the only case search is + * returned from scan. A host must have been present. this restores the + * '?' at which the host part had been truncated in scan, we have to do + * this after host part handling is done. - kw + */ + if (given.search && *(given.search - 1) == '\0') { + given.absolute = given.search - 1; + given.absolute[0] = '?'; + } else if (related.search && !related.absolute && + *(related.search - 1) == '\0') { + related.absolute = related.search - 1; + related.absolute[0] = '?'; + } + + /* + * If different hosts, inherit no path. + */ + if (given.host && related.host) + if (strcmp(given.host, related.host) != 0) { + related.absolute = NULL; + related.relative = NULL; + related.anchor = NULL; + } + + /* + * Handle the path. + */ + if (wanted & (PARSE_PATH | PARSE_STRICTPATH | PARSE_QUERY)) { + int want_detail = (wanted & (PARSE_STRICTPATH | PARSE_QUERY)); + + if (acc_method && !given.absolute && given.relative) { + /* + * Treat all given nntp or snews paths, or given paths for news + * URLs with a host, as absolute. + */ + switch (*acc_method) { + case 'N': + case 'n': + if (!strcasecomp(acc_method, "nntp") || + (!strcasecomp(acc_method, "news") && + !strncasecomp(result, "news://", 7))) { + given.absolute = given.relative; + given.relative = NULL; + } + break; + case 'S': + case 's': + if (!strcasecomp(acc_method, "snews")) { + given.absolute = given.relative; + given.relative = NULL; + } + break; + } + } + + if (given.absolute) { /* All is given */ + char *base = tail; + + if (wanted & PARSE_PUNCTUATION) + *tail++ = '/'; + strcpy(tail, given.absolute); + HTSimplify(base, TRUE); + CTRACE((tfp, "HTParse: (ABS)\n")); + } else if (related.absolute) { /* Adopt path not name */ + char *base = tail; + + *tail++ = '/'; + strcpy(tail, related.absolute); + if (given.relative) { + /* RFC 1808 part 4 step 5 (if URL path is empty) */ + /* a) if given has params, add/replace that */ + if (given.relative[0] == ';') { + strcpy(strchr_or_end(tail, ';'), given.relative); + } + /* b) if given has query, add/replace that */ + else if (given.relative[0] == '?') { + strcpy(strchr_or_end(tail, '?'), given.relative); + } + /* otherwise fall through to RFC 1808 part 4 step 6 */ + else { + p = StrChr(tail, '?'); /* Search part? */ + if (p == NULL) + p = (tail + strlen(tail) - 1); + for (; *p != '/'; p--) ; /* last / */ + p[1] = '\0'; /* Remove filename */ + strcat(p, given.relative); /* Add given one */ + } + HTSimplify(base, FALSE); + if (*base == '\0') + strcpy(base, "/"); + } else { + HTSimplify(base, TRUE); + } + if (base[0] == '/' && base[1] == '/') { + char *pz; + + for (pz = base; (pz[0] = pz[1]) != '\0'; ++pz) ; + } + CTRACE((tfp, "HTParse: (Related-ABS)\n")); + } else if (given.relative) { + strcpy(tail, given.relative); /* what we've got */ + HTSimplify(tail, FALSE); + CTRACE((tfp, "HTParse: (REL)\n")); + } else if (related.relative) { + strcpy(tail, related.relative); + HTSimplify(tail, FALSE); + CTRACE((tfp, "HTParse: (Related-REL)\n")); + } else { /* No inheritance */ + if (!isLYNXCGI(aName) && + !isLYNXEXEC(aName) && + !isLYNXPROG(aName)) { + *tail++ = '/'; + *tail = '\0'; + } else { + HTSimplify(tail, FALSE); + } + if (!strcmp(result, "news:/")) + result[5] = '*'; + CTRACE((tfp, "HTParse: (No inheritance)\n")); + } + if (want_detail) { + p = StrChr(tail, '?'); /* Search part? */ + if (p) { + if (PARSE_STRICTPATH) { + *p = '\0'; + } else { + if (!(wanted & PARSE_PUNCTUATION)) + p++; + do { + *tail++ = *p; + } while (*p++); + } + } else { + if (wanted & PARSE_QUERY) + *tail = '\0'; + } + } + } + + /* + * Handle the fragment (anchor). Never inherit. + */ + if (wanted & PARSE_ANCHOR) { + if (given.anchor && *given.anchor) { + tail += strlen(tail); + if (wanted & PARSE_PUNCTUATION) + *tail++ = '#'; + strcpy(tail, given.anchor); + } + } + + /* + * If there are any blanks remaining in the string, escape them as needed. + * See the discussion in LYLegitimizeHREF() for example. + */ + if ((p = StrChr(result, ' ')) != 0) { + switch (is_url(result)) { + case UNKNOWN_URL_TYPE: + CTRACE((tfp, "HTParse: ignore:`%s'\n", result)); + break; + case LYNXEXEC_URL_TYPE: + case LYNXPROG_URL_TYPE: + case LYNXCGI_URL_TYPE: + case LYNXPRINT_URL_TYPE: + case LYNXHIST_URL_TYPE: + case LYNXDOWNLOAD_URL_TYPE: + case LYNXKEYMAP_URL_TYPE: + case LYNXIMGMAP_URL_TYPE: + case LYNXCOOKIE_URL_TYPE: + case LYNXCACHE_URL_TYPE: + case LYNXDIRED_URL_TYPE: + case LYNXOPTIONS_URL_TYPE: + case LYNXCFG_URL_TYPE: + case LYNXCOMPILE_OPTS_URL_TYPE: + case LYNXMESSAGES_URL_TYPE: + CTRACE((tfp, "HTParse: spaces:`%s'\n", result)); + break; + case NOT_A_URL_TYPE: + default: + CTRACE((tfp, "HTParse: encode:`%s'\n", result)); + do { + q = p + strlen(p) + 2; + + while (q != p + 1) { + q[0] = q[-2]; + --q; + } + p[0] = HEX_ESCAPE; + p[1] = '2'; + p[2] = '0'; + } while ((p = StrChr(result, ' ')) != 0); + break; + } + } + CTRACE((tfp, "HTParse: result:`%s'\n", result)); + + StrAllocCopy(return_value, result); + LYalloca_free(result); + + /* FIXME: could be optimized using HTParse() internals */ + if (*relatedName && + ((wanted & PARSE_ALL_WITHOUT_ANCHOR) == PARSE_ALL_WITHOUT_ANCHOR)) { + /* + * Check whether to fill in localhost. - FM + */ + LYFillLocalFileURL(&return_value, relatedName); + CTRACE((tfp, "pass LYFillLocalFile:`%s'\n", return_value)); + } + + return return_value; /* exactly the right length */ +} + +/* HTParseAnchor(), fast HTParse() specialization + * ---------------------------------------------- + * + * On exit, + * returns A pointer within input string (probably to its end '\0') + */ +const char *HTParseAnchor(const char *aName) +{ + const char *p = aName; + + for (; *p && *p != '#'; p++) { + ; + } + if (*p == '#') { + /* the safe way based on HTParse() - + * keeping in mind scan() peculiarities on schemes: + */ + struct struct_parts given; + size_t need = ((unsigned) ((p - aName) + (int) strlen(p) + 1)); + char *name; + + if (need > (size_t) max_uri_size) { + p += strlen(p); + } else { + name = (char *) LYalloca(need); + + if (name == NULL) { + outofmem(__FILE__, "HTParseAnchor"); + } + strcpy(name, aName); + scan(name, &given); + LYalloca_free(name); + + p++; /*next to '#' */ + if (given.anchor == NULL) { + for (; *p; p++) /*scroll to end '\0' */ + ; + } + } + } + return p; +} + +/* Simplify a filename. HTSimplify() + * -------------------- + * + * A unix-style file is allowed to contain the sequence xxx/../ which may + * be replaced by "" , and the sequence "/./" which may be replaced by "/". + * Simplification helps us recognize duplicate filenames. + * + * RFC 3986 section 5.2.4 says to do this whether or not the path was relative. + */ +void HTSimplify(char *filename, BOOL absolute) +{ +#define MY_FMT "HTParse HTSimplify\t(%s)" +#ifdef NO_LYNX_TRACE +#define debug_at(at) /* nothing */ +#define atln "?" +#else + const char *atln; + +#define debug_at(at) atln = at +#endif + char *mark; + char *p; + size_t limit; + + CTRACE2(TRACE_HTPARSE, + (tfp, MY_FMT " %s\n", + filename, + absolute ? "ABS" : "REL")); + + if (LYIsPathSep(*filename) && !absolute) + ++filename; + mark = filename; + limit = strlen(filename); + + for (p = filename; *p; ++p) { + if (*p == '?' || *p == '#') { + limit = (size_t) (p - filename); + break; + } + } + while ((limit != 0) && (*filename != '\0')) { + size_t trim = 0; + size_t skip = 0; + size_t last = 0; + + debug_at("?"); + p = filename; + if (limit >= 2 && !memcmp(p, "./", 2)) { /* 2A */ + debug_at("2A"); + trim = 2; + } else if (limit >= 3 && !memcmp(p, "../", 3)) { + debug_at("2A2"); + trim = 3; + } else if (limit >= 3 && !memcmp(p, "/./", 3)) { /* 2B */ + debug_at("2B"); + trim = 2; + skip = 1; + } else if (limit == 2 && !memcmp(p, "/.", 2)) { + debug_at("2B2"); + trim = 1; + skip = 1; + } else if (limit >= 4 && !memcmp(p, "/../", 4)) { /* 2C */ + debug_at("2C"); + trim = 3; + skip = 1; + last = 1; + } else if (limit == 3 && !memcmp(p, "/..", 3)) { + debug_at("2C2"); + trim = 2; + skip = 1; + last = 1; + } else if (limit == 2 && !memcmp(p, "..", 2)) { /* 2D */ + debug_at("2D"); + trim = 2; + } else if (limit == 1 && !memcmp(p, ".", 1)) { + debug_at("2D2"); + trim = 1; + } + if (trim) { + CTRACE2(TRACE_HTPARSE, + (tfp, MY_FMT " trim %lu/%lu (%.*s) '%.*s' @%s\n", + mark, (unsigned long) trim, (unsigned long) limit, + (int) trim, p + skip, (int) limit, p, atln)); + } + if (last) { + char *prior = filename; + + if (prior != mark) { + --prior; + while (prior != mark && *prior != '/') { + --prior; + } + } + if (prior != filename) { + trim += (size_t) (filename - prior); + limit += (size_t) (filename - prior); + filename = prior; + CTRACE2(TRACE_HTPARSE, + (tfp, MY_FMT " TRIM %lu/%lu (%.*s)\n", + mark, (unsigned long) trim, (unsigned long) limit, + (int) trim, filename + skip)); + } + } + if (trim) { + limit -= trim; + for (p = filename;; ++p) { + if ((p[0] = p[trim]) == '\0') { + break; + } + if (skip) { + p[0] = '/'; + skip = 0; + } + } + CTRACE2(TRACE_HTPARSE, + (tfp, MY_FMT " loop %lu\n", mark, (unsigned long) limit)); + } else { + if (*filename == '/') { + ++filename; + --limit; + } + while ((limit != 0) && (*filename != '/')) { + ++filename; + --limit; + } + } + } + CTRACE2(TRACE_HTPARSE, (tfp, MY_FMT " done\n", mark)); +#undef MY_FMT +} + +/* Make Relative Name. HTRelative() + * ------------------- + * + * This function creates and returns a string which gives an expression of + * one address as related to another. Where there is no relation, an absolute + * address is returned. + * + * On entry, + * Both names must be absolute, fully qualified names of nodes + * (no anchor bits) + * + * On exit, + * The return result points to a newly allocated name which, if + * parsed by HTParse relative to relatedName, will yield aName. + * The caller is responsible for freeing the resulting name later. + * + */ +char *HTRelative(const char *aName, + const char *relatedName) +{ + char *result = NULL; + const char *p = aName; + const char *q = relatedName; + const char *after_access = NULL; + const char *path = NULL; + const char *last_slash = NULL; + int slashes = 0; + + for (; *p; p++, q++) { /* Find extent of match */ + if (*p != *q) + break; + if (*p == ':') + after_access = p + 1; + if (*p == '/') { + last_slash = p; + slashes++; + if (slashes == 3) + path = p; + } + } + + /* q, p point to the first non-matching character or zero */ + + if (!after_access) { /* Different access */ + StrAllocCopy(result, aName); + } else if (slashes < 3) { /* Different nodes */ + StrAllocCopy(result, after_access); + } else if (slashes == 3) { /* Same node, different path */ + StrAllocCopy(result, path); + } else { /* Some path in common */ + unsigned levels = 0; + + for (; *q && (*q != '#'); q++) + if (*q == '/') + levels++; + result = typecallocn(char, 3 * levels + strlen(last_slash) + 1); + + if (result == NULL) + outofmem(__FILE__, "HTRelative"); + + result[0] = '\0'; + for (; levels; levels--) + strcat(result, "../"); + strcat(result, last_slash + 1); + } + CTRACE((tfp, + "HTparse: `%s' expressed relative to\n `%s' is\n `%s'.\n", + aName, relatedName, result)); + return result; +} + +#define AlloCopy(next,base,extra) \ + typecallocn(char, ((next - base) + ((int) extra))) + +/* Escape undesirable characters using % HTEscape() + * ------------------------------------- + * + * This function takes a pointer to a string in which + * some characters may be unacceptable unescaped. + * It returns a string which has these characters + * represented by a '%' character followed by two hex digits. + * + * Unlike HTUnEscape(), this routine returns a calloc'd string. + */ +/* *INDENT-OFF* */ +static const unsigned char isAcceptable[96] = + +/* Bit 0 xalpha -- see HTFile.h + * Bit 1 xpalpha -- as xalpha but with plus. + * Bit 2 ... path -- as xpalphas but with / + */ + /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ + { 0,0,0,0,0,0,0,0,0,0,7,6,0,7,7,4, /* 2x !"#$%&'()*+,-./ */ + 7,7,7,7,7,7,7,7,7,7,0,0,0,0,0,0, /* 3x 0123456789:;<=>? */ + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /* 4x @ABCDEFGHIJKLMNO */ + 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,7, /* 5X PQRSTUVWXYZ[\]^_ */ + 0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /* 6x `abcdefghijklmno */ + 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,0 }; /* 7X pqrstuvwxyz{|}~ DEL */ +/* *INDENT-ON* */ + +static const char *hex = "0123456789ABCDEF"; + +#define ACCEPTABLE(a) ( a>=32 && a<128 && ((isAcceptable[a-32]) & mask)) + +char *HTEscape(const char *str, + unsigned mask) +{ + const char *p; + char *q; + char *result; + size_t unacceptable = 0; + + for (p = str; *p; p++) + if (!ACCEPTABLE(UCH(TOASCII(*p)))) + unacceptable++; + result = AlloCopy(p, str, (unacceptable * 2) + 1); + + if (result == NULL) + outofmem(__FILE__, "HTEscape"); + + for (q = result, p = str; *p; p++) { + unsigned char a = UCH(TOASCII(*p)); + + if (!ACCEPTABLE(a)) { + *q++ = HEX_ESCAPE; /* Means hex coming */ + *q++ = hex[a >> 4]; + *q++ = hex[a & 15]; + } else + *q++ = *p; + } + *q = '\0'; /* Terminate */ + return result; +} + +/* Escape unsafe characters using % HTEscapeUnsafe() + * -------------------------------- + * + * This function takes a pointer to a string in which + * some characters may be that may be unsafe are unescaped. + * It returns a string which has these characters + * represented by a '%' character followed by two hex digits. + * + * Unlike HTUnEscape(), this routine returns a malloc'd string. + */ +#define UNSAFE(ch) (((ch) <= 32) || ((ch) >= 127)) + +char *HTEscapeUnsafe(const char *str) +{ + const char *p; + char *q; + char *result; + size_t unacceptable = 0; + + for (p = str; *p; p++) + if (UNSAFE(UCH(TOASCII(*p)))) + unacceptable++; + result = AlloCopy(p, str, (unacceptable * 2) + 1); + + if (result == NULL) + outofmem(__FILE__, "HTEscapeUnsafe"); + + for (q = result, p = str; *p; p++) { + unsigned char a = UCH(TOASCII(*p)); + + if (UNSAFE(a)) { + *q++ = HEX_ESCAPE; /* Means hex coming */ + *q++ = hex[a >> 4]; + *q++ = hex[a & 15]; + } else + *q++ = *p; + } + *q = '\0'; /* Terminate */ + return result; +} + +/* Escape undesirable characters using % but space to +. HTEscapeSP() + * ----------------------------------------------------- + * + * This function takes a pointer to a string in which + * some characters may be unacceptable unescaped. + * It returns a string which has these characters + * represented by a '%' character followed by two hex digits, + * except that spaces are converted to '+' instead of %2B. + * + * Unlike HTUnEscape(), this routine returns a calloced string. + */ +char *HTEscapeSP(const char *str, + unsigned mask) +{ + const char *p; + char *q; + char *result; + size_t unacceptable = 0; + + for (p = str; *p; p++) + if (!(*p == ' ' || ACCEPTABLE(UCH(TOASCII(*p))))) + unacceptable++; + result = AlloCopy(p, str, (unacceptable * 2) + 1); + + if (result == NULL) + outofmem(__FILE__, "HTEscape"); + + for (q = result, p = str; *p; p++) { + unsigned char a = UCH(TOASCII(*p)); + + if (a == 32) { + *q++ = '+'; + } else if (!ACCEPTABLE(a)) { + *q++ = HEX_ESCAPE; /* Means hex coming */ + *q++ = hex[a >> 4]; + *q++ = hex[a & 15]; + } else { + *q++ = *p; + } + } + *q = '\0'; /* Terminate */ + return result; +} + +/* Decode %xx escaped characters. HTUnEscape() + * ------------------------------ + * + * This function takes a pointer to a string in which some + * characters may have been encoded in %xy form, where xy is + * the ASCII hex code for character 16x+y. + * The string is converted in place, as it will never grow. + */ +static char from_hex(int c) +{ + return (char) (c >= '0' && c <= '9' ? c - '0' + : c >= 'A' && c <= 'F' ? c - 'A' + 10 + : c - 'a' + 10); /* accept small letters just in case */ +} + +char *HTUnEscape(char *str) +{ + char *p = str; + char *q = str; + + if (!(p && *p)) + return str; + + while (*p != '\0') { + if (*p == HEX_ESCAPE && + /* + * Tests shouldn't be needed, but better safe than sorry. + */ + p[1] && p[2] && + isxdigit(UCH(p[1])) && + isxdigit(UCH(p[2]))) { + p++; + if (*p) + *q = (char) (from_hex(*p++) * 16); + if (*p) { + /* + * Careful! FROMASCII() may evaluate its arg more than once! + */ + /* S/390 -- gil -- 0221 */ + *q = (char) (*q + from_hex(*p++)); + } + *q = FROMASCII(*q); + q++; + } else { + *q++ = *p++; + } + } + + *q = '\0'; + return str; + +} /* HTUnEscape */ + +/* Decode some %xx escaped characters. HTUnEscapeSome() + * ----------------------------------- Klaus Weide + * (kweide@tezcat.com) + * This function takes a pointer to a string in which some + * characters may have been encoded in %xy form, where xy is + * the ASCII hex code for character 16x+y, and a pointer to + * a second string containing one or more characters which + * should be unescaped if escaped in the first string. + * The first string is converted in place, as it will never grow. + */ +char *HTUnEscapeSome(char *str, + const char *do_trans) +{ + char *p = str; + char *q = str; + char testcode; + + if (p == NULL || *p == '\0' || do_trans == NULL || *do_trans == '\0') + return str; + + while (*p != '\0') { + if (*p == HEX_ESCAPE && + p[1] && p[2] && /* tests shouldn't be needed, but.. */ + isxdigit(UCH(p[1])) && + isxdigit(UCH(p[2])) && + (testcode = (char) FROMASCII(from_hex(p[1]) * 16 + + from_hex(p[2]))) && /* %00 no good */ + StrChr(do_trans, testcode)) { /* it's one of the ones we want */ + *q++ = testcode; + p += 3; + } else { + *q++ = *p++; + } + } + + *q = '\0'; + return str; + +} /* HTUnEscapeSome */ +/* *INDENT-OFF* */ +static const unsigned char crfc[96] = + +/* Bit 0 xalpha -- need "quoting" + * Bit 1 xpalpha -- need \escape if quoted + */ + /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ + { 1,0,3,0,0,0,0,0,1,1,0,0,1,0,1,0, /* 2x !"#$%&'()*+,-./ */ + 0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0, /* 3x 0123456789:;<=>? */ + 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 4x @ABCDEFGHIJKLMNO */ + 0,0,0,0,0,0,0,0,0,0,0,1,2,1,0,0, /* 5X PQRSTUVWXYZ[\]^_ */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 6x `abcdefghijklmno */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3 }; /* 7X pqrstuvwxyz{|}~ DEL */ +/* *INDENT-ON* */ + +#define ASCII_TAB '\011' +#define ASCII_LF '\012' +#define ASCII_CR '\015' +#define ASCII_SPC '\040' +#define ASCII_BAK '\134' + +/* + * Turn a string which is not a RFC 822 token into a quoted-string. - KW + * The "quoted" parameter tells whether we need the beginning/ending quote + * marks. If not, the caller will provide them -TD + */ +void HTMake822Word(char **str, + int quoted) +{ + const char *p; + char *q; + char *result; + unsigned char a; + unsigned added = 0; + + if (isEmpty(*str)) { + StrAllocCopy(*str, quoted ? "\"\"" : ""); + return; + } + for (p = *str; *p; p++) { + a = UCH(TOASCII(*p)); /* S/390 -- gil -- 0240 */ + if (a < 32 || a >= 128 || + ((crfc[a - 32]) & 1)) { + if (!added) + added = 2; + if (a >= 160 || a == '\t') + continue; + if (a == '\r' || a == '\n') + added += 2; + else if ((a & 127) < 32 || ((crfc[a - 32]) & 2)) + added++; + } + } + if (!added) + return; + result = AlloCopy(p, *str, added + 1); + if (result == NULL) + outofmem(__FILE__, "HTMake822Word"); + + q = result; + if (quoted) + *q++ = '"'; + /* + * Having converted the character to ASCII, we can't use symbolic + * escape codes, since they're in the host character set, which + * is not necessarily ASCII. Thus we use octal escape codes instead. + * -- gil (Paul Gilmartin) <pg@sweng.stortek.com> + */ + /* S/390 -- gil -- 0268 */ + for (p = *str; *p; p++) { + a = UCH(TOASCII(*p)); + if ((a != ASCII_TAB) && + ((a & 127) < ASCII_SPC || + (a < 128 && ((crfc[a - 32]) & 2)))) + *q++ = ASCII_BAK; + *q++ = *p; + if (a == ASCII_LF || + (a == ASCII_CR && (TOASCII(*(p + 1)) != ASCII_LF))) + *q++ = ' '; + } + if (quoted) + *q++ = '"'; + *q = '\0'; /* Terminate */ + FREE(*str); + *str = result; +} |