/* * $LynxId: HTParse.c,v 1.98 2021/07/27 21:29:49 tom Exp $ * * Parse HyperText Document Address HTParse.c * ================================ */ #include <HTUtils.h> #include <HTParse.h> #include <LYUtils.h> #include <LYLeaks.h> #include <LYStrings.h> #include <LYCharUtils.h> #include <LYGlobalDefs.h> #ifdef HAVE_ALLOCA_H #include <alloca.h> #else #ifdef __MINGW32__ #include <malloc.h> #endif /* __MINGW32__ */ #endif #ifdef USE_IDN2 #include <idn2.h> #define FreeIdna(out) idn2_free(out) #elif defined(USE_IDNA) #include <idna.h> #include <idn-free.h> #define FreeIdna(out) idn_free(out) #define IDN2_OK IDNA_SUCCESS #endif #define HEX_ESCAPE '%' struct struct_parts { char *access; char *host; char *absolute; char *relative; char *search; /* treated normally as part of path */ char *anchor; }; #if 0 /* for debugging */ static void show_parts(const char *name, struct struct_parts *parts, int line) { if (TRACE) { CTRACE((tfp, "struct_parts(%s) %s@%d\n", name, __FILE__, line)); CTRACE((tfp, " access '%s'\n", NONNULL(parts->access))); CTRACE((tfp, " host '%s'\n", NONNULL(parts->host))); CTRACE((tfp, " absolute '%s'\n", NONNULL(parts->absolute))); CTRACE((tfp, " relative '%s'\n", NONNULL(parts->relative))); CTRACE((tfp, " search '%s'\n", NONNULL(parts->search))); CTRACE((tfp, " anchor '%s'\n", NONNULL(parts->anchor))); } } #define SHOW_PARTS(name) show_parts(#name, &name, __LINE__) #else #define SHOW_PARTS(name) /* nothing */ #endif /* Strip white space off a string. HTStrip() * ------------------------------- * * On exit, * Return value points to first non-white character, or to 0 if none. * All trailing white space is OVERWRITTEN with zero. */ char *HTStrip(char *s) { #define SPACE(c) ((c == ' ') || (c == '\t') || (c == '\n')) char *p; for (p = s; *p; p++) { /* Find end of string */ ; } for (p--; p >= s; p--) { if (SPACE(*p)) *p = '\0'; /* Zap trailing blanks */ else break; } while (SPACE(*s)) s++; /* Strip leading blanks */ return s; } /* Scan a filename for its constituents. scan() * ------------------------------------- * * On entry, * name points to a document name which may be incomplete. * On exit, * absolute or relative may be nonzero (but not both). * host, anchor and access may be nonzero if they were specified. * Any which are nonzero point to zero terminated strings. */ static void scan(char *name, struct struct_parts *parts) { char *after_access; char *p; parts->access = NULL; parts->host = NULL; parts->absolute = NULL; parts->relative = NULL; parts->search = NULL; /* normally not used - kw */ parts->anchor = NULL; /* * Scan left-to-right for a scheme (access). */ after_access = name; for (p = name; *p; p++) { if (*p == ':') { *p = '\0'; parts->access = name; /* Access name has been specified */ after_access = (p + 1); break; } if (*p == '/' || *p == '#' || *p == ';' || *p == '?') break; } /* * Scan left-to-right for a fragment (anchor). */ for (p = after_access; *p; p++) { if (*p == '#') { parts->anchor = (p + 1); *p = '\0'; /* terminate the rest */ break; /* leave things after first # alone - kw */ } } /* * Scan left-to-right for a host or absolute path. */ p = after_access; if (*p == '/') { if (p[1] == '/') { parts->host = (p + 2); /* host has been specified */ *p = '\0'; /* Terminate access */ p = StrChr(parts->host, '/'); /* look for end of host name if any */ if (p != NULL) { *p = '\0'; /* Terminate host */ parts->absolute = (p + 1); /* Root has been found */ } else { p = StrChr(parts->host, '?'); if (p != NULL) { *p = '\0'; /* Terminate host */ parts->search = (p + 1); } } } else { parts->absolute = (p + 1); /* Root found but no host */ } } else { parts->relative = (*after_access) ? after_access : NULL; /* NULL for "" */ } /* * Check schemes that commonly have unescaped hashes. */ if (parts->access && parts->anchor && /* optimize */ StrChr("lnsdLNSD", *parts->access) != NULL) { if ((!parts->host && strcasecomp(parts->access, "lynxcgi")) || !strcasecomp(parts->access, "nntp") || !strcasecomp(parts->access, "snews") || !strcasecomp(parts->access, "news") || !strcasecomp(parts->access, "data")) { /* * Access specified but no host and not a lynxcgi URL, so the * anchor may not really be one, e.g., news:j462#36487@foo.bar, or * it's an nntp or snews URL, or news URL with a host. Restore the * '#' in the address. */ /* but only if we have found a path component of which this will * become part. - kw */ if (parts->relative || parts->absolute) { *(parts->anchor - 1) = '#'; parts->anchor = NULL; } } } } /*scan */ #if defined(HAVE_ALLOCA) && !defined(LY_FIND_LEAKS) #define LYalloca(x) alloca((size_t)(x)) #define LYalloca_free(x) {} #else #define LYalloca(x) malloc((size_t)(x)) #define LYalloca_free(x) free((void *)(x)) #endif static char *strchr_or_end(char *string, int ch) { char *result = StrChr(string, ch); if (result == 0) { result = string + strlen(string); } return result; } /* * Given a host specification that may end with a port number, e.g., * foobar:123 * point to the ':' which begins the ":port" to make it simple to handle the * substring. * * If no port is found (or a syntax error), return null. */ char *HTParsePort(char *host, int *portp) { int brackets = 0; char *result = NULL; *portp = 0; if (host != NULL) { while (*host != '\0' && result == 0) { switch (*host++) { case ':': if (brackets == 0 && isdigit(UCH(*host))) { char *next = NULL; *portp = (int) strtol(host, &next, 10); if (next != 0 && next != host && *next == '\0') { result = (host - 1); CTRACE((tfp, "HTParsePort %d\n", *portp)); } } break; case '[': /* for ipv6 */ ++brackets; break; case ']': /* for ipv6 */ --brackets; break; } } } return result; } #if defined(USE_IDNA) || defined(USE_IDN2) static int hex_decode(int ch) { int result = -1; if (ch >= '0' && ch <= '9') result = (ch - '0'); else if (ch >= 'a' && ch <= 'f') result = (ch - 'a') + 10; else if (ch >= 'A' && ch <= 'F') result = (ch - 'A') + 10; return result; } /* * Convert in-place the given hostname to IDNA form. That requires up to 64 * characters, and we've allowed for that, with MIN_PARSE. */ static void convert_to_idna(char *host) { size_t length = strlen(host); char *endhost = host + length; char *buffer = malloc(length + 1); char *params = malloc(length + 1); char *output = NULL; char *src, *dst; int code; int hi, lo; if (buffer != NULL && params != NULL) { code = TRUE; *params = '\0'; for (dst = buffer, src = host; src < endhost; ++dst) { int ch = *src++; if (RFC_3986_GEN_DELIMS(ch)) { strcpy(params, src - 1); *dst = '\0'; break; } else if (ch == HEX_ESCAPE) { if ((src + 1) < endhost && (hi = hex_decode(src[0])) >= 0 && (lo = hex_decode(src[1])) >= 0) { *dst = (char) ((hi << 4) | lo); src += 2; } else { CTRACE((tfp, "convert_to_idna: `%s' is malformed\n", host)); code = FALSE; break; } } else { *dst = (char) ch; } } if (code) { *dst = '\0'; #ifdef USE_IDN2 #if (!defined(IDN2_VERSION_NUMBER) || IDN2_VERSION_NUMBER < 0x02000003) /* * Older libidn2 mishandles STD3, stripping underscores. */ if (strchr(buffer, '_') != NULL) { code = -1; } else #endif switch (LYidnaMode) { case LYidna2003: code = idn2_to_ascii_8z(buffer, &output, IDN2_TRANSITIONAL); break; case LYidna2008: /* IDNA2008 rules without the TR46 amendments */ code = idn2_to_ascii_8z(buffer, &output, 0); break; case LYidnaTR46: code = idn2_to_ascii_8z(buffer, &output, IDN2_NONTRANSITIONAL | IDN2_NFC_INPUT); break; case LYidnaCompat: /* IDNA2008 */ code = idn2_to_ascii_8z(buffer, &output, IDN2_NONTRANSITIONAL | IDN2_NFC_INPUT); if (code == IDN2_DISALLOWED) { /* IDNA2003 - compatible */ code = idn2_to_ascii_8z(buffer, &output, IDN2_TRANSITIONAL); } break; } #else code = idna_to_ascii_8z(buffer, &output, IDNA_USE_STD3_ASCII_RULES); #endif if (code == IDN2_OK) { CTRACE((tfp, "convert_to_idna: `%s' -> `%s': OK\n", buffer, output)); strcpy(host, output); strcat(host, params); } else { CTRACE((tfp, "convert_to_idna: `%s': %s\n", buffer, idna_strerror((Idna_rc) code))); } if (output) FreeIdna(output); } } free(buffer); free(params); } #define MIN_PARSE 80 #else #define MIN_PARSE 8 #endif /* Parse a Name relative to another name. HTParse() * -------------------------------------- * * This returns those parts of a name which are given (and requested) * substituting bits from the related name where necessary. * * Originally based on RFC 1808, some details in RFC 3986 are used. * * On entry, * aName A filename given * relatedName A name relative to which aName is to be parsed * wanted A mask for the bits which are wanted. * * On exit, * returns A pointer to a malloc'd string which MUST BE FREED */ char *HTParse(const char *aName, const char *relatedName, int wanted) { char *result = NULL; char *tail = NULL; /* a pointer to the end of the 'result' string */ char *return_value = NULL; size_t len, len1, len2; size_t need; char *name = NULL; char *rel = NULL; char *p, *q; char *acc_method; struct struct_parts given, related; CTRACE((tfp, "HTParse: aName:`%s'\n", aName)); CTRACE((tfp, " relatedName:`%s'\n", relatedName)); if (wanted & (PARSE_STRICTPATH | PARSE_QUERY)) { /* if detail wanted... */ if ((wanted & (PARSE_STRICTPATH | PARSE_QUERY)) == (PARSE_STRICTPATH | PARSE_QUERY)) /* if strictpath AND query */ wanted |= PARSE_PATH; /* then treat as if PARSE_PATH wanted */ if (wanted & PARSE_PATH) /* if PARSE_PATH wanted */ wanted &= ~(PARSE_STRICTPATH | PARSE_QUERY); /* ignore details */ } /* *INDENT-OFF* */ CTRACE((tfp, " want:%s%s%s%s%s%s%s\n", wanted & PARSE_PUNCTUATION ? " punc" : "", wanted & PARSE_ANCHOR ? " anchor" : "", wanted & PARSE_PATH ? " path" : "", wanted & PARSE_HOST ? " host" : "", wanted & PARSE_ACCESS ? " access" : "", wanted & PARSE_STRICTPATH ? " PATH" : "", wanted & PARSE_QUERY ? " QUERY" : "")); /* *INDENT-ON* */ /* * Allocate the temporary string. Optimized. */ len1 = strlen(aName) + 1; len2 = strlen(relatedName) + 1; len = len1 + len2 + MIN_PARSE; /* Lots of space: more than enough */ need = (len * 2 + len1 + len2); if (need > (size_t) max_uri_size || (int) need < (int) len1 || (int) need < (int) len2) return StrAllocCopy(return_value, ""); result = tail = (char *) LYalloca(need); if (result == NULL) { outofmem(__FILE__, "HTParse"); } *result = '\0'; name = result + len; rel = name + len1; /* * Make working copy of the input string to cut up. */ MemCpy(name, aName, len1); /* * Cut up the string into URL fields. */ scan(name, &given); SHOW_PARTS(given); /* * Now related string. */ if ((given.access && given.host && given.absolute) || !*relatedName) { /* * Inherit nothing! */ related.access = NULL; related.host = NULL; related.absolute = NULL; related.relative = NULL; related.search = NULL; related.anchor = NULL; } else { MemCpy(rel, relatedName, len2); scan(rel, &related); } SHOW_PARTS(related); /* * Handle the scheme (access) field. */ if (given.access && given.host && !given.relative && !given.absolute) { if (!strcmp(given.access, "http") || !strcmp(given.access, "https") || !strcmp(given.access, "ftp")) { /* * Assume root. */ given.absolute = empty_string; } } acc_method = given.access ? given.access : related.access; if (wanted & PARSE_ACCESS) { if (acc_method) { strcpy(tail, acc_method); tail += strlen(tail); if (wanted & PARSE_PUNCTUATION) { *tail++ = ':'; *tail = '\0'; } } } /* * If different schemes, inherit nothing. * * We'll try complying with RFC 1808 and the Fielding draft, and inherit * nothing if both schemes are given, rather than only when they differ, * except for file URLs - FM * * After trying it for a while, it's still premature, IHMO, to go along * with it, so this is back to inheriting for identical schemes whether or * not they are "file". If you want to try it again yourself, uncomment * the strcasecomp() below. - FM */ if ((given.access && related.access) && ( /* strcasecomp(given.access, "file") || */ strcmp(given.access, related.access))) { related.host = NULL; related.absolute = NULL; related.relative = NULL; related.search = NULL; related.anchor = NULL; } /* * Handle the host field. */ if (wanted & PARSE_HOST) { if (given.host || related.host) { if (wanted & PARSE_PUNCTUATION) { *tail++ = '/'; *tail++ = '/'; } strcpy(tail, given.host ? given.host : related.host); /* * Ignore default port numbers, and trailing dots on FQDNs, which * will only cause identical addresses to look different. (related * is already a clean url). */ { char *p2, *h; int portnumber; int gen_delims = 0; if ((p2 = HTSkipToAt(result, &gen_delims)) != NULL && gen_delims == 0) { tail = (p2 + 1); } p2 = HTParsePort(result, &portnumber); if (p2 != NULL && acc_method != NULL) { /* * Port specified. */ #define ACC_METHOD(a,b) (!strcmp(acc_method, a) && (portnumber == b)) if (ACC_METHOD("http", 80) || ACC_METHOD("https", 443) || ACC_METHOD("gopher", 70) || ACC_METHOD("ftp", 21) || ACC_METHOD("wais", 210) || ACC_METHOD("nntp", 119) || ACC_METHOD("news", 119) || ACC_METHOD("newspost", 119) || ACC_METHOD("newsreply", 119) || ACC_METHOD("snews", 563) || ACC_METHOD("snewspost", 563) || ACC_METHOD("snewsreply", 563) || ACC_METHOD("finger", 79) || ACC_METHOD("telnet", 23) || ACC_METHOD("tn3270", 23) || ACC_METHOD("rlogin", 513) || ACC_METHOD("cso", 105)) *p2 = '\0'; /* It is the default: ignore it */ } if (p2 == NULL) { int len3 = (int) strlen(tail); if (len3 > 0) { h = tail + len3 - 1; /* last char of hostname */ if (*h == '.') *h = '\0'; /* chop final . */ } } else if (p2 != result) { h = p2; h--; /* End of hostname */ if (*h == '.') { /* * Slide p2 over h. */ while (*p2 != '\0') *h++ = *p2++; *h = '\0'; /* terminate */ } } } #if defined(USE_IDNA) || defined(USE_IDN2) /* * Depending on locale-support, we could have a literal UTF-8 * string as a host name, or a URL-encoded form of that. */ convert_to_idna(tail); #endif } } /* * Trim any blanks from the result so far - there's no excuse for blanks * in a hostname. Also update the tail here. */ tail = LYRemoveBlanks(result); /* * If host in given or related was ended directly with a '?' (no slash), * fake the search part into absolute. This is the only case search is * returned from scan. A host must have been present. this restores the * '?' at which the host part had been truncated in scan, we have to do * this after host part handling is done. - kw */ if (given.search && *(given.search - 1) == '\0') { given.absolute = given.search - 1; given.absolute[0] = '?'; } else if (related.search && !related.absolute && *(related.search - 1) == '\0') { related.absolute = related.search - 1; related.absolute[0] = '?'; } /* * If different hosts, inherit no path. */ if (given.host && related.host) if (strcmp(given.host, related.host) != 0) { related.absolute = NULL; related.relative = NULL; related.anchor = NULL; } /* * Handle the path. */ if (wanted & (PARSE_PATH | PARSE_STRICTPATH | PARSE_QUERY)) { int want_detail = (wanted & (PARSE_STRICTPATH | PARSE_QUERY)); if (acc_method && !given.absolute && given.relative) { /* * Treat all given nntp or snews paths, or given paths for news * URLs with a host, as absolute. */ switch (*acc_method) { case 'N': case 'n': if (!strcasecomp(acc_method, "nntp") || (!strcasecomp(acc_method, "news") && !strncasecomp(result, "news://", 7))) { given.absolute = given.relative; given.relative = NULL; } break; case 'S': case 's': if (!strcasecomp(acc_method, "snews")) { given.absolute = given.relative; given.relative = NULL; } break; } } if (given.absolute) { /* All is given */ char *base = tail; if (wanted & PARSE_PUNCTUATION) *tail++ = '/'; strcpy(tail, given.absolute); HTSimplify(base, TRUE); CTRACE((tfp, "HTParse: (ABS)\n")); } else if (related.absolute) { /* Adopt path not name */ char *base = tail; *tail++ = '/'; strcpy(tail, related.absolute); if (given.relative) { /* RFC 1808 part 4 step 5 (if URL path is empty) */ /* a) if given has params, add/replace that */ if (given.relative[0] == ';') { strcpy(strchr_or_end(tail, ';'), given.relative); } /* b) if given has query, add/replace that */ else if (given.relative[0] == '?') { strcpy(strchr_or_end(tail, '?'), given.relative); } /* otherwise fall through to RFC 1808 part 4 step 6 */ else { p = StrChr(tail, '?'); /* Search part? */ if (p == NULL) p = (tail + strlen(tail) - 1); for (; *p != '/'; p--) ; /* last / */ p[1] = '\0'; /* Remove filename */ strcat(p, given.relative); /* Add given one */ } HTSimplify(base, FALSE); if (*base == '\0') strcpy(base, "/"); } else { HTSimplify(base, TRUE); } if (base[0] == '/' && base[1] == '/') { char *pz; for (pz = base; (pz[0] = pz[1]) != '\0'; ++pz) ; } CTRACE((tfp, "HTParse: (Related-ABS)\n")); } else if (given.relative) { strcpy(tail, given.relative); /* what we've got */ HTSimplify(tail, FALSE); CTRACE((tfp, "HTParse: (REL)\n")); } else if (related.relative) { strcpy(tail, related.relative); HTSimplify(tail, FALSE); CTRACE((tfp, "HTParse: (Related-REL)\n")); } else { /* No inheritance */ if (!isLYNXCGI(aName) && !isLYNXEXEC(aName) && !isLYNXPROG(aName)) { *tail++ = '/'; *tail = '\0'; } else { HTSimplify(tail, FALSE); } if (!strcmp(result, "news:/")) result[5] = '*'; CTRACE((tfp, "HTParse: (No inheritance)\n")); } if (want_detail) { p = StrChr(tail, '?'); /* Search part? */ if (p) { if (PARSE_STRICTPATH) { *p = '\0'; } else { if (!(wanted & PARSE_PUNCTUATION)) p++; do { *tail++ = *p; } while (*p++); } } else { if (wanted & PARSE_QUERY) *tail = '\0'; } } } /* * Handle the fragment (anchor). Never inherit. */ if (wanted & PARSE_ANCHOR) { if (given.anchor && *given.anchor) { tail += strlen(tail); if (wanted & PARSE_PUNCTUATION) *tail++ = '#'; strcpy(tail, given.anchor); } } /* * If there are any blanks remaining in the string, escape them as needed. * See the discussion in LYLegitimizeHREF() for example. */ if ((p = StrChr(result, ' ')) != 0) { switch (is_url(result)) { case UNKNOWN_URL_TYPE: CTRACE((tfp, "HTParse: ignore:`%s'\n", result)); break; case LYNXEXEC_URL_TYPE: case LYNXPROG_URL_TYPE: case LYNXCGI_URL_TYPE: case LYNXPRINT_URL_TYPE: case LYNXHIST_URL_TYPE: case LYNXDOWNLOAD_URL_TYPE: case LYNXKEYMAP_URL_TYPE: case LYNXIMGMAP_URL_TYPE: case LYNXCOOKIE_URL_TYPE: case LYNXCACHE_URL_TYPE: case LYNXDIRED_URL_TYPE: case LYNXOPTIONS_URL_TYPE: case LYNXCFG_URL_TYPE: case LYNXCOMPILE_OPTS_URL_TYPE: case LYNXMESSAGES_URL_TYPE: CTRACE((tfp, "HTParse: spaces:`%s'\n", result)); break; case NOT_A_URL_TYPE: default: CTRACE((tfp, "HTParse: encode:`%s'\n", result)); do { q = p + strlen(p) + 2; while (q != p + 1) { q[0] = q[-2]; --q; } p[0] = HEX_ESCAPE; p[1] = '2'; p[2] = '0'; } while ((p = StrChr(result, ' ')) != 0); break; } } CTRACE((tfp, "HTParse: result:`%s'\n", result)); StrAllocCopy(return_value, result); LYalloca_free(result); /* FIXME: could be optimized using HTParse() internals */ if (*relatedName && ((wanted & PARSE_ALL_WITHOUT_ANCHOR) == PARSE_ALL_WITHOUT_ANCHOR)) { /* * Check whether to fill in localhost. - FM */ LYFillLocalFileURL(&return_value, relatedName); CTRACE((tfp, "pass LYFillLocalFile:`%s'\n", return_value)); } return return_value; /* exactly the right length */ } /* HTParseAnchor(), fast HTParse() specialization * ---------------------------------------------- * * On exit, * returns A pointer within input string (probably to its end '\0') */ const char *HTParseAnchor(const char *aName) { const char *p = aName; for (; *p && *p != '#'; p++) { ; } if (*p == '#') { /* the safe way based on HTParse() - * keeping in mind scan() peculiarities on schemes: */ struct struct_parts given; size_t need = ((unsigned) ((p - aName) + (int) strlen(p) + 1)); char *name; if (need > (size_t) max_uri_size) { p += strlen(p); } else { name = (char *) LYalloca(need); if (name == NULL) { outofmem(__FILE__, "HTParseAnchor"); } strcpy(name, aName); scan(name, &given); LYalloca_free(name); p++; /*next to '#' */ if (given.anchor == NULL) { for (; *p; p++) /*scroll to end '\0' */ ; } } } return p; } /* Simplify a filename. HTSimplify() * -------------------- * * A unix-style file is allowed to contain the sequence xxx/../ which may * be replaced by "" , and the sequence "/./" which may be replaced by "/". * Simplification helps us recognize duplicate filenames. * * RFC 3986 section 5.2.4 says to do this whether or not the path was relative. */ void HTSimplify(char *filename, BOOL absolute) { #define MY_FMT "HTParse HTSimplify\t(%s)" #ifdef NO_LYNX_TRACE #define debug_at(at) /* nothing */ #define atln "?" #else const char *atln; #define debug_at(at) atln = at #endif char *mark; char *p; size_t limit; CTRACE2(TRACE_HTPARSE, (tfp, MY_FMT " %s\n", filename, absolute ? "ABS" : "REL")); if (LYIsPathSep(*filename) && !absolute) ++filename; mark = filename; limit = strlen(filename); for (p = filename; *p; ++p) { if (*p == '?' || *p == '#') { limit = (size_t) (p - filename); break; } } while ((limit != 0) && (*filename != '\0')) { size_t trim = 0; size_t skip = 0; size_t last = 0; debug_at("?"); p = filename; if (limit >= 2 && !memcmp(p, "./", 2)) { /* 2A */ debug_at("2A"); trim = 2; } else if (limit >= 3 && !memcmp(p, "../", 3)) { debug_at("2A2"); trim = 3; } else if (limit >= 3 && !memcmp(p, "/./", 3)) { /* 2B */ debug_at("2B"); trim = 2; skip = 1; } else if (limit == 2 && !memcmp(p, "/.", 2)) { debug_at("2B2"); trim = 1; skip = 1; } else if (limit >= 4 && !memcmp(p, "/../", 4)) { /* 2C */ debug_at("2C"); trim = 3; skip = 1; last = 1; } else if (limit == 3 && !memcmp(p, "/..", 3)) { debug_at("2C2"); trim = 2; skip = 1; last = 1; } else if (limit == 2 && !memcmp(p, "..", 2)) { /* 2D */ debug_at("2D"); trim = 2; } else if (limit == 1 && !memcmp(p, ".", 1)) { debug_at("2D2"); trim = 1; } if (trim) { CTRACE2(TRACE_HTPARSE, (tfp, MY_FMT " trim %lu/%lu (%.*s) '%.*s' @%s\n", mark, (unsigned long) trim, (unsigned long) limit, (int) trim, p + skip, (int) limit, p, atln)); } if (last) { char *prior = filename; if (prior != mark) { --prior; while (prior != mark && *prior != '/') { --prior; } } if (prior != filename) { trim += (size_t) (filename - prior); limit += (size_t) (filename - prior); filename = prior; CTRACE2(TRACE_HTPARSE, (tfp, MY_FMT " TRIM %lu/%lu (%.*s)\n", mark, (unsigned long) trim, (unsigned long) limit, (int) trim, filename + skip)); } } if (trim) { limit -= trim; for (p = filename;; ++p) { if ((p[0] = p[trim]) == '\0') { break; } if (skip) { p[0] = '/'; skip = 0; } } CTRACE2(TRACE_HTPARSE, (tfp, MY_FMT " loop %lu\n", mark, (unsigned long) limit)); } else { if (*filename == '/') { ++filename; --limit; } while ((limit != 0) && (*filename != '/')) { ++filename; --limit; } } } CTRACE2(TRACE_HTPARSE, (tfp, MY_FMT " done\n", mark)); #undef MY_FMT } /* Make Relative Name. HTRelative() * ------------------- * * This function creates and returns a string which gives an expression of * one address as related to another. Where there is no relation, an absolute * address is returned. * * On entry, * Both names must be absolute, fully qualified names of nodes * (no anchor bits) * * On exit, * The return result points to a newly allocated name which, if * parsed by HTParse relative to relatedName, will yield aName. * The caller is responsible for freeing the resulting name later. * */ char *HTRelative(const char *aName, const char *relatedName) { char *result = NULL; const char *p = aName; const char *q = relatedName; const char *after_access = NULL; const char *path = NULL; const char *last_slash = NULL; int slashes = 0; for (; *p; p++, q++) { /* Find extent of match */ if (*p != *q) break; if (*p == ':') after_access = p + 1; if (*p == '/') { last_slash = p; slashes++; if (slashes == 3) path = p; } } /* q, p point to the first non-matching character or zero */ if (!after_access) { /* Different access */ StrAllocCopy(result, aName); } else if (slashes < 3) { /* Different nodes */ StrAllocCopy(result, after_access); } else if (slashes == 3) { /* Same node, different path */ StrAllocCopy(result, path); } else { /* Some path in common */ unsigned levels = 0; for (; *q && (*q != '#'); q++) if (*q == '/') levels++; result = typecallocn(char, 3 * levels + strlen(last_slash) + 1); if (result == NULL) outofmem(__FILE__, "HTRelative"); result[0] = '\0'; for (; levels; levels--) strcat(result, "../"); strcat(result, last_slash + 1); } CTRACE((tfp, "HTparse: `%s' expressed relative to\n `%s' is\n `%s'.\n", aName, relatedName, result)); return result; } #define AlloCopy(next,base,extra) \ typecallocn(char, ((next - base) + ((int) extra))) /* Escape undesirable characters using % HTEscape() * ------------------------------------- * * This function takes a pointer to a string in which * some characters may be unacceptable unescaped. * It returns a string which has these characters * represented by a '%' character followed by two hex digits. * * Unlike HTUnEscape(), this routine returns a calloc'd string. */ /* *INDENT-OFF* */ static const unsigned char isAcceptable[96] = /* Bit 0 xalpha -- see HTFile.h * Bit 1 xpalpha -- as xalpha but with plus. * Bit 2 ... path -- as xpalphas but with / */ /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ { 0,0,0,0,0,0,0,0,0,0,7,6,0,7,7,4, /* 2x !"#$%&'()*+,-./ */ 7,7,7,7,7,7,7,7,7,7,0,0,0,0,0,0, /* 3x 0123456789:;<=>? */ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /* 4x @ABCDEFGHIJKLMNO */ 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,7, /* 5X PQRSTUVWXYZ[\]^_ */ 0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /* 6x `abcdefghijklmno */ 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,0 }; /* 7X pqrstuvwxyz{|}~ DEL */ /* *INDENT-ON* */ static const char *hex = "0123456789ABCDEF"; #define ACCEPTABLE(a) ( a>=32 && a<128 && ((isAcceptable[a-32]) & mask)) char *HTEscape(const char *str, unsigned mask) { const char *p; char *q; char *result; size_t unacceptable = 0; for (p = str; *p; p++) if (!ACCEPTABLE(UCH(TOASCII(*p)))) unacceptable++; result = AlloCopy(p, str, (unacceptable * 2) + 1); if (result == NULL) outofmem(__FILE__, "HTEscape"); for (q = result, p = str; *p; p++) { unsigned char a = UCH(TOASCII(*p)); if (!ACCEPTABLE(a)) { *q++ = HEX_ESCAPE; /* Means hex coming */ *q++ = hex[a >> 4]; *q++ = hex[a & 15]; } else *q++ = *p; } *q = '\0'; /* Terminate */ return result; } /* Escape unsafe characters using % HTEscapeUnsafe() * -------------------------------- * * This function takes a pointer to a string in which * some characters may be that may be unsafe are unescaped. * It returns a string which has these characters * represented by a '%' character followed by two hex digits. * * Unlike HTUnEscape(), this routine returns a malloc'd string. */ #define UNSAFE(ch) (((ch) <= 32) || ((ch) >= 127)) char *HTEscapeUnsafe(const char *str) { const char *p; char *q; char *result; size_t unacceptable = 0; for (p = str; *p; p++) if (UNSAFE(UCH(TOASCII(*p)))) unacceptable++; result = AlloCopy(p, str, (unacceptable * 2) + 1); if (result == NULL) outofmem(__FILE__, "HTEscapeUnsafe"); for (q = result, p = str; *p; p++) { unsigned char a = UCH(TOASCII(*p)); if (UNSAFE(a)) { *q++ = HEX_ESCAPE; /* Means hex coming */ *q++ = hex[a >> 4]; *q++ = hex[a & 15]; } else *q++ = *p; } *q = '\0'; /* Terminate */ return result; } /* Escape undesirable characters using % but space to +. HTEscapeSP() * ----------------------------------------------------- * * This function takes a pointer to a string in which * some characters may be unacceptable unescaped. * It returns a string which has these characters * represented by a '%' character followed by two hex digits, * except that spaces are converted to '+' instead of %2B. * * Unlike HTUnEscape(), this routine returns a calloced string. */ char *HTEscapeSP(const char *str, unsigned mask) { const char *p; char *q; char *result; size_t unacceptable = 0; for (p = str; *p; p++) if (!(*p == ' ' || ACCEPTABLE(UCH(TOASCII(*p))))) unacceptable++; result = AlloCopy(p, str, (unacceptable * 2) + 1); if (result == NULL) outofmem(__FILE__, "HTEscape"); for (q = result, p = str; *p; p++) { unsigned char a = UCH(TOASCII(*p)); if (a == 32) { *q++ = '+'; } else if (!ACCEPTABLE(a)) { *q++ = HEX_ESCAPE; /* Means hex coming */ *q++ = hex[a >> 4]; *q++ = hex[a & 15]; } else { *q++ = *p; } } *q = '\0'; /* Terminate */ return result; } /* Decode %xx escaped characters. HTUnEscape() * ------------------------------ * * This function takes a pointer to a string in which some * characters may have been encoded in %xy form, where xy is * the ASCII hex code for character 16x+y. * The string is converted in place, as it will never grow. */ static char from_hex(int c) { return (char) (c >= '0' && c <= '9' ? c - '0' : c >= 'A' && c <= 'F' ? c - 'A' + 10 : c - 'a' + 10); /* accept small letters just in case */ } char *HTUnEscape(char *str) { char *p = str; char *q = str; if (!(p && *p)) return str; while (*p != '\0') { if (*p == HEX_ESCAPE && /* * Tests shouldn't be needed, but better safe than sorry. */ p[1] && p[2] && isxdigit(UCH(p[1])) && isxdigit(UCH(p[2]))) { p++; if (*p) *q = (char) (from_hex(*p++) * 16); if (*p) { /* * Careful! FROMASCII() may evaluate its arg more than once! */ /* S/390 -- gil -- 0221 */ *q = (char) (*q + from_hex(*p++)); } *q = FROMASCII(*q); q++; } else { *q++ = *p++; } } *q = '\0'; return str; } /* HTUnEscape */ /* Decode some %xx escaped characters. HTUnEscapeSome() * ----------------------------------- Klaus Weide * (kweide@tezcat.com) * This function takes a pointer to a string in which some * characters may have been encoded in %xy form, where xy is * the ASCII hex code for character 16x+y, and a pointer to * a second string containing one or more characters which * should be unescaped if escaped in the first string. * The first string is converted in place, as it will never grow. */ char *HTUnEscapeSome(char *str, const char *do_trans) { char *p = str; char *q = str; char testcode; if (p == NULL || *p == '\0' || do_trans == NULL || *do_trans == '\0') return str; while (*p != '\0') { if (*p == HEX_ESCAPE && p[1] && p[2] && /* tests shouldn't be needed, but.. */ isxdigit(UCH(p[1])) && isxdigit(UCH(p[2])) && (testcode = (char) FROMASCII(from_hex(p[1]) * 16 + from_hex(p[2]))) && /* %00 no good */ StrChr(do_trans, testcode)) { /* it's one of the ones we want */ *q++ = testcode; p += 3; } else { *q++ = *p++; } } *q = '\0'; return str; } /* HTUnEscapeSome */ /* *INDENT-OFF* */ static const unsigned char crfc[96] = /* Bit 0 xalpha -- need "quoting" * Bit 1 xpalpha -- need \escape if quoted */ /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ { 1,0,3,0,0,0,0,0,1,1,0,0,1,0,1,0, /* 2x !"#$%&'()*+,-./ */ 0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0, /* 3x 0123456789:;<=>? */ 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 4x @ABCDEFGHIJKLMNO */ 0,0,0,0,0,0,0,0,0,0,0,1,2,1,0,0, /* 5X PQRSTUVWXYZ[\]^_ */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 6x `abcdefghijklmno */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3 }; /* 7X pqrstuvwxyz{|}~ DEL */ /* *INDENT-ON* */ #define ASCII_TAB '\011' #define ASCII_LF '\012' #define ASCII_CR '\015' #define ASCII_SPC '\040' #define ASCII_BAK '\134' /* * Turn a string which is not a RFC 822 token into a quoted-string. - KW * The "quoted" parameter tells whether we need the beginning/ending quote * marks. If not, the caller will provide them -TD */ void HTMake822Word(char **str, int quoted) { const char *p; char *q; char *result; unsigned char a; unsigned added = 0; if (isEmpty(*str)) { StrAllocCopy(*str, quoted ? "\"\"" : ""); return; } for (p = *str; *p; p++) { a = UCH(TOASCII(*p)); /* S/390 -- gil -- 0240 */ if (a < 32 || a >= 128 || ((crfc[a - 32]) & 1)) { if (!added) added = 2; if (a >= 160 || a == '\t') continue; if (a == '\r' || a == '\n') added += 2; else if ((a & 127) < 32 || ((crfc[a - 32]) & 2)) added++; } } if (!added) return; result = AlloCopy(p, *str, added + 1); if (result == NULL) outofmem(__FILE__, "HTMake822Word"); q = result; if (quoted) *q++ = '"'; /* * Having converted the character to ASCII, we can't use symbolic * escape codes, since they're in the host character set, which * is not necessarily ASCII. Thus we use octal escape codes instead. * -- gil (Paul Gilmartin) <pg@sweng.stortek.com> */ /* S/390 -- gil -- 0268 */ for (p = *str; *p; p++) { a = UCH(TOASCII(*p)); if ((a != ASCII_TAB) && ((a & 127) < ASCII_SPC || (a < 128 && ((crfc[a - 32]) & 2)))) *q++ = ASCII_BAK; *q++ = *p; if (a == ASCII_LF || (a == ASCII_CR && (TOASCII(*(p + 1)) != ASCII_LF))) *q++ = ' '; } if (quoted) *q++ = '"'; *q = '\0'; /* Terminate */ FREE(*str); *str = result; }