From 2faa747e2303ee774a4b4aace961188e950e185a Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 8 Apr 2024 21:09:22 +0200 Subject: Adding upstream version 2.4.58. Signed-off-by: Daniel Baumann --- modules/filters/mod_proxy_html.c | 1353 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 1353 insertions(+) create mode 100644 modules/filters/mod_proxy_html.c (limited to 'modules/filters/mod_proxy_html.c') diff --git a/modules/filters/mod_proxy_html.c b/modules/filters/mod_proxy_html.c new file mode 100644 index 0000000..7783da1 --- /dev/null +++ b/modules/filters/mod_proxy_html.c @@ -0,0 +1,1353 @@ +/* Copyright (c) 2003-11, WebThing Ltd + * Copyright (c) 2011-, The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* GO_FASTER + You can #define GO_FASTER to disable trace logging. +*/ + +#ifdef GO_FASTER +#define VERBOSE(x) +#define VERBOSEB(x) +#else +#define VERBOSE(x) if (verbose) x +#define VERBOSEB(x) if (verbose) {x} +#endif + +/* libxml2 includes unicode/[...].h files which uses C++ comments */ +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic warning "-Wcomment" +#elif defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic push +#pragma GCC diagnostic warning "-Wcomment" +#endif +#endif + +/* libxml2 */ +#include + +#if defined(__clang__) +#pragma clang diagnostic pop +#elif defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic pop +#endif +#endif + +#include "http_protocol.h" +#include "http_config.h" +#include "http_log.h" +#include "apr_strings.h" +#include "apr_hash.h" +#include "apr_strmatch.h" +#include "apr_lib.h" + +#include "apr_optional.h" +#include "mod_xml2enc.h" +#include "http_request.h" +#include "ap_expr.h" + +/* globals set once at startup */ +static ap_rxplus_t *old_expr; +static ap_regex_t *seek_meta; +static const apr_strmatch_pattern* seek_content; +static apr_status_t (*xml2enc_charset)(request_rec*, xmlCharEncoding*, const char**) = NULL; +static apr_status_t (*xml2enc_filter)(request_rec*, const char*, unsigned int) = NULL; + +module AP_MODULE_DECLARE_DATA proxy_html_module; + +#define M_HTML 0x01 +#define M_EVENTS 0x02 +#define M_CDATA 0x04 +#define M_REGEX 0x08 +#define M_ATSTART 0x10 +#define M_ATEND 0x20 +#define M_LAST 0x40 +#define M_NOTLAST 0x80 +#define M_INTERPOLATE_TO 0x100 +#define M_INTERPOLATE_FROM 0x200 + +typedef struct { + const char *val; +} tattr; +typedef struct { + unsigned int start; + unsigned int end; +} meta; +typedef struct urlmap { + struct urlmap *next; + unsigned int flags; + unsigned int regflags; + union { + const char *c; + ap_regex_t *r; + } from; + const char *to; + ap_expr_info_t *cond; +} urlmap; +typedef struct { + urlmap *map; + const char *doctype; + const char *etag; + unsigned int flags; + int bufsz; + apr_hash_t *links; + apr_array_header_t *events; + const char *charset_out; + int extfix; + int metafix; + int strip_comments; + int interp; + int enabled; +} proxy_html_conf; +typedef struct { + ap_filter_t *f; + proxy_html_conf *cfg; + htmlParserCtxtPtr parser; + apr_bucket_brigade *bb; + char *buf; + size_t offset; + size_t avail; + const char *encoding; + urlmap *map; + char rbuf[4]; + apr_size_t rlen; + apr_size_t rmin; +} saxctxt; + + +#define NORM_LC 0x1 +#define NORM_MSSLASH 0x2 +#define NORM_RESET 0x4 +static htmlSAXHandler sax; + +typedef enum { ATTR_IGNORE, ATTR_URI, ATTR_EVENT } rewrite_t; + +static const char *const fpi_html = + "\n"; +static const char *const fpi_html_legacy = + "\n"; +static const char *const fpi_xhtml = + "\n"; +static const char *const fpi_xhtml_legacy = + "\n"; +static const char *const fpi_html5 = "\n"; +static const char *const html_etag = ">"; +static const char *const xhtml_etag = " />"; +/*#define DEFAULT_DOCTYPE fpi_html */ +static const char *const DEFAULT_DOCTYPE = ""; +#define DEFAULT_ETAG html_etag + +static void normalise(unsigned int flags, char *str) +{ + char *p; + if (flags & NORM_LC) + for (p = str; *p; ++p) + if (isupper(*p)) + *p = tolower(*p); + + if (flags & NORM_MSSLASH) + for (p = ap_strchr(str, '\\'); p; p = ap_strchr(p+1, '\\')) + *p = '/'; + +} +#define consume_buffer(ctx,inbuf,bytes,flag) \ + htmlParseChunk(ctx->parser, inbuf, bytes, flag) + +#define AP_fwrite(ctx,inbuf,bytes,flush) \ + ap_fwrite(ctx->f->next, ctx->bb, inbuf, bytes); + +/* This is always utf-8 on entry. We can convert charset within FLUSH */ +#define FLUSH AP_fwrite(ctx, (chars+begin), (i-begin), 0); begin = i+1 +static void pcharacters(void *ctxt, const xmlChar *uchars, int length) +{ + const char *chars = (const char*) uchars; + saxctxt *ctx = (saxctxt*) ctxt; + int i; + int begin; + for (begin=i=0; if->next, ctx->bb, "&"); break; + case '<' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, "<"); break; + case '>' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, ">"); break; + case '"' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, """); break; + default : break; + } + } + FLUSH; +} + +static void preserve(saxctxt *ctx, const size_t len) +{ + char *newbuf; + if (len <= (ctx->avail - ctx->offset)) + return; + else while (len > (ctx->avail - ctx->offset)) + ctx->avail += ctx->cfg->bufsz; + + newbuf = realloc(ctx->buf, ctx->avail); + if (newbuf != ctx->buf) { + if (ctx->buf) + apr_pool_cleanup_kill(ctx->f->r->pool, ctx->buf, + (int(*)(void*))free); + apr_pool_cleanup_register(ctx->f->r->pool, newbuf, + (int(*)(void*))free, apr_pool_cleanup_null); + ctx->buf = newbuf; + } +} + +static void pappend(saxctxt *ctx, const char *buf, const size_t len) +{ + preserve(ctx, len); + memcpy(ctx->buf+ctx->offset, buf, len); + ctx->offset += len; +} + +static void dump_content(saxctxt *ctx) +{ + urlmap *m; + char *found; + size_t s_from, s_to; + size_t match; + char c = 0; + int nmatch; + ap_regmatch_t pmatch[10]; + char *subs; + size_t len, offs; + urlmap *themap = ctx->map; +#ifndef GO_FASTER + int verbose = APLOGrtrace1(ctx->f->r); +#endif + + pappend(ctx, &c, 1); /* append null byte */ + /* parse the text for URLs */ + for (m = themap; m; m = m->next) { + if (!(m->flags & M_CDATA)) + continue; + if (m->flags & M_REGEX) { + nmatch = 10; + offs = 0; + while (!ap_regexec(m->from.r, ctx->buf+offs, nmatch, pmatch, 0)) { + match = pmatch[0].rm_so; + s_from = pmatch[0].rm_eo - match; + subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs, + nmatch, pmatch); + s_to = strlen(subs); + len = strlen(ctx->buf); + offs += match; + VERBOSEB( + const char *f = apr_pstrndup(ctx->f->r->pool, + ctx->buf + offs, s_from); + ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, ctx->f->r, + "C/RX: match at %s, substituting %s", f, subs); + ) + if (s_to > s_from) { + preserve(ctx, s_to - s_from); + memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from, + len + 1 - s_from - offs); + memcpy(ctx->buf+offs, subs, s_to); + } + else { + memcpy(ctx->buf + offs, subs, s_to); + memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from, + len + 1 - s_from - offs); + } + offs += s_to; + } + } + else { + s_from = strlen(m->from.c); + s_to = strlen(m->to); + for (found = strstr(ctx->buf, m->from.c); found; + found = strstr(ctx->buf+match+s_to, m->from.c)) { + match = found - ctx->buf; + if ((m->flags & M_ATSTART) && (match != 0)) + break; + len = strlen(ctx->buf); + if ((m->flags & M_ATEND) && (match < (len - s_from))) + continue; + VERBOSE(ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, ctx->f->r, + "C: matched %s, substituting %s", + m->from.c, m->to)); + if (s_to > s_from) { + preserve(ctx, s_to - s_from); + memmove(ctx->buf+match+s_to, ctx->buf+match+s_from, + len + 1 - s_from - match); + memcpy(ctx->buf+match, m->to, s_to); + } + else { + memcpy(ctx->buf+match, m->to, s_to); + memmove(ctx->buf+match+s_to, ctx->buf+match+s_from, + len + 1 - s_from - match); + } + } + } + } + AP_fwrite(ctx, ctx->buf, strlen(ctx->buf), 1); +} +static void pcdata(void *ctxt, const xmlChar *uchars, int length) +{ + const char *chars = (const char*) uchars; + saxctxt *ctx = (saxctxt*) ctxt; + if (ctx->cfg->extfix) { + pappend(ctx, chars, length); + } + else { + /* not sure if this should force-flush + * (i.e. can one cdata section come in multiple calls?) + */ + AP_fwrite(ctx, chars, length, 0); + } +} +static void pcomment(void *ctxt, const xmlChar *uchars) +{ + const char *chars = (const char*) uchars; + saxctxt *ctx = (saxctxt*) ctxt; + if (ctx->cfg->strip_comments) + return; + + if (ctx->cfg->extfix) { + pappend(ctx, "", 3); + } + else { + ap_fputs(ctx->f->next, ctx->bb, ""); + dump_content(ctx); + } +} +static void pendElement(void *ctxt, const xmlChar *uname) +{ + saxctxt *ctx = (saxctxt*) ctxt; + const char *name = (const char*) uname; + const htmlElemDesc* desc = htmlTagLookup(uname); + + if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) { + /* enforce html */ + if (!desc || desc->depr) + return; + + } + else if ((ctx->cfg->doctype == fpi_html_legacy) + || (ctx->cfg->doctype == fpi_xhtml_legacy)) { + /* enforce html legacy */ + if (!desc) + return; + } + /* TODO - implement HTML "allowed here" using the stack */ + /* nah. Keeping the stack is too much overhead */ + + if (ctx->offset > 0) { + dump_content(ctx); + ctx->offset = 0; /* having dumped it, we can re-use the memory */ + } + if (!desc || !desc->empty) { + ap_fprintf(ctx->f->next, ctx->bb, "", name); + } +} + +static void pstartElement(void *ctxt, const xmlChar *uname, + const xmlChar** uattrs) +{ + int required_attrs; + int num_match; + size_t offs, len; + char *subs; + rewrite_t is_uri; + const char** a; + urlmap *m; + size_t s_to, s_from, match; + char *found; + saxctxt *ctx = (saxctxt*) ctxt; + size_t nmatch; + ap_regmatch_t pmatch[10]; +#ifndef GO_FASTER + int verbose = APLOGrtrace1(ctx->f->r); +#endif + apr_array_header_t *linkattrs; + int i; + const char *name = (const char*) uname; + const char** attrs = (const char**) uattrs; + const htmlElemDesc* desc = htmlTagLookup(uname); + urlmap *themap = ctx->map; +#ifdef HAVE_STACK + const void** descp; +#endif + int enforce = 0; + if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) { + /* enforce html */ + if (!desc || desc->depr) { + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01416) + "Bogus HTML element %s dropped", name); + return; + } + enforce = 2; + } + else if ((ctx->cfg->doctype == fpi_html_legacy) + || (ctx->cfg->doctype == fpi_xhtml_legacy)) { + /* enforce html legacy */ + if (!desc) { + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01417) + "Deprecated HTML element %s dropped", name); + return; + } + enforce = 1; + } +#ifdef HAVE_STACK + descp = apr_array_push(ctx->stack); + *descp = desc; + /* TODO - implement HTML "allowed here" */ +#endif + + ap_fputc(ctx->f->next, ctx->bb, '<'); + ap_fputs(ctx->f->next, ctx->bb, name); + + required_attrs = 0; + if ((enforce > 0) && (desc != NULL) && (desc->attrs_req != NULL)) + for (a = desc->attrs_req; *a; a++) + ++required_attrs; + + if (attrs) { + linkattrs = apr_hash_get(ctx->cfg->links, name, APR_HASH_KEY_STRING); + for (a = attrs; *a; a += 2) { + if (desc && enforce > 0) { + switch (htmlAttrAllowed(desc, (xmlChar*)*a, 2-enforce)) { + case HTML_INVALID: + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01418) + "Bogus HTML attribute %s of %s dropped", + *a, name); + continue; + case HTML_DEPRECATED: + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01419) + "Deprecated HTML attribute %s of %s dropped", + *a, name); + continue; + case HTML_REQUIRED: + required_attrs--; /* cross off the number still needed */ + /* fallthrough - required implies valid */ + default: + break; + } + } + ctx->offset = 0; + if (a[1]) { + pappend(ctx, a[1], strlen(a[1])+1); + is_uri = ATTR_IGNORE; + if (linkattrs) { + tattr *attrs = (tattr*) linkattrs->elts; + for (i=0; i < linkattrs->nelts; ++i) { + if (!strcmp(*a, attrs[i].val)) { + is_uri = ATTR_URI; + break; + } + } + } + if ((is_uri == ATTR_IGNORE) && ctx->cfg->extfix + && (ctx->cfg->events != NULL)) { + for (i=0; i < ctx->cfg->events->nelts; ++i) { + tattr *attrs = (tattr*) ctx->cfg->events->elts; + if (!strcmp(*a, attrs[i].val)) { + is_uri = ATTR_EVENT; + break; + } + } + } + switch (is_uri) { + case ATTR_URI: + num_match = 0; + for (m = themap; m; m = m->next) { + if (!(m->flags & M_HTML)) + continue; + if (m->flags & M_REGEX) { + nmatch = 10; + if (!ap_regexec(m->from.r, ctx->buf, nmatch, + pmatch, 0)) { + ++num_match; + offs = match = pmatch[0].rm_so; + s_from = pmatch[0].rm_eo - match; + subs = ap_pregsub(ctx->f->r->pool, m->to, + ctx->buf, nmatch, pmatch); + VERBOSE({ + const char *f; + f = apr_pstrndup(ctx->f->r->pool, + ctx->buf + offs, s_from); + ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, + ctx->f->r, + "H/RX: match at %s, substituting %s", + f, subs); + }) + s_to = strlen(subs); + len = strlen(ctx->buf); + if (s_to > s_from) { + preserve(ctx, s_to - s_from); + memmove(ctx->buf+offs+s_to, + ctx->buf+offs+s_from, + len + 1 - s_from - offs); + memcpy(ctx->buf+offs, subs, s_to); + } + else { + memcpy(ctx->buf + offs, subs, s_to); + memmove(ctx->buf+offs+s_to, + ctx->buf+offs+s_from, + len + 1 - s_from - offs); + } + } + } else { + s_from = strlen(m->from.c); + if (!strncasecmp(ctx->buf, m->from.c, s_from)) { + ++num_match; + s_to = strlen(m->to); + len = strlen(ctx->buf); + VERBOSE(ap_log_rerror(APLOG_MARK, APLOG_TRACE3, + 0, ctx->f->r, + "H: matched %s, substituting %s", + m->from.c, m->to)); + if (s_to > s_from) { + preserve(ctx, s_to - s_from); + memmove(ctx->buf+s_to, ctx->buf+s_from, + len + 1 - s_from); + memcpy(ctx->buf, m->to, s_to); + } + else { /* it fits in the existing space */ + memcpy(ctx->buf, m->to, s_to); + memmove(ctx->buf+s_to, ctx->buf+s_from, + len + 1 - s_from); + } + break; + } + } + /* URIs only want one match unless overridden in the config */ + if ((num_match > 0) && !(m->flags & M_NOTLAST)) + break; + } + break; + case ATTR_EVENT: + for (m = themap; m; m = m->next) { + num_match = 0; /* reset here since we're working per-rule */ + if (!(m->flags & M_EVENTS)) + continue; + if (m->flags & M_REGEX) { + nmatch = 10; + offs = 0; + while (!ap_regexec(m->from.r, ctx->buf+offs, + nmatch, pmatch, 0)) { + match = pmatch[0].rm_so; + s_from = pmatch[0].rm_eo - match; + subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs, + nmatch, pmatch); + VERBOSE({ + const char *f; + f = apr_pstrndup(ctx->f->r->pool, + ctx->buf + offs, s_from); + ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, + ctx->f->r, + "E/RX: match at %s, substituting %s", + f, subs); + }) + s_to = strlen(subs); + offs += match; + len = strlen(ctx->buf); + if (s_to > s_from) { + preserve(ctx, s_to - s_from); + memmove(ctx->buf+offs+s_to, + ctx->buf+offs+s_from, + len + 1 - s_from - offs); + memcpy(ctx->buf+offs, subs, s_to); + } + else { + memcpy(ctx->buf + offs, subs, s_to); + memmove(ctx->buf+offs+s_to, + ctx->buf+offs+s_from, + len + 1 - s_from - offs); + } + offs += s_to; + ++num_match; + } + } + else { + found = strstr(ctx->buf, m->from.c); + if ((m->flags & M_ATSTART) && (found != ctx->buf)) + continue; + while (found) { + s_from = strlen(m->from.c); + s_to = strlen(m->to); + match = found - ctx->buf; + if ((s_from < strlen(found)) + && (m->flags & M_ATEND)) { + found = strstr(ctx->buf+match+s_from, + m->from.c); + continue; + } + else { + found = strstr(ctx->buf+match+s_to, + m->from.c); + } + VERBOSE(ap_log_rerror(APLOG_MARK, APLOG_TRACE3, + 0, ctx->f->r, + "E: matched %s, substituting %s", + m->from.c, m->to)); + len = strlen(ctx->buf); + if (s_to > s_from) { + preserve(ctx, s_to - s_from); + memmove(ctx->buf+match+s_to, + ctx->buf+match+s_from, + len + 1 - s_from - match); + memcpy(ctx->buf+match, m->to, s_to); + } + else { + memcpy(ctx->buf+match, m->to, s_to); + memmove(ctx->buf+match+s_to, + ctx->buf+match+s_from, + len + 1 - s_from - match); + } + ++num_match; + } + } + if (num_match && (m->flags & M_LAST)) + break; + } + break; + case ATTR_IGNORE: + break; + } + } + if (!a[1]) + ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], NULL); + else { + + if (ctx->cfg->flags != 0) + normalise(ctx->cfg->flags, ctx->buf); + + /* write the attribute, using pcharacters to html-escape + anything that needs it in the value. + */ + ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], "=\"", NULL); + pcharacters(ctx, (const xmlChar*)ctx->buf, strlen(ctx->buf)); + ap_fputc(ctx->f->next, ctx->bb, '"'); + } + } + } + ctx->offset = 0; + if (desc && desc->empty) + ap_fputs(ctx->f->next, ctx->bb, ctx->cfg->etag); + else + ap_fputc(ctx->f->next, ctx->bb, '>'); + + if ((enforce > 0) && (required_attrs > 0)) { + /* if there are more required attributes than we found then complain */ + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01420) + "HTML element %s is missing %d required attributes", + name, required_attrs); + } +} + +static meta *metafix(request_rec *r, const char *buf, apr_size_t len) +{ + meta *ret = NULL; + size_t offs = 0; + const char *p; + const char *q; + char *header; + char *content; + ap_regmatch_t pmatch[2]; + char delim; + + while (offs < len && + !ap_regexec_len(seek_meta, buf + offs, len - offs, 2, pmatch, 0)) { + header = NULL; + content = NULL; + p = buf+offs+pmatch[1].rm_eo; + while (!apr_isalpha(*++p)); + for (q = p; apr_isalnum(*q) || (*q == '-'); ++q); + header = apr_pstrmemdup(r->pool, p, q-p); + if (ap_cstr_casecmpn(header, "Content-", 8)) { + /* find content=... string */ + p = apr_strmatch(seek_content, buf+offs+pmatch[0].rm_so, + pmatch[0].rm_eo - pmatch[0].rm_so); + /* if it doesn't contain "content", ignore, don't crash! */ + if (p != NULL) { + while (*p) { + p += 7; + while (apr_isspace(*p)) + ++p; + /* XXX Should we search for another content= pattern? */ + if (*p != '=') + break; + while (*p && apr_isspace(*++p)); + if ((*p == '\'') || (*p == '"')) { + delim = *p++; + for (q = p; *q && *q != delim; ++q); + /* No terminating delimiter found? Skip the bogus directive */ + if (*q != delim) + break; + } else { + for (q = p; *q && !apr_isspace(*q) && (*q != '>'); ++q); + } + content = apr_pstrmemdup(r->pool, p, q-p); + break; + } + } + } + else if (!ap_cstr_casecmpn(header, "Content-Type", 12)) { + ret = apr_palloc(r->pool, sizeof(meta)); + ret->start = offs+pmatch[0].rm_so; + ret->end = offs+pmatch[0].rm_eo; + } + if (header && content) { +#ifndef GO_FASTER + ap_log_rerror(APLOG_MARK, APLOG_TRACE2, 0, r, + "Adding header [%s: %s] from HTML META", + header, content); +#endif + apr_table_setn(r->headers_out, header, content); + } + offs += pmatch[0].rm_eo; + } + return ret; +} + +static const char *interpolate_vars(request_rec *r, const char *str) +{ + const char *start; + const char *end; + const char *delim; + const char *before; + const char *after; + const char *replacement; + const char *var; + for (;;) { + if ((start = ap_strstr_c(str, "${")) == NULL) + break; + + if ((end = ap_strchr_c(start+2, '}')) == NULL) + break; + + delim = ap_strchr_c(start+2, '|'); + + /* Restrict delim to ${...} */ + if (delim && delim >= end) { + delim = NULL; + } + + before = apr_pstrmemdup(r->pool, str, start-str); + after = end+1; + if (delim) { + var = apr_pstrmemdup(r->pool, start+2, delim-start-2); + } + else { + var = apr_pstrmemdup(r->pool, start+2, end-start-2); + } + replacement = apr_table_get(r->subprocess_env, var); + if (!replacement) { + if (delim) + replacement = apr_pstrmemdup(r->pool, delim+1, end-delim-1); + else + replacement = ""; + } + str = apr_pstrcat(r->pool, before, replacement, after, NULL); + ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, r, + "Interpolating %s => %s", var, replacement); + } + return str; +} +static void fixup_rules(saxctxt *ctx) +{ + urlmap *newp; + urlmap *p; + urlmap *prev = NULL; + request_rec *r = ctx->f->r; + + for (p = ctx->cfg->map; p; p = p->next) { + if (p->cond != NULL) { + const char *err; + int ok = ap_expr_exec(r, p->cond, &err); + if (err) { + ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(01421) + "Error evaluating expr: %s", err); + } + if (ok == 0) { + continue; /* condition is unsatisfied */ + } + } + + newp = apr_pmemdup(r->pool, p, sizeof(urlmap)); + + if (newp->flags & M_INTERPOLATE_FROM) { + newp->from.c = interpolate_vars(r, newp->from.c); + if (!newp->from.c || !*newp->from.c) + continue; /* don't use empty from-pattern */ + if (newp->flags & M_REGEX) { + newp->from.r = ap_pregcomp(r->pool, newp->from.c, + newp->regflags); + } + } + if (newp->flags & M_INTERPOLATE_TO) { + newp->to = interpolate_vars(r, newp->to); + } + /* evaluate p->cond; continue if unsatisfied */ + /* create new urlmap with memcpy and append to map */ + /* interpolate from if flagged to do so */ + /* interpolate to if flagged to do so */ + + if (prev != NULL) + prev->next = newp; + else + ctx->map = newp; + prev = newp; + } + + if (prev) + prev->next = NULL; +} + +static saxctxt *check_filter_init (ap_filter_t *f) +{ + saxctxt *fctx; + if (!f->ctx) { + proxy_html_conf *cfg; + const char *force; + const char *errmsg = NULL; + cfg = ap_get_module_config(f->r->per_dir_config, &proxy_html_module); + force = apr_table_get(f->r->subprocess_env, "PROXY_HTML_FORCE"); + + if (!force) { + if (!f->r->proxyreq) { + errmsg = "Non-proxy request; not inserting proxy-html filter"; + } + else if (!f->r->content_type) { + errmsg = "No content-type; bailing out of proxy-html filter"; + } + else if (ap_cstr_casecmpn(f->r->content_type, "text/html", 9) && + ap_cstr_casecmpn(f->r->content_type, + "application/xhtml+xml", 21)) { + errmsg = "Non-HTML content; not inserting proxy-html filter"; + } + } + if (!cfg->links) { + errmsg = "No links configured: nothing for proxy-html filter to do"; + } + + if (errmsg) { +#ifndef GO_FASTER + ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, f->r, "%s", errmsg); +#endif + ap_remove_output_filter(f); + return NULL; + } + + fctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(saxctxt)); + fctx->f = f; + fctx->bb = apr_brigade_create(f->r->pool, + f->r->connection->bucket_alloc); + fctx->cfg = cfg; + apr_table_unset(f->r->headers_out, "Content-Length"); + + if (cfg->interp) + fixup_rules(fctx); + else + fctx->map = cfg->map; + /* defer dealing with charset_out until after sniffing charset_in + * so we can support setting one to t'other. + */ + } + return f->ctx; +} + +static void prepend_rbuf(saxctxt *ctxt, apr_bucket_brigade *bb) +{ + if (ctxt->rlen) { + apr_bucket *b = apr_bucket_transient_create(ctxt->rbuf, + ctxt->rlen, + bb->bucket_alloc); + APR_BRIGADE_INSERT_HEAD(bb, b); + ctxt->rlen = 0; + } +} + +static apr_status_t proxy_html_filter(ap_filter_t *f, apr_bucket_brigade *bb) +{ + apr_bucket* b; + meta *m = NULL; + xmlCharEncoding enc; + const char *buf = 0; + apr_size_t bytes = 0; +#ifndef USE_OLD_LIBXML2 + int xmlopts = XML_PARSE_RECOVER | XML_PARSE_NONET | + XML_PARSE_NOBLANKS | XML_PARSE_NOERROR | XML_PARSE_NOWARNING; +#endif + + saxctxt *ctxt = check_filter_init(f); + if (!ctxt) + return ap_pass_brigade(f->next, bb); + for (b = APR_BRIGADE_FIRST(bb); + b != APR_BRIGADE_SENTINEL(bb); + b = APR_BUCKET_NEXT(b)) { + if (APR_BUCKET_IS_METADATA(b)) { + if (APR_BUCKET_IS_EOS(b)) { + if (ctxt->parser != NULL) { + consume_buffer(ctxt, "", 0, 1); + } + else { + prepend_rbuf(ctxt, ctxt->bb); + } + APR_BRIGADE_INSERT_TAIL(ctxt->bb, + apr_bucket_eos_create(ctxt->bb->bucket_alloc)); + ap_pass_brigade(ctxt->f->next, ctxt->bb); + apr_brigade_cleanup(ctxt->bb); + } + else if (APR_BUCKET_IS_FLUSH(b)) { + /* pass on flush, except at start where it would cause + * headers to be sent before doc sniffing + */ + if (ctxt->parser != NULL) { + ap_fflush(ctxt->f->next, ctxt->bb); + } + } + } + else if (apr_bucket_read(b, &buf, &bytes, APR_BLOCK_READ) + == APR_SUCCESS) { + if (ctxt->parser == NULL) { + const char *cenc; + + /* For documents smaller than four bytes, there is no reason to do + * HTML rewriting. The URL schema (i.e. 'http') needs four bytes alone. + * And the HTML parser needs at least four bytes to initialise correctly. + */ + ctxt->rmin += bytes; + if (ctxt->rmin < sizeof(ctxt->rbuf)) { + memcpy(ctxt->rbuf + ctxt->rlen, buf, bytes); + ctxt->rlen += bytes; + continue; + } + if (ctxt->rlen && ctxt->rlen < sizeof(ctxt->rbuf)) { + apr_size_t rem = sizeof(ctxt->rbuf) - ctxt->rlen; + memcpy(ctxt->rbuf + ctxt->rlen, buf, rem); + ctxt->rlen += rem; + buf += rem; + bytes -= rem; + } + + if (!xml2enc_charset || + (xml2enc_charset(f->r, &enc, &cenc) != APR_SUCCESS)) { + if (!xml2enc_charset) + ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r, APLOGNO(01422) + "No i18n support found. Install mod_xml2enc if required"); + enc = XML_CHAR_ENCODING_NONE; + ap_set_content_type(f->r, "text/html;charset=utf-8"); + } + else { + /* if we wanted a non-default charset_out, insert the + * xml2enc filter now that we've sniffed it + */ + if (ctxt->cfg->charset_out && xml2enc_filter) { + if (*ctxt->cfg->charset_out != '*') + cenc = ctxt->cfg->charset_out; + xml2enc_filter(f->r, cenc, ENCIO_OUTPUT); + ap_set_content_type(f->r, + apr_pstrcat(f->r->pool, + "text/html;charset=", + cenc, NULL)); + } + else /* Normal case, everything worked, utf-8 output */ + ap_set_content_type(f->r, "text/html;charset=utf-8"); + } + + ap_fputs(f->next, ctxt->bb, ctxt->cfg->doctype); + + if (ctxt->rlen) { + ctxt->parser = htmlCreatePushParserCtxt(&sax, ctxt, + ctxt->rbuf, + ctxt->rlen, + NULL, enc); + } + else { + ctxt->parser = htmlCreatePushParserCtxt(&sax, ctxt, buf, 4, + NULL, enc); + buf += 4; + bytes -= 4; + } + if (ctxt->parser == NULL) { + prepend_rbuf(ctxt, bb); + ap_remove_output_filter(f); + return ap_pass_brigade(f->next, bb); + } + ctxt->rlen = 0; + apr_pool_cleanup_register(f->r->pool, ctxt->parser, + (int(*)(void*))htmlFreeParserCtxt, + apr_pool_cleanup_null); +#ifndef USE_OLD_LIBXML2 + if (xmlopts = xmlCtxtUseOptions(ctxt->parser, xmlopts), xmlopts) + ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r, APLOGNO(01423) + "Unsupported parser opts %x", xmlopts); +#endif + if (ctxt->cfg->metafix) + m = metafix(f->r, buf, bytes); + if (m) { + consume_buffer(ctxt, buf, m->start, 0); + consume_buffer(ctxt, buf+m->end, bytes-m->end, 0); + } + else { + consume_buffer(ctxt, buf, bytes, 0); + } + } + else { + consume_buffer(ctxt, buf, bytes, 0); + } + } + else { + ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, APLOGNO(01424) + "Error in bucket read"); + } + } + /*ap_fflush(ctxt->f->next, ctxt->bb); // uncomment for debug */ + apr_brigade_cleanup(bb); + return APR_SUCCESS; +} + +static void *proxy_html_config(apr_pool_t *pool, char *x) +{ + proxy_html_conf *ret = apr_pcalloc(pool, sizeof(proxy_html_conf)); + ret->doctype = DEFAULT_DOCTYPE; + ret->etag = DEFAULT_ETAG; + ret->bufsz = 8192; + /* ret->interp = 1; */ + /* don't initialise links and events until they get set/used */ + return ret; +} + +static void *proxy_html_merge(apr_pool_t *pool, void *BASE, void *ADD) +{ + proxy_html_conf *base = (proxy_html_conf *) BASE; + proxy_html_conf *add = (proxy_html_conf *) ADD; + proxy_html_conf *conf = apr_palloc(pool, sizeof(proxy_html_conf)); + + /* don't merge declarations - just use the most specific */ + conf->links = (add->links == NULL) ? base->links : add->links; + conf->events = (add->events == NULL) ? base->events : add->events; + + conf->charset_out = (add->charset_out == NULL) + ? base->charset_out : add->charset_out; + + if (add->map && base->map) { + urlmap *a; + conf->map = NULL; + for (a = base->map; a; a = a->next) { + urlmap *save = conf->map; + conf->map = apr_pmemdup(pool, a, sizeof(urlmap)); + conf->map->next = save; + } + for (a = add->map; a; a = a->next) { + urlmap *save = conf->map; + conf->map = apr_pmemdup(pool, a, sizeof(urlmap)); + conf->map->next = save; + } + } + else + conf->map = add->map ? add->map : base->map; + + conf->doctype = (add->doctype == DEFAULT_DOCTYPE) + ? base->doctype : add->doctype; + conf->etag = (add->etag == DEFAULT_ETAG) ? base->etag : add->etag; + conf->bufsz = add->bufsz; + if (add->flags & NORM_RESET) { + conf->flags = add->flags ^ NORM_RESET; + conf->metafix = add->metafix; + conf->extfix = add->extfix; + conf->interp = add->interp; + conf->strip_comments = add->strip_comments; + conf->enabled = add->enabled; + } + else { + conf->flags = base->flags | add->flags; + conf->metafix = base->metafix | add->metafix; + conf->extfix = base->extfix | add->extfix; + conf->interp = base->interp | add->interp; + conf->strip_comments = base->strip_comments | add->strip_comments; + conf->enabled = add->enabled | base->enabled; + } + return conf; +} +#define REGFLAG(n,s,c) ((s&&(ap_strchr_c((s),(c))!=NULL)) ? (n) : 0) +#define XREGFLAG(n,s,c) ((!s||(ap_strchr_c((s),(c))==NULL)) ? (n) : 0) +static const char *comp_urlmap(cmd_parms *cmd, urlmap *newmap, + const char *from, const char *to, + const char *flags, const char *cond) +{ + const char *err = NULL; + newmap->flags + = XREGFLAG(M_HTML,flags,'h') + | XREGFLAG(M_EVENTS,flags,'e') + | XREGFLAG(M_CDATA,flags,'c') + | REGFLAG(M_ATSTART,flags,'^') + | REGFLAG(M_ATEND,flags,'$') + | REGFLAG(M_REGEX,flags,'R') + | REGFLAG(M_LAST,flags,'L') + | REGFLAG(M_NOTLAST,flags,'l') + | REGFLAG(M_INTERPOLATE_TO,flags,'V') + | REGFLAG(M_INTERPOLATE_FROM,flags,'v'); + + if ((newmap->flags & M_INTERPOLATE_FROM) || !(newmap->flags & M_REGEX)) { + newmap->from.c = from; + newmap->to = to; + } + else { + newmap->regflags + = REGFLAG(AP_REG_EXTENDED,flags,'x') + | REGFLAG(AP_REG_ICASE,flags,'i') + | REGFLAG(AP_REG_NOSUB,flags,'n') + | REGFLAG(AP_REG_NEWLINE,flags,'s'); + newmap->from.r = ap_pregcomp(cmd->pool, from, newmap->regflags); + newmap->to = to; + } + if (cond != NULL) { + /* back-compatibility: support old-style ENV expressions + * by converting to ap_expr syntax. + * + * 1. var --> env(var) + * 2. var=val --> env(var)=val + * 3. !var --> !env(var) + * 4. !var=val --> env(var)!=val + */ + char *newcond = NULL; + if (ap_rxplus_exec(cmd->temp_pool, old_expr, cond, &newcond)) { + /* we got a substitution. Check for the case (3) above + * that the regexp gets wrong: a negation without a comparison. + */ + if ((cond[0] == '!') && !ap_strchr_c(cond, '=')) { + memmove(newcond+1, newcond, strlen(newcond)-1); + newcond[0] = '!'; + } + cond = newcond; + } + newmap->cond = ap_expr_parse_cmd(cmd, cond, 0, &err, NULL); + } + else { + newmap->cond = NULL; + } + return err; +} + +static const char *set_urlmap(cmd_parms *cmd, void *CFG, const char *args) +{ + proxy_html_conf *cfg = (proxy_html_conf *)CFG; + urlmap *map; + apr_pool_t *pool = cmd->pool; + urlmap *newmap; + const char *usage = + "Usage: ProxyHTMLURLMap from-pattern to-pattern [flags] [cond]"; + const char *from; + const char *to; + const char *flags; + const char *cond = NULL; + + if (from = ap_getword_conf(cmd->pool, &args), !from) + return usage; + if (to = ap_getword_conf(cmd->pool, &args), !to) + return usage; + flags = ap_getword_conf(cmd->pool, &args); + if (flags && *flags) + cond = ap_getword_conf(cmd->pool, &args); + if (cond && !*cond) + cond = NULL; + + /* the args look OK, so let's use them */ + newmap = apr_palloc(pool, sizeof(urlmap)); + newmap->next = NULL; + if (cfg->map) { + for (map = cfg->map; map->next; map = map->next); + map->next = newmap; + } + else + cfg->map = newmap; + + return comp_urlmap(cmd, newmap, from, to, flags, cond); +} + +static const char *set_doctype(cmd_parms *cmd, void *CFG, + const char *t, const char *l) +{ + proxy_html_conf *cfg = (proxy_html_conf *)CFG; + if (!strcasecmp(t, "xhtml")) { + cfg->etag = xhtml_etag; + if (l && !strcasecmp(l, "legacy")) + cfg->doctype = fpi_xhtml_legacy; + else + cfg->doctype = fpi_xhtml; + } + else if (!strcasecmp(t, "html")) { + cfg->etag = html_etag; + if (l && !strcasecmp(l, "legacy")) + cfg->doctype = fpi_html_legacy; + else + cfg->doctype = fpi_html; + } + else if (!strcasecmp(t, "html5")) { + cfg->etag = html_etag; + cfg->doctype = fpi_html5; + } + else { + cfg->doctype = t; + if (l && ((l[0] == 'x') || (l[0] == 'X'))) + cfg->etag = xhtml_etag; + else + cfg->etag = html_etag; + } + return NULL; +} + +static const char *set_flags(cmd_parms *cmd, void *CFG, const char *arg) +{ + proxy_html_conf *cfg = CFG; + if (arg && *arg) { + if (!strcasecmp(arg, "lowercase")) + cfg->flags |= NORM_LC; + else if (!strcasecmp(arg, "dospath")) + cfg->flags |= NORM_MSSLASH; + else if (!strcasecmp(arg, "reset")) + cfg->flags |= NORM_RESET; + } + return NULL; +} + +static const char *set_events(cmd_parms *cmd, void *CFG, const char *arg) +{ + tattr *attr; + proxy_html_conf *cfg = CFG; + if (cfg->events == NULL) + cfg->events = apr_array_make(cmd->pool, 20, sizeof(tattr)); + attr = apr_array_push(cfg->events); + attr->val = arg; + return NULL; +} + +static const char *set_links(cmd_parms *cmd, void *CFG, + const char *elt, const char *att) +{ + apr_array_header_t *attrs; + tattr *attr; + proxy_html_conf *cfg = CFG; + + if (cfg->links == NULL) + cfg->links = apr_hash_make(cmd->pool); + + attrs = apr_hash_get(cfg->links, elt, APR_HASH_KEY_STRING); + if (!attrs) { + attrs = apr_array_make(cmd->pool, 2, sizeof(tattr*)); + apr_hash_set(cfg->links, elt, APR_HASH_KEY_STRING, attrs); + } + attr = apr_array_push(attrs); + attr->val = att; + return NULL; +} +static const command_rec proxy_html_cmds[] = { + AP_INIT_ITERATE("ProxyHTMLEvents", set_events, NULL, + RSRC_CONF|ACCESS_CONF, + "Strings to be treated as scripting events"), + AP_INIT_ITERATE2("ProxyHTMLLinks", set_links, NULL, + RSRC_CONF|ACCESS_CONF, "Declare HTML Attributes"), + AP_INIT_RAW_ARGS("ProxyHTMLURLMap", set_urlmap, NULL, + RSRC_CONF|ACCESS_CONF, "Map URL From To"), + AP_INIT_TAKE12("ProxyHTMLDoctype", set_doctype, NULL, + RSRC_CONF|ACCESS_CONF, "(HTML|XHTML) [Legacy]"), + AP_INIT_ITERATE("ProxyHTMLFixups", set_flags, NULL, + RSRC_CONF|ACCESS_CONF, "Options are lowercase, dospath"), + AP_INIT_FLAG("ProxyHTMLMeta", ap_set_flag_slot, + (void*)APR_OFFSETOF(proxy_html_conf, metafix), + RSRC_CONF|ACCESS_CONF, "Fix META http-equiv elements"), + AP_INIT_FLAG("ProxyHTMLInterp", ap_set_flag_slot, + (void*)APR_OFFSETOF(proxy_html_conf, interp), + RSRC_CONF|ACCESS_CONF, + "Support interpolation and conditions in URLMaps"), + AP_INIT_FLAG("ProxyHTMLExtended", ap_set_flag_slot, + (void*)APR_OFFSETOF(proxy_html_conf, extfix), + RSRC_CONF|ACCESS_CONF, "Map URLs in Javascript and CSS"), + AP_INIT_FLAG("ProxyHTMLStripComments", ap_set_flag_slot, + (void*)APR_OFFSETOF(proxy_html_conf, strip_comments), + RSRC_CONF|ACCESS_CONF, "Strip out comments"), + AP_INIT_TAKE1("ProxyHTMLBufSize", ap_set_int_slot, + (void*)APR_OFFSETOF(proxy_html_conf, bufsz), + RSRC_CONF|ACCESS_CONF, "Buffer size"), + AP_INIT_TAKE1("ProxyHTMLCharsetOut", ap_set_string_slot, + (void*)APR_OFFSETOF(proxy_html_conf, charset_out), + RSRC_CONF|ACCESS_CONF, "Usage: ProxyHTMLCharsetOut charset"), + AP_INIT_FLAG("ProxyHTMLEnable", ap_set_flag_slot, + (void*)APR_OFFSETOF(proxy_html_conf, enabled), + RSRC_CONF|ACCESS_CONF, + "Enable proxy-html and xml2enc filters"), + { NULL } +}; +static int mod_proxy_html(apr_pool_t *p, apr_pool_t *p1, apr_pool_t *p2) +{ + seek_meta = ap_pregcomp(p, "]*(http-equiv)[^>]*>", + AP_REG_EXTENDED|AP_REG_ICASE); + seek_content = apr_strmatch_precompile(p, "content", 0); + memset(&sax, 0, sizeof(htmlSAXHandler)); + sax.startElement = pstartElement; + sax.endElement = pendElement; + sax.characters = pcharacters; + sax.comment = pcomment; + sax.cdataBlock = pcdata; + xml2enc_charset = APR_RETRIEVE_OPTIONAL_FN(xml2enc_charset); + xml2enc_filter = APR_RETRIEVE_OPTIONAL_FN(xml2enc_filter); + if (!xml2enc_charset) { + ap_log_perror(APLOG_MARK, APLOG_NOTICE, 0, p2, APLOGNO(01425) + "I18n support in mod_proxy_html requires mod_xml2enc. " + "Without it, non-ASCII characters in proxied pages are " + "likely to display incorrectly."); + } + + /* old_expr only needs to last the life of the config phase */ + old_expr = ap_rxplus_compile(p1, "s/^(!)?(\\w+)((=)(.+))?$/reqenv('$2')$1$4'$5'/"); + return OK; +} +static void proxy_html_insert(request_rec *r) +{ + proxy_html_conf *cfg; + cfg = ap_get_module_config(r->per_dir_config, &proxy_html_module); + if (cfg->enabled) { + if (xml2enc_filter) + xml2enc_filter(r, NULL, ENCIO_INPUT_CHECKS); + ap_add_output_filter("proxy-html", NULL, r, r->connection); + } +} +static void proxy_html_hooks(apr_pool_t *p) +{ + static const char *aszSucc[] = { "mod_filter.c", NULL }; + ap_register_output_filter_protocol("proxy-html", proxy_html_filter, + NULL, AP_FTYPE_RESOURCE, + AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH); + /* move this to pre_config so old_expr is available to interpret + * old-style conditions on URL maps. + */ + ap_hook_pre_config(mod_proxy_html, NULL, NULL, APR_HOOK_MIDDLE); + ap_hook_insert_filter(proxy_html_insert, NULL, aszSucc, APR_HOOK_MIDDLE); +} + +AP_DECLARE_MODULE(proxy_html) = { + STANDARD20_MODULE_STUFF, + proxy_html_config, + proxy_html_merge, + NULL, + NULL, + proxy_html_cmds, + proxy_html_hooks +}; -- cgit v1.2.3