summaryrefslogtreecommitdiffstats
path: root/modules/mappers/mod_speling.c
diff options
context:
space:
mode:
Diffstat (limited to 'modules/mappers/mod_speling.c')
-rw-r--r--modules/mappers/mod_speling.c528
1 files changed, 528 insertions, 0 deletions
diff --git a/modules/mappers/mod_speling.c b/modules/mappers/mod_speling.c
new file mode 100644
index 0000000..2ed65eb
--- /dev/null
+++ b/modules/mappers/mod_speling.c
@@ -0,0 +1,528 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "apr.h"
+#include "apr_file_io.h"
+#include "apr_strings.h"
+#include "apr_lib.h"
+
+#define APR_WANT_STRFUNC
+#include "apr_want.h"
+
+#include "httpd.h"
+#include "http_core.h"
+#include "http_config.h"
+#include "http_request.h"
+#include "http_log.h"
+
+/* mod_speling.c - by Alexei Kosut <akosut@organic.com> June, 1996
+ *
+ * This module is transparent, and simple. It attempts to correct
+ * misspellings of URLs that users might have entered, namely by checking
+ * capitalizations. If it finds a match, it sends a redirect.
+ *
+ * Sep-1999 Hugo Haas <hugo@w3.org>
+ * o Added a CheckCaseOnly option to check only miscapitalized words.
+ *
+ * 08-Aug-1997 <Martin.Kraemer@Mch.SNI.De>
+ * o Upgraded module interface to apache_1.3a2-dev API (more NULL's in
+ * speling_module).
+ * o Integrated tcsh's "spelling correction" routine which allows one
+ * misspelling (character insertion/omission/typo/transposition).
+ * Rewrote it to ignore case as well. This ought to catch the majority
+ * of misspelled requests.
+ * o Commented out the second pass where files' suffixes are stripped.
+ * Given the better hit rate of the first pass, this rather ugly
+ * (request index.html, receive index.db ?!?!) solution can be
+ * omitted.
+ * o wrote a "kind of" html page for mod_speling
+ *
+ * Activate it with "CheckSpelling On"
+ */
+
+module AP_MODULE_DECLARE_DATA speling_module;
+
+typedef struct {
+ int enabled;
+ int check_case_only;
+ int check_basename_match;
+} spconfig;
+
+/*
+ * Create a configuration specific to this module for a server or directory
+ * location, and fill it with the default settings.
+ *
+ * The API says that in the absence of a merge function, the record for the
+ * closest ancestor is used exclusively. That's what we want, so we don't
+ * bother to have such a function.
+ */
+
+static void *mkconfig(apr_pool_t *p)
+{
+ spconfig *cfg = apr_pcalloc(p, sizeof(spconfig));
+
+ cfg->enabled = 0;
+ cfg->check_case_only = 0;
+ cfg->check_basename_match = 1;
+ return cfg;
+}
+
+/*
+ * Respond to a callback to create configuration record for a server or
+ * vhost environment.
+ */
+static void *create_mconfig_for_server(apr_pool_t *p, server_rec *s)
+{
+ return mkconfig(p);
+}
+
+/*
+ * Respond to a callback to create a config record for a specific directory.
+ */
+static void *create_mconfig_for_directory(apr_pool_t *p, char *dir)
+{
+ return mkconfig(p);
+}
+
+/*
+ * Define the directives specific to this module. This structure is referenced
+ * later by the 'module' structure.
+ */
+static const command_rec speling_cmds[] =
+{
+ AP_INIT_FLAG("CheckSpelling", ap_set_flag_slot,
+ (void*)APR_OFFSETOF(spconfig, enabled), OR_OPTIONS,
+ "whether or not to fix miscapitalized/misspelled requests"),
+ AP_INIT_FLAG("CheckCaseOnly", ap_set_flag_slot,
+ (void*)APR_OFFSETOF(spconfig, check_case_only), OR_OPTIONS,
+ "whether or not to fix only miscapitalized requests"),
+ AP_INIT_FLAG("CheckBasenameMatch", ap_set_flag_slot,
+ (void*)APR_OFFSETOF(spconfig, check_basename_match), OR_OPTIONS,
+ "whether or not to fix files with the same base name"),
+ { NULL }
+};
+
+typedef enum {
+ SP_IDENTICAL = 0,
+ SP_MISCAPITALIZED = 1,
+ SP_TRANSPOSITION = 2,
+ SP_MISSINGCHAR = 3,
+ SP_EXTRACHAR = 4,
+ SP_SIMPLETYPO = 5,
+ SP_VERYDIFFERENT = 6
+} sp_reason;
+
+static const char *sp_reason_str[] =
+{
+ "identical",
+ "miscapitalized",
+ "transposed characters",
+ "character missing",
+ "extra character",
+ "mistyped character",
+ "common basename",
+};
+
+typedef struct {
+ const char *name;
+ sp_reason quality;
+} misspelled_file;
+
+/*
+ * spdist() is taken from Kernighan & Pike,
+ * _The_UNIX_Programming_Environment_
+ * and adapted somewhat to correspond better to psychological reality.
+ * (Note the changes to the return values)
+ *
+ * According to Pollock and Zamora, CACM April 1984 (V. 27, No. 4),
+ * page 363, the correct order for this is:
+ * OMISSION = TRANSPOSITION > INSERTION > SUBSTITUTION
+ * thus, it was exactly backwards in the old version. -- PWP
+ *
+ * This routine was taken out of tcsh's spelling correction code
+ * (tcsh-6.07.04) and re-converted to apache data types ("char" type
+ * instead of tcsh's NLS'ed "Char"). Plus it now ignores the case
+ * during comparisons, so is a "approximate strcasecmp()".
+ * NOTE that is still allows only _one_ real "typo",
+ * it does NOT try to correct multiple errors.
+ */
+
+static sp_reason spdist(const char *s, const char *t)
+{
+ for (; apr_tolower(*s) == apr_tolower(*t); t++, s++) {
+ if (*t == '\0') {
+ return SP_MISCAPITALIZED; /* exact match (sans case) */
+ }
+ }
+ if (*s) {
+ if (*t) {
+ if (s[1] && t[1] && apr_tolower(*s) == apr_tolower(t[1])
+ && apr_tolower(*t) == apr_tolower(s[1])
+ && strcasecmp(s + 2, t + 2) == 0) {
+ return SP_TRANSPOSITION; /* transposition */
+ }
+ if (strcasecmp(s + 1, t + 1) == 0) {
+ return SP_SIMPLETYPO; /* 1 char mismatch */
+ }
+ }
+ if (strcasecmp(s + 1, t) == 0) {
+ return SP_EXTRACHAR; /* extra character */
+ }
+ }
+ if (*t && strcasecmp(s, t + 1) == 0) {
+ return SP_MISSINGCHAR; /* missing character */
+ }
+ return SP_VERYDIFFERENT; /* distance too large to fix. */
+}
+
+static int sort_by_quality(const void *left, const void *rite)
+{
+ return (int) (((misspelled_file *) left)->quality)
+ - (int) (((misspelled_file *) rite)->quality);
+}
+
+static int check_speling(request_rec *r)
+{
+ spconfig *cfg;
+ char *good, *bad, *postgood, *url;
+ apr_finfo_t dirent;
+ int filoc, dotloc, urlen, pglen;
+ apr_array_header_t *candidates = NULL;
+ apr_dir_t *dir;
+
+ cfg = ap_get_module_config(r->per_dir_config, &speling_module);
+ if (!cfg->enabled) {
+ return DECLINED;
+ }
+
+ /* We only want to worry about GETs */
+ if (r->method_number != M_GET) {
+ return DECLINED;
+ }
+
+ /* We've already got a file of some kind or another */
+ if (r->finfo.filetype != APR_NOFILE) {
+ return DECLINED;
+ }
+
+ /* Not a file request */
+ if (r->proxyreq || !r->filename) {
+ return DECLINED;
+ }
+
+ /* This is a sub request - don't mess with it */
+ if (r->main) {
+ return DECLINED;
+ }
+
+ /*
+ * The request should end up looking like this:
+ * r->uri: /correct-url/mispelling/more
+ * r->filename: /correct-file/mispelling r->path_info: /more
+ *
+ * So we do this in steps. First break r->filename into two pieces
+ */
+
+ filoc = ap_rind(r->filename, '/');
+ /*
+ * Don't do anything if the request doesn't contain a slash, or
+ * requests "/"
+ */
+ if (filoc == -1 || strcmp(r->uri, "/") == 0) {
+ return DECLINED;
+ }
+
+ /* good = /correct-file */
+ good = apr_pstrndup(r->pool, r->filename, filoc);
+ /* bad = mispelling */
+ bad = apr_pstrdup(r->pool, r->filename + filoc + 1);
+ /* postgood = mispelling/more */
+ postgood = apr_pstrcat(r->pool, bad, r->path_info, NULL);
+
+ urlen = strlen(r->uri);
+ pglen = strlen(postgood);
+
+ /* Check to see if the URL pieces add up */
+ if (strcmp(postgood, r->uri + (urlen - pglen))) {
+ return DECLINED;
+ }
+
+ /* url = /correct-url */
+ url = apr_pstrndup(r->pool, r->uri, (urlen - pglen));
+
+ /* Now open the directory and do ourselves a check... */
+ if (apr_dir_open(&dir, good, r->pool) != APR_SUCCESS) {
+ /* Oops, not a directory... */
+ return DECLINED;
+ }
+
+ candidates = apr_array_make(r->pool, 2, sizeof(misspelled_file));
+
+ dotloc = ap_ind(bad, '.');
+ if (dotloc == -1) {
+ dotloc = strlen(bad);
+ }
+
+ while (apr_dir_read(&dirent, APR_FINFO_DIRENT, dir) == APR_SUCCESS) {
+ sp_reason q;
+
+ /*
+ * If we end up with a "fixed" URL which is identical to the
+ * requested one, we must have found a broken symlink or some such.
+ * Do _not_ try to redirect this, it causes a loop!
+ */
+ if (strcmp(bad, dirent.name) == 0) {
+ apr_dir_close(dir);
+ return OK;
+ }
+
+ /*
+ * miscapitalization errors are checked first (like, e.g., lower case
+ * file, upper case request)
+ */
+ else if (strcasecmp(bad, dirent.name) == 0) {
+ misspelled_file *sp_new;
+
+ sp_new = (misspelled_file *) apr_array_push(candidates);
+ sp_new->name = apr_pstrdup(r->pool, dirent.name);
+ sp_new->quality = SP_MISCAPITALIZED;
+ }
+
+ /*
+ * simple typing errors are checked next (like, e.g.,
+ * missing/extra/transposed char)
+ */
+ else if ((cfg->check_case_only == 0)
+ && ((q = spdist(bad, dirent.name)) != SP_VERYDIFFERENT)) {
+ misspelled_file *sp_new;
+
+ sp_new = (misspelled_file *) apr_array_push(candidates);
+ sp_new->name = apr_pstrdup(r->pool, dirent.name);
+ sp_new->quality = q;
+ }
+
+ /*
+ * The spdist() should have found the majority of the misspelled
+ * requests. It is of questionable use to continue looking for
+ * files with the same base name, but potentially of totally wrong
+ * type (index.html <-> index.db).
+ *
+ * If you're using MultiViews, and have a file named foobar.html,
+ * which you refer to as "foobar", and someone tried to access
+ * "Foobar", without CheckBasenameMatch, mod_speling won't find it,
+ * because it won't find anything matching that spelling.
+ * With the extension-munging, it would locate "foobar.html".
+ */
+ else if (cfg->check_basename_match == 1) {
+ /*
+ * Okay... we didn't find anything. Now we take out the hard-core
+ * power tools. There are several cases here. Someone might have
+ * entered a wrong extension (.htm instead of .html or vice
+ * versa) or the document could be negotiated. At any rate, now
+ * we just compare stuff before the first dot. If it matches, we
+ * figure we got us a match. This can result in wrong things if
+ * there are files of different content types but the same prefix
+ * (e.g. foo.gif and foo.html) This code will pick the first one
+ * it finds. Better than a Not Found, though.
+ */
+ int entloc = ap_ind(dirent.name, '.');
+ if (entloc == -1) {
+ entloc = strlen(dirent.name);
+ }
+
+ if ((dotloc == entloc)
+ && !strncasecmp(bad, dirent.name, dotloc)) {
+ misspelled_file *sp_new;
+
+ sp_new = (misspelled_file *) apr_array_push(candidates);
+ sp_new->name = apr_pstrdup(r->pool, dirent.name);
+ sp_new->quality = SP_VERYDIFFERENT;
+ }
+ }
+ }
+ apr_dir_close(dir);
+
+ if (candidates->nelts != 0) {
+ /* Wow... we found us a mispelling. Construct a fixed url */
+ char *nuri;
+ const char *ref;
+ misspelled_file *variant = (misspelled_file *) candidates->elts;
+ int i;
+
+ ref = apr_table_get(r->headers_in, "Referer");
+
+ qsort((void *) candidates->elts, candidates->nelts,
+ sizeof(misspelled_file), sort_by_quality);
+
+ /*
+ * Conditions for immediate redirection:
+ * a) the first candidate was not found by stripping the suffix
+ * AND b) there exists only one candidate OR the best match is not
+ * ambiguous
+ * then return a redirection right away.
+ */
+ if (variant[0].quality != SP_VERYDIFFERENT
+ && (candidates->nelts == 1
+ || variant[0].quality != variant[1].quality)) {
+
+ nuri = ap_escape_uri(r->pool, apr_pstrcat(r->pool, url,
+ variant[0].name,
+ r->path_info, NULL));
+ if (r->parsed_uri.query)
+ nuri = apr_pstrcat(r->pool, nuri, "?", r->parsed_uri.query, NULL);
+
+ apr_table_setn(r->headers_out, "Location",
+ ap_construct_url(r->pool, nuri, r));
+
+ ap_log_rerror(APLOG_MARK, APLOG_INFO, APR_SUCCESS,
+ r,
+ ref ? APLOGNO(03224) "Fixed spelling: %s to %s from %s"
+ : APLOGNO(03225) "Fixed spelling: %s to %s%s",
+ r->uri, nuri,
+ (ref ? ref : ""));
+
+ return HTTP_MOVED_PERMANENTLY;
+ }
+ /*
+ * Otherwise, a "[300] Multiple Choices" list with the variants is
+ * returned.
+ */
+ else {
+ apr_pool_t *p;
+ apr_table_t *notes;
+ apr_pool_t *sub_pool;
+ apr_array_header_t *t;
+ apr_array_header_t *v;
+
+
+ if (r->main == NULL) {
+ p = r->pool;
+ notes = r->notes;
+ }
+ else {
+ p = r->main->pool;
+ notes = r->main->notes;
+ }
+
+ if (apr_pool_create(&sub_pool, p) != APR_SUCCESS)
+ return DECLINED;
+ apr_pool_tag(sub_pool, "speling_sub");
+
+ t = apr_array_make(sub_pool, candidates->nelts * 8 + 8,
+ sizeof(char *));
+ v = apr_array_make(sub_pool, candidates->nelts * 5,
+ sizeof(char *));
+
+ /* Generate the response text. */
+
+ *(const char **)apr_array_push(t) =
+ "The document name you requested (<code>";
+ *(const char **)apr_array_push(t) = ap_escape_html(sub_pool, r->uri);
+ *(const char **)apr_array_push(t) =
+ "</code>) could not be found on this server.\n"
+ "However, we found documents with names similar "
+ "to the one you requested.<p>"
+ "Available documents:\n<ul>\n";
+
+ for (i = 0; i < candidates->nelts; ++i) {
+ char *vuri;
+ const char *reason;
+
+ reason = sp_reason_str[(int) (variant[i].quality)];
+ /* The format isn't very neat... */
+ vuri = apr_pstrcat(sub_pool, url, variant[i].name, r->path_info,
+ (r->parsed_uri.query != NULL) ? "?" : "",
+ (r->parsed_uri.query != NULL)
+ ? r->parsed_uri.query : "",
+ NULL);
+ *(const char **)apr_array_push(v) = "\"";
+ *(const char **)apr_array_push(v) = ap_escape_uri(sub_pool, vuri);
+ *(const char **)apr_array_push(v) = "\";\"";
+ *(const char **)apr_array_push(v) = reason;
+ *(const char **)apr_array_push(v) = "\"";
+
+ *(const char **)apr_array_push(t) = "<li><a href=\"";
+ *(const char **)apr_array_push(t) = ap_escape_uri(sub_pool, vuri);
+ *(const char **)apr_array_push(t) = "\">";
+ *(const char **)apr_array_push(t) = ap_escape_html(sub_pool, vuri);
+ *(const char **)apr_array_push(t) = "</a> (";
+ *(const char **)apr_array_push(t) = reason;
+ *(const char **)apr_array_push(t) = ")\n";
+
+ /*
+ * when we have printed the "close matches" and there are
+ * more "distant matches" (matched by stripping the suffix),
+ * then we insert an additional separator text to suggest
+ * that the user LOOK CLOSELY whether these are really the
+ * files she wanted.
+ */
+ if (i > 0 && i < candidates->nelts - 1
+ && variant[i].quality != SP_VERYDIFFERENT
+ && variant[i + 1].quality == SP_VERYDIFFERENT) {
+ *(const char **)apr_array_push(t) =
+ "</ul>\nFurthermore, the following related "
+ "documents were found:\n<ul>\n";
+ }
+ }
+ *(const char **)apr_array_push(t) = "</ul>\n";
+
+ /* If we know there was a referring page, add a note: */
+ if (ref != NULL) {
+ *(const char **)apr_array_push(t) =
+ "Please consider informing the owner of the "
+ "referring page <tt>";
+ *(const char **)apr_array_push(t) = ap_escape_html(sub_pool, ref);
+ *(const char **)apr_array_push(t) =
+ "</tt> about the broken link.\n";
+ }
+
+
+ /* Pass our apr_table_t to http_protocol.c (see mod_negotiation): */
+ apr_table_setn(notes, "variant-list", apr_array_pstrcat(p, t, 0));
+
+ apr_table_mergen(r->subprocess_env, "VARIANTS",
+ apr_array_pstrcat(p, v, ','));
+
+ apr_pool_destroy(sub_pool);
+
+ ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
+ ref ? APLOGNO(03226) "Spelling fix: %s: %d candidates from %s"
+ : APLOGNO(03227) "Spelling fix: %s: %d candidates%s",
+ r->uri, candidates->nelts,
+ (ref ? ref : ""));
+
+ return HTTP_MULTIPLE_CHOICES;
+ }
+ }
+
+ return OK;
+}
+
+static void register_hooks(apr_pool_t *p)
+{
+ ap_hook_fixups(check_speling,NULL,NULL,APR_HOOK_LAST);
+}
+
+AP_DECLARE_MODULE(speling) =
+{
+ STANDARD20_MODULE_STUFF,
+ create_mconfig_for_directory, /* create per-dir config */
+ NULL, /* merge per-dir config */
+ create_mconfig_for_server, /* server config */
+ NULL, /* merge server config */
+ speling_cmds, /* command apr_table_t */
+ register_hooks /* register hooks */
+};