diff options
Diffstat (limited to 'modules/mappers/mod_speling.c')
-rw-r--r-- | modules/mappers/mod_speling.c | 528 |
1 files changed, 528 insertions, 0 deletions
diff --git a/modules/mappers/mod_speling.c b/modules/mappers/mod_speling.c new file mode 100644 index 0000000..2ed65eb --- /dev/null +++ b/modules/mappers/mod_speling.c @@ -0,0 +1,528 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "apr.h" +#include "apr_file_io.h" +#include "apr_strings.h" +#include "apr_lib.h" + +#define APR_WANT_STRFUNC +#include "apr_want.h" + +#include "httpd.h" +#include "http_core.h" +#include "http_config.h" +#include "http_request.h" +#include "http_log.h" + +/* mod_speling.c - by Alexei Kosut <akosut@organic.com> June, 1996 + * + * This module is transparent, and simple. It attempts to correct + * misspellings of URLs that users might have entered, namely by checking + * capitalizations. If it finds a match, it sends a redirect. + * + * Sep-1999 Hugo Haas <hugo@w3.org> + * o Added a CheckCaseOnly option to check only miscapitalized words. + * + * 08-Aug-1997 <Martin.Kraemer@Mch.SNI.De> + * o Upgraded module interface to apache_1.3a2-dev API (more NULL's in + * speling_module). + * o Integrated tcsh's "spelling correction" routine which allows one + * misspelling (character insertion/omission/typo/transposition). + * Rewrote it to ignore case as well. This ought to catch the majority + * of misspelled requests. + * o Commented out the second pass where files' suffixes are stripped. + * Given the better hit rate of the first pass, this rather ugly + * (request index.html, receive index.db ?!?!) solution can be + * omitted. + * o wrote a "kind of" html page for mod_speling + * + * Activate it with "CheckSpelling On" + */ + +module AP_MODULE_DECLARE_DATA speling_module; + +typedef struct { + int enabled; + int check_case_only; + int check_basename_match; +} spconfig; + +/* + * Create a configuration specific to this module for a server or directory + * location, and fill it with the default settings. + * + * The API says that in the absence of a merge function, the record for the + * closest ancestor is used exclusively. That's what we want, so we don't + * bother to have such a function. + */ + +static void *mkconfig(apr_pool_t *p) +{ + spconfig *cfg = apr_pcalloc(p, sizeof(spconfig)); + + cfg->enabled = 0; + cfg->check_case_only = 0; + cfg->check_basename_match = 1; + return cfg; +} + +/* + * Respond to a callback to create configuration record for a server or + * vhost environment. + */ +static void *create_mconfig_for_server(apr_pool_t *p, server_rec *s) +{ + return mkconfig(p); +} + +/* + * Respond to a callback to create a config record for a specific directory. + */ +static void *create_mconfig_for_directory(apr_pool_t *p, char *dir) +{ + return mkconfig(p); +} + +/* + * Define the directives specific to this module. This structure is referenced + * later by the 'module' structure. + */ +static const command_rec speling_cmds[] = +{ + AP_INIT_FLAG("CheckSpelling", ap_set_flag_slot, + (void*)APR_OFFSETOF(spconfig, enabled), OR_OPTIONS, + "whether or not to fix miscapitalized/misspelled requests"), + AP_INIT_FLAG("CheckCaseOnly", ap_set_flag_slot, + (void*)APR_OFFSETOF(spconfig, check_case_only), OR_OPTIONS, + "whether or not to fix only miscapitalized requests"), + AP_INIT_FLAG("CheckBasenameMatch", ap_set_flag_slot, + (void*)APR_OFFSETOF(spconfig, check_basename_match), OR_OPTIONS, + "whether or not to fix files with the same base name"), + { NULL } +}; + +typedef enum { + SP_IDENTICAL = 0, + SP_MISCAPITALIZED = 1, + SP_TRANSPOSITION = 2, + SP_MISSINGCHAR = 3, + SP_EXTRACHAR = 4, + SP_SIMPLETYPO = 5, + SP_VERYDIFFERENT = 6 +} sp_reason; + +static const char *sp_reason_str[] = +{ + "identical", + "miscapitalized", + "transposed characters", + "character missing", + "extra character", + "mistyped character", + "common basename", +}; + +typedef struct { + const char *name; + sp_reason quality; +} misspelled_file; + +/* + * spdist() is taken from Kernighan & Pike, + * _The_UNIX_Programming_Environment_ + * and adapted somewhat to correspond better to psychological reality. + * (Note the changes to the return values) + * + * According to Pollock and Zamora, CACM April 1984 (V. 27, No. 4), + * page 363, the correct order for this is: + * OMISSION = TRANSPOSITION > INSERTION > SUBSTITUTION + * thus, it was exactly backwards in the old version. -- PWP + * + * This routine was taken out of tcsh's spelling correction code + * (tcsh-6.07.04) and re-converted to apache data types ("char" type + * instead of tcsh's NLS'ed "Char"). Plus it now ignores the case + * during comparisons, so is a "approximate strcasecmp()". + * NOTE that is still allows only _one_ real "typo", + * it does NOT try to correct multiple errors. + */ + +static sp_reason spdist(const char *s, const char *t) +{ + for (; apr_tolower(*s) == apr_tolower(*t); t++, s++) { + if (*t == '\0') { + return SP_MISCAPITALIZED; /* exact match (sans case) */ + } + } + if (*s) { + if (*t) { + if (s[1] && t[1] && apr_tolower(*s) == apr_tolower(t[1]) + && apr_tolower(*t) == apr_tolower(s[1]) + && strcasecmp(s + 2, t + 2) == 0) { + return SP_TRANSPOSITION; /* transposition */ + } + if (strcasecmp(s + 1, t + 1) == 0) { + return SP_SIMPLETYPO; /* 1 char mismatch */ + } + } + if (strcasecmp(s + 1, t) == 0) { + return SP_EXTRACHAR; /* extra character */ + } + } + if (*t && strcasecmp(s, t + 1) == 0) { + return SP_MISSINGCHAR; /* missing character */ + } + return SP_VERYDIFFERENT; /* distance too large to fix. */ +} + +static int sort_by_quality(const void *left, const void *rite) +{ + return (int) (((misspelled_file *) left)->quality) + - (int) (((misspelled_file *) rite)->quality); +} + +static int check_speling(request_rec *r) +{ + spconfig *cfg; + char *good, *bad, *postgood, *url; + apr_finfo_t dirent; + int filoc, dotloc, urlen, pglen; + apr_array_header_t *candidates = NULL; + apr_dir_t *dir; + + cfg = ap_get_module_config(r->per_dir_config, &speling_module); + if (!cfg->enabled) { + return DECLINED; + } + + /* We only want to worry about GETs */ + if (r->method_number != M_GET) { + return DECLINED; + } + + /* We've already got a file of some kind or another */ + if (r->finfo.filetype != APR_NOFILE) { + return DECLINED; + } + + /* Not a file request */ + if (r->proxyreq || !r->filename) { + return DECLINED; + } + + /* This is a sub request - don't mess with it */ + if (r->main) { + return DECLINED; + } + + /* + * The request should end up looking like this: + * r->uri: /correct-url/mispelling/more + * r->filename: /correct-file/mispelling r->path_info: /more + * + * So we do this in steps. First break r->filename into two pieces + */ + + filoc = ap_rind(r->filename, '/'); + /* + * Don't do anything if the request doesn't contain a slash, or + * requests "/" + */ + if (filoc == -1 || strcmp(r->uri, "/") == 0) { + return DECLINED; + } + + /* good = /correct-file */ + good = apr_pstrndup(r->pool, r->filename, filoc); + /* bad = mispelling */ + bad = apr_pstrdup(r->pool, r->filename + filoc + 1); + /* postgood = mispelling/more */ + postgood = apr_pstrcat(r->pool, bad, r->path_info, NULL); + + urlen = strlen(r->uri); + pglen = strlen(postgood); + + /* Check to see if the URL pieces add up */ + if (strcmp(postgood, r->uri + (urlen - pglen))) { + return DECLINED; + } + + /* url = /correct-url */ + url = apr_pstrndup(r->pool, r->uri, (urlen - pglen)); + + /* Now open the directory and do ourselves a check... */ + if (apr_dir_open(&dir, good, r->pool) != APR_SUCCESS) { + /* Oops, not a directory... */ + return DECLINED; + } + + candidates = apr_array_make(r->pool, 2, sizeof(misspelled_file)); + + dotloc = ap_ind(bad, '.'); + if (dotloc == -1) { + dotloc = strlen(bad); + } + + while (apr_dir_read(&dirent, APR_FINFO_DIRENT, dir) == APR_SUCCESS) { + sp_reason q; + + /* + * If we end up with a "fixed" URL which is identical to the + * requested one, we must have found a broken symlink or some such. + * Do _not_ try to redirect this, it causes a loop! + */ + if (strcmp(bad, dirent.name) == 0) { + apr_dir_close(dir); + return OK; + } + + /* + * miscapitalization errors are checked first (like, e.g., lower case + * file, upper case request) + */ + else if (strcasecmp(bad, dirent.name) == 0) { + misspelled_file *sp_new; + + sp_new = (misspelled_file *) apr_array_push(candidates); + sp_new->name = apr_pstrdup(r->pool, dirent.name); + sp_new->quality = SP_MISCAPITALIZED; + } + + /* + * simple typing errors are checked next (like, e.g., + * missing/extra/transposed char) + */ + else if ((cfg->check_case_only == 0) + && ((q = spdist(bad, dirent.name)) != SP_VERYDIFFERENT)) { + misspelled_file *sp_new; + + sp_new = (misspelled_file *) apr_array_push(candidates); + sp_new->name = apr_pstrdup(r->pool, dirent.name); + sp_new->quality = q; + } + + /* + * The spdist() should have found the majority of the misspelled + * requests. It is of questionable use to continue looking for + * files with the same base name, but potentially of totally wrong + * type (index.html <-> index.db). + * + * If you're using MultiViews, and have a file named foobar.html, + * which you refer to as "foobar", and someone tried to access + * "Foobar", without CheckBasenameMatch, mod_speling won't find it, + * because it won't find anything matching that spelling. + * With the extension-munging, it would locate "foobar.html". + */ + else if (cfg->check_basename_match == 1) { + /* + * Okay... we didn't find anything. Now we take out the hard-core + * power tools. There are several cases here. Someone might have + * entered a wrong extension (.htm instead of .html or vice + * versa) or the document could be negotiated. At any rate, now + * we just compare stuff before the first dot. If it matches, we + * figure we got us a match. This can result in wrong things if + * there are files of different content types but the same prefix + * (e.g. foo.gif and foo.html) This code will pick the first one + * it finds. Better than a Not Found, though. + */ + int entloc = ap_ind(dirent.name, '.'); + if (entloc == -1) { + entloc = strlen(dirent.name); + } + + if ((dotloc == entloc) + && !strncasecmp(bad, dirent.name, dotloc)) { + misspelled_file *sp_new; + + sp_new = (misspelled_file *) apr_array_push(candidates); + sp_new->name = apr_pstrdup(r->pool, dirent.name); + sp_new->quality = SP_VERYDIFFERENT; + } + } + } + apr_dir_close(dir); + + if (candidates->nelts != 0) { + /* Wow... we found us a mispelling. Construct a fixed url */ + char *nuri; + const char *ref; + misspelled_file *variant = (misspelled_file *) candidates->elts; + int i; + + ref = apr_table_get(r->headers_in, "Referer"); + + qsort((void *) candidates->elts, candidates->nelts, + sizeof(misspelled_file), sort_by_quality); + + /* + * Conditions for immediate redirection: + * a) the first candidate was not found by stripping the suffix + * AND b) there exists only one candidate OR the best match is not + * ambiguous + * then return a redirection right away. + */ + if (variant[0].quality != SP_VERYDIFFERENT + && (candidates->nelts == 1 + || variant[0].quality != variant[1].quality)) { + + nuri = ap_escape_uri(r->pool, apr_pstrcat(r->pool, url, + variant[0].name, + r->path_info, NULL)); + if (r->parsed_uri.query) + nuri = apr_pstrcat(r->pool, nuri, "?", r->parsed_uri.query, NULL); + + apr_table_setn(r->headers_out, "Location", + ap_construct_url(r->pool, nuri, r)); + + ap_log_rerror(APLOG_MARK, APLOG_INFO, APR_SUCCESS, + r, + ref ? APLOGNO(03224) "Fixed spelling: %s to %s from %s" + : APLOGNO(03225) "Fixed spelling: %s to %s%s", + r->uri, nuri, + (ref ? ref : "")); + + return HTTP_MOVED_PERMANENTLY; + } + /* + * Otherwise, a "[300] Multiple Choices" list with the variants is + * returned. + */ + else { + apr_pool_t *p; + apr_table_t *notes; + apr_pool_t *sub_pool; + apr_array_header_t *t; + apr_array_header_t *v; + + + if (r->main == NULL) { + p = r->pool; + notes = r->notes; + } + else { + p = r->main->pool; + notes = r->main->notes; + } + + if (apr_pool_create(&sub_pool, p) != APR_SUCCESS) + return DECLINED; + apr_pool_tag(sub_pool, "speling_sub"); + + t = apr_array_make(sub_pool, candidates->nelts * 8 + 8, + sizeof(char *)); + v = apr_array_make(sub_pool, candidates->nelts * 5, + sizeof(char *)); + + /* Generate the response text. */ + + *(const char **)apr_array_push(t) = + "The document name you requested (<code>"; + *(const char **)apr_array_push(t) = ap_escape_html(sub_pool, r->uri); + *(const char **)apr_array_push(t) = + "</code>) could not be found on this server.\n" + "However, we found documents with names similar " + "to the one you requested.<p>" + "Available documents:\n<ul>\n"; + + for (i = 0; i < candidates->nelts; ++i) { + char *vuri; + const char *reason; + + reason = sp_reason_str[(int) (variant[i].quality)]; + /* The format isn't very neat... */ + vuri = apr_pstrcat(sub_pool, url, variant[i].name, r->path_info, + (r->parsed_uri.query != NULL) ? "?" : "", + (r->parsed_uri.query != NULL) + ? r->parsed_uri.query : "", + NULL); + *(const char **)apr_array_push(v) = "\""; + *(const char **)apr_array_push(v) = ap_escape_uri(sub_pool, vuri); + *(const char **)apr_array_push(v) = "\";\""; + *(const char **)apr_array_push(v) = reason; + *(const char **)apr_array_push(v) = "\""; + + *(const char **)apr_array_push(t) = "<li><a href=\""; + *(const char **)apr_array_push(t) = ap_escape_uri(sub_pool, vuri); + *(const char **)apr_array_push(t) = "\">"; + *(const char **)apr_array_push(t) = ap_escape_html(sub_pool, vuri); + *(const char **)apr_array_push(t) = "</a> ("; + *(const char **)apr_array_push(t) = reason; + *(const char **)apr_array_push(t) = ")\n"; + + /* + * when we have printed the "close matches" and there are + * more "distant matches" (matched by stripping the suffix), + * then we insert an additional separator text to suggest + * that the user LOOK CLOSELY whether these are really the + * files she wanted. + */ + if (i > 0 && i < candidates->nelts - 1 + && variant[i].quality != SP_VERYDIFFERENT + && variant[i + 1].quality == SP_VERYDIFFERENT) { + *(const char **)apr_array_push(t) = + "</ul>\nFurthermore, the following related " + "documents were found:\n<ul>\n"; + } + } + *(const char **)apr_array_push(t) = "</ul>\n"; + + /* If we know there was a referring page, add a note: */ + if (ref != NULL) { + *(const char **)apr_array_push(t) = + "Please consider informing the owner of the " + "referring page <tt>"; + *(const char **)apr_array_push(t) = ap_escape_html(sub_pool, ref); + *(const char **)apr_array_push(t) = + "</tt> about the broken link.\n"; + } + + + /* Pass our apr_table_t to http_protocol.c (see mod_negotiation): */ + apr_table_setn(notes, "variant-list", apr_array_pstrcat(p, t, 0)); + + apr_table_mergen(r->subprocess_env, "VARIANTS", + apr_array_pstrcat(p, v, ',')); + + apr_pool_destroy(sub_pool); + + ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, + ref ? APLOGNO(03226) "Spelling fix: %s: %d candidates from %s" + : APLOGNO(03227) "Spelling fix: %s: %d candidates%s", + r->uri, candidates->nelts, + (ref ? ref : "")); + + return HTTP_MULTIPLE_CHOICES; + } + } + + return OK; +} + +static void register_hooks(apr_pool_t *p) +{ + ap_hook_fixups(check_speling,NULL,NULL,APR_HOOK_LAST); +} + +AP_DECLARE_MODULE(speling) = +{ + STANDARD20_MODULE_STUFF, + create_mconfig_for_directory, /* create per-dir config */ + NULL, /* merge per-dir config */ + create_mconfig_for_server, /* server config */ + NULL, /* merge server config */ + speling_cmds, /* command apr_table_t */ + register_hooks /* register hooks */ +}; |