dgaudet 97/09/11 11:50:35
Modified: src Configuration.tmpl Added: htdocs/manual/mod mod_speling.html src/modules/standard mod_speling.c Log: Add in mod_speling. Submitted by: Martin Kraemer, Alexei Kosut Reviewed by: various Revision Changes Path 1.1 apachen/htdocs/manual/mod/mod_speling.html Index: mod_speling.html =================================================================== <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"> <HTML> <HEAD> <TITLE>Apache module mod_speling</TITLE> </HEAD> <!-- Background white, links blue (unvisited), navy (visited), red (active) --> <BODY BGCOLOR="#FFFFFF" TEXT="#000000" LINK="#0000FF" VLINK="#000080" ALINK="#FF0000" > <!--#include virtual="header.html" --> <H1 ALIGN="CENTER">Module mod_speling</H1> <P> This module is contained in the <code>mod_speling.c</code> file, and is <strong>not</strong> compiled in by default. It attemps to correct mispellings of URLs that users might have entered, by ignoring capitalization and by allowing up to one misspelling.<br> This catches the majority of misspelled requests. An automatic "spelling corrected" redirection is returned if only one matching document was found, and a list of matches is returned if more than one document with a sufficiently similar name is found. </P> <h2>Summary</h2> <p> Requests to documents sometimes cannot be served by the core apache server because the request was misspelled or miscapitalized. This module addresses this problem by trying to find a matching document, even after all other modules gave up. It does its work by comparing each document name in the requested directory against the requested document name <STRONG>without regard to case</STRONG>, and allowing <STRONG>up to one misspelling</STRONG> (character insertion / omission / transposition or wrong character). A list is built with all document names which were matched using this strategy. </p> <p> If, after scanning the directory, <ul> <li>no matching document was found, Apache will proceed as usual and return a "document not found" error. <li>only one document is found that "almost" matches the request, then it is returned in the form of a redirection response. <li>more than one document with a close match was found, then the list of the matches is returned to the client, and the client can select the correct candidate. </ul> </p> <h2>Directives</h2> <menu> <li><A HREF="#checkspelling">CheckSpelling</A> </menu> <HR> <!-- the HR is part of the directive description --> <A name="checkspelling"><h2>CheckSpelling</h2></A> <!--%plaintext <?INDEX {\tt CheckSpelling} directive> --> <strong>Syntax:</strong> CheckSpelling <em>on/off</em><br> <strong>Default:</strong> <code>CheckSpelling Off</code><br> <Strong>Context:</strong> server config, virtual host<br> <strong>Status:</strong> Base<br> <strong>Module:</strong> mod_speling<br> <strong>Compatibility:</strong> CheckSpelling was available as a separately available module for Apache 1.1, but was limited to miscapitalizations. As of Apache 1.3, it is part of the apache distribution<!-- or: available as a separate module-->.<p> This directive enables or disables the spelling module. When enabled, keep in mind that <UL> <LI>the directory scan which is necessary for the spelling correction will have an impact on the server's performance when many spelling corrections have to be performed at the same time. <LI>the document trees should not contain sensitive files which could be matched inadvertedly, by a spelling "correction". <LI>the module is unable to correct misspelled user names (as in <code>http://my.host/~apahce/</code>), just file names or directory names. </UL> <!--#include virtual="footer.html" --> </BODY> </HTML> 1.76 +9 -0 apachen/src/Configuration.tmpl Index: Configuration.tmpl =================================================================== RCS file: /export/home/cvs/apachen/src/Configuration.tmpl,v retrieving revision 1.75 retrieving revision 1.76 diff -u -r1.75 -r1.76 --- Configuration.tmpl 1997/08/31 22:36:22 1.75 +++ Configuration.tmpl 1997/09/11 18:50:31 1.76 @@ -211,6 +211,15 @@ ## ## URL translation modules. ## + +## The Speling module attemps to correct mispellings of URLs that +## users might have entered, namely by checking capitalizations +## or by allowing up to one misspelling (character insertion / omission / +## transposition/typo). This catches the majority of misspelled requests. +## If it finds a match, a "spelling corrected" redirection is returned. + +# AddModule modules/standard/mod_speling.o + ## The UserDir module for selecting resource directories by user name ## and a common prefix, e.g., /~<user> , /usr/web/<user> , etc. 1.1 apachen/src/modules/standard/mod_speling.c Index: mod_speling.c =================================================================== #define WANT_BASENAME_MATCH /* ==================================================================== * Copyright (c) 1996,1997 The Apache Group. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. All advertising materials mentioning features or use of this * software must display the following acknowledgment: * "This product includes software developed by the Apache Group * for use in the Apache HTTP server project (http://www.apache.org/)." * * 4. The names "Apache Server" and "Apache Group" must not be used to * endorse or promote products derived from this software without * prior written permission. * * 5. Redistributions of any form whatsoever must retain the following * acknowledgment: * "This product includes software developed by the Apache Group * for use in the Apache HTTP server project (http://www.apache.org/)." * * THIS SOFTWARE IS PROVIDED BY THE APACHE GROUP ``AS IS'' AND ANY * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE APACHE GROUP OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED * OF THE POSSIBILITY OF SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Group and was originally based * on public domain software written at the National Center for * Supercomputing Applications, University of Illinois, Urbana-Champaign. * For more information on the Apache Group and the Apache HTTP server * project, please see <http://www.apache.org/>. * */ #include "httpd.h" #include "http_config.h" #include "http_log.h" /* mod_speling.c - by Alexei Kosut <[EMAIL PROTECTED]> June, 1996 * * This module is transparent, and simple. It attemps to correct * mispellings of URLs that users might have entered, namely by checking * capitalizations. If it finds a match, it sends a redirect. * * 08-Aug-1997 <[EMAIL PROTECTED]> * o Upgraded module interface to apache_1.3a2-dev API (more NULL's in speling_module). * o Integrated tcsh's "spelling correction" routine which allows one * misspelling (character insertion/omission/typo/transposition). * Rewrote it to ignore case as well. This ought to catch the majority * of misspelled requests. * o Commented out the second pass where files' suffixes are stripped. * Given the better hit rate of the first pass, this rather ugly * (request index.html, receive index.db ?!?!) solution can be * omitted. * o wrote a "kind of" html page for mod_speling * * Activate it with "CheckSpelling On" */ module speling_module; /* We use the "unconventional" mod_userdir approach here. And heck, * here it's just one int! */ static void *create_speling_config(pool * dummy, server_rec * s) { return (void *) 0; } static const char *set_speling(cmd_parms * cmd, void *dummy, int arg) { void *server_conf = cmd->server->module_config; set_module_config(server_conf, &speling_module, (void *) arg); return NULL; } command_rec speling_cmds[] = { {"CheckSpelling", set_speling, NULL, RSRC_CONF, FLAG, "whether or not to fix miscapitalized/misspelled requests"}, {NULL} }; typedef enum { SP_IDENTICAL = 0, SP_MISCAPITALIZED = 1, SP_TRANSPOSITION = 2, SP_MISSINGCHAR = 3, SP_EXTRACHAR = 4, SP_SIMPLETYPO = 5, SP_VERYDIFFERENT = 6 } sp_reason; static const char *sp_reason_str[] = { "identical", "miscapitalized", "transposed characters", "character missing", "extra character", "mistyped character", "common basename", }; typedef struct { const char *name; sp_reason quality; } misspelled_file; /* * spdist() is taken from Kernighan & Pike, * _The_UNIX_Programming_Environment_ * and adapted somewhat to correspond better to psychological reality. * (Note the changes to the return values) * * According to Pollock and Zamora, CACM April 1984 (V. 27, No. 4), * page 363, the correct order for this is: * OMISSION = TRANSPOSITION > INSERTION > SUBSTITUTION * thus, it was exactly backwards in the old version. -- PWP * * This routine was taken out of tcsh's spelling correction code * (tcsh-6.07.04) and re-converted to apache data types ("char" type * instead of tcsh's NLS'ed "Char"). Plus it now ignores the case * during comparisons, so is a "approximate strcasecmp()". * NOTE that is still allows only _one_ real "typo", * it does NOT try to correct multiple errors. */ static sp_reason spdist(const char *s, const char *t) { for (; tolower(*s) == tolower(*t); t++, s++) if (*t == '\0') return SP_MISCAPITALIZED; /* exact match (sans case) */ if (*s) { if (*t) { if (s[1] && t[1] && tolower(*s) == tolower(t[1]) && tolower(*t) == tolower(s[1]) && strcasecmp(s + 2, t + 2) == 0) return SP_TRANSPOSITION; /* transposition */ if (strcasecmp(s + 1, t + 1) == 0) return SP_SIMPLETYPO; /* 1 char mismatch */ } if (strcasecmp(s + 1, t) == 0) return SP_EXTRACHAR; /* extra character */ } if (*t && strcasecmp(s, t + 1) == 0) return SP_MISSINGCHAR; /* missing character */ return SP_VERYDIFFERENT; /* distance too large to fix. */ } static int sort_by_quality(const void *left, const void *rite) { return (int) (((misspelled_file *) left)->quality) - (int) (((misspelled_file *) rite)->quality); } static int check_speling(request_rec * r) { void *server_conf = r->server->module_config; char *good, *bad, *postgood, *url; int filoc, dotloc, urlen, pglen; DIR *dirp; struct DIR_TYPE *dir_entry; array_header *candidates = NULL; if (!(int) get_module_config(server_conf, &speling_module)) return DECLINED; /* We only want to worry about GETs */ if (r->method_number != M_GET) return DECLINED; /* We've already got a file of some kind or another */ if (r->proxyreq || (r->finfo.st_mode != 0)) return DECLINED; /* This is a sub request - don't mess with it */ if (r->main) return DECLINED; /* The request should end up looking like this: * r->uri: /correct-url/mispelling/more * r->filename: /correct-file/mispelling r->path_info: /more * * So we do this in steps. First break r->filename into two peices */ filoc = rind(r->filename, '/'); if (filoc == -1) return DECLINED; /* good = /correct-file */ good = pstrndup(r->pool, r->filename, filoc); /* bad = mispelling */ bad = pstrdup(r->pool, r->filename + filoc + 1); /* postgood = mispelling/more */ postgood = pstrcat(r->pool, bad, r->path_info, NULL); urlen = strlen(r->uri); pglen = strlen(postgood); /* Check to see if the URL pieces add up */ if (strcmp(postgood, r->uri + (urlen - pglen))) return DECLINED; /* url = /correct-url */ url = pstrndup(r->pool, r->uri, (urlen - pglen)); /* Now open the directory and do ourselves a check... */ dirp = opendir(good); if (dirp == NULL) /* Oops, not a directory... */ return DECLINED; candidates = make_array(r->pool, 2, sizeof(misspelled_file)); dotloc = ind(bad, '.'); if (dotloc == -1) dotloc = strlen(bad); while ((dir_entry = readdir(dirp))) { sp_reason q; /* If we end up with a "fixed" URL which is identical to the * requested one, we must have found a broken symlink or some such. * Do _not_ try to redirect this, it causes a loop! */ if (strcmp(bad, dir_entry->d_name) == 0) { closedir(dirp); return OK; } /* * miscapitalization errors are checked first * (like, e.g., lower case file, upper case request) */ else if (strcasecmp(bad, dir_entry->d_name) == 0) { misspelled_file *sp_new = (misspelled_file *) push_array(candidates); sp_new->name = pstrdup(r->pool, dir_entry->d_name); sp_new->quality = SP_MISCAPITALIZED; } /* * simple typing errors are checked next * (like, e.g., missing/extra/transposed char) */ else if ((q = spdist(bad, dir_entry->d_name)) != SP_VERYDIFFERENT) { misspelled_file *sp_new = (misspelled_file *) push_array(candidates); sp_new->name = pstrdup(r->pool, dir_entry->d_name); sp_new->quality = q; } /* The spdist() should have found the majority of the misspelled requests. * it is of questionable use to continue looking for files with the same * base name, but potentially of totally wrong type (index.html <-> index.db) * I would propose to not set the WANT_BASENAME_MATCH define. * 08-Aug-1997 <[EMAIL PROTECTED]> * * However, Alexei replied giving some reasons to add it anyway: * > Oh, by the way, I remembered why having the * > extension-stripping-and-matching stuff is a good idea: * > * > If you're using MultiViews, and have a file named foobar.html, which you * > refer to as "foobar", and someone tried to access "Foobar", mod_speling * > won't find it, because it won't find anything matching that * > spelling. With the extension-munging, it would locate "foobar.html". Not * > perfect, but I ran into that problem when I first wrote the module. */ else { #ifdef WANT_BASENAME_MATCH /* Okay... we didn't find anything. Now we take out the hard-core * power tools. There are several cases here. Someone might have * entered a wrong extension (.htm instead of .html or vice versa) * or the document could be negotiated. At any rate, now we just compare * stuff before the first dot. If it matches, we figure we got us a * match. This can result in wrong things if there are files of * different content types but the same prefix (e.g. foo.gif and foo.html) * This code will pick the first one it finds. Better than a Not Found, * though. */ int entloc = ind(dir_entry->d_name, '.'); if (entloc == -1) entloc = strlen(dir_entry->d_name); if ((dotloc == entloc) && !strncasecmp(bad, dir_entry->d_name, dotloc)) { misspelled_file *sp_new = (misspelled_file *) push_array(candidates); sp_new->name = pstrdup(r->pool, dir_entry->d_name); sp_new->quality = SP_VERYDIFFERENT; } #endif } } closedir(dirp); if (candidates->nelts != 0) { /* Wow... we found us a mispelling. Construct a fixed url */ char *nuri, *ref; misspelled_file *variant = (misspelled_file *) candidates->elts; int i; ref = table_get(r->headers_in, "Referer"); qsort((void *) candidates->elts, candidates->nelts, sizeof(misspelled_file), sort_by_quality); /* * Conditions for immediate redirection: * a) the first candidate was not found by stripping the suffix * AND b) there exists only one candidate OR the best match is not ambigous * * Otherwise, a "[300] Multiple Choices" list with the variants is returned. */ if (variant[0].quality != SP_VERYDIFFERENT && (candidates->nelts == 1 || variant[0].quality != variant[1].quality)) { nuri = pstrcat(r->pool, url, variant[0].name, r->path_info, NULL); table_set(r->headers_out, "Location", construct_url(r->pool, nuri, r->server)); aplog_error(APLOG_MARK, APLOG_ERR, r->server, ref ? "Fixed spelling: %s to %s from %s" : "Fixed spelling: %s to %s", r->uri, nuri, ref); return HTTP_MOVED_PERMANENTLY; } /* * Otherwise, a "[300] Multiple Choices" list with the variants is returned. */ else { char *t; pool *pool; table *notes; if (r->main == NULL) { pool = r->pool; notes = r->notes; } else { pool = r->main->pool; notes = r->main->notes; } /* Generate the reponse text. */ t = pstrcat(pool, "The document name you requested (<code>", r->uri, "</code>) could not be found on this server.\n" "However, we found documents with names similar to the one you requested.<p>" "Available documents:\n<ul>\n", NULL); for (i = 0; i < candidates->nelts; ++i) { /* The format isn't very neat... */ t = pstrcat(pool, t, "<li><a href=\"", variant[i].name, "\">", variant[i].name, "</a> (", sp_reason_str[(int) (variant[i].quality)], ")\n", NULL); /* when we have printed the "close matches" and there * are more "distant matches" (matched by stripping the * suffix), then we insert an additional separator text * to suggest that the user LOOK CLOSELY whether these * are really the files she wanted. */ if (i > 0 && i < candidates->nelts - 1 && variant[i].quality != SP_VERYDIFFERENT && variant[i + 1].quality == SP_VERYDIFFERENT) { t = pstrcat(pool, t, "</ul>\nFurthermore, the following related documents were found:\n<ul>\n", NULL); } } t = pstrcat(pool, t, "</ul>\n", NULL); /* If we know there was a referring page, add a note: */ if (ref != NULL) t = pstrcat(pool, t, "Please consider informing the owner of the <a href=\"", ref, "\">referring page</a> about the broken link.\n", NULL); /* Pass our table to http_protocol.c (see mod_negotiation): */ table_set(notes, "variant-list", t); aplog_error(APLOG_MARK, APLOG_WARNING, r->server, ref ? "Spelling fix: %s: %d candidates from %s" : "Spelling fix: %s: %d candidates", r->uri, candidates->nelts, ref); return HTTP_MULTIPLE_CHOICES; } } return OK; } module MODULE_VAR_EXPORT speling_module = { STANDARD_MODULE_STUFF, NULL, /* initializer */ NULL, /* create per-dir config */ NULL, /* merge per-dir config */ create_speling_config, /* server config */ NULL, /* merge server config */ speling_cmds, /* command table */ NULL, /* handlers */ NULL, /* filename translation */ NULL, /* check_user_id */ NULL, /* check auth */ NULL, /* check access */ NULL, /* type_checker */ check_speling, /* fixups */ NULL, /* logger */ NULL, /* header parser */ NULL, /* child_init */ NULL, /* child_exit */ NULL /* post read-request */ };