Author: siren Date: Thu Jun 1 09:52:23 2006 New Revision: 410885 URL: http://svn.apache.org/viewvc?rev=410885&view=rev Log: initial import of spellcheck query proposer
Added: lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/build.xml lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/plugin.xml lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/conf/ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/conf/tiles-defs.xml lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/NGramSpeller.java lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerm.java lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerms.java lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/webapp/ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/webapp/controller/ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/webapp/controller/SpellCheckController.java lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/resources/ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/test/ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp Modified: lucene/nutch/trunk/contrib/web2/plugins/build.xml Modified: lucene/nutch/trunk/contrib/web2/plugins/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/build.xml?rev=410885&r1=410884&r2=410885&view=diff ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/build.xml (original) +++ lucene/nutch/trunk/contrib/web2/plugins/build.xml Thu Jun 1 09:52:23 2006 @@ -16,6 +16,7 @@ <ant dir="web-resources" target="deploy"/> <ant dir="web-clustering" target="deploy"/> <ant dir="web-query-propose-ontology" target="deploy"/> + <ant dir="web-query-propose-spellcheck" target="deploy"/> </target> <!-- ====================================================== --> @@ -36,6 +37,7 @@ <ant dir="web-more" target="clean"/> <ant dir="web-clustering" target="clean"/> <ant dir="web-query-propose-ontology" target="clean"/> + <ant dir="web-query-propose-spellcheck" target="clean"/> </target> </project> Added: lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/build.xml?rev=410885&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/build.xml (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/build.xml Thu Jun 1 09:52:23 2006 @@ -0,0 +1,19 @@ +<?xml version="1.0"?> +<project name="web-query-propose-spellcheck" default="jar-core"> + <import file="../build-plugin.xml" /> + <property name="nutch.root" location="${root}/../../../../" /> + <target name="init-plugin"> + <echo>Copying resources templates</echo> + <copy todir="${build.classes}/resources"> + <fileset dir="${resources.dir}" includes="**/*" /> + </copy> + <echo>Copying UI configuration</echo> + <copy todir="${build.classes}"> + <fileset dir="src/conf" includes="**/*"/> + </copy> + <echo>Copying UI templates</echo> + <copy todir="${deploy.dir}/web"> + <fileset dir="src/web" includes="**/*"/> + </copy> + </target> +</project> \ No newline at end of file Added: lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/plugin.xml?rev=410885&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/plugin.xml (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/plugin.xml Thu Jun 1 09:52:23 2006 @@ -0,0 +1,25 @@ +<?xml version="1.0" encoding="UTF-8"?> +<plugin + id="web-query-propose-spellcheck" + name="Spellcheck query proposer" + version="1.0.0" + provider-name="apache.org"> + + <runtime> + <library name="web-query-propose-spellcheck.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.webapp.extension.UIExtensionPoint" + name="Nutch ui extension point" + point="org.apache.nutch.webapp.extension.UIExtensionPoint"> + <implementation id="web-query-propose-spellcheck" + class="org.apache.nutch.webapp.extension.UIExtension.VoidImplementation"/> + </extension> + +</plugin> Added: lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/conf/tiles-defs.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/conf/tiles-defs.xml?rev=410885&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/conf/tiles-defs.xml (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/conf/tiles-defs.xml Thu Jun 1 09:52:23 2006 @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE tiles-definitions PUBLIC "-//Apache Software Foundation//DTD Tiles Configuration 1.1//EN" "/WEB-INF/dtd/tiles-config_1_1.dtd"> +<tiles-definitions> + <definition name="propose" + path="/plugin/web-query-propose-spellcheck/propose.jsp" + controllerClass="org.apache.nutch.webapp.controller.SpellCheckController" + /> +</tiles-definitions> Added: lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/NGramSpeller.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/NGramSpeller.java?rev=410885&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/NGramSpeller.java (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/NGramSpeller.java Thu Jun 1 09:52:23 2006 @@ -0,0 +1,907 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.spell; + +import org.apache.lucene.analysis.*; +import org.apache.lucene.document.*; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; +import org.apache.lucene.store.*; + +import java.io.*; + +import java.text.*; + +import java.util.*; + +/** + * Do spelling correction based on ngram frequency of terms in an index. + * + * Developed based on <a + * href="http://marc.theaimsgroup.com/?l=lucene-user&m=109474652805339&w=2">this + * message</a> in the lucene-user list. + * + * <p> + * There are two parts to this algorithm. First a ngram lookup table is formed + * for all terms in an index. Then suggested spelling corrections can be done + * based on this table. + * <p> + * The "lookup table" is actually another Lucene index. It is built by going + * through all terms in your original index and storing the term in a Document + * with all ngrams that make it up. Ngrams of length 3 and 4 are suggested. + * <p> + * + * In addition the prefix and suffix ngrams are stored in case you want to use a + * heuristic that people usually know the first few characters of a word. + * + * <p> + * The entry's boost is set by default to log(word_freq)/log(num_docs). + * + * <p> + * + * For a word like "kings" a [EMAIL PROTECTED] Document} with the following fields is made + * in the ngram index: + * + * <pre> + * word:kings + * gram3:kin + * gram3:ing + * gram3:ngs + * gram4:king + * gram4:ings + * start3:kin + * start4:king + * end3:ngs + * end4:ings + * + * boost: log(freq('kings'))/log(num_docs). + * </pre> + * + * + * When a lookup is done a query is formed with all ngrams in the misspelled + * word. + * + * <p> + * For a word like <code>"kingz"</code> a query is formed like this. + * + * Query: <br> + * <code> + * gram3:kin gram3:ing gram3:ngz start3:kin^B1 end3:ngz^B2 start4:king^B1 end4:ingz^B2 + * </code> + * <br> + * + * Above B1 and B2 are the prefix and suffix boosts. The prefix boost should + * probably be >= 2.0 and the suffix boost should probably be just a little + * above 1. + * + * <p> + * <b>To build</b> the ngram index based on the "contents" field in an existing + * index 'orig_index' you run the main() driver like this:<br> + * <code> + * java org.apache.lucene.spell.NGramSpeller -f contents -i orig_index -o ngram_index + * </code> + * + * <p> + * Once you build an index you can <b>perform spelling corrections using</b> + * [EMAIL PROTECTED] #suggestUsingNGrams suggestUsingNGrams(...)}. + * + * + * <p> + * + * To play around with the code against an index approx 100k javadoc-generated + * web pages circa Sept/2004 go here: <a + * href='http://www.searchmorph.com/kat/spell.jsp'>http://www.searchmorph.com/kat/spell.jsp</a>. + * + * <p> + * Of interest might be the <a + * href="http://secondstring.sourceforge.net/">secondstring</a> string matching + * package and <a + * href="http://specialist.nlm.nih.gov/nls/gspell/doc/apiDoc/overview-summary.html">gspell</a>. + * + * @author <a href="mailto:dave@tropo.com?subject=NGramSpeller">David + * Spencer</a> + * + * Slightly modified from original version for use in Nutch project. + * + */ +public final class NGramSpeller { + /** + * Field name for each word in the ngram index. + */ + public static final String F_WORD = "word"; + + /** + * Frequency, for the popularity cutoff option which says to only return + * suggestions that occur more frequently than the misspelled word. + */ + public static final String F_FREQ = "freq"; + + /** + * Store transpositions too. + */ + public static final String F_TRANSPOSITION = "transposition"; + + /** + * + */ + private static final PrintStream o = System.out; + + /** + * + */ + private static final NumberFormat nf = NumberFormat.getInstance(); + + public static Query lastQuery; + + /** + * + */ + private NGramSpeller() { + } + + /** + * Main driver, used to build an index. You probably want invoke like this: + * <br> + * <code> + * java org.apache.lucene.spell.NGramSpeller -f contents -i orig_index -o ngram_index + * </code> + */ + public static void main(String[] args) throws Throwable { + int minThreshold = 5; + int ng1 = 3; + int ng2 = 4; + int maxr = 10; + int maxd = 5; + String out = "gram_index"; + String gi = "gram_index"; + + String name = null; + String field = "contents"; + + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-i")) { + name = args[++i]; + } else if (args[i].equals("-minThreshold")) { + minThreshold = Integer.parseInt(args[++i]); + } else if (args[i].equals("-gi")) { + gi = args[++i]; + } else if (args[i].equals("-o")) { + out = args[++i]; + } else if (args[i].equals("-t")) { // test transpositions + + String s = args[++i]; + o.println("TRANS: " + s); + + String[] ar = formTranspositions(s); + + for (int j = 0; j < ar.length; j++) + o.println("\t" + ar[j]); + + System.exit(0); + } else if (args[i].equals("-ng1")) { + ng1 = Integer.parseInt(args[++i]); + } else if (args[i].equals("-ng2")) { + ng2 = Integer.parseInt(args[++i]); + } else if (args[i].equals("-help") || args[i].equals("--help") + || args[i].equals("-h")) { + o.println("To form an ngram index:"); + o + .println("NGramSpeller -i ORIG_INDEX -o NGRAM_INDEX [-ng1 MIN] [-ng2 MAX] [-f FIELD]"); + o.println("Defaults are ng1=3, ng2=4, field='contents'"); + System.exit(100); + } else if (args[i].equals("-q")) { + String goal = args[++i]; + o.println("[NGrams] for " + goal + " from " + gi); + + float bStart = 2.0f; + float bEnd = 1.0f; + float bTransposition = 0f; + + o.println("bStart: " + bStart); + o.println("bEnd: " + bEnd); + o.println("bTrans: " + bTransposition); + o.println("ng1: " + ng1); + o.println("ng2: " + ng2); + + IndexReader ir = IndexReader.open(gi); + IndexSearcher searcher = new IndexSearcher(gi); + List lis = new ArrayList(maxr); + String[] res = suggestUsingNGrams(searcher, goal, ng1, ng2, maxr, + bStart, bEnd, bTransposition, maxd, lis, true); // more popular + o.println("Returned " + res.length + " from " + gi + " which has " + + ir.numDocs() + " words in it"); + + Iterator it = lis.iterator(); + + while (it.hasNext()) { + o.println(it.next().toString()); + } + + o.println(); + o.println("query: " + lastQuery.toString("contents")); + + Hits ghits = searcher.search(new TermQuery( + new Term(F_WORD, "recursive"))); + + if (ghits.length() >= 1) // umm, should only be 0 or 1 + { + Document doc = ghits.doc(0); + o.println("TEST DOC: " + doc); + } + + searcher.close(); + ir.close(); + + return; + } else if (args[i].equals("-f")) { + field = args[++i]; + } else { + o.println("hmm? " + args[i]); + System.exit(1); + } + } + + if (name == null) { + o.println("opps, you need to specify the input index w/ -i"); + System.exit(1); + } + + o.println("Opening " + name); + IndexReader.unlock(FSDirectory.getDirectory(name, false)); + + final IndexReader r = IndexReader.open(name); + + o.println("Docs: " + nf.format(r.numDocs())); + o.println("Using field: " + field); + + IndexWriter writer = new IndexWriter(out, new WhitespaceAnalyzer(), true); + writer.setMergeFactor(writer.getMergeFactor()*50); + writer.setMaxBufferedDocs(writer.getMaxBufferedDocs()*50); + + o.println("Forming index from " + name + " to " + out); + + int res = formNGramIndex(r, writer, ng1, ng2, field, minThreshold); + + o.println("done, did " + res + " ngrams"); + writer.optimize(); + writer.close(); + r.close(); + } + + /** + * Using an NGram algorithm try to find alternate spellings for a "goal" word + * based on the ngrams in it. + * + * @param searcher + * the searcher for the "ngram" index + * + * @param goal + * the word you want a spell check done on + * + * @param ng1 + * the min ngram length to use, probably 3 and it defaults to 3 if + * you pass in a value <= 0 + * + * @param ng2 + * the max ngram length to use, probably 3 or 4 + * + * @param maxr + * max results to return, probably a small number like 5 for normal + * use or 10-100 for testing + * + * @param bStart + * how to boost matches that start the same way as the goal word, + * probably greater than 2 + * + * @param bEnd + * how to boost matches that end the same way as the goal word, + * probably greater than or equal to 1 + * + * @param bTransposition + * how to boost matches that are also simple transpositions, or 0 to + * disable + * + * @param maxd + * filter for the max Levenshtein string distance for matches, + * probably a number like 3, 4, or 5, or use 0 for it to be ignored. + * This prevents words radically longer but similar to the goal word + * from being returned. + * + * @param details + * if non null is a list with one entry per match. Each entry is an + * array of ([String] word, [Double] score, [Integer] Levenshtein + * string distance, [Integer] word freq). + * + * @param morePopular + * if true says to only return suggestions more popular than the + * misspelled word. This prevents rare words from being suggested. + * Note that for words that don't appear in the index at all this has + * no effect as those words will have a frequency of 0 anyway. + * + * @return the strings suggested with the best one first + */ + public static String[] suggestUsingNGrams(Searcher searcher, String goal, + int ng1, int ng2, int maxr, float bStart, float bEnd, + float bTransposition, int maxd, List details, boolean morePopular) + throws Throwable { + List res = new ArrayList(maxr); + BooleanQuery query = new BooleanQuery(); + + if (ng1 <= 0) { + ng1 = 3; // guess + } + + if (ng2 < ng1) { + ng2 = ng1; + } + + if (bStart < 0) { + bStart = 0; + } + + if (bEnd < 0) { + bEnd = 0; + } + + if (bTransposition < 0) { + bTransposition = 0; + } + + // calculate table of all ngrams for goal word + String[][] gramt = new String[ng2 + 1][]; + + for (int ng = ng1; ng <= ng2; ng++) + gramt[ng] = formGrams(goal, ng); + + int goalFreq = 0; + + if (morePopular) { + Hits ghits = searcher.search(new TermQuery(new Term(F_WORD, goal))); + + if (ghits.length() >= 1) // umm, should only be 0 or 1 + { + Document doc = ghits.doc(0); + goalFreq = Integer.parseInt(doc.get(F_FREQ)); + } + } + + if (bTransposition > 0) { + add(query, F_TRANSPOSITION, goal, bTransposition); + } + + TRStringDistance sd = new TRStringDistance(goal); + + for (int ng = ng1; ng <= ng2; ng++) // for every ngram in range + { + String[] grams = gramt[ng]; // form word into ngrams (allow dups too) + + if (grams.length == 0) { + continue; // hmm + } + + String key = "gram" + ng; // form key + + if (bStart > 0) { // should we boost prefixes? + add(query, "start" + ng, grams[0], bStart); // matches start of word + } + + if (bEnd > 0) { // should we boost suffixes + add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end + // of word + } + + // match ngrams anywhere, w/o a boost + for (int i = 0; i < grams.length; i++) { + add(query, key, grams[i]); + } + } + + Hits hits = searcher.search(query); + int len = hits.length(); + int remain = maxr; + int stop = Math.min(len, 100 * maxr); // go thru more than 'maxr' matches in + // case the distance filter triggers + + for (int i = 0; (i < stop) && (remain > 0); i++) { + Document d = hits.doc(i); + String word = d.get(F_WORD); // get orig word + + if (word.equals(goal)) { + continue; // don't suggest a word for itself, that would be silly + } + + int dist = sd.getDistance(word); // use distance filter + + if ((maxd > 0) && (dist > maxd)) { + continue; + } + + int suggestionFreq = Integer.parseInt(d.get(F_FREQ)); + + if (morePopular && (goalFreq > suggestionFreq)) { + continue; // don't suggest a rarer word + } + + remain--; + res.add(word); + + if (details != null) // only non-null for testing probably + { + int[] matches = new int[ng2 + 1]; + + for (int ng = ng1; ng <= ng2; ng++) { + String[] have = formGrams(word, ng); + int match = 0; + String[] cur = gramt[ng]; + + for (int k = 0; k < have.length; k++) { + boolean looking = true; + + for (int j = 0; (j < cur.length) && looking; j++) { + if (have[k].equals(cur[j])) { + // o.println( "\t\tmatch: " + have[ k] + " on " + word); + match++; + looking = false; + } + } + + /* + * if ( looking) o.println( "\t\tNO MATCH: " + have[ k] + " on " + + * word); + */ + } + + matches[ng] = match; + } + + details.add(new SpellSuggestionDetails(word, hits.score(i), dist, + suggestionFreq, matches, ng1)); + } + } + + lastQuery = query; // hack for now + + return (String[]) res.toArray(new String[0]); + } + + /** + * Go thru all terms and form an index of the "ngrams" of length 'ng1' to + * 'ng2' in each term. The ngrams have field names like "gram3" for a 3 char + * ngram, and "gram4" for a 4 char one. The starting and ending (or prefix and + * suffix) "n" characters are also stored for each word with field names + * "start3" and "end3". + * + * + * @param r + * the index to read terms from + * + * @param w + * the writer to write the ngrams to, or if null an index named + * "gram_index" will be created. If you pass in non-null then you + * should optimize and close the index. + * + * @param ng1 + * the min number of chars to form ngrams with (3 is suggested) + * + * @param ng2 + * the max number of chars to form ngrams with, can be equal to ng1 + * + * @param fields + * the field name to process ngrams from. + * + * @param minThreshold + * terms must appear in at least this many docs else they're ignored + * as the assumption is that they're so rare (...) + * + * @return the number of ngrams added + * + */ + private static int formNGramIndex(IndexReader r, IndexWriter _w, int ng1, + int ng2, String field, int minThreshold) throws IOException { + int mins = 0; + float nudge = 0.01f; // don't allow boosts to be too small + IndexWriter w; + + if (_w == null) { + w = new IndexWriter("gram_index", new WhitespaceAnalyzer(), // should have + // no effect + true); + } else { + w = _w; + } + + int mod = 1000; // for status + int nd = r.numDocs(); + final float base = (float) Math.log(1.0d / ((double) nd)); + + if (field == null) { + field = "contents"; // def field + } + + field = field.intern(); // is it doced that you can use == on fields? + + int grams = 0; // # of ngrams added + final TermEnum te = r.terms(new Term(field, "")); + int n = 0; + int skips = 0; + + while (te.next()) { + boolean show = false; // for debugging + Term t = te.term(); + String have = t.field(); + + if ((have != field) && !have.equals(field)) // wrong field + { + break; + } + + if (t.text().indexOf("-") >= 0) { + continue; + } + + int df = te.docFreq(); + + if ((++n % mod) == 0) { + show = true; + o.println("term: " + t + " n=" + nf.format(n) + " grams=" + + nf.format(grams) + " mins=" + nf.format(mins) + " skip=" + + nf.format(skips) + " docFreq=" + df); + } + + if (df < minThreshold) // not freq enough, too rare to consider + { + mins++; + + continue; + } + + String text = t.text(); + int len = text.length(); + + if (len < ng1) { + continue; // too short we bail but "too long" is fine... + } + + // but note that long tokens that are rare prob won't get here anyway as + // they won't + // pass the 'minThreshold' check above + Document doc = new Document(); + doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term + doc.add(new Field(F_FREQ, "" + df, Field.Store.YES, Field.Index.UN_TOKENIZED)); // for popularity cutoff optionx + + String[] trans = formTranspositions(text); + + for (int i = 0; i < trans.length; i++) + doc.add(new Field(F_TRANSPOSITION, trans[i], Field.Store.YES, Field.Index.UN_TOKENIZED)); + + // now loop thru all ngrams of lengths 'ng1' to 'ng2' + for (int ng = ng1; ng <= ng2; ng++) { + String key = "gram" + ng; + String end = null; + + for (int i = 0; i < (len - ng + 1); i++) { + String gram = text.substring(i, i + ng); + doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED)); + + if (i == 0) { + doc.add(new Field("start" + ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED)); + } + + end = gram; + grams++; + } + + if (end != null) { // may not be present if len==ng1 + doc.add(new Field("end" + ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED)); + } + } + + float f1 = te.docFreq(); + float f2 = nd; + + float bo = (float) ((Math.log(f1) / Math.log(f2)) + nudge); + doc.setBoost(bo); + + if (show) { + o.println("f1=" + f1 + " nd=" + nd + " boost=" + bo + " base=" + base + + " word=" + text); + } + + w.addDocument(doc); + } + + if (_w == null) // else you have to optimize/close + { + w.optimize(); + w.close(); + } + + return grams; + } + + /** + * Add a clause to a boolean query. + */ + private static void add(BooleanQuery q, String k, String v, float boost) { + Query tq = new TermQuery(new Term(k, v)); + tq.setBoost(boost); + q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD)); + } + + /** + * + */ + public static String[] formTranspositions(String s) { + int len = s.length(); + List res = new ArrayList(len - 1); + + for (int i = 0; i < (len - 1); i++) { + char c1 = s.charAt(i); + char c2 = s.charAt(i + 1); + + if (c1 == c2) { + continue; + } + + res.add(s.substring(0, i) + c2 + c1 + s.substring(i + 2)); + } + + return (String[]) res.toArray(new String[0]); + } + + /** + * Form all ngrams for a given word. + * + * @param text + * the word to parse + * @param ng + * the ngram length e.g. 3 + * @return an array of all ngrams in the word and note that duplicates are not + * removed + */ + public static String[] formGrams(String text, int ng) { + List res = new ArrayList(text.length() - ng + 1); + int len = text.length(); + + for (int i = 0; i < (len - ng + 1); i++) { + res.add(text.substring(i, i + ng)); + } + + return (String[]) res.toArray(new String[0]); + } + + /** + * Add a clause to a boolean query. + */ + private static void add(BooleanQuery q, String k, String v) { + q.add(new BooleanClause(new TermQuery(new Term(k, v)), BooleanClause.Occur.SHOULD)); + } + + /** + * Presumably this is implemented somewhere in the apache/jakarta/commons area + * but I couldn't find it. + * + * @link http://www.merriampark.com/ld.htm + * + */ + private static class TRStringDistance { + final char[] sa; + + final int n; + + final int[][][] cache = new int[30][][]; + + /** + * Optimized to run a bit faster than the static getDistance(). In one + * benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus + * 37% faster. + */ + private TRStringDistance(String target) { + sa = target.toCharArray(); + n = sa.length; + } + + // ***************************** + // Compute Levenshtein distance + // ***************************** + public int getDistance(String other) { + int[][] d; // matrix + int cost; // cost + + // Step 1 + final char[] ta = other.toCharArray(); + final int m = ta.length; + + if (n == 0) { + return m; + } + + if (m == 0) { + return n; + } + + if (m >= cache.length) { + d = form(n, m); + } else if (cache[m] != null) { + d = cache[m]; + } else { + d = cache[m] = form(n, m); + } + + // Step 3 + for (int i = 1; i <= n; i++) { + final char s_i = sa[i - 1]; + + // Step 4 + for (int j = 1; j <= m; j++) { + final char t_j = ta[j - 1]; + + // Step 5 + if (s_i == t_j) { // same + cost = 0; + } else { // not a match + cost = 1; + } + + // Step 6 + d[i][j] = min3(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + + cost); + } + } + + // Step 7 + return d[n][m]; + } + + /** + * + */ + private static int[][] form(int n, int m) { + int[][] d = new int[n + 1][m + 1]; + + // Step 2 + for (int i = 0; i <= n; i++) + d[i][0] = i; + + for (int j = 0; j <= m; j++) + d[0][j] = j; + + return d; + } + + // **************************** + // Get minimum of three values + // **************************** + private static int min3(int a, int b, int c) { + int mi; + + mi = a; + + if (b < mi) { + mi = b; + } + + if (c < mi) { + mi = c; + } + + return mi; + } + + // ***************************** + // Compute Levenshtein distance + // ***************************** + public static int getDistance(String s, String t) { + return getDistance(s.toCharArray(), t.toCharArray()); + } + + // ***************************** + // Compute Levenshtein distance + // ***************************** + public static int getDistance(final char[] sa, final char[] ta) { + int[][] d; // matrix + int i; // iterates through s + int j; // iterates through t + char s_i; // ith character of s + char t_j; // jth character of t + int cost; // cost + + // Step 1 + final int n = sa.length; + final int m = ta.length; + + if (n == 0) { + return m; + } + + if (m == 0) { + return n; + } + + d = new int[n + 1][m + 1]; + + // Step 2 + for (i = 0; i <= n; i++) { + d[i][0] = i; + } + + for (j = 0; j <= m; j++) { + d[0][j] = j; + } + + // Step 3 + for (i = 1; i <= n; i++) { + s_i = sa[i - 1]; + + // Step 4 + for (j = 1; j <= m; j++) { + t_j = ta[j - 1]; + + // Step 5 + if (s_i == t_j) { + cost = 0; + } else { + cost = 1; + } + + // Step 6 + d[i][j] = min3(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + + cost); + } + } + + // Step 7 + return d[n][m]; + } + } + + /* Added by Andy Liu for Nutch */ + public static class SpellSuggestionDetails { + public String word; + + public double score; + + public int dist; + + public int docFreq; + + public int[] matches; + + public int ng1; + + public SpellSuggestionDetails(String word, double score, int dist, + int docFreq, int[] matches, int ng1) { + super(); + this.word = word; + this.score = score; + this.dist = dist; + this.docFreq = docFreq; + this.matches = matches; + this.ng1 = ng1; + } + + public String toString() { + StringBuffer buf = new StringBuffer("word=" + word + " score=" + score + + " dist=" + dist + " freq=" + docFreq + "\n"); + + for (int j = ng1; j < matches.length; j++) + buf.append("\tmm[ " + j + " ] = " + matches[j]); + + return buf.toString(); + } + } +} Added: lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java?rev=410885&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java Thu Jun 1 09:52:23 2006 @@ -0,0 +1,328 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.spell; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.logging.Logger; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.LogFormatter; + +import org.apache.lucene.search.IndexSearcher; +import org.apache.nutch.searcher.Query; +import org.apache.nutch.util.NutchConfiguration; + +/** + * Parses queries and sends them to NGramSpeller for spell checking. + * + * @author Andy Liu <[EMAIL PROTECTED]> + */ +public class SpellCheckerBean { + public static final Logger LOG = LogFormatter + .getLogger(SpellCheckerBean.class.toString()); + + IndexSearcher spellingSearcher; + + // + // Configuration parameters used by NGramSpeller . Hardcoded for now. + // + final int minThreshold = 5; + + final int ng1 = 3; + + final int ng2 = 4; + + final int maxr = 10; + + final int maxd = 5; + + final float bStart = 2.0f; + + final float bEnd = 1.0f; + + final float bTransposition = 6.5f; + + // configuration variable names + public static final String SPELLING_INDEX_LOCATION = "spell.index.dir"; + + public static final String SPELLING_DOCFREQ_THRESHOLD = "spell.docfreq.threshold"; + + public static final String SPELLING_DOCFREQ_THRESHOLD_FACTOR = "spell.docfreq.threshold.factor"; + + String indexLocation; + + int threshold; + + int thresholdFactor; + + Configuration conf; + + public SpellCheckerBean(Configuration conf) { + this.conf=conf; + indexLocation = conf.get(SPELLING_INDEX_LOCATION, "./spelling"); + threshold = conf.getInt(SPELLING_DOCFREQ_THRESHOLD, 100); + thresholdFactor = conf.getInt(SPELLING_DOCFREQ_THRESHOLD_FACTOR, 10); + try { + spellingSearcher = new IndexSearcher(indexLocation); + } catch (IOException ioe) { + LOG.info("error opening spell checking index"); + ioe.printStackTrace(System.out); + } + } + + /** Cache in Configuration. */ + public static SpellCheckerBean get(Configuration conf) { + SpellCheckerBean spellCheckerBean = (SpellCheckerBean) conf + .getObject(SpellCheckerBean.class.getName()); + + if (spellCheckerBean == null) { + LOG.info("creating new spell checker bean"); + spellCheckerBean = new SpellCheckerBean(conf); + conf.setObject(SpellCheckerBean.class.getName(), spellCheckerBean); + } + return spellCheckerBean; + } + + public SpellCheckerTerms checkSpelling(Query query, String queryString) { + + return checkSpelling(query, queryString, threshold, thresholdFactor); + } + + /** + * Parses original query, retrieves suggestions from ngrams spelling index + * + * @param originalQuery + * Query to be spell-checked + * @param docFreqThreshold + * Terms in query that have a docFreq lower than this threshold + * qualify as "mispelled" + * @param factorThreshold + * The suggested term must have a docFreq at least factorThreshold + * times more than the mispelled term. Set to 1 to disable. + * @return terms with corrected spelling + */ + public SpellCheckerTerms checkSpelling(Query query, String queryString, + int docFreqThreshold, int factorThreshold) { + SpellCheckerTerms spellCheckerTerms = null; + + try { + spellCheckerTerms = parseOriginalQuery(query, queryString); + + for (int i = 0; i < spellCheckerTerms.size(); i++) { + SpellCheckerTerm currentTerm = spellCheckerTerms.getSpellCheckerTerm(i); + String originalTerm = currentTerm.getOriginalTerm(); + + spellCheckerTerms.getSpellCheckerTerm(i).setOriginalDocFreq( + getDocFreq(originalTerm)); + + // + // Spell checking is not effective for words under 4 letters long + // Any words over 25 letters long isn't worth checking either. + // + if (originalTerm.length() < 4) + continue; + + if (originalTerm.length() > 25) + continue; + + List lis = new ArrayList(maxr); + + String[] suggestions = NGramSpeller.suggestUsingNGrams(spellingSearcher + , originalTerm, ng1, ng2, maxr, bStart, bEnd, + bTransposition, maxd, lis, true); + + Iterator it = lis.iterator(); + + while (it.hasNext()) { + LOG.fine(it.next().toString()); + } + + if (suggestions.length > 0) { + currentTerm.setSuggestedTerm(suggestions[0]); + + if (lis != null) { + NGramSpeller.SpellSuggestionDetails detail = (NGramSpeller.SpellSuggestionDetails) lis + .get(0); + currentTerm.setSuggestedTermDocFreq(detail.docFreq); + } + + // We use document frequencies of the original term and the suggested + // term to guess + // whether or not a term is mispelled. The criteria is as follows: + // + // 1. The term's document frequency must be under a constant threshold + // 2. The suggested term's docFreq must be greater than the original + // term's docFreq * constant factor + // + if ((currentTerm.originalDocFreq < docFreqThreshold) + && ((currentTerm.originalDocFreq * factorThreshold) < (currentTerm.suggestedTermDocFreq))) { + spellCheckerTerms.setHasMispelledTerms(true); + currentTerm.setMispelled(true); + } + } + + } + + } catch (Throwable t) { + t.printStackTrace(); + } + + return spellCheckerTerms; + } + + /** + * + * Parses the query and preserves characters and formatting surrounding terms + * to be spell-checked. This is done so that we can present the query in the + * "Did you mean: XYZ" message in the same format the user originally typed + * it. + * + * @param originalQuery + * text to be parsed + * @return spell checker terms + */ + public SpellCheckerTerms parseOriginalQuery(Query query, String queryString) + throws IOException { + String[] terms = query.getTerms(); + SpellCheckerTerms spellCheckerTerms = new SpellCheckerTerms(); + + int previousTermPos = 0; + for (int i = 0; i < terms.length; i++) { + + int termPos = queryString.toLowerCase().indexOf(terms[i]); + + String charsBefore = ""; + String charsAfter = ""; + + // Is this the first term? If so, we need to check for characters + // before the first term. + if (i == 0) { + + if (termPos > 0) { + charsBefore = queryString.substring(0, termPos); + } + + // We're in-between terms... + } else { + int endOfLastTerm = previousTermPos + terms[i - 1].length(); + + if (endOfLastTerm < termPos) { + charsBefore = queryString.substring(endOfLastTerm, termPos); + } + } + + // Is this the last term? If so, we need to check for characters + // after the last term. + if (i == (terms.length - 1)) { + + int endOfCurrentTerm = termPos + terms[i].length(); + + if (endOfCurrentTerm < queryString.length()) { + charsAfter = queryString.substring(endOfCurrentTerm, queryString + .length()); + } + + } + + previousTermPos = termPos; + + spellCheckerTerms.add(new SpellCheckerTerm(terms[i], charsBefore, + charsAfter)); + + } + + return spellCheckerTerms; + + } + + public SpellCheckerTerms parseOriginalQuery(String queryString) + throws IOException { + return parseOriginalQuery(Query.parse(queryString, conf), queryString); + } + + /** + * Retrieves docFreq as stored within spelling index. Alternatively, we could + * simply consult the main index for a docFreq() of a term (which would be + * faster) but it's nice to have a separate, spelling index that can stand on + * its own. + * + * @param term + * @return document frequency of term + */ + private int getDocFreq(String term) throws IOException { + /* + * Hits hits = this.spellingSearcher.getLuceneSearcher().search(new + * TermQuery(new Term( NGramSpeller.F_WORD, term))); if (hits.length() > 0) { + * Document doc = hits.doc(0); String docFreq = + * doc.get(NGramSpeller.F_FREQ); return Integer.parseInt(docFreq); } + */ + return 0; + } + + public static void main(String[] args) throws Throwable { + if (args.length < 1) { + System.out.println("usage: SpellCheckerBean [ngrams spelling index]"); + return; + } + + Configuration conf = NutchConfiguration.create(); + + conf.set("spell.index.dir", args[0]); + + SpellCheckerBean checker = new SpellCheckerBean(conf); + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + + String line; + + while ((line = in.readLine()) != null) { + Query query = Query.parse(line, conf); + SpellCheckerTerms terms = checker.checkSpelling(query, line); + StringBuffer buf = new StringBuffer(); + + for (int i = 0; i < terms.size(); i++) { + SpellCheckerTerm currentTerm = terms.getSpellCheckerTerm(i); + buf.append(currentTerm.getCharsBefore()); + + if (currentTerm.isMispelled()) { + buf.append(currentTerm.getSuggestedTerm()); + } else { + buf.append(currentTerm.getOriginalTerm()); + } + } + + System.out.println("Spell checked: " + buf); + } + } + + public void init() { + //do initialization here + } + + public String[] suggest(Query query) { + // TODO Auto-generated method stub + return null; + } + + public String getID() { + return "SPELLER"; + } +} Added: lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerm.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerm.java?rev=410885&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerm.java (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerm.java Thu Jun 1 09:52:23 2006 @@ -0,0 +1,103 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.spell; + +public class SpellCheckerTerm { + String originalTerm = null; + + String charsBefore = null; + + String charsAfter = null; + + String suggestedTerm = null; + + int originalDocFreq = -1; + + int suggestedTermDocFreq = -1; + + boolean isMispelled = false; + + public SpellCheckerTerm(String originalTerm, String charsBefore, + String charsAfter) { + super(); + this.originalTerm = originalTerm; + this.charsBefore = charsBefore; + this.charsAfter = charsAfter; + } + + public String getCharsAfter() { + return charsAfter; + } + + public void setCharsAfter(String charsAfter) { + this.charsAfter = charsAfter; + } + + public String getCharsBefore() { + return charsBefore; + } + + public void setCharsBefore(String charsBefore) { + this.charsBefore = charsBefore; + } + + public int getOriginalDocFreq() { + return originalDocFreq; + } + + public void setOriginalDocFreq(int originalDocFreq) { + this.originalDocFreq = originalDocFreq; + } + + public String getOriginalTerm() { + return originalTerm; + } + + public void setOriginalTerm(String originalTerm) { + this.originalTerm = originalTerm; + } + + public int getSuggestedTermDocFreq() { + return suggestedTermDocFreq; + } + + public void setSuggestedTermDocFreq(int suggestedTermDocFreq) { + this.suggestedTermDocFreq = suggestedTermDocFreq; + } + + public String getSuggestedTerm() { + return suggestedTerm; + } + + public void setSuggestedTerm(String suggestedTerm) { + this.suggestedTerm = suggestedTerm; + } + + public boolean isMispelled() { + return isMispelled; + } + + public void setMispelled(boolean isMispelled) { + this.isMispelled = isMispelled; + } + + public String toString() { + return "[" + originalTerm + ", " + charsBefore + ", " + charsAfter + ", " + + suggestedTerm + ", " + originalDocFreq + ", " + suggestedTermDocFreq + + ", " + isMispelled + "]"; + } +} Added: lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerms.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerms.java?rev=410885&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerms.java (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerms.java Thu Jun 1 09:52:23 2006 @@ -0,0 +1,66 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.spell; + +import java.util.ArrayList; + +public class SpellCheckerTerms { + boolean hasMispelledTerms = false; + ArrayList terms=new ArrayList(); + + public SpellCheckerTerm getSpellCheckerTerm(int i) { + return (SpellCheckerTerm) terms.get(i); + } + + public ArrayList getTerms(){ + return terms; + } + + public String getSpellCheckedQuery() { + StringBuffer buf = new StringBuffer(); + for (int i = 0; i < terms.size(); i++) { + SpellCheckerTerm term = getSpellCheckerTerm(i); + buf.append(term.charsBefore); + if (term.isMispelled) + buf.append(term.suggestedTerm); + else + buf.append(term.originalTerm); + buf.append(term.charsAfter); + } + return buf.toString(); + } + + public boolean getHasMispelledTerms() { + return hasMispelledTerms; + } + + public void setHasMispelledTerms(boolean hasMispelledTerms) { + this.hasMispelledTerms = hasMispelledTerms; + } + + public Object get(int index) { + return terms.get(index); + } + + public int size() { + return terms.size(); + } + + public boolean add(Object arg0) { + return terms.add(arg0); + } +} Added: lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/webapp/controller/SpellCheckController.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/webapp/controller/SpellCheckController.java?rev=410885&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/webapp/controller/SpellCheckController.java (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/webapp/controller/SpellCheckController.java Thu Jun 1 09:52:23 2006 @@ -0,0 +1,49 @@ +package org.apache.nutch.webapp.controller; + +import java.io.IOException; + +import javax.servlet.ServletContext; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.spell.SpellCheckerBean; +import org.apache.nutch.spell.SpellCheckerTerms; +import org.apache.nutch.webapp.common.SearchForm; +import org.apache.nutch.webapp.common.ServiceLocator; +import org.apache.nutch.webapp.common.Startable; +import org.apache.nutch.webapp.controller.NutchController; +import org.apache.struts.tiles.ComponentContext; + +public class SpellCheckController extends NutchController implements Startable { + + SpellCheckerBean spellCheckerBean=null; + + public static final String ATTR_SPELL_TERMS="spellCheckerTerms"; + public static final String ATTR_SPELL_QUERY="spellCheckerQuery"; + + public void nutchPerform(ComponentContext tileContext, + HttpServletRequest request, HttpServletResponse response, + ServletContext servletContext) throws ServletException, IOException { + ServiceLocator serviceLocator=getServiceLocator(request); + + SpellCheckerTerms spellCheckerTerms = null; + if (spellCheckerBean != null) { + spellCheckerTerms = spellCheckerBean.checkSpelling(serviceLocator.getSearch().getQuery(), serviceLocator.getSearch().getQueryString()); + } + + SearchForm form=(SearchForm)serviceLocator.getSearchForm().clone(); + form.setValue(SearchForm.NAME_QUERYSTRING,spellCheckerTerms.getSpellCheckedQuery()); + String spellQuery = form.getParameterString("utf-8"); + + request.setAttribute(ATTR_SPELL_TERMS, spellCheckerTerms); + request.setAttribute(ATTR_SPELL_QUERY, spellQuery); + } + + public void start(ServletContext servletContext) { + Configuration conf=getServiceLocator(servletContext).getConfiguration(); + LOG.info("Initializing spellchecker"); + spellCheckerBean = SpellCheckerBean.get(conf); + } +} Added: lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp?rev=410885&view=auto ============================================================================== --- lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp (added) +++ lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp Thu Jun 1 09:52:23 2006 @@ -0,0 +1,20 @@ +<%@ page session="false"%> +<%@ taglib prefix="tiles" uri="http://jakarta.apache.org/struts/tags-tiles"%> +<%@ taglib prefix="c" uri="http://java.sun.com/jstl/core"%> +<%@ taglib prefix="fmt" uri="http://java.sun.com/jstl/fmt"%> +<c:if test="${spellCheckerTerms!=null && spellCheckerTerms.hasMispelledTerms}"> + <p>Did you mean <a href="search.do?<c:out value="${spellCheckerQuery}"/>"> + <c:forEach + var="currentTerm" items="${spellCheckerTerms.terms}"> + <c:out value="${currentItem.charsBefore}" /> + <c:choose> + <c:when test="${currentTerm.mispelled}"> + <i><b> <c:out value="${currentTerm.suggestedTerm}" /> </b></i> + </c:when> + <c:otherwise> + <c:out value="${currentTerm.originalTerm}" /> + </c:otherwise> + </c:choose> + <c:out value="${currentItem.charsAfter}" /> + </c:forEach> </a></p> +</c:if>