Author: siren
Date: Thu Jun 1 09:52:23 2006
New Revision: 410885
URL: http://svn.apache.org/viewvc?rev=410885&view=rev
Log:
initial import of spellcheck query proposer
Added:
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/build.xml
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/plugin.xml
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/conf/
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/conf/tiles-defs.xml
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/NGramSpeller.java
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerm.java
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerms.java
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/webapp/
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/webapp/controller/
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/webapp/controller/SpellCheckController.java
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/resources/
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/test/
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp
Modified:
lucene/nutch/trunk/contrib/web2/plugins/build.xml
Modified: lucene/nutch/trunk/contrib/web2/plugins/build.xml
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/build.xml?rev=410885&r1=410884&r2=410885&view=diff
==============================================================================
--- lucene/nutch/trunk/contrib/web2/plugins/build.xml (original)
+++ lucene/nutch/trunk/contrib/web2/plugins/build.xml Thu Jun 1 09:52:23 2006
@@ -16,6 +16,7 @@
<ant dir="web-resources" target="deploy"/>
<ant dir="web-clustering" target="deploy"/>
<ant dir="web-query-propose-ontology" target="deploy"/>
+ <ant dir="web-query-propose-spellcheck" target="deploy"/>
</target>
<!-- ====================================================== -->
@@ -36,6 +37,7 @@
<ant dir="web-more" target="clean"/>
<ant dir="web-clustering" target="clean"/>
<ant dir="web-query-propose-ontology" target="clean"/>
+ <ant dir="web-query-propose-spellcheck" target="clean"/>
</target>
</project>
Added:
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/build.xml
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/build.xml?rev=410885&view=auto
==============================================================================
---
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/build.xml
(added)
+++
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/build.xml
Thu Jun 1 09:52:23 2006
@@ -0,0 +1,19 @@
+<?xml version="1.0"?>
+<project name="web-query-propose-spellcheck" default="jar-core">
+ <import file="../build-plugin.xml" />
+ <property name="nutch.root" location="${root}/../../../../" />
+ <target name="init-plugin">
+ <echo>Copying resources templates</echo>
+ <copy todir="${build.classes}/resources">
+ <fileset dir="${resources.dir}" includes="**/*" />
+ </copy>
+ <echo>Copying UI configuration</echo>
+ <copy todir="${build.classes}">
+ <fileset dir="src/conf" includes="**/*"/>
+ </copy>
+ <echo>Copying UI templates</echo>
+ <copy todir="${deploy.dir}/web">
+ <fileset dir="src/web" includes="**/*"/>
+ </copy>
+ </target>
+</project>
\ No newline at end of file
Added:
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/plugin.xml
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/plugin.xml?rev=410885&view=auto
==============================================================================
---
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/plugin.xml
(added)
+++
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/plugin.xml
Thu Jun 1 09:52:23 2006
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+ id="web-query-propose-spellcheck"
+ name="Spellcheck query proposer"
+ version="1.0.0"
+ provider-name="apache.org">
+
+ <runtime>
+ <library name="web-query-propose-spellcheck.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.webapp.extension.UIExtensionPoint"
+ name="Nutch ui extension point"
+ point="org.apache.nutch.webapp.extension.UIExtensionPoint">
+ <implementation id="web-query-propose-spellcheck"
+
class="org.apache.nutch.webapp.extension.UIExtension.VoidImplementation"/>
+ </extension>
+
+</plugin>
Added:
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/conf/tiles-defs.xml
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/conf/tiles-defs.xml?rev=410885&view=auto
==============================================================================
---
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/conf/tiles-defs.xml
(added)
+++
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/conf/tiles-defs.xml
Thu Jun 1 09:52:23 2006
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE tiles-definitions PUBLIC "-//Apache Software Foundation//DTD Tiles
Configuration 1.1//EN" "/WEB-INF/dtd/tiles-config_1_1.dtd">
+<tiles-definitions>
+ <definition name="propose"
+ path="/plugin/web-query-propose-spellcheck/propose.jsp"
+
controllerClass="org.apache.nutch.webapp.controller.SpellCheckController"
+ />
+</tiles-definitions>
Added:
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/NGramSpeller.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/NGramSpeller.java?rev=410885&view=auto
==============================================================================
---
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/NGramSpeller.java
(added)
+++
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/NGramSpeller.java
Thu Jun 1 09:52:23 2006
@@ -0,0 +1,907 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.spell;
+
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.document.*;
+import org.apache.lucene.index.*;
+import org.apache.lucene.search.*;
+import org.apache.lucene.store.*;
+
+import java.io.*;
+
+import java.text.*;
+
+import java.util.*;
+
+/**
+ * Do spelling correction based on ngram frequency of terms in an index.
+ *
+ * Developed based on <a
+ *
href="http://marc.theaimsgroup.com/?l=lucene-user&m=109474652805339&w=2">this
+ * message</a> in the lucene-user list.
+ *
+ * <p>
+ * There are two parts to this algorithm. First a ngram lookup table is formed
+ * for all terms in an index. Then suggested spelling corrections can be done
+ * based on this table.
+ * <p>
+ * The "lookup table" is actually another Lucene index. It is built by going
+ * through all terms in your original index and storing the term in a Document
+ * with all ngrams that make it up. Ngrams of length 3 and 4 are suggested.
+ * <p>
+ *
+ * In addition the prefix and suffix ngrams are stored in case you want to use
a
+ * heuristic that people usually know the first few characters of a word.
+ *
+ * <p>
+ * The entry's boost is set by default to log(word_freq)/log(num_docs).
+ *
+ * <p>
+ *
+ * For a word like "kings" a [EMAIL PROTECTED] Document} with the following
fields is made
+ * in the ngram index:
+ *
+ * <pre>
+ * word:kings
+ * gram3:kin
+ * gram3:ing
+ * gram3:ngs
+ * gram4:king
+ * gram4:ings
+ * start3:kin
+ * start4:king
+ * end3:ngs
+ * end4:ings
+ *
+ * boost: log(freq('kings'))/log(num_docs).
+ * </pre>
+ *
+ *
+ * When a lookup is done a query is formed with all ngrams in the misspelled
+ * word.
+ *
+ * <p>
+ * For a word like <code>"kingz"</code> a query is formed like this.
+ *
+ * Query: <br>
+ * <code>
+ * gram3:kin gram3:ing gram3:ngz start3:kin^B1 end3:ngz^B2 start4:king^B1
end4:ingz^B2
+ * </code>
+ * <br>
+ *
+ * Above B1 and B2 are the prefix and suffix boosts. The prefix boost should
+ * probably be >= 2.0 and the suffix boost should probably be just a little
+ * above 1.
+ *
+ * <p>
+ * <b>To build</b> the ngram index based on the "contents" field in an existing
+ * index 'orig_index' you run the main() driver like this:<br>
+ * <code>
+ * java org.apache.lucene.spell.NGramSpeller -f contents -i orig_index -o
ngram_index
+ * </code>
+ *
+ * <p>
+ * Once you build an index you can <b>perform spelling corrections using</b>
+ * [EMAIL PROTECTED] #suggestUsingNGrams suggestUsingNGrams(...)}.
+ *
+ *
+ * <p>
+ *
+ * To play around with the code against an index approx 100k javadoc-generated
+ * web pages circa Sept/2004 go here: <a
+ *
href='http://www.searchmorph.com/kat/spell.jsp'>http://www.searchmorph.com/kat/spell.jsp</a>.
+ *
+ * <p>
+ * Of interest might be the <a
+ * href="http://secondstring.sourceforge.net/">secondstring</a> string matching
+ * package and <a
+ *
href="http://specialist.nlm.nih.gov/nls/gspell/doc/apiDoc/overview-summary.html">gspell</a>.
+ *
+ * @author <a href="mailto:dave@tropo.com?subject=NGramSpeller">David
+ * Spencer</a>
+ *
+ * Slightly modified from original version for use in Nutch project.
+ *
+ */
+public final class NGramSpeller {
+ /**
+ * Field name for each word in the ngram index.
+ */
+ public static final String F_WORD = "word";
+
+ /**
+ * Frequency, for the popularity cutoff option which says to only return
+ * suggestions that occur more frequently than the misspelled word.
+ */
+ public static final String F_FREQ = "freq";
+
+ /**
+ * Store transpositions too.
+ */
+ public static final String F_TRANSPOSITION = "transposition";
+
+ /**
+ *
+ */
+ private static final PrintStream o = System.out;
+
+ /**
+ *
+ */
+ private static final NumberFormat nf = NumberFormat.getInstance();
+
+ public static Query lastQuery;
+
+ /**
+ *
+ */
+ private NGramSpeller() {
+ }
+
+ /**
+ * Main driver, used to build an index. You probably want invoke like this:
+ * <br>
+ * <code>
+ * java org.apache.lucene.spell.NGramSpeller -f contents -i orig_index -o
ngram_index
+ * </code>
+ */
+ public static void main(String[] args) throws Throwable {
+ int minThreshold = 5;
+ int ng1 = 3;
+ int ng2 = 4;
+ int maxr = 10;
+ int maxd = 5;
+ String out = "gram_index";
+ String gi = "gram_index";
+
+ String name = null;
+ String field = "contents";
+
+ for (int i = 0; i < args.length; i++) {
+ if (args[i].equals("-i")) {
+ name = args[++i];
+ } else if (args[i].equals("-minThreshold")) {
+ minThreshold = Integer.parseInt(args[++i]);
+ } else if (args[i].equals("-gi")) {
+ gi = args[++i];
+ } else if (args[i].equals("-o")) {
+ out = args[++i];
+ } else if (args[i].equals("-t")) { // test transpositions
+
+ String s = args[++i];
+ o.println("TRANS: " + s);
+
+ String[] ar = formTranspositions(s);
+
+ for (int j = 0; j < ar.length; j++)
+ o.println("\t" + ar[j]);
+
+ System.exit(0);
+ } else if (args[i].equals("-ng1")) {
+ ng1 = Integer.parseInt(args[++i]);
+ } else if (args[i].equals("-ng2")) {
+ ng2 = Integer.parseInt(args[++i]);
+ } else if (args[i].equals("-help") || args[i].equals("--help")
+ || args[i].equals("-h")) {
+ o.println("To form an ngram index:");
+ o
+ .println("NGramSpeller -i ORIG_INDEX -o NGRAM_INDEX [-ng1 MIN]
[-ng2 MAX] [-f FIELD]");
+ o.println("Defaults are ng1=3, ng2=4, field='contents'");
+ System.exit(100);
+ } else if (args[i].equals("-q")) {
+ String goal = args[++i];
+ o.println("[NGrams] for " + goal + " from " + gi);
+
+ float bStart = 2.0f;
+ float bEnd = 1.0f;
+ float bTransposition = 0f;
+
+ o.println("bStart: " + bStart);
+ o.println("bEnd: " + bEnd);
+ o.println("bTrans: " + bTransposition);
+ o.println("ng1: " + ng1);
+ o.println("ng2: " + ng2);
+
+ IndexReader ir = IndexReader.open(gi);
+ IndexSearcher searcher = new IndexSearcher(gi);
+ List lis = new ArrayList(maxr);
+ String[] res = suggestUsingNGrams(searcher, goal, ng1, ng2, maxr,
+ bStart, bEnd, bTransposition, maxd, lis, true); // more popular
+ o.println("Returned " + res.length + " from " + gi + " which has "
+ + ir.numDocs() + " words in it");
+
+ Iterator it = lis.iterator();
+
+ while (it.hasNext()) {
+ o.println(it.next().toString());
+ }
+
+ o.println();
+ o.println("query: " + lastQuery.toString("contents"));
+
+ Hits ghits = searcher.search(new TermQuery(
+ new Term(F_WORD, "recursive")));
+
+ if (ghits.length() >= 1) // umm, should only be 0 or 1
+ {
+ Document doc = ghits.doc(0);
+ o.println("TEST DOC: " + doc);
+ }
+
+ searcher.close();
+ ir.close();
+
+ return;
+ } else if (args[i].equals("-f")) {
+ field = args[++i];
+ } else {
+ o.println("hmm? " + args[i]);
+ System.exit(1);
+ }
+ }
+
+ if (name == null) {
+ o.println("opps, you need to specify the input index w/ -i");
+ System.exit(1);
+ }
+
+ o.println("Opening " + name);
+ IndexReader.unlock(FSDirectory.getDirectory(name, false));
+
+ final IndexReader r = IndexReader.open(name);
+
+ o.println("Docs: " + nf.format(r.numDocs()));
+ o.println("Using field: " + field);
+
+ IndexWriter writer = new IndexWriter(out, new WhitespaceAnalyzer(), true);
+ writer.setMergeFactor(writer.getMergeFactor()*50);
+ writer.setMaxBufferedDocs(writer.getMaxBufferedDocs()*50);
+
+ o.println("Forming index from " + name + " to " + out);
+
+ int res = formNGramIndex(r, writer, ng1, ng2, field, minThreshold);
+
+ o.println("done, did " + res + " ngrams");
+ writer.optimize();
+ writer.close();
+ r.close();
+ }
+
+ /**
+ * Using an NGram algorithm try to find alternate spellings for a "goal" word
+ * based on the ngrams in it.
+ *
+ * @param searcher
+ * the searcher for the "ngram" index
+ *
+ * @param goal
+ * the word you want a spell check done on
+ *
+ * @param ng1
+ * the min ngram length to use, probably 3 and it defaults to 3 if
+ * you pass in a value <= 0
+ *
+ * @param ng2
+ * the max ngram length to use, probably 3 or 4
+ *
+ * @param maxr
+ * max results to return, probably a small number like 5 for normal
+ * use or 10-100 for testing
+ *
+ * @param bStart
+ * how to boost matches that start the same way as the goal word,
+ * probably greater than 2
+ *
+ * @param bEnd
+ * how to boost matches that end the same way as the goal word,
+ * probably greater than or equal to 1
+ *
+ * @param bTransposition
+ * how to boost matches that are also simple transpositions, or 0 to
+ * disable
+ *
+ * @param maxd
+ * filter for the max Levenshtein string distance for matches,
+ * probably a number like 3, 4, or 5, or use 0 for it to be ignored.
+ * This prevents words radically longer but similar to the goal word
+ * from being returned.
+ *
+ * @param details
+ * if non null is a list with one entry per match. Each entry is an
+ * array of ([String] word, [Double] score, [Integer] Levenshtein
+ * string distance, [Integer] word freq).
+ *
+ * @param morePopular
+ * if true says to only return suggestions more popular than the
+ * misspelled word. This prevents rare words from being suggested.
+ * Note that for words that don't appear in the index at all this
has
+ * no effect as those words will have a frequency of 0 anyway.
+ *
+ * @return the strings suggested with the best one first
+ */
+ public static String[] suggestUsingNGrams(Searcher searcher, String goal,
+ int ng1, int ng2, int maxr, float bStart, float bEnd,
+ float bTransposition, int maxd, List details, boolean morePopular)
+ throws Throwable {
+ List res = new ArrayList(maxr);
+ BooleanQuery query = new BooleanQuery();
+
+ if (ng1 <= 0) {
+ ng1 = 3; // guess
+ }
+
+ if (ng2 < ng1) {
+ ng2 = ng1;
+ }
+
+ if (bStart < 0) {
+ bStart = 0;
+ }
+
+ if (bEnd < 0) {
+ bEnd = 0;
+ }
+
+ if (bTransposition < 0) {
+ bTransposition = 0;
+ }
+
+ // calculate table of all ngrams for goal word
+ String[][] gramt = new String[ng2 + 1][];
+
+ for (int ng = ng1; ng <= ng2; ng++)
+ gramt[ng] = formGrams(goal, ng);
+
+ int goalFreq = 0;
+
+ if (morePopular) {
+ Hits ghits = searcher.search(new TermQuery(new Term(F_WORD, goal)));
+
+ if (ghits.length() >= 1) // umm, should only be 0 or 1
+ {
+ Document doc = ghits.doc(0);
+ goalFreq = Integer.parseInt(doc.get(F_FREQ));
+ }
+ }
+
+ if (bTransposition > 0) {
+ add(query, F_TRANSPOSITION, goal, bTransposition);
+ }
+
+ TRStringDistance sd = new TRStringDistance(goal);
+
+ for (int ng = ng1; ng <= ng2; ng++) // for every ngram in range
+ {
+ String[] grams = gramt[ng]; // form word into ngrams (allow dups too)
+
+ if (grams.length == 0) {
+ continue; // hmm
+ }
+
+ String key = "gram" + ng; // form key
+
+ if (bStart > 0) { // should we boost prefixes?
+ add(query, "start" + ng, grams[0], bStart); // matches start of word
+ }
+
+ if (bEnd > 0) { // should we boost suffixes
+ add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end
+ // of word
+ }
+
+ // match ngrams anywhere, w/o a boost
+ for (int i = 0; i < grams.length; i++) {
+ add(query, key, grams[i]);
+ }
+ }
+
+ Hits hits = searcher.search(query);
+ int len = hits.length();
+ int remain = maxr;
+ int stop = Math.min(len, 100 * maxr); // go thru more than 'maxr' matches
in
+ // case the distance filter triggers
+
+ for (int i = 0; (i < stop) && (remain > 0); i++) {
+ Document d = hits.doc(i);
+ String word = d.get(F_WORD); // get orig word
+
+ if (word.equals(goal)) {
+ continue; // don't suggest a word for itself, that would be silly
+ }
+
+ int dist = sd.getDistance(word); // use distance filter
+
+ if ((maxd > 0) && (dist > maxd)) {
+ continue;
+ }
+
+ int suggestionFreq = Integer.parseInt(d.get(F_FREQ));
+
+ if (morePopular && (goalFreq > suggestionFreq)) {
+ continue; // don't suggest a rarer word
+ }
+
+ remain--;
+ res.add(word);
+
+ if (details != null) // only non-null for testing probably
+ {
+ int[] matches = new int[ng2 + 1];
+
+ for (int ng = ng1; ng <= ng2; ng++) {
+ String[] have = formGrams(word, ng);
+ int match = 0;
+ String[] cur = gramt[ng];
+
+ for (int k = 0; k < have.length; k++) {
+ boolean looking = true;
+
+ for (int j = 0; (j < cur.length) && looking; j++) {
+ if (have[k].equals(cur[j])) {
+ // o.println( "\t\tmatch: " + have[ k] + " on " + word);
+ match++;
+ looking = false;
+ }
+ }
+
+ /*
+ * if ( looking) o.println( "\t\tNO MATCH: " + have[ k] + " on " +
+ * word);
+ */
+ }
+
+ matches[ng] = match;
+ }
+
+ details.add(new SpellSuggestionDetails(word, hits.score(i), dist,
+ suggestionFreq, matches, ng1));
+ }
+ }
+
+ lastQuery = query; // hack for now
+
+ return (String[]) res.toArray(new String[0]);
+ }
+
+ /**
+ * Go thru all terms and form an index of the "ngrams" of length 'ng1' to
+ * 'ng2' in each term. The ngrams have field names like "gram3" for a 3 char
+ * ngram, and "gram4" for a 4 char one. The starting and ending (or prefix
and
+ * suffix) "n" characters are also stored for each word with field names
+ * "start3" and "end3".
+ *
+ *
+ * @param r
+ * the index to read terms from
+ *
+ * @param w
+ * the writer to write the ngrams to, or if null an index named
+ * "gram_index" will be created. If you pass in non-null then you
+ * should optimize and close the index.
+ *
+ * @param ng1
+ * the min number of chars to form ngrams with (3 is suggested)
+ *
+ * @param ng2
+ * the max number of chars to form ngrams with, can be equal to ng1
+ *
+ * @param fields
+ * the field name to process ngrams from.
+ *
+ * @param minThreshold
+ * terms must appear in at least this many docs else they're ignored
+ * as the assumption is that they're so rare (...)
+ *
+ * @return the number of ngrams added
+ *
+ */
+ private static int formNGramIndex(IndexReader r, IndexWriter _w, int ng1,
+ int ng2, String field, int minThreshold) throws IOException {
+ int mins = 0;
+ float nudge = 0.01f; // don't allow boosts to be too small
+ IndexWriter w;
+
+ if (_w == null) {
+ w = new IndexWriter("gram_index", new WhitespaceAnalyzer(), // should
have
+ // no effect
+ true);
+ } else {
+ w = _w;
+ }
+
+ int mod = 1000; // for status
+ int nd = r.numDocs();
+ final float base = (float) Math.log(1.0d / ((double) nd));
+
+ if (field == null) {
+ field = "contents"; // def field
+ }
+
+ field = field.intern(); // is it doced that you can use == on fields?
+
+ int grams = 0; // # of ngrams added
+ final TermEnum te = r.terms(new Term(field, ""));
+ int n = 0;
+ int skips = 0;
+
+ while (te.next()) {
+ boolean show = false; // for debugging
+ Term t = te.term();
+ String have = t.field();
+
+ if ((have != field) && !have.equals(field)) // wrong field
+ {
+ break;
+ }
+
+ if (t.text().indexOf("-") >= 0) {
+ continue;
+ }
+
+ int df = te.docFreq();
+
+ if ((++n % mod) == 0) {
+ show = true;
+ o.println("term: " + t + " n=" + nf.format(n) + " grams="
+ + nf.format(grams) + " mins=" + nf.format(mins) + " skip="
+ + nf.format(skips) + " docFreq=" + df);
+ }
+
+ if (df < minThreshold) // not freq enough, too rare to consider
+ {
+ mins++;
+
+ continue;
+ }
+
+ String text = t.text();
+ int len = text.length();
+
+ if (len < ng1) {
+ continue; // too short we bail but "too long" is fine...
+ }
+
+ // but note that long tokens that are rare prob won't get here anyway as
+ // they won't
+ // pass the 'minThreshold' check above
+ Document doc = new Document();
+ doc.add(new Field(F_WORD, text, Field.Store.YES,
Field.Index.UN_TOKENIZED)); // orig term
+ doc.add(new Field(F_FREQ, "" + df, Field.Store.YES,
Field.Index.UN_TOKENIZED)); // for popularity cutoff optionx
+
+ String[] trans = formTranspositions(text);
+
+ for (int i = 0; i < trans.length; i++)
+ doc.add(new Field(F_TRANSPOSITION, trans[i], Field.Store.YES,
Field.Index.UN_TOKENIZED));
+
+ // now loop thru all ngrams of lengths 'ng1' to 'ng2'
+ for (int ng = ng1; ng <= ng2; ng++) {
+ String key = "gram" + ng;
+ String end = null;
+
+ for (int i = 0; i < (len - ng + 1); i++) {
+ String gram = text.substring(i, i + ng);
+ doc.add(new Field(key, gram, Field.Store.YES,
Field.Index.UN_TOKENIZED));
+
+ if (i == 0) {
+ doc.add(new Field("start" + ng, gram, Field.Store.YES,
Field.Index.UN_TOKENIZED));
+ }
+
+ end = gram;
+ grams++;
+ }
+
+ if (end != null) { // may not be present if len==ng1
+ doc.add(new Field("end" + ng, end, Field.Store.YES,
Field.Index.UN_TOKENIZED));
+ }
+ }
+
+ float f1 = te.docFreq();
+ float f2 = nd;
+
+ float bo = (float) ((Math.log(f1) / Math.log(f2)) + nudge);
+ doc.setBoost(bo);
+
+ if (show) {
+ o.println("f1=" + f1 + " nd=" + nd + " boost=" + bo + " base=" + base
+ + " word=" + text);
+ }
+
+ w.addDocument(doc);
+ }
+
+ if (_w == null) // else you have to optimize/close
+ {
+ w.optimize();
+ w.close();
+ }
+
+ return grams;
+ }
+
+ /**
+ * Add a clause to a boolean query.
+ */
+ private static void add(BooleanQuery q, String k, String v, float boost) {
+ Query tq = new TermQuery(new Term(k, v));
+ tq.setBoost(boost);
+ q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
+ }
+
+ /**
+ *
+ */
+ public static String[] formTranspositions(String s) {
+ int len = s.length();
+ List res = new ArrayList(len - 1);
+
+ for (int i = 0; i < (len - 1); i++) {
+ char c1 = s.charAt(i);
+ char c2 = s.charAt(i + 1);
+
+ if (c1 == c2) {
+ continue;
+ }
+
+ res.add(s.substring(0, i) + c2 + c1 + s.substring(i + 2));
+ }
+
+ return (String[]) res.toArray(new String[0]);
+ }
+
+ /**
+ * Form all ngrams for a given word.
+ *
+ * @param text
+ * the word to parse
+ * @param ng
+ * the ngram length e.g. 3
+ * @return an array of all ngrams in the word and note that duplicates are
not
+ * removed
+ */
+ public static String[] formGrams(String text, int ng) {
+ List res = new ArrayList(text.length() - ng + 1);
+ int len = text.length();
+
+ for (int i = 0; i < (len - ng + 1); i++) {
+ res.add(text.substring(i, i + ng));
+ }
+
+ return (String[]) res.toArray(new String[0]);
+ }
+
+ /**
+ * Add a clause to a boolean query.
+ */
+ private static void add(BooleanQuery q, String k, String v) {
+ q.add(new BooleanClause(new TermQuery(new Term(k, v)),
BooleanClause.Occur.SHOULD));
+ }
+
+ /**
+ * Presumably this is implemented somewhere in the apache/jakarta/commons
area
+ * but I couldn't find it.
+ *
+ * @link http://www.merriampark.com/ld.htm
+ *
+ */
+ private static class TRStringDistance {
+ final char[] sa;
+
+ final int n;
+
+ final int[][][] cache = new int[30][][];
+
+ /**
+ * Optimized to run a bit faster than the static getDistance(). In one
+ * benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus
+ * 37% faster.
+ */
+ private TRStringDistance(String target) {
+ sa = target.toCharArray();
+ n = sa.length;
+ }
+
+ // *****************************
+ // Compute Levenshtein distance
+ // *****************************
+ public int getDistance(String other) {
+ int[][] d; // matrix
+ int cost; // cost
+
+ // Step 1
+ final char[] ta = other.toCharArray();
+ final int m = ta.length;
+
+ if (n == 0) {
+ return m;
+ }
+
+ if (m == 0) {
+ return n;
+ }
+
+ if (m >= cache.length) {
+ d = form(n, m);
+ } else if (cache[m] != null) {
+ d = cache[m];
+ } else {
+ d = cache[m] = form(n, m);
+ }
+
+ // Step 3
+ for (int i = 1; i <= n; i++) {
+ final char s_i = sa[i - 1];
+
+ // Step 4
+ for (int j = 1; j <= m; j++) {
+ final char t_j = ta[j - 1];
+
+ // Step 5
+ if (s_i == t_j) { // same
+ cost = 0;
+ } else { // not a match
+ cost = 1;
+ }
+
+ // Step 6
+ d[i][j] = min3(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1]
+ + cost);
+ }
+ }
+
+ // Step 7
+ return d[n][m];
+ }
+
+ /**
+ *
+ */
+ private static int[][] form(int n, int m) {
+ int[][] d = new int[n + 1][m + 1];
+
+ // Step 2
+ for (int i = 0; i <= n; i++)
+ d[i][0] = i;
+
+ for (int j = 0; j <= m; j++)
+ d[0][j] = j;
+
+ return d;
+ }
+
+ // ****************************
+ // Get minimum of three values
+ // ****************************
+ private static int min3(int a, int b, int c) {
+ int mi;
+
+ mi = a;
+
+ if (b < mi) {
+ mi = b;
+ }
+
+ if (c < mi) {
+ mi = c;
+ }
+
+ return mi;
+ }
+
+ // *****************************
+ // Compute Levenshtein distance
+ // *****************************
+ public static int getDistance(String s, String t) {
+ return getDistance(s.toCharArray(), t.toCharArray());
+ }
+
+ // *****************************
+ // Compute Levenshtein distance
+ // *****************************
+ public static int getDistance(final char[] sa, final char[] ta) {
+ int[][] d; // matrix
+ int i; // iterates through s
+ int j; // iterates through t
+ char s_i; // ith character of s
+ char t_j; // jth character of t
+ int cost; // cost
+
+ // Step 1
+ final int n = sa.length;
+ final int m = ta.length;
+
+ if (n == 0) {
+ return m;
+ }
+
+ if (m == 0) {
+ return n;
+ }
+
+ d = new int[n + 1][m + 1];
+
+ // Step 2
+ for (i = 0; i <= n; i++) {
+ d[i][0] = i;
+ }
+
+ for (j = 0; j <= m; j++) {
+ d[0][j] = j;
+ }
+
+ // Step 3
+ for (i = 1; i <= n; i++) {
+ s_i = sa[i - 1];
+
+ // Step 4
+ for (j = 1; j <= m; j++) {
+ t_j = ta[j - 1];
+
+ // Step 5
+ if (s_i == t_j) {
+ cost = 0;
+ } else {
+ cost = 1;
+ }
+
+ // Step 6
+ d[i][j] = min3(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1]
+ + cost);
+ }
+ }
+
+ // Step 7
+ return d[n][m];
+ }
+ }
+
+ /* Added by Andy Liu for Nutch */
+ public static class SpellSuggestionDetails {
+ public String word;
+
+ public double score;
+
+ public int dist;
+
+ public int docFreq;
+
+ public int[] matches;
+
+ public int ng1;
+
+ public SpellSuggestionDetails(String word, double score, int dist,
+ int docFreq, int[] matches, int ng1) {
+ super();
+ this.word = word;
+ this.score = score;
+ this.dist = dist;
+ this.docFreq = docFreq;
+ this.matches = matches;
+ this.ng1 = ng1;
+ }
+
+ public String toString() {
+ StringBuffer buf = new StringBuffer("word=" + word + " score=" + score
+ + " dist=" + dist + " freq=" + docFreq + "\n");
+
+ for (int j = ng1; j < matches.length; j++)
+ buf.append("\tmm[ " + j + " ] = " + matches[j]);
+
+ return buf.toString();
+ }
+ }
+}
Added:
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java?rev=410885&view=auto
==============================================================================
---
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java
(added)
+++
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerBean.java
Thu Jun 1 09:52:23 2006
@@ -0,0 +1,328 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.spell;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.logging.Logger;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.nutch.searcher.Query;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * Parses queries and sends them to NGramSpeller for spell checking.
+ *
+ * @author Andy Liu <[EMAIL PROTECTED]>
+ */
+public class SpellCheckerBean {
+ public static final Logger LOG = LogFormatter
+ .getLogger(SpellCheckerBean.class.toString());
+
+ IndexSearcher spellingSearcher;
+
+ //
+ // Configuration parameters used by NGramSpeller . Hardcoded for now.
+ //
+ final int minThreshold = 5;
+
+ final int ng1 = 3;
+
+ final int ng2 = 4;
+
+ final int maxr = 10;
+
+ final int maxd = 5;
+
+ final float bStart = 2.0f;
+
+ final float bEnd = 1.0f;
+
+ final float bTransposition = 6.5f;
+
+ // configuration variable names
+ public static final String SPELLING_INDEX_LOCATION = "spell.index.dir";
+
+ public static final String SPELLING_DOCFREQ_THRESHOLD =
"spell.docfreq.threshold";
+
+ public static final String SPELLING_DOCFREQ_THRESHOLD_FACTOR =
"spell.docfreq.threshold.factor";
+
+ String indexLocation;
+
+ int threshold;
+
+ int thresholdFactor;
+
+ Configuration conf;
+
+ public SpellCheckerBean(Configuration conf) {
+ this.conf=conf;
+ indexLocation = conf.get(SPELLING_INDEX_LOCATION, "./spelling");
+ threshold = conf.getInt(SPELLING_DOCFREQ_THRESHOLD, 100);
+ thresholdFactor = conf.getInt(SPELLING_DOCFREQ_THRESHOLD_FACTOR, 10);
+ try {
+ spellingSearcher = new IndexSearcher(indexLocation);
+ } catch (IOException ioe) {
+ LOG.info("error opening spell checking index");
+ ioe.printStackTrace(System.out);
+ }
+ }
+
+ /** Cache in Configuration. */
+ public static SpellCheckerBean get(Configuration conf) {
+ SpellCheckerBean spellCheckerBean = (SpellCheckerBean) conf
+ .getObject(SpellCheckerBean.class.getName());
+
+ if (spellCheckerBean == null) {
+ LOG.info("creating new spell checker bean");
+ spellCheckerBean = new SpellCheckerBean(conf);
+ conf.setObject(SpellCheckerBean.class.getName(), spellCheckerBean);
+ }
+ return spellCheckerBean;
+ }
+
+ public SpellCheckerTerms checkSpelling(Query query, String queryString) {
+
+ return checkSpelling(query, queryString, threshold, thresholdFactor);
+ }
+
+ /**
+ * Parses original query, retrieves suggestions from ngrams spelling index
+ *
+ * @param originalQuery
+ * Query to be spell-checked
+ * @param docFreqThreshold
+ * Terms in query that have a docFreq lower than this threshold
+ * qualify as "mispelled"
+ * @param factorThreshold
+ * The suggested term must have a docFreq at least factorThreshold
+ * times more than the mispelled term. Set to 1 to disable.
+ * @return terms with corrected spelling
+ */
+ public SpellCheckerTerms checkSpelling(Query query, String queryString,
+ int docFreqThreshold, int factorThreshold) {
+ SpellCheckerTerms spellCheckerTerms = null;
+
+ try {
+ spellCheckerTerms = parseOriginalQuery(query, queryString);
+
+ for (int i = 0; i < spellCheckerTerms.size(); i++) {
+ SpellCheckerTerm currentTerm =
spellCheckerTerms.getSpellCheckerTerm(i);
+ String originalTerm = currentTerm.getOriginalTerm();
+
+ spellCheckerTerms.getSpellCheckerTerm(i).setOriginalDocFreq(
+ getDocFreq(originalTerm));
+
+ //
+ // Spell checking is not effective for words under 4 letters long
+ // Any words over 25 letters long isn't worth checking either.
+ //
+ if (originalTerm.length() < 4)
+ continue;
+
+ if (originalTerm.length() > 25)
+ continue;
+
+ List lis = new ArrayList(maxr);
+
+ String[] suggestions = NGramSpeller.suggestUsingNGrams(spellingSearcher
+ , originalTerm, ng1, ng2, maxr, bStart, bEnd,
+ bTransposition, maxd, lis, true);
+
+ Iterator it = lis.iterator();
+
+ while (it.hasNext()) {
+ LOG.fine(it.next().toString());
+ }
+
+ if (suggestions.length > 0) {
+ currentTerm.setSuggestedTerm(suggestions[0]);
+
+ if (lis != null) {
+ NGramSpeller.SpellSuggestionDetails detail =
(NGramSpeller.SpellSuggestionDetails) lis
+ .get(0);
+ currentTerm.setSuggestedTermDocFreq(detail.docFreq);
+ }
+
+ // We use document frequencies of the original term and the suggested
+ // term to guess
+ // whether or not a term is mispelled. The criteria is as follows:
+ //
+ // 1. The term's document frequency must be under a constant
threshold
+ // 2. The suggested term's docFreq must be greater than the original
+ // term's docFreq * constant factor
+ //
+ if ((currentTerm.originalDocFreq < docFreqThreshold)
+ && ((currentTerm.originalDocFreq * factorThreshold) <
(currentTerm.suggestedTermDocFreq))) {
+ spellCheckerTerms.setHasMispelledTerms(true);
+ currentTerm.setMispelled(true);
+ }
+ }
+
+ }
+
+ } catch (Throwable t) {
+ t.printStackTrace();
+ }
+
+ return spellCheckerTerms;
+ }
+
+ /**
+ *
+ * Parses the query and preserves characters and formatting surrounding terms
+ * to be spell-checked. This is done so that we can present the query in the
+ * "Did you mean: XYZ" message in the same format the user originally typed
+ * it.
+ *
+ * @param originalQuery
+ * text to be parsed
+ * @return spell checker terms
+ */
+ public SpellCheckerTerms parseOriginalQuery(Query query, String queryString)
+ throws IOException {
+ String[] terms = query.getTerms();
+ SpellCheckerTerms spellCheckerTerms = new SpellCheckerTerms();
+
+ int previousTermPos = 0;
+ for (int i = 0; i < terms.length; i++) {
+
+ int termPos = queryString.toLowerCase().indexOf(terms[i]);
+
+ String charsBefore = "";
+ String charsAfter = "";
+
+ // Is this the first term? If so, we need to check for characters
+ // before the first term.
+ if (i == 0) {
+
+ if (termPos > 0) {
+ charsBefore = queryString.substring(0, termPos);
+ }
+
+ // We're in-between terms...
+ } else {
+ int endOfLastTerm = previousTermPos + terms[i - 1].length();
+
+ if (endOfLastTerm < termPos) {
+ charsBefore = queryString.substring(endOfLastTerm, termPos);
+ }
+ }
+
+ // Is this the last term? If so, we need to check for characters
+ // after the last term.
+ if (i == (terms.length - 1)) {
+
+ int endOfCurrentTerm = termPos + terms[i].length();
+
+ if (endOfCurrentTerm < queryString.length()) {
+ charsAfter = queryString.substring(endOfCurrentTerm, queryString
+ .length());
+ }
+
+ }
+
+ previousTermPos = termPos;
+
+ spellCheckerTerms.add(new SpellCheckerTerm(terms[i], charsBefore,
+ charsAfter));
+
+ }
+
+ return spellCheckerTerms;
+
+ }
+
+ public SpellCheckerTerms parseOriginalQuery(String queryString)
+ throws IOException {
+ return parseOriginalQuery(Query.parse(queryString, conf), queryString);
+ }
+
+ /**
+ * Retrieves docFreq as stored within spelling index. Alternatively, we could
+ * simply consult the main index for a docFreq() of a term (which would be
+ * faster) but it's nice to have a separate, spelling index that can stand on
+ * its own.
+ *
+ * @param term
+ * @return document frequency of term
+ */
+ private int getDocFreq(String term) throws IOException {
+ /*
+ * Hits hits = this.spellingSearcher.getLuceneSearcher().search(new
+ * TermQuery(new Term( NGramSpeller.F_WORD, term))); if (hits.length() >
0) {
+ * Document doc = hits.doc(0); String docFreq =
+ * doc.get(NGramSpeller.F_FREQ); return Integer.parseInt(docFreq); }
+ */
+ return 0;
+ }
+
+ public static void main(String[] args) throws Throwable {
+ if (args.length < 1) {
+ System.out.println("usage: SpellCheckerBean [ngrams spelling index]");
+ return;
+ }
+
+ Configuration conf = NutchConfiguration.create();
+
+ conf.set("spell.index.dir", args[0]);
+
+ SpellCheckerBean checker = new SpellCheckerBean(conf);
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+
+ String line;
+
+ while ((line = in.readLine()) != null) {
+ Query query = Query.parse(line, conf);
+ SpellCheckerTerms terms = checker.checkSpelling(query, line);
+ StringBuffer buf = new StringBuffer();
+
+ for (int i = 0; i < terms.size(); i++) {
+ SpellCheckerTerm currentTerm = terms.getSpellCheckerTerm(i);
+ buf.append(currentTerm.getCharsBefore());
+
+ if (currentTerm.isMispelled()) {
+ buf.append(currentTerm.getSuggestedTerm());
+ } else {
+ buf.append(currentTerm.getOriginalTerm());
+ }
+ }
+
+ System.out.println("Spell checked: " + buf);
+ }
+ }
+
+ public void init() {
+ //do initialization here
+ }
+
+ public String[] suggest(Query query) {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ public String getID() {
+ return "SPELLER";
+ }
+}
Added:
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerm.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerm.java?rev=410885&view=auto
==============================================================================
---
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerm.java
(added)
+++
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerm.java
Thu Jun 1 09:52:23 2006
@@ -0,0 +1,103 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.spell;
+
+public class SpellCheckerTerm {
+ String originalTerm = null;
+
+ String charsBefore = null;
+
+ String charsAfter = null;
+
+ String suggestedTerm = null;
+
+ int originalDocFreq = -1;
+
+ int suggestedTermDocFreq = -1;
+
+ boolean isMispelled = false;
+
+ public SpellCheckerTerm(String originalTerm, String charsBefore,
+ String charsAfter) {
+ super();
+ this.originalTerm = originalTerm;
+ this.charsBefore = charsBefore;
+ this.charsAfter = charsAfter;
+ }
+
+ public String getCharsAfter() {
+ return charsAfter;
+ }
+
+ public void setCharsAfter(String charsAfter) {
+ this.charsAfter = charsAfter;
+ }
+
+ public String getCharsBefore() {
+ return charsBefore;
+ }
+
+ public void setCharsBefore(String charsBefore) {
+ this.charsBefore = charsBefore;
+ }
+
+ public int getOriginalDocFreq() {
+ return originalDocFreq;
+ }
+
+ public void setOriginalDocFreq(int originalDocFreq) {
+ this.originalDocFreq = originalDocFreq;
+ }
+
+ public String getOriginalTerm() {
+ return originalTerm;
+ }
+
+ public void setOriginalTerm(String originalTerm) {
+ this.originalTerm = originalTerm;
+ }
+
+ public int getSuggestedTermDocFreq() {
+ return suggestedTermDocFreq;
+ }
+
+ public void setSuggestedTermDocFreq(int suggestedTermDocFreq) {
+ this.suggestedTermDocFreq = suggestedTermDocFreq;
+ }
+
+ public String getSuggestedTerm() {
+ return suggestedTerm;
+ }
+
+ public void setSuggestedTerm(String suggestedTerm) {
+ this.suggestedTerm = suggestedTerm;
+ }
+
+ public boolean isMispelled() {
+ return isMispelled;
+ }
+
+ public void setMispelled(boolean isMispelled) {
+ this.isMispelled = isMispelled;
+ }
+
+ public String toString() {
+ return "[" + originalTerm + ", " + charsBefore + ", " + charsAfter + ", "
+ + suggestedTerm + ", " + originalDocFreq + ", " + suggestedTermDocFreq
+ + ", " + isMispelled + "]";
+ }
+}
Added:
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerms.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerms.java?rev=410885&view=auto
==============================================================================
---
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerms.java
(added)
+++
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/spell/SpellCheckerTerms.java
Thu Jun 1 09:52:23 2006
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.spell;
+
+import java.util.ArrayList;
+
+public class SpellCheckerTerms {
+ boolean hasMispelledTerms = false;
+ ArrayList terms=new ArrayList();
+
+ public SpellCheckerTerm getSpellCheckerTerm(int i) {
+ return (SpellCheckerTerm) terms.get(i);
+ }
+
+ public ArrayList getTerms(){
+ return terms;
+ }
+
+ public String getSpellCheckedQuery() {
+ StringBuffer buf = new StringBuffer();
+ for (int i = 0; i < terms.size(); i++) {
+ SpellCheckerTerm term = getSpellCheckerTerm(i);
+ buf.append(term.charsBefore);
+ if (term.isMispelled)
+ buf.append(term.suggestedTerm);
+ else
+ buf.append(term.originalTerm);
+ buf.append(term.charsAfter);
+ }
+ return buf.toString();
+ }
+
+ public boolean getHasMispelledTerms() {
+ return hasMispelledTerms;
+ }
+
+ public void setHasMispelledTerms(boolean hasMispelledTerms) {
+ this.hasMispelledTerms = hasMispelledTerms;
+ }
+
+ public Object get(int index) {
+ return terms.get(index);
+ }
+
+ public int size() {
+ return terms.size();
+ }
+
+ public boolean add(Object arg0) {
+ return terms.add(arg0);
+ }
+}
Added:
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/webapp/controller/SpellCheckController.java
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/webapp/controller/SpellCheckController.java?rev=410885&view=auto
==============================================================================
---
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/webapp/controller/SpellCheckController.java
(added)
+++
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/java/org/apache/nutch/webapp/controller/SpellCheckController.java
Thu Jun 1 09:52:23 2006
@@ -0,0 +1,49 @@
+package org.apache.nutch.webapp.controller;
+
+import java.io.IOException;
+
+import javax.servlet.ServletContext;
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.spell.SpellCheckerBean;
+import org.apache.nutch.spell.SpellCheckerTerms;
+import org.apache.nutch.webapp.common.SearchForm;
+import org.apache.nutch.webapp.common.ServiceLocator;
+import org.apache.nutch.webapp.common.Startable;
+import org.apache.nutch.webapp.controller.NutchController;
+import org.apache.struts.tiles.ComponentContext;
+
+public class SpellCheckController extends NutchController implements Startable
{
+
+ SpellCheckerBean spellCheckerBean=null;
+
+ public static final String ATTR_SPELL_TERMS="spellCheckerTerms";
+ public static final String ATTR_SPELL_QUERY="spellCheckerQuery";
+
+ public void nutchPerform(ComponentContext tileContext,
+ HttpServletRequest request, HttpServletResponse response,
+ ServletContext servletContext) throws ServletException, IOException {
+ ServiceLocator serviceLocator=getServiceLocator(request);
+
+ SpellCheckerTerms spellCheckerTerms = null;
+ if (spellCheckerBean != null) {
+ spellCheckerTerms =
spellCheckerBean.checkSpelling(serviceLocator.getSearch().getQuery(),
serviceLocator.getSearch().getQueryString());
+ }
+
+ SearchForm form=(SearchForm)serviceLocator.getSearchForm().clone();
+
form.setValue(SearchForm.NAME_QUERYSTRING,spellCheckerTerms.getSpellCheckedQuery());
+ String spellQuery = form.getParameterString("utf-8");
+
+ request.setAttribute(ATTR_SPELL_TERMS, spellCheckerTerms);
+ request.setAttribute(ATTR_SPELL_QUERY, spellQuery);
+ }
+
+ public void start(ServletContext servletContext) {
+ Configuration conf=getServiceLocator(servletContext).getConfiguration();
+ LOG.info("Initializing spellchecker");
+ spellCheckerBean = SpellCheckerBean.get(conf);
+ }
+}
Added:
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp
URL:
http://svn.apache.org/viewvc/lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp?rev=410885&view=auto
==============================================================================
---
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp
(added)
+++
lucene/nutch/trunk/contrib/web2/plugins/web-query-propose-spellcheck/src/web/web-query-propose-spellcheck/propose.jsp
Thu Jun 1 09:52:23 2006
@@ -0,0 +1,20 @@
+<%@ page session="false"%>
+<%@ taglib prefix="tiles" uri="http://jakarta.apache.org/struts/tags-tiles"%>
+<%@ taglib prefix="c" uri="http://java.sun.com/jstl/core"%>
+<%@ taglib prefix="fmt" uri="http://java.sun.com/jstl/fmt"%>
+<c:if test="${spellCheckerTerms!=null && spellCheckerTerms.hasMispelledTerms}">
+ <p>Did you mean <a href="search.do?<c:out value="${spellCheckerQuery}"/>">
+ <c:forEach
+ var="currentTerm" items="${spellCheckerTerms.terms}">
+ <c:out value="${currentItem.charsBefore}" />
+ <c:choose>
+ <c:when test="${currentTerm.mispelled}">
+ <i><b> <c:out value="${currentTerm.suggestedTerm}" /> </b></i>
+ </c:when>
+ <c:otherwise>
+ <c:out value="${currentTerm.originalTerm}" />
+ </c:otherwise>
+ </c:choose>
+ <c:out value="${currentItem.charsAfter}" />
+ </c:forEach> </a></p>
+</c:if>