dnaber 2004/08/13 11:35:02 Modified: src/java/org/apache/lucene/search FuzzyTermEnum.java FuzzyQuery.java src/test/org/apache/lucene/search TestFuzzyQuery.java Log: You can now set the required fuzziness of FuzzyQuery. Note that QueryParser does not support this (yet?). Revision Changes Path 1.7 +11 -6 jakarta-lucene/src/java/org/apache/lucene/search/FuzzyTermEnum.java Index: FuzzyTermEnum.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/FuzzyTermEnum.java,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- FuzzyTermEnum.java 11 May 2004 17:23:21 -0000 1.6 +++ FuzzyTermEnum.java 13 Aug 2004 18:35:01 -0000 1.7 @@ -26,16 +26,24 @@ the enumeration is greater than all that precede it. */ public final class FuzzyTermEnum extends FilteredTermEnum { double distance; - boolean fieldMatch = false; boolean endEnum = false; Term searchTerm = null; String field = ""; String text = ""; int textlen; + float minimumSimilarity; + double scale_factor; + public FuzzyTermEnum(IndexReader reader, Term term) throws IOException { + this(reader, term, 0.5f); + } + + public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws IOException { super(); + minimumSimilarity = minSimilarity; + scale_factor = 1.0f / (1.0f - minimumSimilarity); searchTerm = term; field = searchTerm.field(); text = searchTerm.text(); @@ -53,14 +61,14 @@ int targetlen = target.length(); int dist = editDistance(text, target, textlen, targetlen); distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen)); - return (distance > FUZZY_THRESHOLD); + return (distance > minimumSimilarity); } endEnum = true; return false; } protected final float difference() { - return (float)((distance - FUZZY_THRESHOLD) * SCALE_FACTOR); + return (float)((distance - minimumSimilarity) * scale_factor); } public final boolean endEnum() { @@ -70,9 +78,6 @@ /****************************** * Compute Levenshtein distance ******************************/ - - public static final double FUZZY_THRESHOLD = 0.5; - public static final double SCALE_FACTOR = 1.0f / (1.0f - FUZZY_THRESHOLD); /** Finds and returns the smallest of three integers 1.5 +32 -3 jakarta-lucene/src/java/org/apache/lucene/search/FuzzyQuery.java Index: FuzzyQuery.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/FuzzyQuery.java,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- FuzzyQuery.java 29 Mar 2004 22:48:03 -0000 1.4 +++ FuzzyQuery.java 13 Aug 2004 18:35:01 -0000 1.5 @@ -20,14 +20,43 @@ import org.apache.lucene.index.Term; import java.io.IOException; -/** Implements the fuzzy search query */ +/** Implements the fuzzy search query. The similiarity measurement + * is based on the Levenshtein (edit distance) algorithm. + */ public final class FuzzyQuery extends MultiTermQuery { - public FuzzyQuery(Term term) { + + private float minimumSimilarity; + + /** + * Create a new FuzzyQuery that will match terms with a similarity + * of at least <code>minimumSimilarity</code> to <code>term</code>. + * + * @param term the term to search for + * @param minimumSimilarity a value between 0 and 1 to set the required similarity + * between the query term and the matching terms. For example, for a + * <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length + * as the query term is considered similar to the query term if the edit distance + * between both terms is less than <code>length(term)*0.5</code>. + * @throws IllegalArgumentException if minimumSimilarity is > 1 or < 0 + */ + public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException { super(term); + if (minimumSimilarity > 1.0f) + throw new IllegalArgumentException("minimumSimilarity > 1"); + else if (minimumSimilarity < 0.0f) + throw new IllegalArgumentException("minimumSimilarity < 0"); + this.minimumSimilarity = minimumSimilarity; + } + + /** + * Calls [EMAIL PROTECTED] #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f)}. + */ + public FuzzyQuery(Term term) { + this(term, 0.5f); } protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { - return new FuzzyTermEnum(reader, getTerm()); + return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity); } public String toString(String field) { 1.2 +20 -2 jakarta-lucene/src/test/org/apache/lucene/search/TestFuzzyQuery.java Index: TestFuzzyQuery.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/test/org/apache/lucene/search/TestFuzzyQuery.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- TestFuzzyQuery.java 1 Aug 2004 22:19:59 -0000 1.1 +++ TestFuzzyQuery.java 13 Aug 2004 18:35:02 -0000 1.2 @@ -33,7 +33,7 @@ */ public class TestFuzzyQuery extends TestCase { - public void testDefaultFuzziness() throws Exception { + public void testFuzziness() throws Exception { RAMDirectory directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true); addDoc("aaaaa", writer); @@ -90,7 +90,7 @@ directory.close(); } - public void testDefaultFuzzinessLong() throws Exception { + public void testFuzzinessLong() throws Exception { RAMDirectory directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true); addDoc("aaaaaaa", writer); @@ -123,6 +123,24 @@ query = new FuzzyQuery(new Term("field", "stellent")); hits = searcher.search(query); assertEquals(1, hits.length()); + + // "student" doesn't match anymore thanks to increased minimum similarity: + query = new FuzzyQuery(new Term("field", "student"), 0.6f); + hits = searcher.search(query); + assertEquals(0, hits.length()); + + try { + query = new FuzzyQuery(new Term("field", "student"), 1.1f); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expecting exception + } + try { + query = new FuzzyQuery(new Term("field", "student"), -0.1f); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expecting exception + } searcher.close(); directory.close();
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]