Hi,

I think FuzzyQuery is not as useful as it could be, because it's too fuzzy. 
For a word with 10 characters it allows an edit distance of 4, i.e. almost 
half of the word can be different. I suggest to add an option so the 
fuzziness can be configured, as in the attached patch. If nobody objects, 
I will commit it (plus test cases). I'll later also try to modify 
QueryParser to support this, but I cannot promise to get that working.

One thing I don't quite understand is the meaning of scale_factor. Does it 
make sense to configure that from outside, too?

Regards
 Daniel

-- 
http://www.danielnaber.de
Index: FuzzyTermEnum.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/FuzzyTermEnum.java,v
retrieving revision 1.6
diff -u -r1.6 FuzzyTermEnum.java
--- FuzzyTermEnum.java	11 May 2004 17:23:21 -0000	1.6
+++ FuzzyTermEnum.java	8 Aug 2004 14:48:08 -0000
@@ -33,9 +33,18 @@
     String field = "";
     String text = "";
     int textlen;
+    float minimumSimilarity;
+    double scale_factor;
+    
     
     public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
+      this(reader, term, 0.5f);
+    }
+    
+    public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws IOException {
         super();
+        minimumSimilarity = minSimilarity;
+        scale_factor = 1.0f / (1.0f - minimumSimilarity);
         searchTerm = term;
         field = searchTerm.field();
         text = searchTerm.text();
@@ -53,14 +62,14 @@
             int targetlen = target.length();
             int dist = editDistance(text, target, textlen, targetlen);
             distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
-            return (distance > FUZZY_THRESHOLD);
+            return (distance > minimumSimilarity);
         }
         endEnum = true;
         return false;
     }
     
     protected final float difference() {
-        return (float)((distance - FUZZY_THRESHOLD) * SCALE_FACTOR);
+        return (float)((distance - minimumSimilarity) * scale_factor);
     }
     
     public final boolean endEnum() {
@@ -70,9 +79,6 @@
     /******************************
      * Compute Levenshtein distance
      ******************************/
-    
-    public static final double FUZZY_THRESHOLD = 0.5;
-    public static final double SCALE_FACTOR = 1.0f / (1.0f - FUZZY_THRESHOLD);
     
     /**
      Finds and returns the smallest of three integers 
Index: FuzzyQuery.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/FuzzyQuery.java,v
retrieving revision 1.4
diff -u -r1.4 FuzzyQuery.java
--- FuzzyQuery.java	29 Mar 2004 22:48:03 -0000	1.4
+++ FuzzyQuery.java	8 Aug 2004 14:47:49 -0000
@@ -20,14 +20,43 @@
 import org.apache.lucene.index.Term;
 import java.io.IOException;
 
-/** Implements the fuzzy search query */
+/** Implements the fuzzy search query. The similiarity measurement
+ * is based on the Levenshtein (edit distance) algorithm.
+ */
 public final class FuzzyQuery extends MultiTermQuery {
-  public FuzzyQuery(Term term) {
+  
+  private float minimumSimilarity;
+  
+  /**
+   * Create a new FuzzyQuery that will match terms with a similarity 
+   * of at least <code>minimumSimilarity</code> to <code>term</code>.
+   * 
+   * @param term the term to search for
+   * @param minimumSimilarity a value between 0 and 1 to set the required similarity
+   *  between the query term and the matching terms. For example, for a
+   *  <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
+   *  as the query term is considered similar to the query term if the edit distance
+   *  between both terms is less than <code>length(term)*0.5</code>.
+   * @throws IllegalArgumentException if minimumSimilarity is &gt; 1 or &lt; 0
+   */
+  public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException {
     super(term);
+    if (minimumSimilarity > 1.0f)
+      throw new IllegalArgumentException("minimumSimilarity > 1");
+    else if (minimumSimilarity < 0.0f)
+      throw new IllegalArgumentException("minimumSimilarity < 0");
+    this.minimumSimilarity = minimumSimilarity;
+  }
+
+  /**
+   * Calls [EMAIL PROTECTED] #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f)}.
+   */
+  public FuzzyQuery(Term term) {
+    this(term, 0.5f);
   }
     
   protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
-    return new FuzzyTermEnum(reader, getTerm());
+    return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity);
   }
     
   public String toString(String field) {

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to