dnaber      2004/08/13 11:35:02

  Modified:    src/java/org/apache/lucene/search FuzzyTermEnum.java
                        FuzzyQuery.java
               src/test/org/apache/lucene/search TestFuzzyQuery.java
  Log:
  You can now set the required fuzziness of FuzzyQuery. Note that QueryParser does not 
support this (yet?).
  
  Revision  Changes    Path
  1.7       +11 -6     
jakarta-lucene/src/java/org/apache/lucene/search/FuzzyTermEnum.java
  
  Index: FuzzyTermEnum.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/FuzzyTermEnum.java,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- FuzzyTermEnum.java        11 May 2004 17:23:21 -0000      1.6
  +++ FuzzyTermEnum.java        13 Aug 2004 18:35:01 -0000      1.7
  @@ -26,16 +26,24 @@
     the enumeration is greater than all that precede it.  */
   public final class FuzzyTermEnum extends FilteredTermEnum {
       double distance;
  -    boolean fieldMatch = false;
       boolean endEnum = false;
   
       Term searchTerm = null;
       String field = "";
       String text = "";
       int textlen;
  +    float minimumSimilarity;
  +    double scale_factor;
  +    
       
       public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
  +      this(reader, term, 0.5f);
  +    }
  +    
  +    public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws 
IOException {
           super();
  +        minimumSimilarity = minSimilarity;
  +        scale_factor = 1.0f / (1.0f - minimumSimilarity);
           searchTerm = term;
           field = searchTerm.field();
           text = searchTerm.text();
  @@ -53,14 +61,14 @@
               int targetlen = target.length();
               int dist = editDistance(text, target, textlen, targetlen);
               distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
  -            return (distance > FUZZY_THRESHOLD);
  +            return (distance > minimumSimilarity);
           }
           endEnum = true;
           return false;
       }
       
       protected final float difference() {
  -        return (float)((distance - FUZZY_THRESHOLD) * SCALE_FACTOR);
  +        return (float)((distance - minimumSimilarity) * scale_factor);
       }
       
       public final boolean endEnum() {
  @@ -70,9 +78,6 @@
       /******************************
        * Compute Levenshtein distance
        ******************************/
  -    
  -    public static final double FUZZY_THRESHOLD = 0.5;
  -    public static final double SCALE_FACTOR = 1.0f / (1.0f - FUZZY_THRESHOLD);
       
       /**
        Finds and returns the smallest of three integers 
  
  
  
  1.5       +32 -3     jakarta-lucene/src/java/org/apache/lucene/search/FuzzyQuery.java
  
  Index: FuzzyQuery.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/FuzzyQuery.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- FuzzyQuery.java   29 Mar 2004 22:48:03 -0000      1.4
  +++ FuzzyQuery.java   13 Aug 2004 18:35:01 -0000      1.5
  @@ -20,14 +20,43 @@
   import org.apache.lucene.index.Term;
   import java.io.IOException;
   
  -/** Implements the fuzzy search query */
  +/** Implements the fuzzy search query. The similiarity measurement
  + * is based on the Levenshtein (edit distance) algorithm.
  + */
   public final class FuzzyQuery extends MultiTermQuery {
  -  public FuzzyQuery(Term term) {
  +  
  +  private float minimumSimilarity;
  +  
  +  /**
  +   * Create a new FuzzyQuery that will match terms with a similarity 
  +   * of at least <code>minimumSimilarity</code> to <code>term</code>.
  +   * 
  +   * @param term the term to search for
  +   * @param minimumSimilarity a value between 0 and 1 to set the required similarity
  +   *  between the query term and the matching terms. For example, for a
  +   *  <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
  +   *  as the query term is considered similar to the query term if the edit distance
  +   *  between both terms is less than <code>length(term)*0.5</code>.
  +   * @throws IllegalArgumentException if minimumSimilarity is &gt; 1 or &lt; 0
  +   */
  +  public FuzzyQuery(Term term, float minimumSimilarity) throws 
IllegalArgumentException {
       super(term);
  +    if (minimumSimilarity > 1.0f)
  +      throw new IllegalArgumentException("minimumSimilarity > 1");
  +    else if (minimumSimilarity < 0.0f)
  +      throw new IllegalArgumentException("minimumSimilarity < 0");
  +    this.minimumSimilarity = minimumSimilarity;
  +  }
  +
  +  /**
  +   * Calls [EMAIL PROTECTED] #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f)}.
  +   */
  +  public FuzzyQuery(Term term) {
  +    this(term, 0.5f);
     }
       
     protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
  -    return new FuzzyTermEnum(reader, getTerm());
  +    return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity);
     }
       
     public String toString(String field) {
  
  
  
  1.2       +20 -2     
jakarta-lucene/src/test/org/apache/lucene/search/TestFuzzyQuery.java
  
  Index: TestFuzzyQuery.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/test/org/apache/lucene/search/TestFuzzyQuery.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- TestFuzzyQuery.java       1 Aug 2004 22:19:59 -0000       1.1
  +++ TestFuzzyQuery.java       13 Aug 2004 18:35:02 -0000      1.2
  @@ -33,7 +33,7 @@
    */
   public class TestFuzzyQuery extends TestCase {
   
  -  public void testDefaultFuzziness() throws Exception {
  +  public void testFuzziness() throws Exception {
       RAMDirectory directory = new RAMDirectory();
       IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
       addDoc("aaaaa", writer);
  @@ -90,7 +90,7 @@
       directory.close();
     }
   
  -  public void testDefaultFuzzinessLong() throws Exception {
  +  public void testFuzzinessLong() throws Exception {
       RAMDirectory directory = new RAMDirectory();
       IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
       addDoc("aaaaaaa", writer);
  @@ -123,6 +123,24 @@
       query = new FuzzyQuery(new Term("field", "stellent"));   
       hits = searcher.search(query);
       assertEquals(1, hits.length());
  +
  +    // "student" doesn't match anymore thanks to increased minimum similarity:
  +    query = new FuzzyQuery(new Term("field", "student"), 0.6f);   
  +    hits = searcher.search(query);
  +    assertEquals(0, hits.length());
  +
  +    try {
  +      query = new FuzzyQuery(new Term("field", "student"), 1.1f);
  +      fail("Expected IllegalArgumentException");
  +    } catch (IllegalArgumentException e) {
  +      // expecting exception
  +    }
  +    try {
  +      query = new FuzzyQuery(new Term("field", "student"), -0.1f);
  +      fail("Expected IllegalArgumentException");
  +    } catch (IllegalArgumentException e) {
  +      // expecting exception
  +    }
   
       searcher.close();
       directory.close();
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to