goller      2004/09/14 06:45:16

  Modified:    src/java/org/apache/lucene/queryParser
                        QueryParserTokenManager.java QueryParser.jj
                        QueryParser.java QueryParserConstants.java
               src/java/org/apache/lucene/search FuzzyQuery.java
                        FuzzyTermEnum.java
  Log:
  QueryParser can now handle minimumSimilarity parameter
  of FuzzyQuery; FuzzyQuery extended to allow for non-fuzzy
  prefixes.
  
  Revision  Changes    Path
  1.4       +133 -117  
jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java
  
  Index: QueryParserTokenManager.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParserTokenManager.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- QueryParserTokenManager.java      24 Mar 2004 10:12:27 -0000      1.3
  +++ QueryParserTokenManager.java      14 Sep 2004 13:45:15 -0000      1.4
  @@ -54,13 +54,11 @@
         case 58:
            return jjStopAtPos(0, 14);
         case 91:
  -         return jjStopAtPos(0, 22);
  +         return jjStopAtPos(0, 21);
         case 94:
            return jjStopAtPos(0, 15);
         case 123:
  -         return jjStopAtPos(0, 23);
  -      case 126:
  -         return jjStartNfaWithStates_3(0, 18, 18);
  +         return jjStopAtPos(0, 22);
         default :
            return jjMoveNfa_3(0, 0);
      }
  @@ -105,7 +103,7 @@
   {
      int[] nextStates;
      int startsAt = 0;
  -   jjnewStateCnt = 31;
  +   jjnewStateCnt = 33;
      int i = 1;
      jjstateSet[0] = startState;
      int j, kind = 0x7fffffff;
  @@ -169,56 +167,67 @@
                  case 18:
                     if ((0x3ff000000000000L & l) == 0L)
                        break;
  -                  if (kind > 19)
  -                     kind = 19;
  -                  jjstateSet[jjnewStateCnt++] = 18;
  +                  if (kind > 18)
  +                     kind = 18;
  +                  jjAddStates(7, 8);
                     break;
                  case 19:
  +                  if (curChar == 46)
  +                     jjCheckNAdd(20);
  +                  break;
  +               case 20:
  +                  if ((0x3ff000000000000L & l) == 0L)
  +                     break;
  +                  if (kind > 18)
  +                     kind = 18;
  +                  jjCheckNAdd(20);
  +                  break;
  +               case 21:
                     if ((0x7bffd0f8ffffd9ffL & l) == 0L)
                        break;
                     if (kind > 17)
                        kind = 17;
                     jjCheckNAddStates(0, 6);
                     break;
  -               case 20:
  +               case 22:
                     if ((0x7bfff8f8ffffd9ffL & l) == 0L)
                        break;
                     if (kind > 17)
                        kind = 17;
  -                  jjCheckNAddTwoStates(20, 21);
  +                  jjCheckNAddTwoStates(22, 23);
                     break;
  -               case 22:
  +               case 24:
                     if ((0x84002f0600000000L & l) == 0L)
                        break;
                     if (kind > 17)
                        kind = 17;
  -                  jjCheckNAddTwoStates(20, 21);
  +                  jjCheckNAddTwoStates(22, 23);
                     break;
  -               case 23:
  +               case 25:
                     if ((0x7bfff8f8ffffd9ffL & l) != 0L)
  -                     jjCheckNAddStates(7, 9);
  -                  break;
  -               case 24:
  -                  if (curChar == 42 && kind > 20)
  -                     kind = 20;
  +                     jjCheckNAddStates(9, 11);
                     break;
                  case 26:
  +                  if (curChar == 42 && kind > 19)
  +                     kind = 19;
  +                  break;
  +               case 28:
                     if ((0x84002f0600000000L & l) != 0L)
  -                     jjCheckNAddStates(7, 9);
  +                     jjCheckNAddStates(9, 11);
                     break;
  -               case 27:
  +               case 29:
                     if ((0xfbfffcf8ffffd9ffL & l) == 0L)
                        break;
  -                  if (kind > 21)
  -                     kind = 21;
  -                  jjCheckNAddTwoStates(27, 28);
  +                  if (kind > 20)
  +                     kind = 20;
  +                  jjCheckNAddTwoStates(29, 30);
                     break;
  -               case 29:
  +               case 31:
                     if ((0x84002f0600000000L & l) == 0L)
                        break;
  -                  if (kind > 21)
  -                     kind = 21;
  -                  jjCheckNAddTwoStates(27, 28);
  +                  if (kind > 20)
  +                     kind = 20;
  +                  jjCheckNAddTwoStates(29, 30);
                     break;
                  default : break;
               }
  @@ -239,9 +248,13 @@
                        jjCheckNAddStates(0, 6);
                     }
                     else if (curChar == 126)
  +                  {
  +                     if (kind > 18)
  +                        kind = 18;
                        jjstateSet[jjnewStateCnt++] = 18;
  +                  }
                     if (curChar == 92)
  -                     jjCheckNAddStates(10, 12);
  +                     jjCheckNAddStates(12, 14);
                     else if (curChar == 78)
                        jjstateSet[jjnewStateCnt++] = 11;
                     else if (curChar == 124)
  @@ -292,70 +305,73 @@
                        jjstateSet[jjnewStateCnt++] = 11;
                     break;
                  case 15:
  -                  jjAddStates(13, 14);
  +                  jjAddStates(15, 16);
                     break;
                  case 17:
  -                  if (curChar == 126)
  -                     jjstateSet[jjnewStateCnt++] = 18;
  +                  if (curChar != 126)
  +                     break;
  +                  if (kind > 18)
  +                     kind = 18;
  +                  jjstateSet[jjnewStateCnt++] = 18;
                     break;
  -               case 19:
  +               case 21:
                     if ((0x97ffffff97ffffffL & l) == 0L)
                        break;
                     if (kind > 17)
                        kind = 17;
                     jjCheckNAddStates(0, 6);
                     break;
  -               case 20:
  +               case 22:
                     if ((0x97ffffff97ffffffL & l) == 0L)
                        break;
                     if (kind > 17)
                        kind = 17;
  -                  jjCheckNAddTwoStates(20, 21);
  +                  jjCheckNAddTwoStates(22, 23);
                     break;
  -               case 21:
  +               case 23:
                     if (curChar == 92)
  -                     jjCheckNAddTwoStates(22, 22);
  +                     jjCheckNAddTwoStates(24, 24);
                     break;
  -               case 22:
  +               case 24:
                     if ((0x6800000078000000L & l) == 0L)
                        break;
                     if (kind > 17)
                        kind = 17;
  -                  jjCheckNAddTwoStates(20, 21);
  +                  jjCheckNAddTwoStates(22, 23);
                     break;
  -               case 23:
  +               case 25:
                     if ((0x97ffffff97ffffffL & l) != 0L)
  -                     jjCheckNAddStates(7, 9);
  +                     jjCheckNAddStates(9, 11);
                     break;
  -               case 25:
  +               case 27:
                     if (curChar == 92)
  -                     jjCheckNAddTwoStates(26, 26);
  +                     jjCheckNAddTwoStates(28, 28);
                     break;
  -               case 26:
  +               case 28:
                     if ((0x6800000078000000L & l) != 0L)
  -                     jjCheckNAddStates(7, 9);
  +                     jjCheckNAddStates(9, 11);
                     break;
  -               case 27:
  +               case 29:
                     if ((0x97ffffff97ffffffL & l) == 0L)
                        break;
  -                  if (kind > 21)
  -                     kind = 21;
  -                  jjCheckNAddTwoStates(27, 28);
  +                  if (kind > 20)
  +                     kind = 20;
  +                  jjCheckNAddTwoStates(29, 30);
                     break;
  -               case 28:
  +               case 30:
                     if (curChar == 92)
  -                     jjCheckNAddTwoStates(29, 29);
  +                     jjCheckNAddTwoStates(31, 31);
                     break;
  -               case 29:
  +               case 31:
                     if ((0x6800000078000000L & l) == 0L)
                        break;
  -                  if (kind > 21)
  -                     kind = 21;
  -                  jjCheckNAddTwoStates(27, 28);
  +                  if (kind > 20)
  +                     kind = 20;
  +                  jjCheckNAddTwoStates(29, 30);
                     break;
  -               case 30:
  +               case 32:
                     if (curChar == 92)
  -                     jjCheckNAddStates(10, 12);
  +                     jjCheckNAddStates(12, 14);
                     break;
                  default : break;
               }
  @@ -381,25 +397,25 @@
                     break;
                  case 15:
                     if (jjCanMove_0(hiByte, i1, i2, l1, l2))
  -                     jjAddStates(13, 14);
  +                     jjAddStates(15, 16);
                     break;
  -               case 20:
  +               case 22:
                     if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
                        break;
                     if (kind > 17)
                        kind = 17;
  -                  jjCheckNAddTwoStates(20, 21);
  +                  jjCheckNAddTwoStates(22, 23);
                     break;
  -               case 23:
  +               case 25:
                     if (jjCanMove_0(hiByte, i1, i2, l1, l2))
  -                     jjCheckNAddStates(7, 9);
  +                     jjCheckNAddStates(9, 11);
                     break;
  -               case 27:
  +               case 29:
                     if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
                        break;
  -                  if (kind > 21)
  -                     kind = 21;
  -                  jjCheckNAddTwoStates(27, 28);
  +                  if (kind > 20)
  +                     kind = 20;
  +                  jjCheckNAddTwoStates(29, 30);
                     break;
                  default : break;
               }
  @@ -412,7 +428,7 @@
            kind = 0x7fffffff;
         }
         ++curPos;
  -      if ((i = jjnewStateCnt) == (startsAt = 31 - (jjnewStateCnt = startsAt)))
  +      if ((i = jjnewStateCnt) == (startsAt = 33 - (jjnewStateCnt = startsAt)))
            return curPos;
         try { curChar = input_stream.readChar(); }
         catch(java.io.IOException e) { return curPos; }
  @@ -423,9 +439,9 @@
      switch (pos)
      {
         case 0:
  -         if ((active0 & 0x20000000L) != 0L)
  +         if ((active0 & 0x10000000L) != 0L)
            {
  -            jjmatchedKind = 32;
  +            jjmatchedKind = 31;
               return 4;
            }
            return -1;
  @@ -450,9 +466,9 @@
      switch(curChar)
      {
         case 84:
  -         return jjMoveStringLiteralDfa1_1(0x20000000L);
  +         return jjMoveStringLiteralDfa1_1(0x10000000L);
         case 125:
  -         return jjStopAtPos(0, 30);
  +         return jjStopAtPos(0, 29);
         default :
            return jjMoveNfa_1(0, 0);
      }
  @@ -467,8 +483,8 @@
      switch(curChar)
      {
         case 79:
  -         if ((active0 & 0x20000000L) != 0L)
  -            return jjStartNfaWithStates_1(1, 29, 4);
  +         if ((active0 & 0x10000000L) != 0L)
  +            return jjStartNfaWithStates_1(1, 28, 4);
            break;
         default :
            break;
  @@ -497,8 +513,8 @@
                  case 0:
                     if ((0xfffffffeffffffffL & l) != 0L)
                     {
  -                     if (kind > 32)
  -                        kind = 32;
  +                     if (kind > 31)
  +                        kind = 31;
                        jjCheckNAdd(4);
                     }
                     if ((0x100002600L & l) != 0L)
  @@ -518,14 +534,14 @@
                        jjCheckNAddTwoStates(2, 3);
                     break;
                  case 3:
  -                  if (curChar == 34 && kind > 31)
  -                     kind = 31;
  +                  if (curChar == 34 && kind > 30)
  +                     kind = 30;
                     break;
                  case 4:
                     if ((0xfffffffeffffffffL & l) == 0L)
                        break;
  -                  if (kind > 32)
  -                     kind = 32;
  +                  if (kind > 31)
  +                     kind = 31;
                     jjCheckNAdd(4);
                     break;
                  default : break;
  @@ -543,12 +559,12 @@
                  case 4:
                     if ((0xdfffffffffffffffL & l) == 0L)
                        break;
  -                  if (kind > 32)
  -                     kind = 32;
  +                  if (kind > 31)
  +                     kind = 31;
                     jjCheckNAdd(4);
                     break;
                  case 2:
  -                  jjAddStates(15, 16);
  +                  jjAddStates(17, 18);
                     break;
                  default : break;
               }
  @@ -569,13 +585,13 @@
                  case 4:
                     if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
                        break;
  -                  if (kind > 32)
  -                     kind = 32;
  +                  if (kind > 31)
  +                     kind = 31;
                     jjCheckNAdd(4);
                     break;
                  case 2:
                     if (jjCanMove_0(hiByte, i1, i2, l1, l2))
  -                     jjAddStates(15, 16);
  +                     jjAddStates(17, 18);
                     break;
                  default : break;
               }
  @@ -620,9 +636,9 @@
                  case 0:
                     if ((0x3ff000000000000L & l) == 0L)
                        break;
  -                  if (kind > 24)
  -                     kind = 24;
  -                  jjAddStates(17, 18);
  +                  if (kind > 23)
  +                     kind = 23;
  +                  jjAddStates(19, 20);
                     break;
                  case 1:
                     if (curChar == 46)
  @@ -631,8 +647,8 @@
                  case 2:
                     if ((0x3ff000000000000L & l) == 0L)
                        break;
  -                  if (kind > 24)
  -                     kind = 24;
  +                  if (kind > 23)
  +                     kind = 23;
                     jjCheckNAdd(2);
                     break;
                  default : break;
  @@ -683,9 +699,9 @@
      switch (pos)
      {
         case 0:
  -         if ((active0 & 0x2000000L) != 0L)
  +         if ((active0 & 0x1000000L) != 0L)
            {
  -            jjmatchedKind = 28;
  +            jjmatchedKind = 27;
               return 4;
            }
            return -1;
  @@ -710,9 +726,9 @@
      switch(curChar)
      {
         case 84:
  -         return jjMoveStringLiteralDfa1_2(0x2000000L);
  +         return jjMoveStringLiteralDfa1_2(0x1000000L);
         case 93:
  -         return jjStopAtPos(0, 26);
  +         return jjStopAtPos(0, 25);
         default :
            return jjMoveNfa_2(0, 0);
      }
  @@ -727,8 +743,8 @@
      switch(curChar)
      {
         case 79:
  -         if ((active0 & 0x2000000L) != 0L)
  -            return jjStartNfaWithStates_2(1, 25, 4);
  +         if ((active0 & 0x1000000L) != 0L)
  +            return jjStartNfaWithStates_2(1, 24, 4);
            break;
         default :
            break;
  @@ -757,8 +773,8 @@
                  case 0:
                     if ((0xfffffffeffffffffL & l) != 0L)
                     {
  -                     if (kind > 28)
  -                        kind = 28;
  +                     if (kind > 27)
  +                        kind = 27;
                        jjCheckNAdd(4);
                     }
                     if ((0x100002600L & l) != 0L)
  @@ -778,14 +794,14 @@
                        jjCheckNAddTwoStates(2, 3);
                     break;
                  case 3:
  -                  if (curChar == 34 && kind > 27)
  -                     kind = 27;
  +                  if (curChar == 34 && kind > 26)
  +                     kind = 26;
                     break;
                  case 4:
                     if ((0xfffffffeffffffffL & l) == 0L)
                        break;
  -                  if (kind > 28)
  -                     kind = 28;
  +                  if (kind > 27)
  +                     kind = 27;
                     jjCheckNAdd(4);
                     break;
                  default : break;
  @@ -803,12 +819,12 @@
                  case 4:
                     if ((0xffffffffdfffffffL & l) == 0L)
                        break;
  -                  if (kind > 28)
  -                     kind = 28;
  +                  if (kind > 27)
  +                     kind = 27;
                     jjCheckNAdd(4);
                     break;
                  case 2:
  -                  jjAddStates(15, 16);
  +                  jjAddStates(17, 18);
                     break;
                  default : break;
               }
  @@ -829,13 +845,13 @@
                  case 4:
                     if (!jjCanMove_0(hiByte, i1, i2, l1, l2))
                        break;
  -                  if (kind > 28)
  -                     kind = 28;
  +                  if (kind > 27)
  +                     kind = 27;
                     jjCheckNAdd(4);
                     break;
                  case 2:
                     if (jjCanMove_0(hiByte, i1, i2, l1, l2))
  -                     jjAddStates(15, 16);
  +                     jjAddStates(17, 18);
                     break;
                  default : break;
               }
  @@ -855,8 +871,8 @@
      }
   }
   static final int[] jjnextStates = {
  -   20, 23, 24, 27, 28, 25, 21, 23, 24, 25, 22, 26, 29, 15, 16, 2, 
  -   3, 0, 1, 
  +   22, 25, 26, 29, 30, 27, 23, 18, 19, 25, 26, 27, 24, 28, 31, 15, 
  +   16, 2, 3, 0, 1, 
   };
   private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long 
l2)
   {
  @@ -872,8 +888,8 @@
   }
   public static final String[] jjstrLiteralImages = {
   "", null, null, null, null, null, null, null, null, null, "\53", "\55", "\50", 
  -"\51", "\72", "\136", null, null, "\176", null, null, null, "\133", "\173", null, 
  -"\124\117", "\135", null, null, "\124\117", "\175", null, null, };
  +"\51", "\72", "\136", null, null, null, null, null, "\133", "\173", null, 
"\124\117", 
  +"\135", null, null, "\124\117", "\175", null, null, };
   public static final String[] lexStateNames = {
      "Boost", 
      "RangeEx", 
  @@ -881,18 +897,18 @@
      "DEFAULT", 
   };
   public static final int[] jjnewLexState = {
  -   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, 
-1, -1, 2, 1, 3, 
  -   -1, 3, -1, -1, -1, 3, -1, -1, 
  +   -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, 
-1, 2, 1, 3, -1, 
  +   3, -1, -1, -1, 3, -1, -1, 
   };
   static final long[] jjtoToken = {
  -   0x1ffffff81L, 
  +   0xffffff81L, 
   };
   static final long[] jjtoSkip = {
      0x40L, 
   };
   protected CharStream input_stream;
  -private final int[] jjrounds = new int[31];
  -private final int[] jjstateSet = new int[62];
  +private final int[] jjrounds = new int[33];
  +private final int[] jjstateSet = new int[66];
   protected char curChar;
   public QueryParserTokenManager(CharStream stream)
   {
  @@ -914,7 +930,7 @@
   {
      int i;
      jjround = 0x80000001;
  -   for (i = 31; i-- > 0;)
  +   for (i = 33; i-- > 0;)
         jjrounds[i] = 0x80000000;
   }
   public void ReInit(CharStream stream, int lexState)
  
  
  
  1.49      +28 -11    
jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj
  
  Index: QueryParser.jj
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj,v
  retrieving revision 1.48
  retrieving revision 1.49
  diff -u -r1.48 -r1.49
  --- QueryParser.jj    8 Sep 2004 15:06:42 -0000       1.48
  +++ QueryParser.jj    14 Sep 2004 13:45:15 -0000      1.49
  @@ -96,6 +96,7 @@
     Analyzer analyzer;
     String field;
     int phraseSlop = 0;
  +  float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity;
     Locale locale = Locale.getDefault();
   
     /** Parses a query string, returning a [EMAIL PROTECTED] 
org.apache.lucene.search.Query}.
  @@ -151,6 +152,19 @@
     public String getField() {
       return field;
     }
  +  
  +   /**
  +   * Get the default minimal similarity for fuzzy queries.
  +   */
  +  public float getFuzzyMinSim() {
  +      return fuzzyMinSim;
  +  }
  +  /**
  +   *Set the default minimum similarity for fuzzy queries.
  +   */
  +  public void setFuzzyMinSim(float fuzzyMinSim) {
  +      this.fuzzyMinSim = fuzzyMinSim;
  +  }
   
     /**
      * Sets the default slop for phrases.  If zero, then exact phrase matches
  @@ -439,10 +453,10 @@
      * @return Resulting [EMAIL PROTECTED] Query} built for the term
      * @exception ParseException throw in overridden method to disallow
      */
  -  protected Query getFuzzyQuery(String field, String termStr) throws ParseException
  +  protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) 
throws ParseException
     {
       Term t = new Term(field, termStr);
  -    return new FuzzyQuery(t);
  +    return new FuzzyQuery(t, minSimilarity);
     }
   
     /**
  @@ -531,8 +545,7 @@
   | <CARAT:     "^" > : Boost
   | <QUOTED:     "\"" (~["\""])+ "\"">
   | <TERM:      <_TERM_START_CHAR> (<_TERM_CHAR>)*  >
  -| <FUZZY:     "~" >
  -| <SLOP:      "~" (<_NUM_CHAR>)+ >
  +| <FUZZY_SLOP:     "~" ( (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? )? >
   | <PREFIXTERM:  <_TERM_START_CHAR> (<_TERM_CHAR>)* "*" >
   | <WILDTERM:  <_TERM_START_CHAR>
                 (<_TERM_CHAR> | ( [ "*", "?" ] ))* >
  @@ -641,7 +654,7 @@
   
   
   Query Term(String field) : {
  -  Token term, boost=null, slop=null, goop1, goop2;
  +  Token term, boost=null, fuzzySlop=null, goop1, goop2;
     boolean prefix = false;
     boolean wildcard = false;
     boolean fuzzy = false;
  @@ -656,8 +669,8 @@
          | term=<WILDTERM> { wildcard=true; }
          | term=<NUMBER>
        )
  -     [ <FUZZY> { fuzzy=true; } ]
  -     [ <CARAT> boost=<NUMBER> [ <FUZZY> { fuzzy=true; } ] ]
  +     [ fuzzySlop=<FUZZY_SLOP> { fuzzy=true; } ]
  +     [ <CARAT> boost=<NUMBER> [ fuzzySlop=<FUZZY_SLOP> { fuzzy=true; } ] ]
        {
          String termImage=discardEscapeChar(term.image);
          if (wildcard) {
  @@ -667,7 +680,11 @@
              discardEscapeChar(term.image.substring
             (0, term.image.length()-1)));
          } else if (fuzzy) {
  -         q = getFuzzyQuery(field, termImage);
  +               float fms = fuzzyMinSim;
  +               try {
  +            fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue();
  +               } catch (Exception ignored) { }
  +         q = getFuzzyQuery(field, termImage, fms);
          } else {
            q = getFieldQuery(field, termImage);
          }
  @@ -708,14 +725,14 @@
             q = getRangeQuery(field, goop1.image, goop2.image, false);
           }
        | term=<QUOTED>
  -       [ slop=<SLOP> ]
  +       [ fuzzySlop=<FUZZY_SLOP> ]
          [ <CARAT> boost=<NUMBER> ]
          {
            int s = phraseSlop;
   
  -         if (slop != null) {
  +         if (fuzzySlop != null) {
              try {
  -             s = Float.valueOf(slop.image.substring(1)).intValue();
  +             s = Float.valueOf(fuzzySlop.image.substring(1)).intValue();
              }
              catch (Exception ignored) { }
            }
  
  
  
  1.16      +36 -26    
jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParser.java
  
  Index: QueryParser.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParser.java,v
  retrieving revision 1.15
  retrieving revision 1.16
  diff -u -r1.15 -r1.16
  --- QueryParser.java  8 Sep 2004 15:06:42 -0000       1.15
  +++ QueryParser.java  14 Sep 2004 13:45:15 -0000      1.16
  @@ -73,6 +73,7 @@
     Analyzer analyzer;
     String field;
     int phraseSlop = 0;
  +  float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity;
     Locale locale = Locale.getDefault();
   
     /** Parses a query string, returning a [EMAIL PROTECTED] 
org.apache.lucene.search.Query}.
  @@ -129,6 +130,19 @@
       return field;
     }
   
  +   /**
  +   * Get the default minimal similarity for fuzzy queries.
  +   */
  +  public float getFuzzyMinSim() {
  +      return fuzzyMinSim;
  +  }
  +  /**
  +   *Set the default minimum similarity for fuzzy queries.
  +   */
  +  public void setFuzzyMinSim(float fuzzyMinSim) {
  +      this.fuzzyMinSim = fuzzyMinSim;
  +  }
  +
     /**
      * Sets the default slop for phrases.  If zero, then exact phrase matches
      * are required.  Default value is zero.
  @@ -416,10 +430,10 @@
      * @return Resulting [EMAIL PROTECTED] Query} built for the term
      * @exception ParseException throw in overridden method to disallow
      */
  -  protected Query getFuzzyQuery(String field, String termStr) throws ParseException
  +  protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) 
throws ParseException
     {
       Term t = new Term(field, termStr);
  -    return new FuzzyQuery(t);
  +    return new FuzzyQuery(t, minSimilarity);
     }
   
     /**
  @@ -622,7 +636,7 @@
     }
   
     final public Query Term(String field) throws ParseException {
  -  Token term, boost=null, slop=null, goop1, goop2;
  +  Token term, boost=null, fuzzySlop=null, goop1, goop2;
     boolean prefix = false;
     boolean wildcard = false;
     boolean fuzzy = false;
  @@ -654,9 +668,9 @@
           throw new ParseException();
         }
         switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
  -      case FUZZY:
  -        jj_consume_token(FUZZY);
  -                 fuzzy=true;
  +      case FUZZY_SLOP:
  +        fuzzySlop = jj_consume_token(FUZZY_SLOP);
  +                                fuzzy=true;
           break;
         default:
           jj_la1[8] = jj_gen;
  @@ -667,9 +681,9 @@
           jj_consume_token(CARAT);
           boost = jj_consume_token(NUMBER);
           switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
  -        case FUZZY:
  -          jj_consume_token(FUZZY);
  -                                          fuzzy=true;
  +        case FUZZY_SLOP:
  +          fuzzySlop = jj_consume_token(FUZZY_SLOP);
  +                                                         fuzzy=true;
             break;
           default:
             jj_la1[9] = jj_gen;
  @@ -688,7 +702,11 @@
              discardEscapeChar(term.image.substring
             (0, term.image.length()-1)));
          } else if (fuzzy) {
  -         q = getFuzzyQuery(field, termImage);
  +          float fms = fuzzyMinSim;
  +          try {
  +            fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue();
  +          } catch (Exception ignored) { }
  +         q = getFuzzyQuery(field, termImage, fms);
          } else {
            q = getFieldQuery(field, termImage);
          }
  @@ -809,8 +827,8 @@
       case QUOTED:
         term = jj_consume_token(QUOTED);
         switch ((jj_ntk==-1)?jj_ntk():jj_ntk) {
  -      case SLOP:
  -        slop = jj_consume_token(SLOP);
  +      case FUZZY_SLOP:
  +        fuzzySlop = jj_consume_token(FUZZY_SLOP);
           break;
         default:
           jj_la1[19] = jj_gen;
  @@ -827,9 +845,9 @@
         }
            int s = phraseSlop;
   
  -         if (slop != null) {
  +         if (fuzzySlop != null) {
              try {
  -             s = Float.valueOf(slop.image.substring(1)).intValue();
  +             s = Float.valueOf(fuzzySlop.image.substring(1)).intValue();
              }
              catch (Exception ignored) { }
            }
  @@ -883,16 +901,11 @@
     private int jj_gen;
     final private int[] jj_la1 = new int[22];
     static private int[] jj_la1_0;
  -  static private int[] jj_la1_1;
     static {
         jj_la1_0();
  -      jj_la1_1();
      }
      private static void jj_la1_0() {
  -      jj_la1_0 = new int[] 
{0x180,0x180,0xe00,0xe00,0x1f31f80,0x8000,0x1f31000,0x1320000,0x40000,0x40000,0x8000,0x18000000,0x2000000,0x18000000,0x8000,0x80000000,0x20000000,0x80000000,0x8000,0x80000,0x8000,0x1f30000,};
  -   }
  -   private static void jj_la1_1() {
  -      jj_la1_1 = new int[] 
{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x1,0x0,0x0,0x0,0x0,};
  +      jj_la1_0 = new int[] 
{0x180,0x180,0xe00,0xe00,0xfb1f80,0x8000,0xfb1000,0x9a0000,0x40000,0x40000,0x8000,0xc000000,0x1000000,0xc000000,0x8000,0xc0000000,0x10000000,0xc0000000,0x8000,0x40000,0x8000,0xfb0000,};
      }
     final private JJCalls[] jj_2_rtns = new JJCalls[1];
     private boolean jj_rescan = false;
  @@ -1041,8 +1054,8 @@
   
     public ParseException generateParseException() {
       jj_expentries.removeAllElements();
  -    boolean[] la1tokens = new boolean[33];
  -    for (int i = 0; i < 33; i++) {
  +    boolean[] la1tokens = new boolean[32];
  +    for (int i = 0; i < 32; i++) {
         la1tokens[i] = false;
       }
       if (jj_kind >= 0) {
  @@ -1055,13 +1068,10 @@
             if ((jj_la1_0[i] & (1<<j)) != 0) {
               la1tokens[j] = true;
             }
  -          if ((jj_la1_1[i] & (1<<j)) != 0) {
  -            la1tokens[32+j] = true;
  -          }
           }
         }
       }
  -    for (int i = 0; i < 33; i++) {
  +    for (int i = 0; i < 32; i++) {
         if (la1tokens[i]) {
           jj_expentry = new int[1];
           jj_expentry[0] = i;
  
  
  
  1.2       +15 -17    
jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParserConstants.java
  
  Index: QueryParserConstants.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParserConstants.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- QueryParserConstants.java 11 Sep 2003 01:51:33 -0000      1.1
  +++ QueryParserConstants.java 14 Sep 2004 13:45:15 -0000      1.2
  @@ -20,21 +20,20 @@
     int CARAT = 15;
     int QUOTED = 16;
     int TERM = 17;
  -  int FUZZY = 18;
  -  int SLOP = 19;
  -  int PREFIXTERM = 20;
  -  int WILDTERM = 21;
  -  int RANGEIN_START = 22;
  -  int RANGEEX_START = 23;
  -  int NUMBER = 24;
  -  int RANGEIN_TO = 25;
  -  int RANGEIN_END = 26;
  -  int RANGEIN_QUOTED = 27;
  -  int RANGEIN_GOOP = 28;
  -  int RANGEEX_TO = 29;
  -  int RANGEEX_END = 30;
  -  int RANGEEX_QUOTED = 31;
  -  int RANGEEX_GOOP = 32;
  +  int FUZZY_SLOP = 18;
  +  int PREFIXTERM = 19;
  +  int WILDTERM = 20;
  +  int RANGEIN_START = 21;
  +  int RANGEEX_START = 22;
  +  int NUMBER = 23;
  +  int RANGEIN_TO = 24;
  +  int RANGEIN_END = 25;
  +  int RANGEIN_QUOTED = 26;
  +  int RANGEIN_GOOP = 27;
  +  int RANGEEX_TO = 28;
  +  int RANGEEX_END = 29;
  +  int RANGEEX_QUOTED = 30;
  +  int RANGEEX_GOOP = 31;
   
     int Boost = 0;
     int RangeEx = 1;
  @@ -60,8 +59,7 @@
       "\"^\"",
       "<QUOTED>",
       "<TERM>",
  -    "\"~\"",
  -    "<SLOP>",
  +    "<FUZZY_SLOP>",
       "<PREFIXTERM>",
       "<WILDTERM>",
       "\"[\"",
  
  
  
  1.6       +25 -5     jakarta-lucene/src/java/org/apache/lucene/search/FuzzyQuery.java
  
  Index: FuzzyQuery.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/FuzzyQuery.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- FuzzyQuery.java   13 Aug 2004 18:35:01 -0000      1.5
  +++ FuzzyQuery.java   14 Sep 2004 13:45:15 -0000      1.6
  @@ -25,11 +25,15 @@
    */
   public final class FuzzyQuery extends MultiTermQuery {
     
  +  public final static float defaultMinSimilarity = 0.5f;
     private float minimumSimilarity;
  +  private int prefixLength;
     
     /**
      * Create a new FuzzyQuery that will match terms with a similarity 
      * of at least <code>minimumSimilarity</code> to <code>term</code>.
  +   * If a <code>prefixLength</code> &gt; 0 is specified, a common prefix
  +   * of that length is also required.
      * 
      * @param term the term to search for
      * @param minimumSimilarity a value between 0 and 1 to set the required similarity
  @@ -37,29 +41,45 @@
      *  <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length
      *  as the query term is considered similar to the query term if the edit distance
      *  between both terms is less than <code>length(term)*0.5</code>.
  +   * @param prefixLength length of common prefix.
      * @throws IllegalArgumentException if minimumSimilarity is &gt; 1 or &lt; 0
  +   * or if prefixLength &lt; 0 or &gt; <code>term.text().length()</code>.
      */
  -  public FuzzyQuery(Term term, float minimumSimilarity) throws 
IllegalArgumentException {
  +  public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws 
IllegalArgumentException {
       super(term);
  +    
       if (minimumSimilarity > 1.0f)
         throw new IllegalArgumentException("minimumSimilarity > 1");
       else if (minimumSimilarity < 0.0f)
         throw new IllegalArgumentException("minimumSimilarity < 0");
       this.minimumSimilarity = minimumSimilarity;
  +    
  +    if(prefixLength < 0)
  +        throw new IllegalArgumentException("prefixLength < 0");
  +    else if(prefixLength >= term.text().length())
  +        throw new IllegalArgumentException("prefixLength >= term.text().length()");
  +    this.prefixLength = prefixLength;
  +  }
  +  
  +  /**
  +   * Calls [EMAIL PROTECTED] #FuzzyQuery(Term, float) FuzzyQuery(term, 
minimumSimilarity, 0)}.
  +   */
  +  public FuzzyQuery(Term term, float minimumSimilarity) throws 
IllegalArgumentException {
  +      this(term, minimumSimilarity, 0);
     }
   
     /**
  -   * Calls [EMAIL PROTECTED] #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f)}.
  +   * Calls [EMAIL PROTECTED] #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0)}.
      */
     public FuzzyQuery(Term term) {
  -    this(term, 0.5f);
  +    this(term, defaultMinSimilarity, 0);
     }
       
     protected FilteredTermEnum getEnum(IndexReader reader) throws IOException {
  -    return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity);
  +    return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength);
     }
       
     public String toString(String field) {
  -    return super.toString(field) + '~';
  +    return super.toString(field) + '~' + Float.toString(minimumSimilarity);
     }
   }
  
  
  
  1.8       +45 -4     
jakarta-lucene/src/java/org/apache/lucene/search/FuzzyTermEnum.java
  
  Index: FuzzyTermEnum.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/FuzzyTermEnum.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- FuzzyTermEnum.java        13 Aug 2004 18:35:01 -0000      1.7
  +++ FuzzyTermEnum.java        14 Sep 2004 13:45:15 -0000      1.8
  @@ -32,15 +32,49 @@
       String field = "";
       String text = "";
       int textlen;
  +    String prefix = "";
  +    int prefixLength = 0;
       float minimumSimilarity;
       double scale_factor;
       
       
  +    /**
  +     * Empty prefix and minSimilarity of 0.5f are used.
  +     * 
  +     * @param reader
  +     * @param term
  +     * @throws IOException
  +     * @see #FuzzyTermEnum(IndexReader, Term, float, int)
  +     */
       public FuzzyTermEnum(IndexReader reader, Term term) throws IOException {
  -      this(reader, term, 0.5f);
  +      this(reader, term, FuzzyQuery.defaultMinSimilarity, 0);
       }
       
  +    /**
  +     * This is the standard FuzzyTermEnum with an empty prefix.
  +     * 
  +     * @param reader
  +     * @param term
  +     * @param minSimilarity
  +     * @throws IOException
  +     * @see #FuzzyTermEnum(IndexReader, Term, float, int)
  +     */
       public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws 
IOException {
  +      this(reader, term, minSimilarity, 0);
  +    }
  +    
  +    /**
  +     * Constructor for enumeration of all terms from specified <code>reader</code> 
which share a prefix of
  +     * length <code>prefixLength</code> with <code>term</code> and which have a 
fuzzy similarity &gt;
  +     * <code>minSimilarity</code>. 
  +     * 
  +     * @param reader Delivers terms.
  +     * @param term Pattern term.
  +     * @param minSimilarity Minimum required similarity for terms from the reader. 
Default value is 0.5f.
  +     * @param prefixLength Length of required common prefix. Default value is 0.
  +     * @throws IOException
  +     */
  +    public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int 
prefixLength) throws IOException {
           super();
           minimumSimilarity = minSimilarity;
           scale_factor = 1.0f / (1.0f - minimumSimilarity);
  @@ -48,7 +82,13 @@
           field = searchTerm.field();
           text = searchTerm.text();
           textlen = text.length();
  -        setEnum(reader.terms(new Term(searchTerm.field(), "")));
  +        if(prefixLength > 0 && prefixLength < textlen){
  +            this.prefixLength = prefixLength;
  +            prefix = text.substring(0, prefixLength);
  +            text = text.substring(prefixLength);
  +            textlen = text.length();
  +        }
  +        setEnum(reader.terms(new Term(searchTerm.field(), prefix)));
       }
       
       /**
  @@ -56,8 +96,9 @@
        calculate the distance between the given term and the comparing term. 
        */
       protected final boolean termCompare(Term term) {
  -        if (field == term.field()) {
  -            String target = term.text();
  +        String termText = term.text();
  +        if (field == term.field() && termText.startsWith(prefix)) {
  +            String target = termText.substring(prefixLength);
               int targetlen = target.length();
               int dist = editDistance(text, target, textlen, targetlen);
               distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen));
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to