[PATCH] Refactoring QueryParser.jj, setLowercaseWildcardTerms()

Tatu Saloranta Tue, 11 Feb 2003 20:38:40 -0800

(since this is the first patch I've done to Lucene, let me know if there are 
some preferences or if I missed something...)


I did some small refactorization to QueryParser.jj, to make it easier to 
create subclasses that override actual query building (without modifying 
actual parser part). I'll probably try to create my own MultiFieldQueryParser 
with some changes (different weights for different fields, not necessarily 
duplicating the whole query etc.), and these changes should make it possible 
to do that without touching QueryParser.jj itself.

I also added simple improvement to prefix/wildcard query handling; now it is 
possible to set a property to enable automatic lowercasing of prefix/wildcard 
terms (default is false to stay backwards compatible). This is not a complete 
solution to wildcard query problems, but should help a bit.

I ran unit tests (without errors), but I wasn't sure if there are unit tests 
for QueryParser. Hopefully I didn't break anything; changes are fairly minor.

Let me know if proposed changes make sense,

-+ Tatu +-

--- QueryParser.jj.orig 2003-02-11 21:25:19.000000000 -0700
+++ QueryParser.jj      2003-02-11 21:29:03.000000000 -0700
@@ -129,6 +129,12 @@
   Analyzer analyzer;
   String field;
   int phraseSlop = 0;
+  /**
+   * Whether terms of wildcard and prefix queries are to be automatically
+   * lowercased or not. Default is false (for backwards compatibility)
+   * to indicate those terms are not modified in any way.
+   */
+  boolean lowercaseWildcardTerms = false;
 
   /** Constructs a query parser.
    *  @param field     the default field for query terms.
@@ -181,6 +187,12 @@
        return this.operator;
     }
 
+    public void setLowercaseWildcardTerms(boolean b) {
+       lowercaseWildcardTerms = b;
+    }
+
+    public boolean getLowercaseWildcardTerms() { return lowercaseWildcardTerms; }
+
   private void addClause(Vector clauses, int conj, int mods, Query q) {
     boolean required, prohibited;
 
@@ -291,6 +303,103 @@
                           inclusive);
   }
 
+  /**
+   * Factory method for generating query, given set of clauses.
+   * By default creates a boolean query composed of clauses passed in.
+   *
+   * Can be overridden by extending classes, to modify query being
+   * returned.
+   *
+   * @param clauses Vector that contains {@link BooleanClause} instances
+   *    to join.
+   *
+   * @return Resulting {@link Query} object.
+   */
+  protected Query getBooleanQuery(Vector clauses)
+  {
+    BooleanQuery query = new BooleanQuery();
+    for (int i = 0; i < clauses.size(); i++) {
+       query.add((BooleanClause)clauses.elementAt(i));
+    }
+    return query;
+  }
+
+  /**
+   * Factory method for generating a query. Called when parser
+   * parses an input term token that contains one or more wildcard
+   * characters (? and *), but is not a prefix term token (one
+   * that has just a single * character at the end)
+   *<p>
+   * Depending on settings, prefix term may be lower-cased
+   * automatically. It will not go through the default analyzer,
+   * however, since normal analyzers are unlikely to work properly
+   * with wildcard templates.
+   *<p>
+   * Can be overridden by extending classes, to provide custom handling for
+   * wild card queries (which may be necessary due to missing analyzer calls)
+   *
+   * @param field Name of the field query will use.
+   * @param termStr Term token that contains one or more wild card
+   *   characters (? or *), but is not simple prefix term
+   *
+   * @return Resulting query build for the term
+   */
+  protected Query getWildcardQuery(String field, String termStr)
+  {
+    if (lowercaseWildcardTerms) {
+       termStr = termStr.toLowerCase();
+    }
+    Term t = new Term(field, termStr);
+    return new WildcardQuery(t);
+  }
+
+  /**
+   * Factory method for generating a query (similar to
+   * (@link getWildcardQuery}). Called when parser parses an input term
+   * token that uses prefix notation; that is, contains a single '*' wild
+   * char character as it's last character. Since this is a special case
+   * of generic wild card term, and such a query can be optimized easily,
+   * this usually results in different query object.
+   *<p>
+   * Depending on settings, prefix term may be lower-cased
+   * automatically. It will not go through the default analyzer,
+   * however, since normal analyzers are unlikely to work properly
+   * with wildcard templates.
+   *<p>
+   * Can be overridden by extending classes, to provide custom handling for
+   * wild card queries (which may be necessary due to missing analyzer calls)
+   *
+   * @param field Name of the field query will use.
+   * @param termStr Term token to use for building term for the query
+   *    (<b>without</b> trailing '*' character!)
+   *
+   * @return Resulting query build for the term
+   */
+  protected Query getPrefixQuery(String field, String termStr)
+  {
+    if (lowercaseWildcardTerms) {
+       termStr = termStr.toLowerCase();
+    }
+    Term t = new Term(field, termStr);
+    return new PrefixQuery(t);
+  }
+
+  /**
+   * Factory method for generating a query (similar to
+   * (@link getWildcardQuery}). Called when parser parses
+   * an input term token that has the fuzzy suffix (~) appended.
+   *
+   * @param field Name of the field query will use.
+   * @param termStr Term token to use for building term for the query
+   *
+   * @return Resulting query build for the term
+   */
+  protected Query getFuzzyQuery(String field, String termStr)
+  {
+    Term t = new Term(field, termStr);
+    return new FuzzyQuery(t);
+  }
+
   public static void main(String[] args) throws Exception {
     QueryParser qp = new QueryParser("field",
                            new org.apache.lucene.analysis.SimpleAnalyzer());
@@ -423,10 +532,7 @@
       if (clauses.size() == 1 && firstQuery != null)
         return firstQuery;
       else {
-        BooleanQuery query = new BooleanQuery();
-        for (int i = 0; i < clauses.size(); i++)
-         query.add((BooleanClause)clauses.elementAt(i));
-        return query;
+       return getBooleanQuery(clauses);
       }
     }
 }
@@ -478,15 +584,16 @@
      [ <FUZZY> { fuzzy=true; } ]
      [ <CARAT> boost=<NUMBER> [ <FUZZY> { fuzzy=true; } ] ]
      {
-       if (wildcard)
-         q = new WildcardQuery(new Term(field, term.image));
-       else if (prefix)
-         q = new PrefixQuery(new Term(field, term.image.substring
-                                      (0, term.image.length()-1)));
-       else if (fuzzy)
-         q = new FuzzyQuery(new Term(field, term.image));
-       else
+       if (wildcard) {
+        q = getWildcardQuery(field, term.image);
+       } else if (prefix) {
+         q = getPrefixQuery(field, term.image.substring
+                           (0, term.image.length()-1));
+       } else if (fuzzy) {
+         q = getFuzzyQuery(field, term.image);
+       } else {
          q = getFieldQuery(field, analyzer, term.image);
+       }
      }
      | ( <RANGEIN_START> ( goop1=<RANGEIN_GOOP>|goop1=<RANGEIN_QUOTED> )
          [ <RANGEIN_TO> ] ( goop2=<RANGEIN_GOOP>|goop2=<RANGEIN_QUOTED> )
@@ -533,7 +640,11 @@
       try {
         f = Float.valueOf(boost.image).floatValue();
       }
-      catch (Exception ignored) { }
+      catch (Exception ignored) {
+         /* Should this be handled somehow? (defaults to "no boost", if
+          * boost number is invalid)
+          */
+      }
 
       // avoid boosting null queries, such as those caused by stop words
       if (q != null) {

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

[PATCH] Refactoring QueryParser.jj, setLowercaseWildcardTerms()

Reply via email to