RE: [Lucene-dev] New QueryParser

Tal Dayan Wed, 13 Jun 2001 09:15:15 -0700
Brian, here is another idea for the query parser. To add the ability to mark
terms as 'non analyzed'.

For example

 +body:xyz +folder:a.b.c.d

when 'folder' is a non tokenized field will not match if a.b.c.d is
tokenized.

A possible syntax may be

 +body:xyz +folder:'a.b.c.d'

BTW, it will be great if the syntax of the query parser will allow to
describe
any query that is supported by Lucene standard classes. This will provide a
common language to
describe queries and will provide an alternative, and more intuitive,
way to construct queries.

Tal

> -----Original Message-----
> From: [EMAIL PROTECTED]
> [mailto:[EMAIL PROTECTED]]On Behalf Of Brian Goetz
> Sent: Wednesday, June 13, 2001 3:11 AM
> To: [EMAIL PROTECTED]
> Subject: [Lucene-dev] New QueryParser
>
>
> I think I've got the query parser overhauled.  It addresses all the
> concerns that have been raised so far, plus a few others that people
> haven't yet raised (like what happens if a field name is also a stop
> word.)
>
> It accepts the same language as before (plus and minus, parens), plus
>   AND, &&  -> both terms required
>   OR, ||   -> default combination
>   NOT, !   -> next term is prohibited
>
> and you can put a boost factor after the term with
>   ^n.n (need digits both before and after the decimal)
>
> It is also savvy about the analyzer turning one term into more than one,
> or zero, and only applies the analyzer to the term text, not the entire
> query.  Examples:
>
>   a AND NOT b
>   +a -b
>   a b^2.0
>   field1:a field2:b
>   a -(c || d || e)
>   a "b c d"
>   a +"b c d"
>
> Here's the QueryParser.jj (total rewrite).  I've also included my
> JUnit test case for it afterwards, as an example of some of the
> cases I've tested.  If people would please try it out, and get me
> some feedback before I check it in, that would be helpful.
>
>
> ----- BEGIN QueryParser.jj
>
> // QueryParser.jj
> // Copyright (c) 1997-2001 Douglass R. Cutting.
> // Author: Brian Goetz
>
> options {
>   STATIC= false;
> }
>
> PARSER_BEGIN(QueryParser)
>
> package com.lucene.queryParser;
>
> import java.util.Vector;
> import java.io.*;
> import com.lucene.index.Term;
> import com.lucene.analysis.*;
> import com.lucene.search.*;
>
> /**
>  * This class is generated by JavaCC.  The only method that
> clients should need
>  * to call is <a href="#parse">parse()</a>.
>  *
>  * The syntax for query strings is as follows:
>  * A Query is a series of clauses.
>  * A clause may be prefixed by:
>  * <ul>
>  * <li> a plus (<code>+</code>) or a minus (<code>-</code>) sign,
> indicating
>  * that the clause is required or prohibited respectively; or
>  * <li> a term followed by a colon, indicating the field to be searched.
>  * This enables one to construct queries which search multiple fields.
>  * </ul>
>  *
>  * A clause may be either a:
>  * <ul>
>  * <li> a term, indicating all the documents that contain this term; or
>  * <li> a nested query, enclosed in parentheses.  Note that this
> may be used
>  * with a <code>+</code>/<code>-</code> prefix to require any of a set of
>  * terms.
>  * </ul>
>  *
>  * Thus, in BNF, the query grammar is:
>  * <pre>
>  *   Query  ::= ( Clause )*
>  *   Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
>  * </pre>
>  */
>
> public class QueryParser {
>   /** Parses a query string, returning a
>    * <a href="lucene.search.Query.html">Query</a>.
>    *  @param query    the query string to be parsed.
>    *  @param field    the default field for query terms.
>    *  @param analyzer   used to find terms in the query text.
>    */
>   static public Query parse(String query, String field, Analyzer analyzer)
>        throws ParseException {
>     QueryParser parser = new QueryParser(new StringReader(query));
>     return parser.Query(field);
>   }
>
>   Analyzer analyzer;
>   String field;
>   int phraseSlop = 0;
>
>   /** Constructs a query parser.
>    *  @param field    the default field for query terms.
>    *  @param analyzer   used to find terms in the query text.
>    */
>   public QueryParser(String f, Analyzer a) {
>     this(new StringReader(""));
>     analyzer = a;
>     field = f;
>   }
>
>   /** Parses a query string, returning a
>    * <a href="lucene.search.Query.html">Query</a>.
>    *  @param query    the query string to be parsed.
>    */
>   public Query parse(String query) throws ParseException {
>     ReInit(new StringReader(query));
>     return Query(field);
>   }
>
>   /** Sets the default slop for phrases.  If zero, then exact
> phrase matches
>     are required.  Zero by default. */
>   public void setPhraseSlop(int s) { phraseSlop = s; }
>   /** Gets the default slop for phrases. */
>   public int getPhraseSlop() { return phraseSlop; }
>
>   private void addClause(Vector clauses, int conj, int mods,
>                         Query q) {
>     boolean required, prohibited;
>
>     // If this term is introduced by AND, make the preceding term
> required,
>     // unless it's already prohibited
>     if (conj == CONJ_AND) {
>       BooleanClause c = (BooleanClause)
> clauses.elementAt(clauses.size()-1);
>       if (!c.prohibited)
>         c.required = true;
>     }
>
>     // We might have been passed a null query; the term might have been
>     // filtered away by the analyzer.
>     if (q == null)
>       return;
>
>     // We set REQUIRED if we're introduced by AND or +; PROHIBITED if
>     // introduced by NOT or -; make sure not to set both.
>     prohibited = (mods == MOD_NOT);
>     required = (mods == MOD_REQ);
>     if (conj == CONJ_AND && !prohibited)
>       required = true;
>     clauses.addElement(new BooleanClause(q, required, prohibited));
>   }
>
>   private Query getFieldQuery(String field, Analyzer analyzer,
>                              String queryText) {
>     // Use the analyzer to get all the tokens, and then build a TermQuery,
>     // PhraseQuery, or nothing based on the term count
>
>     TokenStream source = analyzer.tokenStream(new
> StringReader(queryText));
>     Vector v = new Vector();
>     com.lucene.analysis.Token t;
>
>     while (true) {
>       try {
>         t = source.next();
>       }
>       catch (IOException e) {
>         t = null;
>       }
>       if (t == null)
>         break;
>       v.addElement(t.termText());
>     }
>     if (v.size() == 0)
>       return null;
>     else if (v.size() == 1)
>       return new TermQuery(new Term(field, (String) v.elementAt(0)));
>     else {
>       PhraseQuery q = new PhraseQuery();
>       q.setSlop(phraseSlop);
>       for (int i=0; i<v.size(); i++) {
>         q.add(new Term(field, (String) v.elementAt(i)));
>       }
>       return q;
>     }
>   }
>
>   public static void main(String[] args) throws Exception {
>     QueryParser qp = new QueryParser("field",
>                                      new
> com.lucene.analysis.SimpleAnalyzer());
>     Query q = qp.parse(args[0]);
>     System.out.println(q.toString("field"));
>   }
>
>   private static final int CONJ_NONE   = 0;
>   private static final int CONJ_AND    = 1;
>   private static final int CONJ_OR     = 2;
>
>   private static final int MOD_NONE    = 0;
>   private static final int MOD_NOT     = 10;
>   private static final int MOD_REQ     = 11;
> }
>
> PARSER_END(QueryParser)
>
> /* ***************** */
> /* Token Definitions */
> /* ***************** */
>
> <*> TOKEN : {
>   <#_ALPHA_CHAR: ["a"-"z", "A"-"Z"] >
> | <#_NUM_CHAR:   ["0"-"9"] >
> | <#_ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
> | <#_IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_" ] >
> | <#_IDENTIFIER: <_ALPHA_CHAR> (<_IDENTIFIER_CHAR>)* >
> | <#_NEWLINE:    ( "\r\n" | "\r" | "\n" ) >
> | <#_WHITESPACE: ( " " | "\t" ) >
> | <#_QCHAR:      ( "\\" (<_NEWLINE> | ~["a"-"z", "A"-"Z", "0"-"9"] ) ) >
> | <#_RESTOFLINE: (~["\r", "\n"])* >
> }
>
> <DEFAULT> TOKEN : {
>   <AND:       ("AND" | "&&") >
> | <OR:        ("OR" | "||") >
> | <NOT:       ("NOT" | "!") >
> | <PLUS:      "+" >
> | <MINUS:     "-" >
> | <LPAREN:    "(" >
> | <RPAREN:    ")" >
> | <COLON:     ":" >
> | <CARAT:     "^" >
> | <QUOTED:     "\"" (~["\""])+ "\"">
> | <NUMBER:    (<_NUM_CHAR>)+ "." (<_NUM_CHAR>)+ >
> | <TERM:      <_IDENTIFIER_CHAR>
>               ( ~["\"", " ", "\t", "(", ")", ":", "&", "|", "^" ] )* >
> }
>
> <DEFAULT> SKIP : {
>   <<_WHITESPACE>>
> }
>
> // *   Query  ::= ( Clause )*
> // *   Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
>
> int Conjunction() :

>   int ret = CONJ_NONE;
> }
> {
>

>     <AND> { ret = CONJ_AND; }
>     | <OR>  { ret = CONJ_OR; }
>   ]
>   { return ret; }
> }
>
> int Modifiers() :

>   int ret = MOD_NONE;
> }
> {
>

>      <PLUS> { ret = MOD_REQ; }
>      | <MINUS> { ret = MOD_NOT; }
>      | <NOT> { ret = MOD_NOT; }
>   ]
>   { return ret; }
> }
>
> Query Query(String field) :
> {
>   Vector clauses = new Vector();
>   Query q;
>   int conj, mods;
> }
> {
>   mods=Modifiers() q=Clause(field)
>   { addClause(clauses, CONJ_NONE, mods, q); }
>

>     conj=Conjunction() mods=Modifiers() q=Clause(field)
>     { addClause(clauses, conj, mods, q); }
>   )*
>     {
>       BooleanQuery query = new BooleanQuery();
>       for (int i = 0; i < clauses.size(); i++)
>       query.add((BooleanClause)clauses.elementAt(i));
>       return query;
>     }
> }
>
> Query Clause(String field) : {
>   Query q;
>   Token fieldToken=null;
> }
> {
>   [
>     LOOKAHEAD(2)
>     fieldToken=<TERM> <COLON> { field = fieldToken.image; }
>   ]
>
>   (
>    q=Term(field)
>    | <LPAREN> q=Query(field) <RPAREN>
>   )
>     {
>       return q;
>     }
> }
>
>
> Query Term(String field) :

>   Token term, boost=null;
>   Query q;
> }
> {
>

>     (term=<TERM>|term=<NUMBER>) [ <CARAT> boost=<NUMBER> ]
>       { q = getFieldQuery(field, analyzer, term.image); }
>     | term=<QUOTED>
>       { q = getFieldQuery(field, analyzer,
>                           term.image.substring(1,
> term.image.length()-1)); }
>   )
>

>     if (boost != null) {
>       float f = (float) 1.0;
>       try

>         f = Float.parseFloat(boost.image);
>       }
>       catch (Exception ignored) { }
>
>       if (q instanceof TermQuery)
>         ((TermQuery) q).setBoost(f);
>       else if (q instanceof PhraseQuery)
>         ((PhraseQuery) q).setBoost(f);
>     }
>     return q;
>   }
> }
>
>
>
> ----- END
>
>
> ----- BEGIN TestQueryParser.java
>
> package com.lucene.queryParser;
>
> import java.io.*;
> import junit.framework.*;
>
> import com.lucene.*;
> import com.lucene.queryParser.*;
> import com.lucene.search.*;
> import com.lucene.analysis.*;
> import com.lucene.analysis.Token;
>
> public class TestQueryParser extends TestCase {
>
>    public TestQueryParser(String name) {
>       super(name);
>    }
>
>   public static Analyzer qpAnalyzer = new QPTestAnalyzer();
>
>   public static class QPTestFilter extends TokenFilter {
>
>     /**
>      * Filter which discards the token 'stop' and which expands the
>      * token 'phrase' into 'phrase1 phrase2'
>      */
>     public QPTestFilter(TokenStream in) {
>       input = in;
>     }
>
>     boolean inPhrase = false;
>     int savedStart=0, savedEnd=0;
>
>     public Token next() throws IOException {
>       if (inPhrase) {
>         inPhrase = false;
>         return new Token("phrase2", savedStart, savedEnd);
>       }
>       else
>         for (Token token = input.next(); token != null; token =
> input.next())
>           if (token.termText().equals("phrase")) {
>             inPhrase = true;
>             savedStart = token.startOffset();
>             savedEnd = token.endOffset();
>             return new Token("phrase1", savedStart, savedEnd);
>           }
>           else if (!token.termText().equals("stop"))
>             return token;
>       return null;
>     }
>   }
>
>   public static class QPTestAnalyzer extends Analyzer {
>
>     public QPTestAnalyzer() {
>     }
>
>     /** Filters LowerCaseTokenizer with StopFilter. */
>     public final TokenStream tokenStream(Reader reader) {
>       return new QPTestFilter(new LowerCaseTokenizer(reader));
>     }
>   }
>
>    /**
>     * initialize this TemplateTester by creating a WebMacro instance
>     * and a default Context.
>     */
>   public void init () throws Exception
>   {
>   }
>
>   public void assertQueryEquals(String query, Analyzer a, String result)
>   throws Exception {
>     if (a == null)
>       a = new SimpleAnalyzer();
>     QueryParser qp = new QueryParser("field", a);
>     Query q = qp.parse(query);
>     String s = q.toString("field");
>     if (!s.equals(result)) {
>       System.err.println("Query /" + query + "/ yielded /" + s
>                          + "/, expecting /" + result + "/");
>       assert(false);
>     }
>   }
>
>   public void testSimple() throws Exception {
>     assertQueryEquals("term term term", null, "(term term term)");
>     assertQueryEquals("term term1 term2", null, "(term term term)");
>     assertQueryEquals("term 1.0 1 2", null, "(term)");
>
>     assertQueryEquals("a AND b", null, "(+a +b)");
>     assertQueryEquals("a AND NOT b", null, "(+a -b)");
>     assertQueryEquals("a AND -b", null, "(+a -b)");
>     assertQueryEquals("a AND !b", null, "(+a -b)");
>     assertQueryEquals("a && b", null, "(+a +b)");
>     assertQueryEquals("a&&b", null, "(+a +b)");
>     assertQueryEquals("a && ! b", null, "(+a -b)");
>
>     assertQueryEquals("a OR b", null, "(a b)");
>     assertQueryEquals("a || b", null, "(a b)");
>     assertQueryEquals("a OR !b", null, "(a -b)");
>     assertQueryEquals("a OR ! b", null, "(a -b)");
>     assertQueryEquals("a OR -b", null, "(a -b)");
>
>     assertQueryEquals("+term -term term", null, "(+term -term term)");
>     assertQueryEquals("foo:term AND field:anotherTerm", null,
>                       "(+foo:term +anotherterm)");
>     assertQueryEquals("term AND \"phrase phrase\"", null,
>                       "(+term +\"phrase phrase\")");
>
>     assertQueryEquals("germ term^2.0", null, "(germ term^2.0)");
>     assertQueryEquals("term^2.0", null, "(term^2.0)");
>   }
>
>   public void testQPA() throws Exception {
>     assertQueryEquals("term term term", qpAnalyzer, "(term term term)");
>     assertQueryEquals("term +stop term", qpAnalyzer, "(term term)");
>     assertQueryEquals("term -stop term", qpAnalyzer, "(term term)");
>     assertQueryEquals("drop AND stop AND roll", qpAnalyzer,
> "(+drop +roll)");
>     assertQueryEquals("term phrase term", qpAnalyzer,
>                       "(term \"phrase1 phrase2\" term)");
>     assertQueryEquals("term AND NOT phrase term", qpAnalyzer,
>                       "(+term -\"phrase1 phrase2\" term)");
>     assertQueryEquals("stop", qpAnalyzer, "()");
>   }
> }
>
> ----- END
>
> _______________________________________________
> Lucene-dev mailing list
> [EMAIL PROTECTED]
> http://lists.sourceforge.net/lists/listinfo/lucene-dev
>


_______________________________________________
Lucene-dev mailing list
[EMAIL PROTECTED]
http://lists.sourceforge.net/lists/listinfo/lucene-dev
RE: [Lucene-dev] New QueryParser

Reply via email to