Brian, here is another idea for the query parser. To add the ability to mark
terms as 'non analyzed'.
For example
+body:xyz +folder:a.b.c.d
when 'folder' is a non tokenized field will not match if a.b.c.d is
tokenized.
A possible syntax may be
+body:xyz +folder:'a.b.c.d'
BTW, it will be great if the syntax of the query parser will allow to
describe
any query that is supported by Lucene standard classes. This will provide a
common language to
describe queries and will provide an alternative, and more intuitive,
way to construct queries.
Tal
> -----Original Message-----
> From: [EMAIL PROTECTED]
> [mailto:[EMAIL PROTECTED]]On Behalf Of Brian Goetz
> Sent: Wednesday, June 13, 2001 3:11 AM
> To: [EMAIL PROTECTED]
> Subject: [Lucene-dev] New QueryParser
>
>
> I think I've got the query parser overhauled. It addresses all the
> concerns that have been raised so far, plus a few others that people
> haven't yet raised (like what happens if a field name is also a stop
> word.)
>
> It accepts the same language as before (plus and minus, parens), plus
> AND, && -> both terms required
> OR, || -> default combination
> NOT, ! -> next term is prohibited
>
> and you can put a boost factor after the term with
> ^n.n (need digits both before and after the decimal)
>
> It is also savvy about the analyzer turning one term into more than one,
> or zero, and only applies the analyzer to the term text, not the entire
> query. Examples:
>
> a AND NOT b
> +a -b
> a b^2.0
> field1:a field2:b
> a -(c || d || e)
> a "b c d"
> a +"b c d"
>
> Here's the QueryParser.jj (total rewrite). I've also included my
> JUnit test case for it afterwards, as an example of some of the
> cases I've tested. If people would please try it out, and get me
> some feedback before I check it in, that would be helpful.
>
>
> ----- BEGIN QueryParser.jj
>
> // QueryParser.jj
> // Copyright (c) 1997-2001 Douglass R. Cutting.
> // Author: Brian Goetz
>
> options {
> STATIC= false;
> }
>
> PARSER_BEGIN(QueryParser)
>
> package com.lucene.queryParser;
>
> import java.util.Vector;
> import java.io.*;
> import com.lucene.index.Term;
> import com.lucene.analysis.*;
> import com.lucene.search.*;
>
> /**
> * This class is generated by JavaCC. The only method that
> clients should need
> * to call is <a href="#parse">parse()</a>.
> *
> * The syntax for query strings is as follows:
> * A Query is a series of clauses.
> * A clause may be prefixed by:
> * <ul>
> * <li> a plus (<code>+</code>) or a minus (<code>-</code>) sign,
> indicating
> * that the clause is required or prohibited respectively; or
> * <li> a term followed by a colon, indicating the field to be searched.
> * This enables one to construct queries which search multiple fields.
> * </ul>
> *
> * A clause may be either a:
> * <ul>
> * <li> a term, indicating all the documents that contain this term; or
> * <li> a nested query, enclosed in parentheses. Note that this
> may be used
> * with a <code>+</code>/<code>-</code> prefix to require any of a set of
> * terms.
> * </ul>
> *
> * Thus, in BNF, the query grammar is:
> * <pre>
> * Query ::= ( Clause )*
> * Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
> * </pre>
> */
>
> public class QueryParser {
> /** Parses a query string, returning a
> * <a href="lucene.search.Query.html">Query</a>.
> * @param query the query string to be parsed.
> * @param field the default field for query terms.
> * @param analyzer used to find terms in the query text.
> */
> static public Query parse(String query, String field, Analyzer analyzer)
> throws ParseException {
> QueryParser parser = new QueryParser(new StringReader(query));
> return parser.Query(field);
> }
>
> Analyzer analyzer;
> String field;
> int phraseSlop = 0;
>
> /** Constructs a query parser.
> * @param field the default field for query terms.
> * @param analyzer used to find terms in the query text.
> */
> public QueryParser(String f, Analyzer a) {
> this(new StringReader(""));
> analyzer = a;
> field = f;
> }
>
> /** Parses a query string, returning a
> * <a href="lucene.search.Query.html">Query</a>.
> * @param query the query string to be parsed.
> */
> public Query parse(String query) throws ParseException {
> ReInit(new StringReader(query));
> return Query(field);
> }
>
> /** Sets the default slop for phrases. If zero, then exact
> phrase matches
> are required. Zero by default. */
> public void setPhraseSlop(int s) { phraseSlop = s; }
> /** Gets the default slop for phrases. */
> public int getPhraseSlop() { return phraseSlop; }
>
> private void addClause(Vector clauses, int conj, int mods,
> Query q) {
> boolean required, prohibited;
>
> // If this term is introduced by AND, make the preceding term
> required,
> // unless it's already prohibited
> if (conj == CONJ_AND) {
> BooleanClause c = (BooleanClause)
> clauses.elementAt(clauses.size()-1);
> if (!c.prohibited)
> c.required = true;
> }
>
> // We might have been passed a null query; the term might have been
> // filtered away by the analyzer.
> if (q == null)
> return;
>
> // We set REQUIRED if we're introduced by AND or +; PROHIBITED if
> // introduced by NOT or -; make sure not to set both.
> prohibited = (mods == MOD_NOT);
> required = (mods == MOD_REQ);
> if (conj == CONJ_AND && !prohibited)
> required = true;
> clauses.addElement(new BooleanClause(q, required, prohibited));
> }
>
> private Query getFieldQuery(String field, Analyzer analyzer,
> String queryText) {
> // Use the analyzer to get all the tokens, and then build a TermQuery,
> // PhraseQuery, or nothing based on the term count
>
> TokenStream source = analyzer.tokenStream(new
> StringReader(queryText));
> Vector v = new Vector();
> com.lucene.analysis.Token t;
>
> while (true) {
> try {
> t = source.next();
> }
> catch (IOException e) {
> t = null;
> }
> if (t == null)
> break;
> v.addElement(t.termText());
> }
> if (v.size() == 0)
> return null;
> else if (v.size() == 1)
> return new TermQuery(new Term(field, (String) v.elementAt(0)));
> else {
> PhraseQuery q = new PhraseQuery();
> q.setSlop(phraseSlop);
> for (int i=0; i<v.size(); i++) {
> q.add(new Term(field, (String) v.elementAt(i)));
> }
> return q;
> }
> }
>
> public static void main(String[] args) throws Exception {
> QueryParser qp = new QueryParser("field",
> new
> com.lucene.analysis.SimpleAnalyzer());
> Query q = qp.parse(args[0]);
> System.out.println(q.toString("field"));
> }
>
> private static final int CONJ_NONE = 0;
> private static final int CONJ_AND = 1;
> private static final int CONJ_OR = 2;
>
> private static final int MOD_NONE = 0;
> private static final int MOD_NOT = 10;
> private static final int MOD_REQ = 11;
> }
>
> PARSER_END(QueryParser)
>
> /* ***************** */
> /* Token Definitions */
> /* ***************** */
>
> <*> TOKEN : {
> <#_ALPHA_CHAR: ["a"-"z", "A"-"Z"] >
> | <#_NUM_CHAR: ["0"-"9"] >
> | <#_ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
> | <#_IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_" ] >
> | <#_IDENTIFIER: <_ALPHA_CHAR> (<_IDENTIFIER_CHAR>)* >
> | <#_NEWLINE: ( "\r\n" | "\r" | "\n" ) >
> | <#_WHITESPACE: ( " " | "\t" ) >
> | <#_QCHAR: ( "\\" (<_NEWLINE> | ~["a"-"z", "A"-"Z", "0"-"9"] ) ) >
> | <#_RESTOFLINE: (~["\r", "\n"])* >
> }
>
> <DEFAULT> TOKEN : {
> <AND: ("AND" | "&&") >
> | <OR: ("OR" | "||") >
> | <NOT: ("NOT" | "!") >
> | <PLUS: "+" >
> | <MINUS: "-" >
> | <LPAREN: "(" >
> | <RPAREN: ")" >
> | <COLON: ":" >
> | <CARAT: "^" >
> | <QUOTED: "\"" (~["\""])+ "\"">
> | <NUMBER: (<_NUM_CHAR>)+ "." (<_NUM_CHAR>)+ >
> | <TERM: <_IDENTIFIER_CHAR>
> ( ~["\"", " ", "\t", "(", ")", ":", "&", "|", "^" ] )* >
> }
>
> <DEFAULT> SKIP : {
> <<_WHITESPACE>>
> }
>
> // * Query ::= ( Clause )*
> // * Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
>
> int Conjunction() :
> int ret = CONJ_NONE;
> }
> {
>
> <AND> { ret = CONJ_AND; }
> | <OR> { ret = CONJ_OR; }
> ]
> { return ret; }
> }
>
> int Modifiers() :
> int ret = MOD_NONE;
> }
> {
>
> <PLUS> { ret = MOD_REQ; }
> | <MINUS> { ret = MOD_NOT; }
> | <NOT> { ret = MOD_NOT; }
> ]
> { return ret; }
> }
>
> Query Query(String field) :
> {
> Vector clauses = new Vector();
> Query q;
> int conj, mods;
> }
> {
> mods=Modifiers() q=Clause(field)
> { addClause(clauses, CONJ_NONE, mods, q); }
>
> conj=Conjunction() mods=Modifiers() q=Clause(field)
> { addClause(clauses, conj, mods, q); }
> )*
> {
> BooleanQuery query = new BooleanQuery();
> for (int i = 0; i < clauses.size(); i++)
> query.add((BooleanClause)clauses.elementAt(i));
> return query;
> }
> }
>
> Query Clause(String field) : {
> Query q;
> Token fieldToken=null;
> }
> {
> [
> LOOKAHEAD(2)
> fieldToken=<TERM> <COLON> { field = fieldToken.image; }
> ]
>
> (
> q=Term(field)
> | <LPAREN> q=Query(field) <RPAREN>
> )
> {
> return q;
> }
> }
>
>
> Query Term(String field) :
> Token term, boost=null;
> Query q;
> }
> {
>
> (term=<TERM>|term=<NUMBER>) [ <CARAT> boost=<NUMBER> ]
> { q = getFieldQuery(field, analyzer, term.image); }
> | term=<QUOTED>
> { q = getFieldQuery(field, analyzer,
> term.image.substring(1,
> term.image.length()-1)); }
> )
>
> if (boost != null) {
> float f = (float) 1.0;
> try
> f = Float.parseFloat(boost.image);
> }
> catch (Exception ignored) { }
>
> if (q instanceof TermQuery)
> ((TermQuery) q).setBoost(f);
> else if (q instanceof PhraseQuery)
> ((PhraseQuery) q).setBoost(f);
> }
> return q;
> }
> }
>
>
>
> ----- END
>
>
> ----- BEGIN TestQueryParser.java
>
> package com.lucene.queryParser;
>
> import java.io.*;
> import junit.framework.*;
>
> import com.lucene.*;
> import com.lucene.queryParser.*;
> import com.lucene.search.*;
> import com.lucene.analysis.*;
> import com.lucene.analysis.Token;
>
> public class TestQueryParser extends TestCase {
>
> public TestQueryParser(String name) {
> super(name);
> }
>
> public static Analyzer qpAnalyzer = new QPTestAnalyzer();
>
> public static class QPTestFilter extends TokenFilter {
>
> /**
> * Filter which discards the token 'stop' and which expands the
> * token 'phrase' into 'phrase1 phrase2'
> */
> public QPTestFilter(TokenStream in) {
> input = in;
> }
>
> boolean inPhrase = false;
> int savedStart=0, savedEnd=0;
>
> public Token next() throws IOException {
> if (inPhrase) {
> inPhrase = false;
> return new Token("phrase2", savedStart, savedEnd);
> }
> else
> for (Token token = input.next(); token != null; token =
> input.next())
> if (token.termText().equals("phrase")) {
> inPhrase = true;
> savedStart = token.startOffset();
> savedEnd = token.endOffset();
> return new Token("phrase1", savedStart, savedEnd);
> }
> else if (!token.termText().equals("stop"))
> return token;
> return null;
> }
> }
>
> public static class QPTestAnalyzer extends Analyzer {
>
> public QPTestAnalyzer() {
> }
>
> /** Filters LowerCaseTokenizer with StopFilter. */
> public final TokenStream tokenStream(Reader reader) {
> return new QPTestFilter(new LowerCaseTokenizer(reader));
> }
> }
>
> /**
> * initialize this TemplateTester by creating a WebMacro instance
> * and a default Context.
> */
> public void init () throws Exception
> {
> }
>
> public void assertQueryEquals(String query, Analyzer a, String result)
> throws Exception {
> if (a == null)
> a = new SimpleAnalyzer();
> QueryParser qp = new QueryParser("field", a);
> Query q = qp.parse(query);
> String s = q.toString("field");
> if (!s.equals(result)) {
> System.err.println("Query /" + query + "/ yielded /" + s
> + "/, expecting /" + result + "/");
> assert(false);
> }
> }
>
> public void testSimple() throws Exception {
> assertQueryEquals("term term term", null, "(term term term)");
> assertQueryEquals("term term1 term2", null, "(term term term)");
> assertQueryEquals("term 1.0 1 2", null, "(term)");
>
> assertQueryEquals("a AND b", null, "(+a +b)");
> assertQueryEquals("a AND NOT b", null, "(+a -b)");
> assertQueryEquals("a AND -b", null, "(+a -b)");
> assertQueryEquals("a AND !b", null, "(+a -b)");
> assertQueryEquals("a && b", null, "(+a +b)");
> assertQueryEquals("a&&b", null, "(+a +b)");
> assertQueryEquals("a && ! b", null, "(+a -b)");
>
> assertQueryEquals("a OR b", null, "(a b)");
> assertQueryEquals("a || b", null, "(a b)");
> assertQueryEquals("a OR !b", null, "(a -b)");
> assertQueryEquals("a OR ! b", null, "(a -b)");
> assertQueryEquals("a OR -b", null, "(a -b)");
>
> assertQueryEquals("+term -term term", null, "(+term -term term)");
> assertQueryEquals("foo:term AND field:anotherTerm", null,
> "(+foo:term +anotherterm)");
> assertQueryEquals("term AND \"phrase phrase\"", null,
> "(+term +\"phrase phrase\")");
>
> assertQueryEquals("germ term^2.0", null, "(germ term^2.0)");
> assertQueryEquals("term^2.0", null, "(term^2.0)");
> }
>
> public void testQPA() throws Exception {
> assertQueryEquals("term term term", qpAnalyzer, "(term term term)");
> assertQueryEquals("term +stop term", qpAnalyzer, "(term term)");
> assertQueryEquals("term -stop term", qpAnalyzer, "(term term)");
> assertQueryEquals("drop AND stop AND roll", qpAnalyzer,
> "(+drop +roll)");
> assertQueryEquals("term phrase term", qpAnalyzer,
> "(term \"phrase1 phrase2\" term)");
> assertQueryEquals("term AND NOT phrase term", qpAnalyzer,
> "(+term -\"phrase1 phrase2\" term)");
> assertQueryEquals("stop", qpAnalyzer, "()");
> }
> }
>
> ----- END
>
> _______________________________________________
> Lucene-dev mailing list
> [EMAIL PROTECTED]
> http://lists.sourceforge.net/lists/listinfo/lucene-dev
>
_______________________________________________
Lucene-dev mailing list
[EMAIL PROTECTED]
http://lists.sourceforge.net/lists/listinfo/lucene-dev