dnaber 2004/11/13 07:11:26 Modified: src/java/org/apache/lucene/queryParser QueryParser.java QueryParser.jj . CHANGES.txt Log: make QueryParser work with analyzers that return more than one token per position PR: 23307 Submitted by: Pierrick Brihaye Revision Changes Path 1.21 +52 -11 jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParser.java Index: QueryParser.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParser.java,v retrieving revision 1.20 retrieving revision 1.21 diff -u -r1.20 -r1.21 --- QueryParser.java 15 Oct 2004 19:47:43 -0000 1.20 +++ QueryParser.java 13 Nov 2004 15:11:26 -0000 1.21 @@ -292,10 +292,11 @@ // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count - TokenStream source = analyzer.tokenStream(field, - new StringReader(queryText)); + TokenStream source = analyzer.tokenStream(field, new StringReader(queryText)); Vector v = new Vector(); org.apache.lucene.analysis.Token t; + int positionCount = 0; + boolean severalTokensAtSamePosition = false; while (true) { try { @@ -306,7 +307,11 @@ } if (t == null) break; - v.addElement(t.termText()); + v.addElement(t); + if (t.getPositionIncrement() == 1) + positionCount++; + else + severalTokensAtSamePosition = true; } try { source.close(); @@ -317,15 +322,48 @@ if (v.size() == 0) return null; - else if (v.size() == 1) - return new TermQuery(new Term(field, (String) v.elementAt(0))); - else { - PhraseQuery q = new PhraseQuery(); - q.setSlop(phraseSlop); - for (int i=0; i<v.size(); i++) { - q.add(new Term(field, (String) v.elementAt(i))); + else if (v.size() == 1) { + t = (org.apache.lucene.analysis.Token) v.elementAt(0); + return new TermQuery(new Term(field, t.termText())); + } else { + if (severalTokensAtSamePosition) { + if (positionCount == 1) { + // no phrase query: + BooleanQuery q = new BooleanQuery(); + for (int i = 0; i < v.size(); i++) { + t = (org.apache.lucene.analysis.Token) v.elementAt(i); + TermQuery currentQuery = new TermQuery( + new Term(field, t.termText())); + q.add(currentQuery, BooleanClause.Occur.SHOULD); + } + return q; + } + else { + // phrase query: + MultiPhraseQuery mpq = new MultiPhraseQuery(); + List multiTerms = new ArrayList(); + for (int i = 0; i < v.size(); i++) { + t = (org.apache.lucene.analysis.Token) v.elementAt(i); + if (t.getPositionIncrement() == 1 && multiTerms.size() > 0) { + mpq.add((Term[])multiTerms.toArray(new Term[0])); + multiTerms.clear(); + } + multiTerms.add(new Term(field, t.termText())); + } + mpq.add((Term[])multiTerms.toArray(new Term[0])); + return mpq; + } + } + else { + PhraseQuery q = new PhraseQuery(); + q.setSlop(phraseSlop); + for (int i = 0; i < v.size(); i++) { + q.add(new Term(field, ((org.apache.lucene.analysis.Token) + v.elementAt(i)).termText())); + + } + return q; } - return q; } } @@ -356,6 +394,9 @@ if (query instanceof PhraseQuery) { ((PhraseQuery) query).setSlop(slop); + } + if (query instanceof MultiPhraseQuery) { + ((MultiPhraseQuery) query).setSlop(slop); } return query; 1.55 +52 -11 jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj Index: QueryParser.jj =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj,v retrieving revision 1.54 retrieving revision 1.55 diff -u -r1.54 -r1.55 --- QueryParser.jj 15 Oct 2004 19:47:43 -0000 1.54 +++ QueryParser.jj 13 Nov 2004 15:11:26 -0000 1.55 @@ -315,10 +315,11 @@ // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count - TokenStream source = analyzer.tokenStream(field, - new StringReader(queryText)); + TokenStream source = analyzer.tokenStream(field, new StringReader(queryText)); Vector v = new Vector(); org.apache.lucene.analysis.Token t; + int positionCount = 0; + boolean severalTokensAtSamePosition = false; while (true) { try { @@ -329,7 +330,11 @@ } if (t == null) break; - v.addElement(t.termText()); + v.addElement(t); + if (t.getPositionIncrement() == 1) + positionCount++; + else + severalTokensAtSamePosition = true; } try { source.close(); @@ -340,15 +345,48 @@ if (v.size() == 0) return null; - else if (v.size() == 1) - return new TermQuery(new Term(field, (String) v.elementAt(0))); - else { - PhraseQuery q = new PhraseQuery(); - q.setSlop(phraseSlop); - for (int i=0; i<v.size(); i++) { - q.add(new Term(field, (String) v.elementAt(i))); + else if (v.size() == 1) { + t = (org.apache.lucene.analysis.Token) v.elementAt(0); + return new TermQuery(new Term(field, t.termText())); + } else { + if (severalTokensAtSamePosition) { + if (positionCount == 1) { + // no phrase query: + BooleanQuery q = new BooleanQuery(); + for (int i = 0; i < v.size(); i++) { + t = (org.apache.lucene.analysis.Token) v.elementAt(i); + TermQuery currentQuery = new TermQuery( + new Term(field, t.termText())); + q.add(currentQuery, BooleanClause.Occur.SHOULD); + } + return q; + } + else { + // phrase query: + MultiPhraseQuery mpq = new MultiPhraseQuery(); + List multiTerms = new ArrayList(); + for (int i = 0; i < v.size(); i++) { + t = (org.apache.lucene.analysis.Token) v.elementAt(i); + if (t.getPositionIncrement() == 1 && multiTerms.size() > 0) { + mpq.add((Term[])multiTerms.toArray(new Term[0])); + multiTerms.clear(); + } + multiTerms.add(new Term(field, t.termText())); + } + mpq.add((Term[])multiTerms.toArray(new Term[0])); + return mpq; + } + } + else { + PhraseQuery q = new PhraseQuery(); + q.setSlop(phraseSlop); + for (int i = 0; i < v.size(); i++) { + q.add(new Term(field, ((org.apache.lucene.analysis.Token) + v.elementAt(i)).termText())); + + } + return q; } - return q; } } @@ -379,6 +417,9 @@ if (query instanceof PhraseQuery) { ((PhraseQuery) query).setSlop(slop); + } + if (query instanceof MultiPhraseQuery) { + ((MultiPhraseQuery) query).setSlop(slop); } return query; 1.121 +9 -2 jakarta-lucene/CHANGES.txt Index: CHANGES.txt =================================================================== RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v retrieving revision 1.120 retrieving revision 1.121 diff -u -r1.120 -r1.121 --- CHANGES.txt 7 Nov 2004 23:31:16 -0000 1.120 +++ CHANGES.txt 13 Nov 2004 15:11:26 -0000 1.121 @@ -12,7 +12,7 @@ 2. FuzzyQuery now takes an additional parameter that specifies the minimum similarity that is required for a term to match the query. The QueryParser syntax for this is term~x, where x is a floating - point number between 0 and 1 (a bigger number means that a higher + point number >= 0 and < 1 (a bigger number means that a higher similarity is required). Furthermore, a prefix can be specified for FuzzyQuerys so that only those terms are considered similar that start with this prefix. This can speed up FuzzyQuery greatly. @@ -112,6 +112,13 @@ 24. Optimize fuzzy queries so the standard fuzzy queries with a prefix of 0 now run 20-50% faster (Patch #31882). (Jonathan Hager via Daniel Naber) + +25. QueryParser now correctly works with Analyzers that can return more + than one token per position. For example, a query "+fast +car" + would be parsed as "+fast +(car automobile)" if the Analyzer + returns "car" and "automobile" at the same position whenever it + finds "car" (Patch #23307). + (Pierrick Brihaye, Daniel Naber) 1.4.1
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]