Author: alexparvulescu Date: Mon Dec 9 20:23:43 2013 New Revision: 1549667
URL: http://svn.apache.org/r1549667 Log: OAK-1270 Revisit full-text queries in case of multiple tokens - introduced a MultiPhraseQuery for the multiple token case - fixed a bug in the tokenizer Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexQueryTest.java jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexTest.java Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java?rev=1549667&r1=1549666&r2=1549667&view=diff ============================================================================== --- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java (original) +++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndex.java Mon Dec 9 20:23:43 2013 @@ -73,10 +73,13 @@ import org.apache.lucene.index.Directory import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; @@ -87,6 +90,9 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.CompiledAutomaton; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -416,7 +422,7 @@ public class LuceneIndex implements Full // when using the LowCostLuceneIndexProvider // which is used for testing } else { - qs.add(getFullTextQuery(ft, analyzer)); + qs.add(getFullTextQuery(ft, analyzer, reader)); } if (nonFullTextConstraints) { addNonFullTextConstraints(qs, filter, reader); @@ -582,7 +588,7 @@ public class LuceneIndex implements Full qs.add(bq); } - static Query getFullTextQuery(FullTextExpression ft, final Analyzer analyzer) { + static Query getFullTextQuery(FullTextExpression ft, final Analyzer analyzer, final IndexReader reader) { // a reference to the query, so it can be set in the visitor // (a "non-local return") final AtomicReference<Query> result = new AtomicReference<Query>(); @@ -592,7 +598,7 @@ public class LuceneIndex implements Full public boolean visit(FullTextOr or) { BooleanQuery q = new BooleanQuery(); for (FullTextExpression e : or.list) { - Query x = getFullTextQuery(e, analyzer); + Query x = getFullTextQuery(e, analyzer, reader); q.add(x, SHOULD); } result.set(q); @@ -603,7 +609,7 @@ public class LuceneIndex implements Full public boolean visit(FullTextAnd and) { BooleanQuery q = new BooleanQuery(); for (FullTextExpression e : and.list) { - Query x = getFullTextQuery(e, analyzer); + Query x = getFullTextQuery(e, analyzer, reader); // Lucene can't deal with "must(must_not(x))" if (x instanceof BooleanQuery) { BooleanQuery bq = (BooleanQuery) x; @@ -625,7 +631,7 @@ public class LuceneIndex implements Full // do not add constraints on child nodes properties p = "*"; } - Query q = tokenToQuery(term.getText(), analyzer); + Query q = tokenToQuery(term.getText(), analyzer, reader); if (q == null) { return false; } @@ -646,7 +652,7 @@ public class LuceneIndex implements Full return result.get(); } - static Query tokenToQuery(String text, Analyzer analyzer) { + static Query tokenToQuery(String text, Analyzer analyzer, IndexReader reader) { if (analyzer == null) { return null; } @@ -657,29 +663,76 @@ public class LuceneIndex implements Full // TODO what should be returned in the case there are no tokens? return new BooleanQuery(); } - if (tokens.size() == 1) { - text = tokens.iterator().next(); - boolean hasFulltextToken = false; - for (char c : fulltextTokens) { - if (text.indexOf(c) != -1) { - hasFulltextToken = true; - break; + String token = tokens.iterator().next(); + if (hasFulltextToken(token)) { + return new WildcardQuery(newFulltextTerm(token)); + } else { + return new TermQuery(newFulltextTerm(token)); + } + } else { + if (hasFulltextToken(tokens)) { + MultiPhraseQuery mpq = new MultiPhraseQuery(); + for(String token: tokens){ + if (hasFulltextToken(token)) { + Term[] terms = extractMatchingTokens(reader, token); + if (terms != null && terms.length > 0) { + mpq.add(terms); + } + } else { + mpq.add(newFulltextTerm(token)); + } } + return mpq; + } else { + PhraseQuery pq = new PhraseQuery(); + for (String t : tokens) { + pq.add(newFulltextTerm(t)); + } + return pq; } + } + } - if (hasFulltextToken) { - return new WildcardQuery(newFulltextTerm(text)); - } else { - return new TermQuery(newFulltextTerm(text)); + private static Term[] extractMatchingTokens(IndexReader reader, String token) { + if (reader == null) { + // getPlan call + return null; + } + + try { + List<Term> terms = new ArrayList<Term>(); + Terms t = MultiFields.getTerms(reader, FieldNames.FULLTEXT); + Automaton a = WildcardQuery.toAutomaton(newFulltextTerm(token)); + CompiledAutomaton ca = new CompiledAutomaton(a); + TermsEnum te = ca.getTermsEnum(t); + BytesRef text; + while ((text = te.next()) != null) { + terms.add(newFulltextTerm(text.utf8ToString())); } - } else { - PhraseQuery pq = new PhraseQuery(); - for (String t : tokens) { - pq.add(newFulltextTerm(t)); + return terms.toArray(new Term[terms.size()]); + } catch (IOException e) { + LOG.error("Building fulltext query failed", e.getMessage()); + return null; + } + } + + private static boolean hasFulltextToken(List<String> tokens) { + for (String token : tokens) { + if (hasFulltextToken(token)) { + return true; + } + } + return false; + } + + private static boolean hasFulltextToken(String token) { + for (char c : fulltextTokens) { + if (token.indexOf(c) != -1) { + return true; } - return pq; } + return false; } private static char[] fulltextTokens = new char[] { '*', '?' }; @@ -727,6 +780,7 @@ public class LuceneIndex implements Full poz = end; if (hasFulltextToken) { token.append(term); + hasFulltextToken = false; } else { if (token.length() > 0) { tokens.add(token.toString()); Modified: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexQueryTest.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexQueryTest.java?rev=1549667&r1=1549666&r2=1549667&view=diff ============================================================================== --- jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexQueryTest.java (original) +++ jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexQueryTest.java Mon Dec 9 20:23:43 2013 @@ -148,6 +148,21 @@ public class LuceneIndexQueryTest extend } @Test + public void containsDash() throws Exception { + Tree test = root.getTree("/").addChild("test"); + test.addChild("a").setProperty("name", "hello-wor"); + test.addChild("b").setProperty("name", "hello-world"); + test.addChild("c").setProperty("name", "hello"); + root.commit(); + + assertQuery("/jcr:root//*[jcr:contains(., 'hello-wor*')]", "xpath", + ImmutableList.of("/test/a", "/test/b")); + assertQuery("/jcr:root//*[jcr:contains(., '*hello-wor*')]", "xpath", + ImmutableList.of("/test/a", "/test/b")); + + } + + @Test public void containsPath() throws Exception { Tree test = root.getTree("/").addChild("test"); Modified: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexTest.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexTest.java?rev=1549667&r1=1549666&r2=1549667&view=diff ============================================================================== --- jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexTest.java (original) +++ jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexTest.java Mon Dec 9 20:23:43 2013 @@ -162,6 +162,15 @@ public class LuceneIndexTest { LuceneIndex.tokenize("first. second", analyzer)); assertEquals(ImmutableList.of("first", "second"), LuceneIndex.tokenize("first.second", analyzer)); + + assertEquals(ImmutableList.of("hello", "world"), + LuceneIndex.tokenize("hello-world", analyzer)); + assertEquals(ImmutableList.of("hello", "wor*"), + LuceneIndex.tokenize("hello-wor*", analyzer)); + assertEquals(ImmutableList.of("*llo", "world"), + LuceneIndex.tokenize("*llo-world", analyzer)); + assertEquals(ImmutableList.of("*llo", "wor*"), + LuceneIndex.tokenize("*llo-wor*", analyzer)); } }
