Re: Exception when attempting to query using ToParentBlockJoinQuery in Lucene 5.1

Bauer, Herbert S. (Scott) Mon, 29 Jun 2015 06:34:37 -0700

I’m able to define this problem with a more discrete example including the
two classes below.  This suggests a bug and unless someone has clearer
direction on this implementation I’m planning to file it as one.



package org.lexevs.lucene.prototype;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;

public class SmallTestIndexBuilder {
        public enum Code{
                C1234,C23432,C4234,C2308, C8958;
        }
        public SmallTestIndexBuilder() {
                // TODO Auto-generated constructor stub
        }

        public void init(){
                try {
                        LuceneContentBuilder builder = new 
LuceneContentBuilder();
                        Path path = Paths.get("/Users/m029206/Desktop/index");
                        Directory dir = new MMapDirectory(path);
                        Analyzer analyzer=new StandardAnalyzer(new 
CharArraySet( 0, true));
                        IndexWriterConfig iwc= new IndexWriterConfig(analyzer);
                        IndexWriter writer = new IndexWriter(dir, iwc);
                        createCodingSchemeIndex(builder, writer );
                        writer.commit();
                        writer.close();
                } catch (IOException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                }
        }
        private void createCodingSchemeIndex(LuceneContentBuilder builder,
                        IndexWriter writer) throws IOException {
                        for(Code c :Code.values()){
                        List<Document> list = createBlockJoin(c.name());
                        writer.addDocuments(list);
                        list = createBlockJoin2(c.name());
                        writer.addDocuments(list);
                        }
        }

        private List<Document> createBlockJoin(String code) {
                List<Document> list = new ArrayList<Document>();
        
                Document doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"Blood", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue", "Mud",
Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"Suds", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"coagulant", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"hepa", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"hematoma", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"normal", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"abnormal", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"notfound", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue", "red
blood cells", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"Blood", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"Blood", Field.Store.YES));
                list.add(doc);
                
                Document par = new Document();
                par.add(new 
org.apache.lucene.document.TextField("codingSchemeName",
"TestScheme", Field.Store.YES));
                par.add(new org.apache.lucene.document.TextField("parentDoc", 
"yes",
Field.Store.YES));
                par.add(new org.apache.lucene.document.TextField("entityCode", 
code,
Field.Store.YES));
                list.add(par);
                return list;
        }

        private List<Document> createBlockJoin2(String code) {
                List<Document> list = new ArrayList<Document>();
        
                Document doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"Blood", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue", "Mud",
Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"Suds", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"coagulant", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"hepa", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"hematoma", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"normal", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"abnormal", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"notfound", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue", "red
blood cells", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"Blood", Field.Store.YES));
                list.add(doc);
                doc = new Document();
                doc.add(new 
org.apache.lucene.document.TextField("propertyValue",
"Blood", Field.Store.YES));
                list.add(doc);
                
                Document par = new Document();
                par.add(new 
org.apache.lucene.document.TextField("codingSchemeName",
"TestSchemeToo", Field.Store.YES));
                par.add(new org.apache.lucene.document.TextField("parentDoc", 
"yes",
Field.Store.YES));
                par.add(new org.apache.lucene.document.TextField("entityCode", 
code,
Field.Store.YES));
                list.add(par);
                return list;
        }
        public static void main(String[] args) {
                new SmallTestIndexBuilder().init();

        }

}



package org.lexevs.lucene.prototype;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.grouping.GroupDocs;
import org.apache.lucene.search.grouping.TopGroups;
import org.apache.lucene.search.join.BitDocIdSetCachingWrapperFilter;
import org.apache.lucene.search.join.BitDocIdSetFilter;
import org.apache.lucene.search.join.ScoreMode;
import org.apache.lucene.search.join.ToParentBlockJoinCollector;
import org.apache.lucene.search.join.ToParentBlockJoinIndexSearcher;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;

public class BlockJoinTestQuery {

        public BlockJoinTestQuery() {
                // TODO Auto-generated constructor stub
        }
        
        public void run(){
        Path path = Paths.get("/Users/m029206/Desktop/index");
        Directory index;
        try {
                index = new MMapDirectory(path);

        IndexReader reader =  DirectoryReader.open(index);
        IndexSearcher searcher = new ToParentBlockJoinIndexSearcher(reader);
        ToParentBlockJoinCollector collector = new
ToParentBlockJoinCollector(Sort.RELEVANCE, 2, true, true);
        BitDocIdSetFilter codingScheme = new BitDocIdSetCachingWrapperFilter(
              new QueryWrapperFilter(new QueryParser("codingSchemeName",
new StandardAnalyzer(new CharArraySet( 0, true))).parse("TestScheme")));

          Query query = new QueryParser(null, new StandardAnalyzer(new
CharArraySet( 0, true))).createBooleanQuery("propertyValue", "Blood",
Occur.MUST);
          ToParentBlockJoinQuery termJoinQuery = new ToParentBlockJoinQuery(
                            query, 
                            codingScheme,
                            ScoreMode.Avg);
          searcher.search(termJoinQuery, collector);
          TopGroups<Integer> getTopGroupsResults =
collector.getTopGroups(termJoinQuery, null, 0, 10, 0, true);
          String ecode = null;
          for (GroupDocs<Integer> result : getTopGroupsResults.groups) {
                  Document parent = searcher.doc(result.groupValue);
                 ecode = parent.get("entityCode");
                 System.out.println("entityCode: " + ecode);
          }
        } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
        } catch (ParseException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
        }
        }
        
        public static void main(String[] args) {
                new BlockJoinTestQuery().run();

        }

}



On 6/23/15, 4:17 PM, "Bauer, Herbert S. (Scott)" <[email protected]>
wrote:

>I’m guessing this issue may be related to the SOLR error described here:
>https://issues.apache.org/jira/browse/SOLR-7606.  I can find at least one
>group of documents with a missing parent in my generated index.  This
>doesn’t explain why I didn’t see a similar issue in 4.10.4.  I can see
>that the BitSet implementation isn’t the issue but the filtered bit set
>inside it may be causing the problem given a missing parent.  I have to
>say I’m a little concerned about the lack of feedback on this list.  Is
>there another forum that is a little more active on this subject or is the
>block join implementation just not used or supported that much?
>
>On 6/22/15, 2:21 PM, "Bauer, Herbert S. (Scott)" <[email protected]>
>wrote:
>
>>Well it’s clear that this is just giving a return value of
>>Integer.MAX_VALUE for the parentDoc.  Given the recent changes noted
>>here:
>> https://issues.apache.org/jira/browse/LUCENE-6021 where FixedBitSet now
>>returns Integer.MAX_VALUE instead of -1 I wonder if a bug wasn’t
>>introduced to the BlockJoinScorer.nextDoc method.  Unfortunately I have
>>yet to come up with an example to make this fail on a smaller test index.
>>The child document in question does have a parent, which is doc #4823684,
>>so I’m confused as to how the NO_MORE_DOCS value would be applied.  Is
>>there something obvious I’m missing here?
>>
>>On 6/5/15, 12:05 PM, "Bauer, Herbert S. (Scott)" <[email protected]>
>>wrote:
>>
>>>One correction, it looks like the parentBits call has 4823680 passed to
>>>it
>>>to generate the erroneous docId.
>>>
>>>On 6/5/15, 10:34 AM, "Bauer, Herbert S. (Scott)" <[email protected]>
>>>wrote:
>>>
>>>>I should mention that this worked in 4.10.4 using a very similar code
>>>>base.  -scott
>>>>
>>>>On 6/4/15, 4:51 PM, "Bauer, Herbert S. (Scott)" <[email protected]>
>>>>wrote:
>>>>
>>>>>I¹m working with Lucene  5.1 to try to make use of the relational
>>>>>structure of the block join index and query mechanisms.  I¹m querying
>>>>>with the following code:
>>>>>
>>>>>IndexReader reader =  DirectoryReader.open(index);
>>>>>
>>>>>ToParentBlockJoinIndexSearcher searcher = new
>>>>>ToParentBlockJoinIndexSearcher(reader);
>>>>>
>>>>>ToParentBlockJoinCollector collector = new
>>>>>ToParentBlockJoinCollector(Sort.RELEVANCE, 2, true, true);
>>>>>
>>>>>BitDocIdSetFilter codingScheme = new BitDocIdSetCachingWrapperFilter(
>>>>>
>>>>>                  new QueryWrapperFilter(new
>>>>>QueryParser("codingSchemeName", new StandardAnalyzer(new CharArraySet(
>>>>>0,
>>>>>true))).parse(scheme.getCodingSchemeName())));
>>>>>
>>>>>  Query query = new QueryParser(null, new StandardAnalyzer(new
>>>>>CharArraySet( 0, true))).createBooleanQuery("propertyValue",
>>>>>term.getTerm(), Occur.MUST);
>>>>>
>>>>>  ToParentBlockJoinQuery termJoinQuery = new ToParentBlockJoinQuery(
>>>>>
>>>>>    query,
>>>>>
>>>>>    codingScheme,
>>>>>
>>>>>    ScoreMode.Avg);
>>>>>
>>>>>  searcher.search(termJoinQuery, collector);
>>>>>
>>>>>
>>>>>To try to get parent values, but it fails on the final line with the
>>>>>following stack trace:
>>>>>
>>>>>
>>>>>Exception in thread "main" java.lang.IllegalStateException: child
>>>>>query
>>>>>must only match non-parent docs, but parent docID=2147483647 matched
>>>>>childScorer=class org.apache.lucene.search.TermScorer
>>>>>
>>>>>at 
>>>>>org.apache.lucene.search.join.ToParentBlockJoinQuery$BlockJoinScorer.n
>>>>>e
>>>>>x
>>>>>t
>>>>>D
>>>>>oc(ToParentBlockJoinQuery.java:330)
>>>>>
>>>>>at 
>>>>>org.apache.lucene.search.join.ToParentBlockJoinIndexSearcher.search(To
>>>>>P
>>>>>a
>>>>>r
>>>>>e
>>>>>ntBlockJoinIndexSearcher.java:63)
>>>>>
>>>>>at 
>>>>>org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:428)
>>>>>
>>>>>at 
>>>>>org.lexevs.lucene.prototype.LuceneQueryTrial.luceneToParentJoinQuery(L
>>>>>u
>>>>>c
>>>>>e
>>>>>n
>>>>>eQueryTrial.java:78)
>>>>>
>>>>>at 
>>>>>org.lexevs.lucene.prototype.LuceneQueryTrial.main(LuceneQueryTrial.jav
>>>>>a
>>>>>:
>>>>>3
>>>>>2
>>>>>7)
>>>>>
>>>>>
>>>>>I build indexes up to about 36Gb using a code similar to the
>>>>>following:
>>>>>
>>>>>
>>>>>List<Document> list = new ArrayList<Document>();
>>>>>
>>>>>//need a static
>>>>>
>>>>>int staticCount = count;
>>>>>
>>>>>ParentDocObject parent =
>>>>>builder.generateParentDoc(cs.getCodingSchemeName(),
>>>>>
>>>>>cs.getVersion(), cs.getURI(), "description");
>>>>>
>>>>>if 
>>>>>(cs.codingSchemeName.equals(CodingScheme.THESSCHEME.codingSchemeName))
>>>>>{
>>>>>
>>>>>//One per coding Scheme
>>>>>
>>>>>int numberOfProperties = 12;
>>>>>
>>>>>if(!thesExactMatchDone){
>>>>>
>>>>>ChildDocObject child1 =
>>>>>builder.generateChildDocWithSalt(parent,SearchTerms.BLOOD.getTerm());
>>>>>
>>>>>Document doc1 = builder.mapToDocumentExactMatch(child1);
>>>>>
>>>>>list.add(doc1);
>>>>>
>>>>>count++;
>>>>>
>>>>>numberOfProperties--;
>>>>>
>>>>>ChildDocObject child =
>>>>>builder.generateChildDocWithSalt(parent,SearchTerms.CHAR.term);
>>>>>
>>>>>Document doc = builder.mapToDocumentExactMatch(child);
>>>>>
>>>>>count++;
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>numberOfProperties--;
>>>>>
>>>>>thesExactMatchDone = true;
>>>>>
>>>>>}
>>>>>
>>>>>while (numberOfProperties > 0) {
>>>>>
>>>>>if(count % 547 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGenerator(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.BLOOD.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocument(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 233 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGenerator(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.CHAR.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocument(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 71 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGenerator(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.ARTICLE.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocument(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 2237 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGenerator(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.LUNG_CANCER.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocument(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 5077 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGenerator(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.LIVER_CARCINOMA.getTerm())
>>>>>)
>>>>>;
>>>>>
>>>>>Document doc = builder.mapToDocument(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 2371 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGeneratorStartsWith(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.BLOOD.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocumentExactMatch(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 79 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGeneratorStartsWith(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.ARTICLE.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocumentExactMatch(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 3581 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGeneratorStartsWith(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.LUNG_CANCER.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocumentExactMatch(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 23 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGeneratorStartsWith(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.CHAR.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocumentExactMatch(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>} else {
>>>>>
>>>>>ChildDocObject child = builder.generateChildDoc(parent);
>>>>>
>>>>>Document doc = builder.mapToDocument(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;
>>>>>
>>>>>numberOfProperties--;
>>>>>
>>>>>}
>>>>>
>>>>>}
>>>>>
>>>>>}
>>>>>
>>>>>Document par = builder.mapToDocument(parent);
>>>>>
>>>>>list.add(par);
>>>>>
>>>>>writer.addDocuments(list);
>>>>>
>>>>>}
>>>>>
>>>>>
>>>>>Which works pretty well until I scale it up using several instances of
>>>>>this.  When the nextChildDoc document retrieved gets to id 5874902 the
>>>>>line in ToParentBlockJoinQuery
>>>>>
>>>>>
>>>>>        parentDoc = parentBits.nextSetBit(nextChildDoc);
>>>>>
>>>>>
>>>>>Gives the value  2147483647 to the parentDoc, which is not a document
>>>>>id
>>>>>in my index if I understand lucene and Luke correctly since my index
>>>>>has
>>>>>only 42716877 documents.
>>>>>
>>>>>Can someone shed some light on this exception?
>>>>>
>>>>>
>>>>>Thanks,
>>>>>
>>>>>Scott Bauer
>>>>>
>>>>>
>>>>>
>>>>>
>>>>
>>>>
>>>>---------------------------------------------------------------------
>>>>To unsubscribe, e-mail: [email protected]
>>>>For additional commands, e-mail: [email protected]
>>>>
>>>
>>
>

Re: Exception when attempting to query using ToParentBlockJoinQuery in Lucene 5.1

Reply via email to