I’m able to define this problem with a more discrete example including the two classes below. This suggests a bug and unless someone has clearer direction on this implementation I’m planning to file it as one.
package org.lexevs.lucene.prototype; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.MMapDirectory; public class SmallTestIndexBuilder { public enum Code{ C1234,C23432,C4234,C2308, C8958; } public SmallTestIndexBuilder() { // TODO Auto-generated constructor stub } public void init(){ try { LuceneContentBuilder builder = new LuceneContentBuilder(); Path path = Paths.get("/Users/m029206/Desktop/index"); Directory dir = new MMapDirectory(path); Analyzer analyzer=new StandardAnalyzer(new CharArraySet( 0, true)); IndexWriterConfig iwc= new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(dir, iwc); createCodingSchemeIndex(builder, writer ); writer.commit(); writer.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } private void createCodingSchemeIndex(LuceneContentBuilder builder, IndexWriter writer) throws IOException { for(Code c :Code.values()){ List<Document> list = createBlockJoin(c.name()); writer.addDocuments(list); list = createBlockJoin2(c.name()); writer.addDocuments(list); } } private List<Document> createBlockJoin(String code) { List<Document> list = new ArrayList<Document>(); Document doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "Blood", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "Mud", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "Suds", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "coagulant", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "hepa", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "hematoma", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "normal", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "abnormal", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "notfound", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "red blood cells", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "Blood", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "Blood", Field.Store.YES)); list.add(doc); Document par = new Document(); par.add(new org.apache.lucene.document.TextField("codingSchemeName", "TestScheme", Field.Store.YES)); par.add(new org.apache.lucene.document.TextField("parentDoc", "yes", Field.Store.YES)); par.add(new org.apache.lucene.document.TextField("entityCode", code, Field.Store.YES)); list.add(par); return list; } private List<Document> createBlockJoin2(String code) { List<Document> list = new ArrayList<Document>(); Document doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "Blood", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "Mud", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "Suds", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "coagulant", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "hepa", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "hematoma", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "normal", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "abnormal", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "notfound", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "red blood cells", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "Blood", Field.Store.YES)); list.add(doc); doc = new Document(); doc.add(new org.apache.lucene.document.TextField("propertyValue", "Blood", Field.Store.YES)); list.add(doc); Document par = new Document(); par.add(new org.apache.lucene.document.TextField("codingSchemeName", "TestSchemeToo", Field.Store.YES)); par.add(new org.apache.lucene.document.TextField("parentDoc", "yes", Field.Store.YES)); par.add(new org.apache.lucene.document.TextField("entityCode", code, Field.Store.YES)); list.add(par); return list; } public static void main(String[] args) { new SmallTestIndexBuilder().init(); } } package org.lexevs.lucene.prototype; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryWrapperFilter; import org.apache.lucene.search.Sort; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.grouping.GroupDocs; import org.apache.lucene.search.grouping.TopGroups; import org.apache.lucene.search.join.BitDocIdSetCachingWrapperFilter; import org.apache.lucene.search.join.BitDocIdSetFilter; import org.apache.lucene.search.join.ScoreMode; import org.apache.lucene.search.join.ToParentBlockJoinCollector; import org.apache.lucene.search.join.ToParentBlockJoinIndexSearcher; import org.apache.lucene.search.join.ToParentBlockJoinQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.MMapDirectory; public class BlockJoinTestQuery { public BlockJoinTestQuery() { // TODO Auto-generated constructor stub } public void run(){ Path path = Paths.get("/Users/m029206/Desktop/index"); Directory index; try { index = new MMapDirectory(path); IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new ToParentBlockJoinIndexSearcher(reader); ToParentBlockJoinCollector collector = new ToParentBlockJoinCollector(Sort.RELEVANCE, 2, true, true); BitDocIdSetFilter codingScheme = new BitDocIdSetCachingWrapperFilter( new QueryWrapperFilter(new QueryParser("codingSchemeName", new StandardAnalyzer(new CharArraySet( 0, true))).parse("TestScheme"))); Query query = new QueryParser(null, new StandardAnalyzer(new CharArraySet( 0, true))).createBooleanQuery("propertyValue", "Blood", Occur.MUST); ToParentBlockJoinQuery termJoinQuery = new ToParentBlockJoinQuery( query, codingScheme, ScoreMode.Avg); searcher.search(termJoinQuery, collector); TopGroups<Integer> getTopGroupsResults = collector.getTopGroups(termJoinQuery, null, 0, 10, 0, true); String ecode = null; for (GroupDocs<Integer> result : getTopGroupsResults.groups) { Document parent = searcher.doc(result.groupValue); ecode = parent.get("entityCode"); System.out.println("entityCode: " + ecode); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void main(String[] args) { new BlockJoinTestQuery().run(); } } On 6/23/15, 4:17 PM, "Bauer, Herbert S. (Scott)" <bauer.sc...@mayo.edu> wrote: >I’m guessing this issue may be related to the SOLR error described here: >https://issues.apache.org/jira/browse/SOLR-7606. I can find at least one >group of documents with a missing parent in my generated index. This >doesn’t explain why I didn’t see a similar issue in 4.10.4. I can see >that the BitSet implementation isn’t the issue but the filtered bit set >inside it may be causing the problem given a missing parent. I have to >say I’m a little concerned about the lack of feedback on this list. Is >there another forum that is a little more active on this subject or is the >block join implementation just not used or supported that much? > >On 6/22/15, 2:21 PM, "Bauer, Herbert S. (Scott)" <bauer.sc...@mayo.edu> >wrote: > >>Well it’s clear that this is just giving a return value of >>Integer.MAX_VALUE for the parentDoc. Given the recent changes noted >>here: >> https://issues.apache.org/jira/browse/LUCENE-6021 where FixedBitSet now >>returns Integer.MAX_VALUE instead of -1 I wonder if a bug wasn’t >>introduced to the BlockJoinScorer.nextDoc method. Unfortunately I have >>yet to come up with an example to make this fail on a smaller test index. >>The child document in question does have a parent, which is doc #4823684, >>so I’m confused as to how the NO_MORE_DOCS value would be applied. Is >>there something obvious I’m missing here? >> >>On 6/5/15, 12:05 PM, "Bauer, Herbert S. (Scott)" <bauer.sc...@mayo.edu> >>wrote: >> >>>One correction, it looks like the parentBits call has 4823680 passed to >>>it >>>to generate the erroneous docId. >>> >>>On 6/5/15, 10:34 AM, "Bauer, Herbert S. (Scott)" <bauer.sc...@mayo.edu> >>>wrote: >>> >>>>I should mention that this worked in 4.10.4 using a very similar code >>>>base. -scott >>>> >>>>On 6/4/15, 4:51 PM, "Bauer, Herbert S. (Scott)" <bauer.sc...@mayo.edu> >>>>wrote: >>>> >>>>>I¹m working with Lucene 5.1 to try to make use of the relational >>>>>structure of the block join index and query mechanisms. I¹m querying >>>>>with the following code: >>>>> >>>>>IndexReader reader = DirectoryReader.open(index); >>>>> >>>>>ToParentBlockJoinIndexSearcher searcher = new >>>>>ToParentBlockJoinIndexSearcher(reader); >>>>> >>>>>ToParentBlockJoinCollector collector = new >>>>>ToParentBlockJoinCollector(Sort.RELEVANCE, 2, true, true); >>>>> >>>>>BitDocIdSetFilter codingScheme = new BitDocIdSetCachingWrapperFilter( >>>>> >>>>> new QueryWrapperFilter(new >>>>>QueryParser("codingSchemeName", new StandardAnalyzer(new CharArraySet( >>>>>0, >>>>>true))).parse(scheme.getCodingSchemeName()))); >>>>> >>>>> Query query = new QueryParser(null, new StandardAnalyzer(new >>>>>CharArraySet( 0, true))).createBooleanQuery("propertyValue", >>>>>term.getTerm(), Occur.MUST); >>>>> >>>>> ToParentBlockJoinQuery termJoinQuery = new ToParentBlockJoinQuery( >>>>> >>>>> query, >>>>> >>>>> codingScheme, >>>>> >>>>> ScoreMode.Avg); >>>>> >>>>> searcher.search(termJoinQuery, collector); >>>>> >>>>> >>>>>To try to get parent values, but it fails on the final line with the >>>>>following stack trace: >>>>> >>>>> >>>>>Exception in thread "main" java.lang.IllegalStateException: child >>>>>query >>>>>must only match non-parent docs, but parent docID=2147483647 matched >>>>>childScorer=class org.apache.lucene.search.TermScorer >>>>> >>>>>at >>>>>org.apache.lucene.search.join.ToParentBlockJoinQuery$BlockJoinScorer.n >>>>>e >>>>>x >>>>>t >>>>>D >>>>>oc(ToParentBlockJoinQuery.java:330) >>>>> >>>>>at >>>>>org.apache.lucene.search.join.ToParentBlockJoinIndexSearcher.search(To >>>>>P >>>>>a >>>>>r >>>>>e >>>>>ntBlockJoinIndexSearcher.java:63) >>>>> >>>>>at >>>>>org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:428) >>>>> >>>>>at >>>>>org.lexevs.lucene.prototype.LuceneQueryTrial.luceneToParentJoinQuery(L >>>>>u >>>>>c >>>>>e >>>>>n >>>>>eQueryTrial.java:78) >>>>> >>>>>at >>>>>org.lexevs.lucene.prototype.LuceneQueryTrial.main(LuceneQueryTrial.jav >>>>>a >>>>>: >>>>>3 >>>>>2 >>>>>7) >>>>> >>>>> >>>>>I build indexes up to about 36Gb using a code similar to the >>>>>following: >>>>> >>>>> >>>>>List<Document> list = new ArrayList<Document>(); >>>>> >>>>>//need a static >>>>> >>>>>int staticCount = count; >>>>> >>>>>ParentDocObject parent = >>>>>builder.generateParentDoc(cs.getCodingSchemeName(), >>>>> >>>>>cs.getVersion(), cs.getURI(), "description"); >>>>> >>>>>if >>>>>(cs.codingSchemeName.equals(CodingScheme.THESSCHEME.codingSchemeName)) >>>>>{ >>>>> >>>>>//One per coding Scheme >>>>> >>>>>int numberOfProperties = 12; >>>>> >>>>>if(!thesExactMatchDone){ >>>>> >>>>>ChildDocObject child1 = >>>>>builder.generateChildDocWithSalt(parent,SearchTerms.BLOOD.getTerm()); >>>>> >>>>>Document doc1 = builder.mapToDocumentExactMatch(child1); >>>>> >>>>>list.add(doc1); >>>>> >>>>>count++; >>>>> >>>>>numberOfProperties--; >>>>> >>>>>ChildDocObject child = >>>>>builder.generateChildDocWithSalt(parent,SearchTerms.CHAR.term); >>>>> >>>>>Document doc = builder.mapToDocumentExactMatch(child); >>>>> >>>>>count++; >>>>> >>>>>list.add(doc); >>>>> >>>>>numberOfProperties--; >>>>> >>>>>thesExactMatchDone = true; >>>>> >>>>>} >>>>> >>>>>while (numberOfProperties > 0) { >>>>> >>>>>if(count % 547 == 0){ >>>>> >>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent, >>>>> >>>>>builder.randomTextGenerator( >>>>> >>>>>builder.randomNumberGenerator(),SearchTerms.BLOOD.getTerm())); >>>>> >>>>>Document doc = builder.mapToDocument(child); >>>>> >>>>>list.add(doc); >>>>> >>>>>count++;numberOfProperties--; >>>>> >>>>>}else if(count % 233 == 0){ >>>>> >>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent, >>>>> >>>>>builder.randomTextGenerator( >>>>> >>>>>builder.randomNumberGenerator(),SearchTerms.CHAR.getTerm())); >>>>> >>>>>Document doc = builder.mapToDocument(child); >>>>> >>>>>list.add(doc); >>>>> >>>>>count++;numberOfProperties--; >>>>> >>>>>}else if(count % 71 == 0){ >>>>> >>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent, >>>>> >>>>>builder.randomTextGenerator( >>>>> >>>>>builder.randomNumberGenerator(),SearchTerms.ARTICLE.getTerm())); >>>>> >>>>>Document doc = builder.mapToDocument(child); >>>>> >>>>>list.add(doc); >>>>> >>>>>count++;numberOfProperties--; >>>>> >>>>>}else if(count % 2237 == 0){ >>>>> >>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent, >>>>> >>>>>builder.randomTextGenerator( >>>>> >>>>>builder.randomNumberGenerator(),SearchTerms.LUNG_CANCER.getTerm())); >>>>> >>>>>Document doc = builder.mapToDocument(child); >>>>> >>>>>list.add(doc); >>>>> >>>>>count++;numberOfProperties--; >>>>> >>>>>}else if(count % 5077 == 0){ >>>>> >>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent, >>>>> >>>>>builder.randomTextGenerator( >>>>> >>>>>builder.randomNumberGenerator(),SearchTerms.LIVER_CARCINOMA.getTerm()) >>>>>) >>>>>; >>>>> >>>>>Document doc = builder.mapToDocument(child); >>>>> >>>>>list.add(doc); >>>>> >>>>>count++;numberOfProperties--; >>>>> >>>>>}else if(count % 2371 == 0){ >>>>> >>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent, >>>>> >>>>>builder.randomTextGeneratorStartsWith( >>>>> >>>>>builder.randomNumberGenerator(),SearchTerms.BLOOD.getTerm())); >>>>> >>>>>Document doc = builder.mapToDocumentExactMatch(child); >>>>> >>>>>list.add(doc); >>>>> >>>>>count++;numberOfProperties--; >>>>> >>>>>}else if(count % 79 == 0){ >>>>> >>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent, >>>>> >>>>>builder.randomTextGeneratorStartsWith( >>>>> >>>>>builder.randomNumberGenerator(),SearchTerms.ARTICLE.getTerm())); >>>>> >>>>>Document doc = builder.mapToDocumentExactMatch(child); >>>>> >>>>>list.add(doc); >>>>> >>>>>count++;numberOfProperties--; >>>>> >>>>>}else if(count % 3581 == 0){ >>>>> >>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent, >>>>> >>>>>builder.randomTextGeneratorStartsWith( >>>>> >>>>>builder.randomNumberGenerator(),SearchTerms.LUNG_CANCER.getTerm())); >>>>> >>>>>Document doc = builder.mapToDocumentExactMatch(child); >>>>> >>>>>list.add(doc); >>>>> >>>>>count++;numberOfProperties--; >>>>> >>>>>}else if(count % 23 == 0){ >>>>> >>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent, >>>>> >>>>>builder.randomTextGeneratorStartsWith( >>>>> >>>>>builder.randomNumberGenerator(),SearchTerms.CHAR.getTerm())); >>>>> >>>>>Document doc = builder.mapToDocumentExactMatch(child); >>>>> >>>>>list.add(doc); >>>>> >>>>>count++;numberOfProperties--; >>>>> >>>>>} else { >>>>> >>>>>ChildDocObject child = builder.generateChildDoc(parent); >>>>> >>>>>Document doc = builder.mapToDocument(child); >>>>> >>>>>list.add(doc); >>>>> >>>>>count++; >>>>> >>>>>numberOfProperties--; >>>>> >>>>>} >>>>> >>>>>} >>>>> >>>>>} >>>>> >>>>>Document par = builder.mapToDocument(parent); >>>>> >>>>>list.add(par); >>>>> >>>>>writer.addDocuments(list); >>>>> >>>>>} >>>>> >>>>> >>>>>Which works pretty well until I scale it up using several instances of >>>>>this. When the nextChildDoc document retrieved gets to id 5874902 the >>>>>line in ToParentBlockJoinQuery >>>>> >>>>> >>>>> parentDoc = parentBits.nextSetBit(nextChildDoc); >>>>> >>>>> >>>>>Gives the value 2147483647 to the parentDoc, which is not a document >>>>>id >>>>>in my index if I understand lucene and Luke correctly since my index >>>>>has >>>>>only 42716877 documents. >>>>> >>>>>Can someone shed some light on this exception? >>>>> >>>>> >>>>>Thanks, >>>>> >>>>>Scott Bauer >>>>> >>>>> >>>>> >>>>> >>>> >>>> >>>>--------------------------------------------------------------------- >>>>To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org >>>>For additional commands, e-mail: java-user-h...@lucene.apache.org >>>> >>> >> >