jpountz commented on code in PR #972:
URL: https://github.com/apache/lucene/pull/972#discussion_r908131031


##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+  private final ScoreMode scoreMode;
+
+  // current doc ID of the leads
+  private int doc;
+
+  // doc id boundary that all scorers maxScore are valid
+  private int upTo = -1;
+
+  // heap of scorers ordered by doc ID
+  private final DisiPriorityQueue essentialsScorers;
+  // list of scorers ordered by maxScore
+  private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+  private final DisiWrapper[] allScorers;
+
+  // sum of max scores of scorers in nonEssentialScorers list
+  private float nonEssentialMaxScoreSum;
+
+  private long cost;
+
+  private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+  // scaled min competitive score
+  private float minCompetitiveScore = 0;
+
+  /**
+   * Constructs a Scorer
+   *
+   * @param weight The weight to be used.
+   * @param scorers The sub scorers this Scorer should iterate on for optional 
clauses
+   * @param scoreMode The scoreMode
+   */
+  public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode 
scoreMode)
+      throws IOException {
+    super(weight);
+    assert scoreMode == ScoreMode.TOP_SCORES;
+
+    this.scoreMode = scoreMode;
+    this.doc = -1;
+
+    this.allScorers = new DisiWrapper[scorers.size()];
+    int i = 0;
+    this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+    this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+    long cost = 0;
+    for (Scorer scorer : scorers) {
+      DisiWrapper w = new DisiWrapper(scorer);
+      cost += w.cost;
+      allScorers[i++] = w;
+    }
+
+    this.cost = cost;
+    maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+  }
+
+  @Override
+  public DocIdSetIterator iterator() {
+    // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+    return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+  }
+
+  @Override
+  public TwoPhaseIterator twoPhaseIterator() {
+    DocIdSetIterator approximation =
+        new DocIdSetIterator() {
+
+          @Override
+          public int docID() {
+            return doc;
+          }
+
+          @Override
+          public int nextDoc() throws IOException {
+            return advance(doc + 1);
+          }
+
+          @Override
+          public int advance(int target) throws IOException {
+            while (true) {
+
+              if (target > upTo) {
+                updateMaxScoresAndLists(target);
+              } else {
+                // minCompetitiveScore might have increased,
+                // move potentially no-longer-competitive scorers from 
essential to non-essential
+                // list
+                movePotentiallyNonCompetitiveScorers();
+              }
+
+              assert target <= upTo;
+
+              DisiWrapper top = essentialsScorers.top();
+
+              if (top == null) {
+                // all scorers in non-essential list, skip to next boundary or 
return no_more_docs
+                if (upTo == NO_MORE_DOCS) {
+                  return doc = NO_MORE_DOCS;
+                } else {
+                  target = upTo + 1;
+                }
+              } else {
+                // position all scorers in essential list to on or after target
+                while (top.doc < target) {
+                  top.doc = top.iterator.advance(target);
+                  top = essentialsScorers.updateTop();
+                }
+
+                if (top.doc == NO_MORE_DOCS) {
+                  return doc = NO_MORE_DOCS;
+                } else if (top.doc > upTo) {
+                  target = upTo + 1;
+                } else {
+                  float matchedMaxScoreSum = nonEssentialMaxScoreSum;

Review Comment:
   Other boolean queries sum up scores into a `double` to reduce the impact of 
rounding on accuracy, though this only helps when there are 3 clauses or more, 
otherwise summing up into a double doesn't yield better accuracy. Can you 
either add a comment that this would need to be changed if we were to support 
more than two clauses with this scorer, or replace it with a double?



##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+  private final ScoreMode scoreMode;
+
+  // current doc ID of the leads
+  private int doc;
+
+  // doc id boundary that all scorers maxScore are valid
+  private int upTo = -1;
+
+  // heap of scorers ordered by doc ID
+  private final DisiPriorityQueue essentialsScorers;
+  // list of scorers ordered by maxScore
+  private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+  private final DisiWrapper[] allScorers;
+
+  // sum of max scores of scorers in nonEssentialScorers list
+  private float nonEssentialMaxScoreSum;
+
+  private long cost;
+
+  private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+  // scaled min competitive score
+  private float minCompetitiveScore = 0;
+
+  /**
+   * Constructs a Scorer
+   *
+   * @param weight The weight to be used.
+   * @param scorers The sub scorers this Scorer should iterate on for optional 
clauses
+   * @param scoreMode The scoreMode
+   */
+  public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode 
scoreMode)
+      throws IOException {
+    super(weight);
+    assert scoreMode == ScoreMode.TOP_SCORES;
+
+    this.scoreMode = scoreMode;
+    this.doc = -1;
+
+    this.allScorers = new DisiWrapper[scorers.size()];
+    int i = 0;
+    this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+    this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+    long cost = 0;
+    for (Scorer scorer : scorers) {
+      DisiWrapper w = new DisiWrapper(scorer);
+      cost += w.cost;
+      allScorers[i++] = w;
+    }
+
+    this.cost = cost;
+    maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+  }
+
+  @Override
+  public DocIdSetIterator iterator() {
+    // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+    return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+  }
+
+  @Override
+  public TwoPhaseIterator twoPhaseIterator() {
+    DocIdSetIterator approximation =
+        new DocIdSetIterator() {
+
+          @Override
+          public int docID() {
+            return doc;
+          }
+
+          @Override
+          public int nextDoc() throws IOException {
+            return advance(doc + 1);
+          }
+
+          @Override
+          public int advance(int target) throws IOException {
+            while (true) {
+
+              if (target > upTo) {
+                updateMaxScoresAndLists(target);
+              } else {
+                // minCompetitiveScore might have increased,
+                // move potentially no-longer-competitive scorers from 
essential to non-essential
+                // list
+                movePotentiallyNonCompetitiveScorers();
+              }
+
+              assert target <= upTo;
+
+              DisiWrapper top = essentialsScorers.top();
+
+              if (top == null) {
+                // all scorers in non-essential list, skip to next boundary or 
return no_more_docs
+                if (upTo == NO_MORE_DOCS) {
+                  return doc = NO_MORE_DOCS;
+                } else {
+                  target = upTo + 1;
+                }
+              } else {
+                // position all scorers in essential list to on or after target
+                while (top.doc < target) {
+                  top.doc = top.iterator.advance(target);
+                  top = essentialsScorers.updateTop();
+                }
+
+                if (top.doc == NO_MORE_DOCS) {
+                  return doc = NO_MORE_DOCS;
+                } else if (top.doc > upTo) {
+                  target = upTo + 1;
+                } else {
+                  float matchedMaxScoreSum = nonEssentialMaxScoreSum;
+
+                  for (DisiWrapper w = essentialsScorers.topList(); w != null; 
w = w.next) {
+                    matchedMaxScoreSum += w.scorer.score();
+                  }
+
+                  if (matchedMaxScoreSum < minCompetitiveScore) {

Review Comment:
   This doesn't look correct to me. The overall score could still be above the 
`minCompetitiveScore` after summing up scores of the non-essential clauses? 
Should it be something like below:
   
   ```java
   if (matchedMaxScoreSum + nonEssentialMaxScoreSum < minCompetitiveScore) {
   ```



##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+  private final ScoreMode scoreMode;
+
+  // current doc ID of the leads
+  private int doc;
+
+  // doc id boundary that all scorers maxScore are valid
+  private int upTo = -1;
+
+  // heap of scorers ordered by doc ID
+  private final DisiPriorityQueue essentialsScorers;
+  // list of scorers ordered by maxScore
+  private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+  private final DisiWrapper[] allScorers;
+
+  // sum of max scores of scorers in nonEssentialScorers list
+  private float nonEssentialMaxScoreSum;
+
+  private long cost;
+
+  private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+  // scaled min competitive score
+  private float minCompetitiveScore = 0;
+
+  /**
+   * Constructs a Scorer
+   *
+   * @param weight The weight to be used.
+   * @param scorers The sub scorers this Scorer should iterate on for optional 
clauses
+   * @param scoreMode The scoreMode
+   */
+  public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode 
scoreMode)
+      throws IOException {
+    super(weight);
+    assert scoreMode == ScoreMode.TOP_SCORES;
+
+    this.scoreMode = scoreMode;
+    this.doc = -1;
+
+    this.allScorers = new DisiWrapper[scorers.size()];
+    int i = 0;
+    this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+    this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+    long cost = 0;
+    for (Scorer scorer : scorers) {
+      DisiWrapper w = new DisiWrapper(scorer);
+      cost += w.cost;
+      allScorers[i++] = w;
+    }
+
+    this.cost = cost;
+    maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+  }
+
+  @Override
+  public DocIdSetIterator iterator() {
+    // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+    return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+  }
+
+  @Override
+  public TwoPhaseIterator twoPhaseIterator() {
+    DocIdSetIterator approximation =
+        new DocIdSetIterator() {
+
+          @Override
+          public int docID() {
+            return doc;
+          }
+
+          @Override
+          public int nextDoc() throws IOException {
+            return advance(doc + 1);
+          }
+
+          @Override
+          public int advance(int target) throws IOException {
+            while (true) {
+
+              if (target > upTo) {
+                updateMaxScoresAndLists(target);
+              } else {
+                // minCompetitiveScore might have increased,
+                // move potentially no-longer-competitive scorers from 
essential to non-essential
+                // list
+                movePotentiallyNonCompetitiveScorers();
+              }
+
+              assert target <= upTo;
+
+              DisiWrapper top = essentialsScorers.top();
+
+              if (top == null) {
+                // all scorers in non-essential list, skip to next boundary or 
return no_more_docs
+                if (upTo == NO_MORE_DOCS) {
+                  return doc = NO_MORE_DOCS;
+                } else {
+                  target = upTo + 1;
+                }
+              } else {
+                // position all scorers in essential list to on or after target
+                while (top.doc < target) {
+                  top.doc = top.iterator.advance(target);
+                  top = essentialsScorers.updateTop();
+                }
+
+                if (top.doc == NO_MORE_DOCS) {
+                  return doc = NO_MORE_DOCS;
+                } else if (top.doc > upTo) {
+                  target = upTo + 1;
+                } else {
+                  float matchedMaxScoreSum = nonEssentialMaxScoreSum;
+
+                  for (DisiWrapper w = essentialsScorers.topList(); w != null; 
w = w.next) {
+                    matchedMaxScoreSum += w.scorer.score();
+                  }
+
+                  if (matchedMaxScoreSum < minCompetitiveScore) {
+                    // skip straight to next candidate doc from essential 
scorer
+                    int docId = top.doc;
+                    do {
+                      top.doc = top.iterator.nextDoc();
+                      top = essentialsScorers.updateTop();
+                    } while (top.doc == docId);
+
+                    target = top.doc;
+                  } else {
+                    return doc = top.doc;
+                  }
+                }
+              }
+            }
+          }
+
+          private void movePotentiallyNonCompetitiveScorers() {
+            while (maxScoreSortedEssentialScorers.size() > 0
+                && nonEssentialMaxScoreSum + 
maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+                    < minCompetitiveScore) {
+              DisiWrapper nextLeastContributingScorer =
+                  maxScoreSortedEssentialScorers.removeFirst();
+              nonEssentialMaxScoreSum += 
nextLeastContributingScorer.maxScoreFloat;
+            }
+
+            // list adjusted
+            if (essentialsScorers.size() != 
maxScoreSortedEssentialScorers.size()) {
+              essentialsScorers.clear();
+              for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+                essentialsScorers.add(w);
+              }
+            }
+          }
+
+          private void updateMaxScoresAndLists(int target) throws IOException {
+            assert target > upTo;
+            // Next candidate doc id is above interval boundary, or 
minCompetitive has increased.
+            // Find next interval boundary.
+            // Block boundary alignment strategy is adapted from "Optimizing 
Top-k Document
+            // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, 
Nepomnyachiy and Suel.
+            // Find the block interval boundary that is the minimum of all 
participating scorer's
+            // block boundary. Then run BMM within each interval.
+            updateUpToAndMaxScore(target);
+            repartitionLists();
+          }
+
+          private void updateUpToAndMaxScore(int target) throws IOException {
+            // reset upTo
+            upTo = -1;
+            for (DisiWrapper w : allScorers) {
+              upTo = Math.max(w.scorer.advanceShallow(Math.max(w.doc, 
target)), upTo);
+            }
+            assert target <= upTo;
+
+            for (DisiWrapper w : allScorers) {
+              if (w.doc <= upTo) {
+                w.maxScoreFloat = w.scorer.getMaxScore(upTo);
+              } else {
+                // This scorer won't be able to contribute to match for 
target, setting its maxScore
+                // to 0 so it goes into nonEssentialList
+                w.maxScoreFloat = 0;
+              }
+            }
+          }
+
+          private void repartitionLists() {
+            essentialsScorers.clear();
+            maxScoreSortedEssentialScorers.clear();
+            Arrays.sort(allScorers, Comparator.comparingDouble(scorer -> 
scorer.maxScoreFloat));
+
+            // Re-partition the scorers into non-essential list and essential 
list, as defined in
+            // the "Optimizing Top-k Document Retrieval Strategies for 
Block-Max Indexes" paper.
+            nonEssentialMaxScoreSum = 0;
+            for (DisiWrapper w : allScorers) {
+              if (nonEssentialMaxScoreSum + w.maxScoreFloat < 
minCompetitiveScore) {
+                nonEssentialMaxScoreSum += w.maxScoreFloat;
+              } else {
+                maxScoreSortedEssentialScorers.add(w);
+                essentialsScorers.add(w);
+              }
+            }
+          }
+
+          @Override
+          public long cost() {
+            // fixed at initialization
+            return cost;
+          }
+        };
+
+    return new TwoPhaseIterator(approximation) {
+      @Override
+      public boolean matches() throws IOException {
+        return score() >= minCompetitiveScore;

Review Comment:
   Implementing `matches()` like that has the good side effect that we only 
pass good matches to the collector, but the bad side-effect that scores are 
computed twice for competitive hits. I wonder if we should try to cache the 
score so that it's not computed another time by the collector.



##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+  private final ScoreMode scoreMode;
+
+  // current doc ID of the leads
+  private int doc;
+
+  // doc id boundary that all scorers maxScore are valid
+  private int upTo = -1;
+
+  // heap of scorers ordered by doc ID
+  private final DisiPriorityQueue essentialsScorers;
+  // list of scorers ordered by maxScore
+  private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+  private final DisiWrapper[] allScorers;
+
+  // sum of max scores of scorers in nonEssentialScorers list
+  private float nonEssentialMaxScoreSum;
+
+  private long cost;
+
+  private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+  // scaled min competitive score
+  private float minCompetitiveScore = 0;
+
+  /**
+   * Constructs a Scorer
+   *
+   * @param weight The weight to be used.
+   * @param scorers The sub scorers this Scorer should iterate on for optional 
clauses
+   * @param scoreMode The scoreMode
+   */
+  public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode 
scoreMode)
+      throws IOException {
+    super(weight);
+    assert scoreMode == ScoreMode.TOP_SCORES;
+
+    this.scoreMode = scoreMode;
+    this.doc = -1;
+
+    this.allScorers = new DisiWrapper[scorers.size()];
+    int i = 0;
+    this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+    this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+    long cost = 0;
+    for (Scorer scorer : scorers) {
+      DisiWrapper w = new DisiWrapper(scorer);
+      cost += w.cost;
+      allScorers[i++] = w;
+    }
+
+    this.cost = cost;
+    maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+  }
+
+  @Override
+  public DocIdSetIterator iterator() {
+    // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+    return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+  }
+
+  @Override
+  public TwoPhaseIterator twoPhaseIterator() {
+    DocIdSetIterator approximation =
+        new DocIdSetIterator() {
+
+          @Override
+          public int docID() {
+            return doc;
+          }
+
+          @Override
+          public int nextDoc() throws IOException {
+            return advance(doc + 1);
+          }
+
+          @Override
+          public int advance(int target) throws IOException {
+            while (true) {
+
+              if (target > upTo) {
+                updateMaxScoresAndLists(target);
+              } else {
+                // minCompetitiveScore might have increased,
+                // move potentially no-longer-competitive scorers from 
essential to non-essential
+                // list
+                movePotentiallyNonCompetitiveScorers();
+              }
+
+              assert target <= upTo;
+
+              DisiWrapper top = essentialsScorers.top();
+
+              if (top == null) {
+                // all scorers in non-essential list, skip to next boundary or 
return no_more_docs
+                if (upTo == NO_MORE_DOCS) {
+                  return doc = NO_MORE_DOCS;
+                } else {
+                  target = upTo + 1;
+                }
+              } else {
+                // position all scorers in essential list to on or after target
+                while (top.doc < target) {
+                  top.doc = top.iterator.advance(target);
+                  top = essentialsScorers.updateTop();
+                }
+
+                if (top.doc == NO_MORE_DOCS) {
+                  return doc = NO_MORE_DOCS;
+                } else if (top.doc > upTo) {
+                  target = upTo + 1;
+                } else {
+                  float matchedMaxScoreSum = nonEssentialMaxScoreSum;
+
+                  for (DisiWrapper w = essentialsScorers.topList(); w != null; 
w = w.next) {
+                    matchedMaxScoreSum += w.scorer.score();
+                  }
+
+                  if (matchedMaxScoreSum < minCompetitiveScore) {
+                    // skip straight to next candidate doc from essential 
scorer
+                    int docId = top.doc;
+                    do {
+                      top.doc = top.iterator.nextDoc();
+                      top = essentialsScorers.updateTop();
+                    } while (top.doc == docId);
+
+                    target = top.doc;
+                  } else {
+                    return doc = top.doc;
+                  }
+                }
+              }
+            }
+          }
+
+          private void movePotentiallyNonCompetitiveScorers() {
+            while (maxScoreSortedEssentialScorers.size() > 0
+                && nonEssentialMaxScoreSum + 
maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+                    < minCompetitiveScore) {
+              DisiWrapper nextLeastContributingScorer =
+                  maxScoreSortedEssentialScorers.removeFirst();
+              nonEssentialMaxScoreSum += 
nextLeastContributingScorer.maxScoreFloat;
+            }
+
+            // list adjusted
+            if (essentialsScorers.size() != 
maxScoreSortedEssentialScorers.size()) {
+              essentialsScorers.clear();
+              for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+                essentialsScorers.add(w);
+              }
+            }
+          }
+
+          private void updateMaxScoresAndLists(int target) throws IOException {
+            assert target > upTo;
+            // Next candidate doc id is above interval boundary, or 
minCompetitive has increased.
+            // Find next interval boundary.
+            // Block boundary alignment strategy is adapted from "Optimizing 
Top-k Document
+            // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, 
Nepomnyachiy and Suel.
+            // Find the block interval boundary that is the minimum of all 
participating scorer's
+            // block boundary. Then run BMM within each interval.
+            updateUpToAndMaxScore(target);
+            repartitionLists();
+          }
+
+          private void updateUpToAndMaxScore(int target) throws IOException {
+            // reset upTo
+            upTo = -1;
+            for (DisiWrapper w : allScorers) {
+              upTo = Math.max(w.scorer.advanceShallow(Math.max(w.doc, 
target)), upTo);
+            }

Review Comment:
   I would add a comment that taking the `max` is probably a good approach for 
2 clauses but might need more thinking if we want this scorer to perform well 
with more than 2 clauses.



##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+  private final ScoreMode scoreMode;

Review Comment:
   We don't seem to actually need the `scoreMode`, let's remove it and make 
sure `Boolean2ScorerSupplier` only uses this scorer for TOP_SCORES?



##########
lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java:
##########
@@ -0,0 +1,314 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/** Scorer implementing Block-Max Maxscore algorithm */
+public class BlockMaxMaxscoreScorer extends Scorer {
+  private final ScoreMode scoreMode;
+
+  // current doc ID of the leads
+  private int doc;
+
+  // doc id boundary that all scorers maxScore are valid
+  private int upTo = -1;
+
+  // heap of scorers ordered by doc ID
+  private final DisiPriorityQueue essentialsScorers;
+  // list of scorers ordered by maxScore
+  private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers;
+
+  private final DisiWrapper[] allScorers;
+
+  // sum of max scores of scorers in nonEssentialScorers list
+  private float nonEssentialMaxScoreSum;
+
+  private long cost;
+
+  private final MaxScoreSumPropagator maxScoreSumPropagator;
+
+  // scaled min competitive score
+  private float minCompetitiveScore = 0;
+
+  /**
+   * Constructs a Scorer
+   *
+   * @param weight The weight to be used.
+   * @param scorers The sub scorers this Scorer should iterate on for optional 
clauses
+   * @param scoreMode The scoreMode
+   */
+  public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers, ScoreMode 
scoreMode)
+      throws IOException {
+    super(weight);
+    assert scoreMode == ScoreMode.TOP_SCORES;
+
+    this.scoreMode = scoreMode;
+    this.doc = -1;
+
+    this.allScorers = new DisiWrapper[scorers.size()];
+    int i = 0;
+    this.essentialsScorers = new DisiPriorityQueue(scorers.size());
+    this.maxScoreSortedEssentialScorers = new LinkedList<>();
+
+    long cost = 0;
+    for (Scorer scorer : scorers) {
+      DisiWrapper w = new DisiWrapper(scorer);
+      cost += w.cost;
+      allScorers[i++] = w;
+    }
+
+    this.cost = cost;
+    maxScoreSumPropagator = new MaxScoreSumPropagator(scorers);
+  }
+
+  @Override
+  public DocIdSetIterator iterator() {
+    // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee
+    return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator());
+  }
+
+  @Override
+  public TwoPhaseIterator twoPhaseIterator() {
+    DocIdSetIterator approximation =
+        new DocIdSetIterator() {
+
+          @Override
+          public int docID() {
+            return doc;
+          }
+
+          @Override
+          public int nextDoc() throws IOException {
+            return advance(doc + 1);
+          }
+
+          @Override
+          public int advance(int target) throws IOException {
+            while (true) {
+
+              if (target > upTo) {
+                updateMaxScoresAndLists(target);
+              } else {
+                // minCompetitiveScore might have increased,
+                // move potentially no-longer-competitive scorers from 
essential to non-essential
+                // list
+                movePotentiallyNonCompetitiveScorers();
+              }
+
+              assert target <= upTo;
+
+              DisiWrapper top = essentialsScorers.top();
+
+              if (top == null) {
+                // all scorers in non-essential list, skip to next boundary or 
return no_more_docs
+                if (upTo == NO_MORE_DOCS) {
+                  return doc = NO_MORE_DOCS;
+                } else {
+                  target = upTo + 1;
+                }
+              } else {
+                // position all scorers in essential list to on or after target
+                while (top.doc < target) {
+                  top.doc = top.iterator.advance(target);
+                  top = essentialsScorers.updateTop();
+                }
+
+                if (top.doc == NO_MORE_DOCS) {
+                  return doc = NO_MORE_DOCS;
+                } else if (top.doc > upTo) {
+                  target = upTo + 1;
+                } else {
+                  float matchedMaxScoreSum = nonEssentialMaxScoreSum;
+
+                  for (DisiWrapper w = essentialsScorers.topList(); w != null; 
w = w.next) {
+                    matchedMaxScoreSum += w.scorer.score();
+                  }
+
+                  if (matchedMaxScoreSum < minCompetitiveScore) {
+                    // skip straight to next candidate doc from essential 
scorer
+                    int docId = top.doc;
+                    do {
+                      top.doc = top.iterator.nextDoc();
+                      top = essentialsScorers.updateTop();
+                    } while (top.doc == docId);
+
+                    target = top.doc;
+                  } else {
+                    return doc = top.doc;
+                  }
+                }
+              }
+            }
+          }
+
+          private void movePotentiallyNonCompetitiveScorers() {
+            while (maxScoreSortedEssentialScorers.size() > 0
+                && nonEssentialMaxScoreSum + 
maxScoreSortedEssentialScorers.get(0).maxScoreFloat
+                    < minCompetitiveScore) {
+              DisiWrapper nextLeastContributingScorer =
+                  maxScoreSortedEssentialScorers.removeFirst();
+              nonEssentialMaxScoreSum += 
nextLeastContributingScorer.maxScoreFloat;
+            }
+
+            // list adjusted
+            if (essentialsScorers.size() != 
maxScoreSortedEssentialScorers.size()) {
+              essentialsScorers.clear();
+              for (DisiWrapper w : maxScoreSortedEssentialScorers) {
+                essentialsScorers.add(w);
+              }
+            }
+          }
+
+          private void updateMaxScoresAndLists(int target) throws IOException {
+            assert target > upTo;
+            // Next candidate doc id is above interval boundary, or 
minCompetitive has increased.
+            // Find next interval boundary.
+            // Block boundary alignment strategy is adapted from "Optimizing 
Top-k Document
+            // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, 
Nepomnyachiy and Suel.
+            // Find the block interval boundary that is the minimum of all 
participating scorer's
+            // block boundary. Then run BMM within each interval.
+            updateUpToAndMaxScore(target);
+            repartitionLists();
+          }
+
+          private void updateUpToAndMaxScore(int target) throws IOException {
+            // reset upTo
+            upTo = -1;
+            for (DisiWrapper w : allScorers) {
+              upTo = Math.max(w.scorer.advanceShallow(Math.max(w.doc, 
target)), upTo);
+            }
+            assert target <= upTo;
+
+            for (DisiWrapper w : allScorers) {
+              if (w.doc <= upTo) {
+                w.maxScoreFloat = w.scorer.getMaxScore(upTo);
+              } else {
+                // This scorer won't be able to contribute to match for 
target, setting its maxScore
+                // to 0 so it goes into nonEssentialList
+                w.maxScoreFloat = 0;
+              }
+            }
+          }
+
+          private void repartitionLists() {
+            essentialsScorers.clear();
+            maxScoreSortedEssentialScorers.clear();
+            Arrays.sort(allScorers, Comparator.comparingDouble(scorer -> 
scorer.maxScoreFloat));
+
+            // Re-partition the scorers into non-essential list and essential 
list, as defined in
+            // the "Optimizing Top-k Document Retrieval Strategies for 
Block-Max Indexes" paper.
+            nonEssentialMaxScoreSum = 0;
+            for (DisiWrapper w : allScorers) {
+              if (nonEssentialMaxScoreSum + w.maxScoreFloat < 
minCompetitiveScore) {
+                nonEssentialMaxScoreSum += w.maxScoreFloat;
+              } else {
+                maxScoreSortedEssentialScorers.add(w);
+                essentialsScorers.add(w);
+              }
+            }
+          }
+
+          @Override
+          public long cost() {
+            // fixed at initialization
+            return cost;
+          }
+        };
+
+    return new TwoPhaseIterator(approximation) {
+      @Override
+      public boolean matches() throws IOException {
+        return score() >= minCompetitiveScore;
+      }
+
+      @Override
+      public float matchCost() {
+        // maximum number of scorer that matches() might advance
+        return allScorers.length - essentialsScorers.size();

Review Comment:
   `matchCost` is expected to be fixed, we shouldn't try to adjust it based on 
the number of essential scorers, see e.g. how `ConjunctionDISI` uses it to sort 
two-phase iterators up-front.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to