[GitHub] [lucene-solr] msokolov commented on a change in pull request #2063: LUCENE-9599 Make comparator aware of index sorting

GitBox Thu, 05 Nov 2020 05:40:38 -0800


msokolov commented on a change in pull request #2063:
URL: https://github.com/apache/lucene-solr/pull/2063#discussion_r518055212




##########
File path: 
lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java
##########
@@ -0,0 +1,324 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.comparators;
+
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.FieldComparator;
+import org.apache.lucene.search.LeafFieldComparator;
+import org.apache.lucene.search.Scorable;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+
+import java.io.IOException;
+
+/**
+ * Comparator that sorts by field's natural Term sort order using ordinals.
+ * This is functionally equivalent to
+ * {@link org.apache.lucene.search.comparators.TermValComparator},
+ * but it first resolves the string to their relative ordinal positions
+ * (using the index returned by
+ * {@link org.apache.lucene.index.LeafReader#getSortedDocValues(String)}),
+ * and does most comparisons using the ordinals.
+ * For medium to large results, this comparator will be much faster
+ * than {@link org.apache.lucene.search.comparators.TermValComparator}.
+ * For very small result sets it may be slower.
+ *
+ * The comparator provides an iterator that can efficiently skip
+ * documents when search sort is done according to the index sort.
+ */
+public class TermOrdValComparator extends FieldComparator<BytesRef> {
+  private final String field;
+  private final boolean reverse;
+  private final int[] ords; // ordinals for each slot
+  private final BytesRef[] values; // values for each slot
+  private final BytesRefBuilder[] tempBRs;
+  /* Which reader last copied a value into the slot. When
+     we compare two slots, we just compare-by-ord if the
+     readerGen is the same; else we must compare the
+     values (slower).*/
+  private final int[] readers;
+  private int currentReader = -1; // index of the current reader we are on
+  private final int missingSortCmp; // -1 – if missing values are sorted 
first, 1 – if sorted last
+  private final int missingOrd; // which ordinal to use for a missing value
+
+  private BytesRef topValue;
+  private boolean topSameReader;
+  private int topOrd;
+
+  private BytesRef bottomValue;
+  boolean bottomSameReader; // true if current bottom slot matches the current 
reader
+  int bottomSlot = -1; // bottom slot, or -1 if queue isn't full yet
+  int bottomOrd; // bottom ord (same as ords[bottomSlot] once bottomSlot is 
set), cached for faster comparison
+
+  protected boolean hitsThresholdReached;
+
+  public TermOrdValComparator(int numHits, String field, boolean 
sortMissingLast, boolean reverse) {
+    this.field = field;
+    this.reverse = reverse;
+    this.ords = new int[numHits];
+    this.values = new BytesRef[numHits];
+    tempBRs = new BytesRefBuilder[numHits];
+    readers = new int[numHits];
+    if (sortMissingLast) {
+      missingSortCmp = 1;
+      missingOrd = Integer.MAX_VALUE;
+    } else {
+      missingSortCmp = -1;
+      missingOrd = -1;
+    }
+  }
+
+  @Override
+  public int compare(int slot1, int slot2) {
+    if (readers[slot1] == readers[slot2]) {
+      return ords[slot1] - ords[slot2];
+    }
+    final BytesRef val1 = values[slot1];
+    final BytesRef val2 = values[slot2];
+    if (val1 == null) {
+      if (val2 == null) {
+        return 0;
+      }
+      return missingSortCmp;
+    } else if (val2 == null) {
+      return -missingSortCmp;
+    }
+    return val1.compareTo(val2);
+  }
+
+  @Override
+  public void setTopValue(BytesRef value) {
+    // null is accepted, this means the last doc of the prior search was 
missing this value
+    topValue = value;
+  }
+
+  @Override
+  public BytesRef value(int slot) {
+    return values[slot];
+  }
+
+  @Override
+  public LeafFieldComparator getLeafComparator(LeafReaderContext context) 
throws IOException {
+    return new TermOrdValLeafComparator(context);
+  }
+
+  @Override
+  public int compareValues(BytesRef val1, BytesRef val2) {
+    if (val1 == null) {
+      if (val2 == null) {
+        return 0;
+      }
+      return missingSortCmp;
+    } else if (val2 == null) {
+      return -missingSortCmp;
+    }
+    return val1.compareTo(val2);
+  }
+
+  /**
+   * Leaf comparator for {@link TermOrdValComparator} that provides skipping 
functionality when index is sorted
+   */
+  public class TermOrdValLeafComparator implements LeafFieldComparator {
+    private final SortedDocValues termsIndex;
+    private boolean indexSort = false; // true if a query sort is a part of 
the index sort
+    private DocIdSetIterator competitiveIterator;
+    private boolean collectedAllCompetitiveHits = false;
+    private boolean iteratorUpdated = false;
+
+    public TermOrdValLeafComparator(LeafReaderContext context) throws 
IOException {
+      termsIndex = getSortedDocValues(context, field);
+      currentReader++;
+      if (topValue != null) {
+        // Recompute topOrd/SameReader
+        int ord = termsIndex.lookupTerm(topValue);
+        if (ord >= 0) {
+          topSameReader = true;
+          topOrd = ord;
+        } else {
+          topSameReader = false;
+          topOrd = -ord-2;
+        }
+      } else {
+        topOrd = missingOrd;
+        topSameReader = true;
+      }
+      if (bottomSlot != -1) {
+        // Recompute bottomOrd/SameReader
+        setBottom(bottomSlot);
+      }
+      this.competitiveIterator = 
DocIdSetIterator.all(context.reader().maxDoc());
+    }
+
+    protected SortedDocValues getSortedDocValues(LeafReaderContext context, 
String field) throws IOException {
+      return DocValues.getSorted(context.reader(), field);
+    }
+
+    @Override
+    public void setBottom(final int slot) throws IOException {
+      bottomSlot = slot;
+      bottomValue = values[bottomSlot];
+      if (currentReader == readers[bottomSlot]) {
+        bottomOrd = ords[bottomSlot];
+        bottomSameReader = true;
+      } else {
+        if (bottomValue == null) {
+          // missingOrd is null for all segments
+          assert ords[bottomSlot] == missingOrd;
+          bottomOrd = missingOrd;
+          bottomSameReader = true;
+          readers[bottomSlot] = currentReader;
+        } else {
+          final int ord = termsIndex.lookupTerm(bottomValue);
+          if (ord < 0) {
+            bottomOrd = -ord - 2;
+            bottomSameReader = false;
+          } else {
+            bottomOrd = ord;
+            // exact value match
+            bottomSameReader = true;
+            readers[bottomSlot] = currentReader;
+            ords[bottomSlot] = bottomOrd;
+          }
+        }
+      }
+    }
+
+    @Override
+    public int compareBottom(int doc) throws IOException {
+      assert bottomSlot != -1;
+      int docOrd = getOrdForDoc(doc);
+      if (docOrd == -1) {
+        docOrd = missingOrd;
+      }
+      int result;
+      if (bottomSameReader) {
+        // ord is precisely comparable, even in the equal case
+        result = bottomOrd - docOrd;
+      } else if (bottomOrd >= docOrd) {
+        // the equals case always means bottom is > doc
+        // (because we set bottomOrd to the lower bound in setBottom):
+        result = 1;
+      } else {
+        result = -1;
+      }
+      // for the index sort case, if we encounter a first doc that is 
non-competitive,
+      // and the hits threshold is reached, we can update the iterator to skip 
the rest of docs
+      if (indexSort && (reverse ? result >= 0 : result <= 0)) {

Review comment:
       oh it looks like there was some new stuff here related to indexSort. I 
wonder if it would be possible to separate the change moving these classes to 
top level for the sake of review? If it's a pain, perhaps you could just 
comment on what the changes are here?

##########
File path: 
lucene/core/src/test/org/apache/lucene/search/TestFieldSortOptimizationSkipping.java
##########
@@ -485,4 +491,186 @@ public void testDocSort() throws IOException {
     dir.close();
   }
 
+  public void testNumericSortOptimizationIndexSort() throws IOException {

Review comment:
       Do we have a test for the case where the index sort has multiple Sorts 
in it and the query sort matches its first n components?

##########
File path: 
lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java
##########
@@ -0,0 +1,324 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.comparators;
+
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.FieldComparator;
+import org.apache.lucene.search.LeafFieldComparator;
+import org.apache.lucene.search.Scorable;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+
+import java.io.IOException;
+
+/**
+ * Comparator that sorts by field's natural Term sort order using ordinals.
+ * This is functionally equivalent to
+ * {@link org.apache.lucene.search.comparators.TermValComparator},
+ * but it first resolves the string to their relative ordinal positions
+ * (using the index returned by
+ * {@link org.apache.lucene.index.LeafReader#getSortedDocValues(String)}),
+ * and does most comparisons using the ordinals.
+ * For medium to large results, this comparator will be much faster
+ * than {@link org.apache.lucene.search.comparators.TermValComparator}.
+ * For very small result sets it may be slower.
+ *
+ * The comparator provides an iterator that can efficiently skip
+ * documents when search sort is done according to the index sort.
+ */
+public class TermOrdValComparator extends FieldComparator<BytesRef> {

Review comment:
       Oh I see reverse was added..

##########
File path: 
lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java
##########
@@ -0,0 +1,324 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.search.comparators;
+
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.FieldComparator;
+import org.apache.lucene.search.LeafFieldComparator;
+import org.apache.lucene.search.Scorable;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+
+import java.io.IOException;
+
+/**
+ * Comparator that sorts by field's natural Term sort order using ordinals.
+ * This is functionally equivalent to
+ * {@link org.apache.lucene.search.comparators.TermValComparator},
+ * but it first resolves the string to their relative ordinal positions
+ * (using the index returned by
+ * {@link org.apache.lucene.index.LeafReader#getSortedDocValues(String)}),
+ * and does most comparisons using the ordinals.
+ * For medium to large results, this comparator will be much faster
+ * than {@link org.apache.lucene.search.comparators.TermValComparator}.
+ * For very small result sets it may be slower.
+ *
+ * The comparator provides an iterator that can efficiently skip
+ * documents when search sort is done according to the index sort.
+ */
+public class TermOrdValComparator extends FieldComparator<BytesRef> {

Review comment:
       Is this just moved to top-level class, or were there changes? It's hard 
to tell in the PR




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

[GitHub] [lucene-solr] msokolov commented on a change in pull request #2063: LUCENE-9599 Make comparator aware of index sorting

Reply via email to