jpountz commented on a change in pull request #404:
URL: https://github.com/apache/lucene/pull/404#discussion_r739310053



##########
File path: lucene/core/src/java/org/apache/lucene/util/IntroSorter.java
##########
@@ -20,66 +20,120 @@
  * {@link Sorter} implementation based on a variant of the quicksort algorithm 
called <a
  * href="http://en.wikipedia.org/wiki/Introsort";>introsort</a>: when the 
recursion level exceeds the
  * log of the length of the array to sort, it falls back to heapsort. This 
prevents quicksort from
- * running into its worst-case quadratic runtime. Small arrays are sorted with 
insertion sort.
+ * running into its worst-case quadratic runtime. Small ranges are sorted with 
insertion sort.
  *
  * @lucene.internal
  */
 public abstract class IntroSorter extends Sorter {
 
+  /** Below this size threshold, the partition selection is simplified to a 
single median. */
+  private static final int SINGLE_MEDIAN_THRESHOLD = 40;
+
   /** Create a new {@link IntroSorter}. */
   public IntroSorter() {}
 
   @Override
   public final void sort(int from, int to) {
     checkRange(from, to);
-    quicksort(from, to, 2 * MathUtil.log(to - from, 2));
+    sort(from, to, 2 * MathUtil.log(to - from, 2));
   }
 
-  void quicksort(int from, int to, int maxDepth) {
-    if (to - from < BINARY_SORT_THRESHOLD) {
-      binarySort(from, to);
-      return;
-    } else if (--maxDepth < 0) {
-      heapSort(from, to);
-      return;
-    }
-
-    final int mid = (from + to) >>> 1;
-
-    if (compare(from, mid) > 0) {
-      swap(from, mid);
-    }
-
-    if (compare(mid, to - 1) > 0) {
-      swap(mid, to - 1);
-      if (compare(from, mid) > 0) {
-        swap(from, mid);
+  /**
+   * Sorts between from (inclusive) and to (exclusive) with intro sort.
+   *
+   * <p>Sorts small ranges with insertion sort. Fallbacks to heap sort to 
avoid quadratic worst
+   * case. Selects the pivot with medians and partitions with the 
Bentley-McIlroy fast 3-ways
+   * algorithm (Engineering a Sort Function, Bentley-McIlroy).
+   */
+  void sort(int from, int to, int maxDepth) {
+    int size;
+
+    // Sort small ranges with insertion sort.
+    while ((size = to - from) > INSERTION_SORT_THRESHOLD) {
+
+      if (--maxDepth < 0) {
+        // Max recursion depth reached: fallback to heap sort.
+        heapSort(from, to);
+        return;
       }
-    }
-
-    int left = from + 1;
-    int right = to - 2;
 
-    setPivot(mid);
-    for (; ; ) {
-      while (comparePivot(right) < 0) {
-        --right;
+      // Pivot selection based on medians.
+      int last = to - 1;
+      int mid = (from + last) >>> 1;
+      int range = size >> 3;
+      int pivot;
+      if (size <= SINGLE_MEDIAN_THRESHOLD) {
+        // Select the pivot with a single median around the middle element.
+        // Do not take the median between [from, mid, last] because it hurts 
performance
+        // if the order is descending.
+        pivot = median(mid - range, mid, mid + range);

Review comment:
       In a 32 elements array, we'd be looking at indices 12, 16 and 20, so all 
rather close to the mid element. Should `range` be `size/4` instead of `size/8` 
in that case to be less subject to special distributions (ie. we'd be looking 
at indices 8, 16 and 24 in a 32-elements array)?

##########
File path: lucene/core/src/java/org/apache/lucene/util/IntroSorter.java
##########
@@ -20,66 +20,120 @@
  * {@link Sorter} implementation based on a variant of the quicksort algorithm 
called <a
  * href="http://en.wikipedia.org/wiki/Introsort";>introsort</a>: when the 
recursion level exceeds the
  * log of the length of the array to sort, it falls back to heapsort. This 
prevents quicksort from
- * running into its worst-case quadratic runtime. Small arrays are sorted with 
insertion sort.
+ * running into its worst-case quadratic runtime. Small ranges are sorted with 
insertion sort.
  *
  * @lucene.internal
  */
 public abstract class IntroSorter extends Sorter {
 
+  /** Below this size threshold, the partition selection is simplified to a 
single median. */
+  private static final int SINGLE_MEDIAN_THRESHOLD = 40;
+
   /** Create a new {@link IntroSorter}. */
   public IntroSorter() {}
 
   @Override
   public final void sort(int from, int to) {
     checkRange(from, to);
-    quicksort(from, to, 2 * MathUtil.log(to - from, 2));
+    sort(from, to, 2 * MathUtil.log(to - from, 2));
   }
 
-  void quicksort(int from, int to, int maxDepth) {
-    if (to - from < BINARY_SORT_THRESHOLD) {
-      binarySort(from, to);
-      return;
-    } else if (--maxDepth < 0) {
-      heapSort(from, to);
-      return;
-    }
-
-    final int mid = (from + to) >>> 1;
-
-    if (compare(from, mid) > 0) {
-      swap(from, mid);
-    }
-
-    if (compare(mid, to - 1) > 0) {
-      swap(mid, to - 1);
-      if (compare(from, mid) > 0) {
-        swap(from, mid);
+  /**
+   * Sorts between from (inclusive) and to (exclusive) with intro sort.
+   *
+   * <p>Sorts small ranges with insertion sort. Fallbacks to heap sort to 
avoid quadratic worst
+   * case. Selects the pivot with medians and partitions with the 
Bentley-McIlroy fast 3-ways
+   * algorithm (Engineering a Sort Function, Bentley-McIlroy).
+   */
+  void sort(int from, int to, int maxDepth) {
+    int size;
+
+    // Sort small ranges with insertion sort.
+    while ((size = to - from) > INSERTION_SORT_THRESHOLD) {
+
+      if (--maxDepth < 0) {
+        // Max recursion depth reached: fallback to heap sort.
+        heapSort(from, to);
+        return;
       }
-    }
-
-    int left = from + 1;
-    int right = to - 2;
 
-    setPivot(mid);
-    for (; ; ) {
-      while (comparePivot(right) < 0) {
-        --right;
+      // Pivot selection based on medians.
+      int last = to - 1;
+      int mid = (from + last) >>> 1;
+      int range = size >> 3;
+      int pivot;
+      if (size <= SINGLE_MEDIAN_THRESHOLD) {
+        // Select the pivot with a single median around the middle element.
+        // Do not take the median between [from, mid, last] because it hurts 
performance
+        // if the order is descending.

Review comment:
       For my understanding, why does it hurt performance? If the array is 
descending, then `mid` would be the actual median, which should be the 
best-case scenario?

##########
File path: lucene/core/src/java/org/apache/lucene/util/IntroSorter.java
##########
@@ -20,66 +20,120 @@
  * {@link Sorter} implementation based on a variant of the quicksort algorithm 
called <a
  * href="http://en.wikipedia.org/wiki/Introsort";>introsort</a>: when the 
recursion level exceeds the
  * log of the length of the array to sort, it falls back to heapsort. This 
prevents quicksort from
- * running into its worst-case quadratic runtime. Small arrays are sorted with 
insertion sort.
+ * running into its worst-case quadratic runtime. Small ranges are sorted with 
insertion sort.
  *
  * @lucene.internal
  */
 public abstract class IntroSorter extends Sorter {
 
+  /** Below this size threshold, the partition selection is simplified to a 
single median. */
+  private static final int SINGLE_MEDIAN_THRESHOLD = 40;
+
   /** Create a new {@link IntroSorter}. */
   public IntroSorter() {}
 
   @Override
   public final void sort(int from, int to) {
     checkRange(from, to);
-    quicksort(from, to, 2 * MathUtil.log(to - from, 2));
+    sort(from, to, 2 * MathUtil.log(to - from, 2));
   }
 
-  void quicksort(int from, int to, int maxDepth) {
-    if (to - from < BINARY_SORT_THRESHOLD) {
-      binarySort(from, to);
-      return;
-    } else if (--maxDepth < 0) {
-      heapSort(from, to);
-      return;
-    }
-
-    final int mid = (from + to) >>> 1;
-
-    if (compare(from, mid) > 0) {
-      swap(from, mid);
-    }
-
-    if (compare(mid, to - 1) > 0) {
-      swap(mid, to - 1);
-      if (compare(from, mid) > 0) {
-        swap(from, mid);
+  /**
+   * Sorts between from (inclusive) and to (exclusive) with intro sort.
+   *
+   * <p>Sorts small ranges with insertion sort. Fallbacks to heap sort to 
avoid quadratic worst
+   * case. Selects the pivot with medians and partitions with the 
Bentley-McIlroy fast 3-ways
+   * algorithm (Engineering a Sort Function, Bentley-McIlroy).
+   */
+  void sort(int from, int to, int maxDepth) {
+    int size;
+
+    // Sort small ranges with insertion sort.
+    while ((size = to - from) > INSERTION_SORT_THRESHOLD) {
+
+      if (--maxDepth < 0) {
+        // Max recursion depth reached: fallback to heap sort.
+        heapSort(from, to);
+        return;
       }
-    }
-
-    int left = from + 1;
-    int right = to - 2;
 
-    setPivot(mid);
-    for (; ; ) {
-      while (comparePivot(right) < 0) {
-        --right;
+      // Pivot selection based on medians.
+      int last = to - 1;
+      int mid = (from + last) >>> 1;
+      int range = size >> 3;
+      int pivot;
+      if (size <= SINGLE_MEDIAN_THRESHOLD) {
+        // Select the pivot with a single median around the middle element.
+        // Do not take the median between [from, mid, last] because it hurts 
performance
+        // if the order is descending.
+        pivot = median(mid - range, mid, mid + range);
+      } else {
+        // Select the pivot with the median of medians.
+        int doubleRange = range << 1;
+        int medianFirst = median(from, from + range, from + doubleRange);
+        int medianMiddle = median(mid - range, mid, mid + range);
+        int medianLast = median(last - doubleRange, last - range, last);
+        pivot = median(medianFirst, medianMiddle, medianLast);
       }
 
-      while (left < right && comparePivot(left) >= 0) {
-        ++left;
+      // Bentley-McIlroy 3-way partitioning.
+      setPivot(pivot);
+      swap(from, pivot);
+      int i = from;
+      int j = to;
+      int p = from + 1;
+      int q = last;
+      while (true) {
+        int leftCmp, rightCmp;
+        while ((leftCmp = comparePivot(++i)) > 0) {}
+        while ((rightCmp = comparePivot(--j)) < 0) {}
+        if (i >= j) {
+          if (i == j && rightCmp == 0) {
+            swap(i, p);
+          }
+          break;
+        }
+        swap(i, j);
+        if (rightCmp == 0) {
+          swap(i, p++);
+        }
+        if (leftCmp == 0) {
+          swap(j, q--);
+        }
+      }
+      i = j + 1;
+      for (int k = from; k < p; ) {
+        swap(k++, j--);
+      }
+      for (int k = last; k > q; ) {
+        swap(k--, i++);
       }
 
-      if (left < right) {
-        swap(left, right);
-        --right;
+      // Recursion on the smallest partition. Replace the tail recursion by a 
loop.

Review comment:
       should the second sentence be a TODO?

##########
File path: lucene/core/src/test/org/apache/lucene/util/SorterBenchmark.java
##########
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util;
+
+import java.util.Locale;
+import java.util.Random;
+import org.apache.lucene.util.BaseSortTestCase.Entry;
+import org.apache.lucene.util.BaseSortTestCase.Strategy;
+
+/**
+ * Benchmark for {@link Sorter} implementations.
+ *
+ * <p>Run the static {@link #main(String[])} method to start the benchmark.
+ */
+public class SorterBenchmark {

Review comment:
       I wonder if luceneutil would be a better home for this.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to