[GitHub] iverase commented on a change in pull request #556: LUCENE-8673: Use radix partitioning when merging dimensional points

GitBox Mon, 04 Feb 2019 06:42:54 -0800

iverase commented on a change in pull request #556: LUCENE-8673: Use radix 
partitioning when merging dimensional points
URL: https://github.com/apache/lucene-solr/pull/556#discussion_r253495525


 ##########
 File path: 
lucene/core/src/java/org/apache/lucene/util/bkd/BKDRadixSelector.java
 ##########
 @@ -0,0 +1,433 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.bkd;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FutureArrays;
+import org.apache.lucene.util.IntroSelector;
+
+/**
+ *
+ * Offline Radix selector for BKD tree.
+ *
+ *  @lucene.internal
+ * */
+public final class BKDRadixSelector {
+  //size of the histogram
+  private static final int HISTOGRAM_SIZE = 256;
+  // we store one histogram per recursion level
+  private final long[][] histogram;
+  //bytes we are sorting
+  private final int bytesPerDim;
+  // number of bytes to be sorted: bytesPerDim + Integer.BYTES
+  private final int bytesSorted;
+  //data dimensions size
+  private final int packedBytesLength;
+  //flag to when we are moving to sort on heap
+  private final int maxPointsSortedOffHeap;
+  //reusable buffer
+  private final byte[] offlineBuffer;
+  //holder for partition points
+  private final int[] partitionBucket;
+  //holder for partition bytes
+  private final byte[] partitionBytes;
+  //re-usable on-heap selector
+  private final HeapSelector heapSelector;
+  // scratch object to move bytes around
+  private final BytesRef bytesRef1 = new BytesRef();
+  // scratch object to move bytes around
+  private final BytesRef bytesRef2 = new BytesRef();
+  //Directory to create new Offline writer
+  private final Directory tempDir;
+  // prefix for temp files
+  private final String tempFileNamePrefix;
+
+
+
+  /**
+   * Sole constructor.
+   */
+  public BKDRadixSelector(int numDim, int bytesPerDim, int 
maxPointsSortedOffHeap, Directory tempDir, String tempFileNamePrefix) {
+    this.bytesPerDim = bytesPerDim;
+    this.packedBytesLength = numDim * bytesPerDim;
+    this.bytesSorted = bytesPerDim + Integer.BYTES;
+    this.maxPointsSortedOffHeap = maxPointsSortedOffHeap;
+    this.offlineBuffer = new byte[maxPointsSortedOffHeap * (packedBytesLength 
+ Integer.BYTES)];
+    this.partitionBucket = new int[bytesSorted];
+    this.partitionBytes =  new byte[bytesSorted];
+    this.histogram = new long[bytesSorted][HISTOGRAM_SIZE];
+    this.bytesRef1.length = numDim * bytesPerDim;
+    this.heapSelector = new HeapSelector(numDim, bytesPerDim);
+    this.tempDir = tempDir;
+    this.tempFileNamePrefix = tempFileNamePrefix;
+  }
+
+  /**
+   * Method to partition the input data. It returns the value of the dimension 
where
+   * the split happens.
+   */
+  public byte[] select(PointWriter points, PointWriter left, PointWriter 
right, long from, long to, long partitionPoint, int dim) throws IOException {
+    checkArgs(from, to, partitionPoint);
+
+    //If we are on heap then we just select on heap
+    if (points instanceof HeapPointWriter) {
+      return heapSelect((HeapPointWriter) points, left, right, dim, 
Math.toIntExact(from), Math.toIntExact(to),  Math.toIntExact(partitionPoint), 
0);
+    }
+
+    //reset histogram
+    for (int i = 0; i < bytesSorted; i++) {
+      Arrays.fill(histogram[i], 0);
+    }
+    OfflinePointWriter offlinePointWriter = (OfflinePointWriter) points;
+
+    //find common prefix, it does already set histogram values if needed
+    int commonPrefix = findCommonPrefix(offlinePointWriter, from, to, dim);
+
+    //if all equals we just partition the data
+    if (commonPrefix ==  bytesSorted) {
+      return partition(offlinePointWriter, left, right, from, to, 
partitionPoint, dim, null, commonPrefix - 1, partitionPoint);
+    }
+    //let's rock'n'roll
+    return buildHistogramAndPartition(offlinePointWriter, null, left, right, 
from, to, partitionPoint, 0, commonPrefix, dim,0, 0);
+  }
+
+  void checkArgs(long from, long to, long middle) {
+    if (middle < from) {
+      throw new IllegalArgumentException("middle must be >= from");
+    }
+    if (middle >= to) {
+      throw new IllegalArgumentException("middle must be < to");
+    }
+  }
+
+  private int findCommonPrefix(OfflinePointWriter points, long from, long to, 
int dim) throws IOException{
+    //find common prefix
+    byte[] commonPrefix = new byte[bytesSorted];
+    int commonPrefixPosition = bytesSorted;
+    try (OfflinePointReader reader = points.getReader(from, to - from, 
maxPointsSortedOffHeap, offlineBuffer)) {
+      reader.next();
+      reader.docValue(bytesRef1);
+      // copy dimension
+      System.arraycopy(bytesRef1.bytes, bytesRef1.offset + dim * bytesPerDim, 
commonPrefix, 0, bytesPerDim);
+      // copy docID
+      System.arraycopy(bytesRef1.bytes, bytesRef1.offset + packedBytesLength, 
commonPrefix, bytesPerDim, Integer.BYTES);
+      for (long i =from + 1; i< to; i++) {
+        reader.next();
+        reader.docValue(bytesRef1);
+        int startIndex =  dim * bytesPerDim;
+        int endIndex  = (commonPrefixPosition > bytesPerDim) ? startIndex + 
bytesPerDim :  startIndex + commonPrefixPosition;
+        int j = FutureArrays.mismatch(commonPrefix, 0, endIndex - startIndex, 
bytesRef1.bytes, bytesRef1.offset + startIndex, bytesRef1.offset + endIndex);
+        if (j == 0) {
+          return 0;
+        } else if (j == -1) {
+          if (commonPrefixPosition > bytesPerDim) {
+            //tie-break on docID
+            int k = FutureArrays.mismatch(commonPrefix, bytesPerDim, 
commonPrefixPosition, bytesRef1.bytes, bytesRef1.offset + packedBytesLength, 
bytesRef1.offset + packedBytesLength + commonPrefixPosition - bytesPerDim );
+            if (k != -1) {
+              commonPrefixPosition = bytesPerDim + k;
+            }
+          }
+        } else {
+          commonPrefixPosition = j;
+        }
+      }
+    }
+
+    //build histogram up to the common prefix
+    for (int i=0; i < commonPrefixPosition; i++) {
+      partitionBucket[i] = commonPrefix[i] & 0xff;
+      partitionBytes[i] = commonPrefix[i];
+      histogram[i][partitionBucket[i]] = to - from;
+    }
+    return commonPrefixPosition;
+  }
+
+  private byte[] buildHistogramAndPartition(OfflinePointWriter points, 
OfflinePointWriter deltaPoints, PointWriter left, PointWriter right,
+                                            long from, long to, long 
partitionPoint, int iteration,  int commonPrefix, int dim, long leftCount, long 
rightCount) throws IOException {
+    //Choose the right set of points
+    OfflinePointWriter currentPoints;
+    long start;
+    long length;
+    if (deltaPoints == null) {
+      currentPoints =  points;
+      start = from;
+      length = to - from;
+    } else {
+      currentPoints =  deltaPoints;
+      start = 0;
+      length = deltaPoints.count;
+    }
 
 Review comment:
   I have changed the algorithm so we partition data as we recurse. It does 
simplify the code.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] iverase commented on a change in pull request #556: LUCENE-8673: Use radix partitioning when merging dimensional points

Reply via email to