[GitHub] [iceberg] rdblue commented on a change in pull request #1338: Add file stats range optimizations for DeleteFileIndex

GitBox Mon, 17 Aug 2020 15:20:41 -0700


rdblue commented on a change in pull request #1338:
URL: https://github.com/apache/iceberg/pull/1338#discussion_r471806073




##########
File path: core/src/main/java/org/apache/iceberg/DeleteFileIndex.java
##########
@@ -96,21 +100,101 @@ private StructLikeWrapper newWrapper(int specId) {
     Pair<Integer, StructLikeWrapper> partition = partition(file.specId(), 
file.partition());
     Pair<long[], DeleteFile[]> partitionDeletes = 
sortedDeletesByPartition.get(partition);
 
+    Stream<DeleteFile> matchingDeletes;
     if (partitionDeletes == null) {
-      return limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes);
+      matchingDeletes = limitBySequenceNumber(sequenceNumber, globalSeqs, 
globalDeletes);
     } else if (globalDeletes == null) {
-      return limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), 
partitionDeletes.second());
+      matchingDeletes = limitBySequenceNumber(sequenceNumber, 
partitionDeletes.first(), partitionDeletes.second());
     } else {
-      return Stream.concat(
-          Stream.of(limitBySequenceNumber(sequenceNumber, globalSeqs, 
globalDeletes)),
-          Stream.of(limitBySequenceNumber(sequenceNumber, 
partitionDeletes.first(), partitionDeletes.second()))
-      ).toArray(DeleteFile[]::new);
+      matchingDeletes = Stream.concat(
+          limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes),
+          limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), 
partitionDeletes.second()));
+    }
+
+    return matchingDeletes
+        .filter(deleteFile -> canContainDeletesForFile(file, deleteFile, 
schema))
+        .toArray(DeleteFile[]::new);
+  }
+
+  private static boolean canContainDeletesForFile(DataFile dataFile, 
DeleteFile deleteFile, Schema schema) {
+    switch (deleteFile.content()) {
+      case POSITION_DELETES:
+        // check that the delete file can contain the data file's file_path
+        Map<Integer, ByteBuffer> lowers = deleteFile.lowerBounds();
+        Map<Integer, ByteBuffer> uppers = deleteFile.upperBounds();
+        if (lowers == null || uppers == null) {
+          return true;
+        }
+
+        Type pathType = MetadataColumns.DELETE_FILE_PATH.type();
+        int pathId = MetadataColumns.DELETE_FILE_PATH.fieldId();
+        ByteBuffer lower = lowers.get(pathId);
+        if (lower != null &&
+            Comparators.charSequences().compare(dataFile.path(), 
Conversions.fromByteBuffer(pathType, lower)) < 0) {
+          return false;
+        }
+
+        ByteBuffer upper = uppers.get(pathId);
+        if (upper != null &&
+            Comparators.charSequences().compare(dataFile.path(), 
Conversions.fromByteBuffer(pathType, upper)) > 0) {
+          return false;
+        }
+
+        break;
+
+      case EQUALITY_DELETES:
+        if (dataFile.lowerBounds() == null || dataFile.upperBounds() == null ||
+            deleteFile.lowerBounds() == null || deleteFile.upperBounds() == 
null) {
+          return true;
+        }
+
+        Map<Integer, ByteBuffer> dataLowers = dataFile.lowerBounds();
+        Map<Integer, ByteBuffer> dataUppers = dataFile.upperBounds();
+        Map<Integer, ByteBuffer> deleteLowers = deleteFile.lowerBounds();
+        Map<Integer, ByteBuffer> deleteUppers = deleteFile.upperBounds();
+
+        for (int id : deleteFile.equalityFieldIds()) {
+          Type type = schema.findType(id);
+          if (!type.isPrimitiveType()) {
+            return true;
+          }
+
+          if (!rangesOverlap(type.asPrimitiveType(),
+              dataLowers.get(id), dataUppers.get(id), deleteLowers.get(id), 
deleteUppers.get(id))) {

Review comment:
       Fixed. I also added cases for null values, along with tests.

##########
File path: core/src/main/java/org/apache/iceberg/DeleteFileIndex.java
##########
@@ -96,21 +100,101 @@ private StructLikeWrapper newWrapper(int specId) {
     Pair<Integer, StructLikeWrapper> partition = partition(file.specId(), 
file.partition());
     Pair<long[], DeleteFile[]> partitionDeletes = 
sortedDeletesByPartition.get(partition);
 
+    Stream<DeleteFile> matchingDeletes;
     if (partitionDeletes == null) {
-      return limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes);
+      matchingDeletes = limitBySequenceNumber(sequenceNumber, globalSeqs, 
globalDeletes);
     } else if (globalDeletes == null) {
-      return limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), 
partitionDeletes.second());
+      matchingDeletes = limitBySequenceNumber(sequenceNumber, 
partitionDeletes.first(), partitionDeletes.second());
     } else {
-      return Stream.concat(
-          Stream.of(limitBySequenceNumber(sequenceNumber, globalSeqs, 
globalDeletes)),
-          Stream.of(limitBySequenceNumber(sequenceNumber, 
partitionDeletes.first(), partitionDeletes.second()))
-      ).toArray(DeleteFile[]::new);
+      matchingDeletes = Stream.concat(
+          limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes),
+          limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), 
partitionDeletes.second()));
+    }
+
+    return matchingDeletes
+        .filter(deleteFile -> canContainDeletesForFile(file, deleteFile, 
schema))
+        .toArray(DeleteFile[]::new);
+  }
+
+  private static boolean canContainDeletesForFile(DataFile dataFile, 
DeleteFile deleteFile, Schema schema) {
+    switch (deleteFile.content()) {
+      case POSITION_DELETES:
+        // check that the delete file can contain the data file's file_path
+        Map<Integer, ByteBuffer> lowers = deleteFile.lowerBounds();
+        Map<Integer, ByteBuffer> uppers = deleteFile.upperBounds();
+        if (lowers == null || uppers == null) {
+          return true;
+        }
+
+        Type pathType = MetadataColumns.DELETE_FILE_PATH.type();
+        int pathId = MetadataColumns.DELETE_FILE_PATH.fieldId();
+        ByteBuffer lower = lowers.get(pathId);
+        if (lower != null &&
+            Comparators.charSequences().compare(dataFile.path(), 
Conversions.fromByteBuffer(pathType, lower)) < 0) {
+          return false;
+        }
+
+        ByteBuffer upper = uppers.get(pathId);
+        if (upper != null &&
+            Comparators.charSequences().compare(dataFile.path(), 
Conversions.fromByteBuffer(pathType, upper)) > 0) {
+          return false;
+        }
+
+        break;
+
+      case EQUALITY_DELETES:
+        if (dataFile.lowerBounds() == null || dataFile.upperBounds() == null ||
+            deleteFile.lowerBounds() == null || deleteFile.upperBounds() == 
null) {
+          return true;
+        }
+
+        Map<Integer, ByteBuffer> dataLowers = dataFile.lowerBounds();
+        Map<Integer, ByteBuffer> dataUppers = dataFile.upperBounds();
+        Map<Integer, ByteBuffer> deleteLowers = deleteFile.lowerBounds();
+        Map<Integer, ByteBuffer> deleteUppers = deleteFile.upperBounds();
+
+        for (int id : deleteFile.equalityFieldIds()) {
+          Type type = schema.findType(id);
+          if (!type.isPrimitiveType()) {
+            return true;
+          }
+
+          if (!rangesOverlap(type.asPrimitiveType(),
+              dataLowers.get(id), dataUppers.get(id), deleteLowers.get(id), 
deleteUppers.get(id))) {
+            return false;
+          }
+        }
+        break;
+    }
+
+    return true;
+  }
+
+  private static <T> boolean rangesOverlap(Type.PrimitiveType type,
+                                           ByteBuffer dataLower, ByteBuffer 
dataUpper,
+                                           ByteBuffer deleteLower, ByteBuffer 
deleteUpper) {
+    Comparator<T> comparator = Comparators.forType(type);
+    T low = Conversions.fromByteBuffer(type, dataLower);
+    T high = Conversions.fromByteBuffer(type, dataUpper);
+
+    if (contains(comparator, low, high, Conversions.fromByteBuffer(type, 
deleteLower))) {
+      return true;
+    }
+
+    if (contains(comparator, low, high, Conversions.fromByteBuffer(type, 
deleteUpper))) {
+      return true;

Review comment:
       Fixed.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] rdblue commented on a change in pull request #1338: Add file stats range optimizations for DeleteFileIndex

Reply via email to