rdblue commented on a change in pull request #1338:
URL: https://github.com/apache/iceberg/pull/1338#discussion_r472410264



##########
File path: core/src/main/java/org/apache/iceberg/DeleteFileIndex.java
##########
@@ -96,21 +104,157 @@ private StructLikeWrapper newWrapper(int specId) {
     Pair<Integer, StructLikeWrapper> partition = partition(file.specId(), 
file.partition());
     Pair<long[], DeleteFile[]> partitionDeletes = 
sortedDeletesByPartition.get(partition);
 
+    Stream<DeleteFile> matchingDeletes;
     if (partitionDeletes == null) {
-      return limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes);
+      matchingDeletes = limitBySequenceNumber(sequenceNumber, globalSeqs, 
globalDeletes);
     } else if (globalDeletes == null) {
-      return limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), 
partitionDeletes.second());
+      matchingDeletes = limitBySequenceNumber(sequenceNumber, 
partitionDeletes.first(), partitionDeletes.second());
     } else {
-      return Stream.concat(
-          Stream.of(limitBySequenceNumber(sequenceNumber, globalSeqs, 
globalDeletes)),
-          Stream.of(limitBySequenceNumber(sequenceNumber, 
partitionDeletes.first(), partitionDeletes.second()))
-      ).toArray(DeleteFile[]::new);
+      matchingDeletes = Stream.concat(
+          limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes),
+          limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), 
partitionDeletes.second()));
+    }
+
+    return matchingDeletes
+        .filter(deleteFile -> canContainDeletesForFile(file, deleteFile, 
specsById.get(file.specId()).schema()))
+        .toArray(DeleteFile[]::new);
+  }
+
+  private static boolean canContainDeletesForFile(DataFile dataFile, 
DeleteFile deleteFile, Schema schema) {
+    switch (deleteFile.content()) {
+      case POSITION_DELETES:
+        return canContainPosDeletesForFile(dataFile, deleteFile);
+
+      case EQUALITY_DELETES:
+        return canContainEqDeletesForFile(dataFile, deleteFile, schema);
+    }
+
+    return true;
+  }
+
+  private static boolean canContainPosDeletesForFile(DataFile dataFile, 
DeleteFile deleteFile) {
+    // check that the delete file can contain the data file's file_path
+    Map<Integer, ByteBuffer> lowers = deleteFile.lowerBounds();
+    Map<Integer, ByteBuffer> uppers = deleteFile.upperBounds();
+    if (lowers == null || uppers == null) {
+      return true;
+    }
+
+    Type pathType = MetadataColumns.DELETE_FILE_PATH.type();
+    int pathId = MetadataColumns.DELETE_FILE_PATH.fieldId();
+    ByteBuffer lower = lowers.get(pathId);
+    if (lower != null &&
+        Comparators.charSequences().compare(dataFile.path(), 
Conversions.fromByteBuffer(pathType, lower)) < 0) {
+      return false;
     }
+
+    ByteBuffer upper = uppers.get(pathId);
+    if (upper != null &&
+        Comparators.charSequences().compare(dataFile.path(), 
Conversions.fromByteBuffer(pathType, upper)) > 0) {
+      return false;
+    }
+
+    return true;
+  }
+
+  @SuppressWarnings("checkstyle:CyclomaticComplexity")
+  private static boolean canContainEqDeletesForFile(DataFile dataFile, 
DeleteFile deleteFile, Schema schema) {
+    if (dataFile.lowerBounds() == null || dataFile.upperBounds() == null ||
+        deleteFile.lowerBounds() == null || deleteFile.upperBounds() == null) {
+      return true;
+    }
+
+    Map<Integer, ByteBuffer> dataLowers = dataFile.lowerBounds();
+    Map<Integer, ByteBuffer> dataUppers = dataFile.upperBounds();
+    Map<Integer, ByteBuffer> deleteLowers = deleteFile.lowerBounds();
+    Map<Integer, ByteBuffer> deleteUppers = deleteFile.upperBounds();
+
+    Map<Integer, Long> dataNullCounts = dataFile.nullValueCounts();
+    Map<Integer, Long> dataValueCounts = dataFile.valueCounts();
+    Map<Integer, Long> deleteNullCounts = deleteFile.nullValueCounts();
+    Map<Integer, Long> deleteValueCounts = deleteFile.valueCounts();
+
+    for (int id : deleteFile.equalityFieldIds()) {
+      Types.NestedField field = schema.findField(id);
+      if (!field.type().isPrimitiveType()) {
+        return true;
+      }
+
+      if (allNull(dataNullCounts, dataValueCounts, field) && 
allNonNull(deleteNullCounts, field)) {

Review comment:
       Added a comment for each case.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to