rdblue commented on a change in pull request #1338:
URL: https://github.com/apache/iceberg/pull/1338#discussion_r472360701



##########
File path: core/src/main/java/org/apache/iceberg/DeleteFileIndex.java
##########
@@ -96,21 +104,157 @@ private StructLikeWrapper newWrapper(int specId) {
     Pair<Integer, StructLikeWrapper> partition = partition(file.specId(), 
file.partition());
     Pair<long[], DeleteFile[]> partitionDeletes = 
sortedDeletesByPartition.get(partition);
 
+    Stream<DeleteFile> matchingDeletes;
     if (partitionDeletes == null) {
-      return limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes);
+      matchingDeletes = limitBySequenceNumber(sequenceNumber, globalSeqs, 
globalDeletes);
     } else if (globalDeletes == null) {
-      return limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), 
partitionDeletes.second());
+      matchingDeletes = limitBySequenceNumber(sequenceNumber, 
partitionDeletes.first(), partitionDeletes.second());
     } else {
-      return Stream.concat(
-          Stream.of(limitBySequenceNumber(sequenceNumber, globalSeqs, 
globalDeletes)),
-          Stream.of(limitBySequenceNumber(sequenceNumber, 
partitionDeletes.first(), partitionDeletes.second()))
-      ).toArray(DeleteFile[]::new);
+      matchingDeletes = Stream.concat(
+          limitBySequenceNumber(sequenceNumber, globalSeqs, globalDeletes),
+          limitBySequenceNumber(sequenceNumber, partitionDeletes.first(), 
partitionDeletes.second()));
+    }
+
+    return matchingDeletes
+        .filter(deleteFile -> canContainDeletesForFile(file, deleteFile, 
specsById.get(file.specId()).schema()))
+        .toArray(DeleteFile[]::new);
+  }
+
+  private static boolean canContainDeletesForFile(DataFile dataFile, 
DeleteFile deleteFile, Schema schema) {
+    switch (deleteFile.content()) {
+      case POSITION_DELETES:
+        return canContainPosDeletesForFile(dataFile, deleteFile);
+
+      case EQUALITY_DELETES:
+        return canContainEqDeletesForFile(dataFile, deleteFile, schema);
+    }
+
+    return true;
+  }
+
+  private static boolean canContainPosDeletesForFile(DataFile dataFile, 
DeleteFile deleteFile) {
+    // check that the delete file can contain the data file's file_path
+    Map<Integer, ByteBuffer> lowers = deleteFile.lowerBounds();
+    Map<Integer, ByteBuffer> uppers = deleteFile.upperBounds();
+    if (lowers == null || uppers == null) {
+      return true;
+    }
+
+    Type pathType = MetadataColumns.DELETE_FILE_PATH.type();
+    int pathId = MetadataColumns.DELETE_FILE_PATH.fieldId();
+    ByteBuffer lower = lowers.get(pathId);
+    if (lower != null &&
+        Comparators.charSequences().compare(dataFile.path(), 
Conversions.fromByteBuffer(pathType, lower)) < 0) {
+      return false;
     }
+
+    ByteBuffer upper = uppers.get(pathId);
+    if (upper != null &&
+        Comparators.charSequences().compare(dataFile.path(), 
Conversions.fromByteBuffer(pathType, upper)) > 0) {
+      return false;
+    }
+
+    return true;
+  }
+
+  @SuppressWarnings("checkstyle:CyclomaticComplexity")
+  private static boolean canContainEqDeletesForFile(DataFile dataFile, 
DeleteFile deleteFile, Schema schema) {
+    if (dataFile.lowerBounds() == null || dataFile.upperBounds() == null ||
+        deleteFile.lowerBounds() == null || deleteFile.upperBounds() == null) {
+      return true;
+    }
+
+    Map<Integer, ByteBuffer> dataLowers = dataFile.lowerBounds();
+    Map<Integer, ByteBuffer> dataUppers = dataFile.upperBounds();
+    Map<Integer, ByteBuffer> deleteLowers = deleteFile.lowerBounds();
+    Map<Integer, ByteBuffer> deleteUppers = deleteFile.upperBounds();
+
+    Map<Integer, Long> dataNullCounts = dataFile.nullValueCounts();
+    Map<Integer, Long> dataValueCounts = dataFile.valueCounts();
+    Map<Integer, Long> deleteNullCounts = deleteFile.nullValueCounts();
+    Map<Integer, Long> deleteValueCounts = deleteFile.valueCounts();
+
+    for (int id : deleteFile.equalityFieldIds()) {
+      Types.NestedField field = schema.findField(id);
+      if (!field.type().isPrimitiveType()) {
+        return true;
+      }
+
+      if (allNull(dataNullCounts, dataValueCounts, field) && 
allNonNull(deleteNullCounts, field)) {
+        return false;
+      }
+
+      if (allNull(deleteNullCounts, deleteValueCounts, field) && 
allNonNull(dataNullCounts, field)) {
+        return false;
+      }
+
+      ByteBuffer dataLower = dataLowers.get(id);
+      ByteBuffer dataUpper = dataUppers.get(id);
+      ByteBuffer deleteLower = deleteLowers.get(id);
+      ByteBuffer deleteUpper = deleteUppers.get(id);
+      if (dataLower == null || dataUpper == null || deleteLower == null || 
deleteUpper == null) {
+        return true;
+      }
+
+      if (!rangesOverlap(field.type().asPrimitiveType(), dataLower, dataUpper, 
deleteLower, deleteUpper)) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  private static <T> boolean rangesOverlap(Type.PrimitiveType type,
+                                           ByteBuffer dataLowerBuf, ByteBuffer 
dataUpperBuf,
+                                           ByteBuffer deleteLowerBuf, 
ByteBuffer deleteUpperBuf) {
+    Comparator<T> comparator = Comparators.forType(type);
+    T dataLower = Conversions.fromByteBuffer(type, dataLowerBuf);
+    T dataUpper = Conversions.fromByteBuffer(type, dataUpperBuf);
+    T deleteLower = Conversions.fromByteBuffer(type, deleteLowerBuf);
+    T deleteUpper = Conversions.fromByteBuffer(type, deleteUpperBuf);
+
+    return comparator.compare(deleteLower, dataUpper) <= 0 && 
comparator.compare(dataLower, deleteUpper) <= 0;
+  }
+
+  private static boolean allNonNull(Map<Integer, Long> nullValueCounts, 
Types.NestedField field) {
+    if (field.isRequired()) {
+      return true;
+    }
+
+    if (nullValueCounts == null) {
+      return false;
+    }
+
+    Long nullValueCount = nullValueCounts.get(field.fieldId());
+    if (nullValueCount == null) {
+      return false;
+    }
+
+    return nullValueCount <= 0;
+  }
+
+  private static boolean allNull(Map<Integer, Long> nullValueCounts, 
Map<Integer, Long> valueCounts,
+                                 Types.NestedField field) {
+    if (field.isRequired()) {
+      return false;
+    }
+
+    if (nullValueCounts == null || valueCounts == null) {
+      return false;
+    }
+
+    Long nullValueCount = nullValueCounts.get(field.fieldId());
+    Long valueCount = valueCounts.get(field.fieldId());
+    if (nullValueCount == null || valueCount == null) {
+      return true;

Review comment:
       Stats can be configured for each column, so that you can avoid keeping 
stats in table metadata when they will not be useful. If the table has 
column-level stats, but a column is configured to `none`, then these would be 
null. That indicates that the values are unknown.
   
   I think you're right that this is not the correct return value. It should be 
false because we do not know that all values are null if there are no counts.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to