This is an automated email from the ASF dual-hosted git repository.

russellspitzer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new d6e770e349 Core: DeleteWithFilter fails on HashCode Collision (#6680)
d6e770e349 is described below

commit d6e770e3491b75fa20c02336db89d269abc05070
Author: Russell Spitzer <[email protected]>
AuthorDate: Mon Jan 30 10:13:49 2023 -0600

    Core: DeleteWithFilter fails on HashCode Collision (#6680)
    
    Failure to copy partition data structs when evaluating residuals caused 
hashcode collisions to incorrect delete or not delete files. To fix this 
PartitionData is copied before being place inside the Residual Evaluator map.
---
 .../org/apache/iceberg/ManifestFilterManager.java  | 23 ++++----
 .../java/org/apache/iceberg/TestDeleteFiles.java   | 63 ++++++++++++++++++++++
 2 files changed, 76 insertions(+), 10 deletions(-)

diff --git a/core/src/main/java/org/apache/iceberg/ManifestFilterManager.java 
b/core/src/main/java/org/apache/iceberg/ManifestFilterManager.java
index 6275de1436..36fa222ae0 100644
--- a/core/src/main/java/org/apache/iceberg/ManifestFilterManager.java
+++ b/core/src/main/java/org/apache/iceberg/ManifestFilterManager.java
@@ -510,16 +510,19 @@ abstract class ManifestFilterManager<F extends 
ContentFile<F>> {
       // in other words, ResidualEvaluator returns a part of the expression 
that needs to be
       // evaluated
       // for rows in the given partition using metrics
-      return metricsEvaluators.computeIfAbsent(
-          file.partition(),
-          partition -> {
-            Expression residual = residualEvaluator.residualFor(partition);
-            InclusiveMetricsEvaluator inclusive =
-                new InclusiveMetricsEvaluator(tableSchema, residual, 
caseSensitive);
-            StrictMetricsEvaluator strict =
-                new StrictMetricsEvaluator(tableSchema, residual, 
caseSensitive);
-            return Pair.of(inclusive, strict);
-          });
+      PartitionData partition = (PartitionData) file.partition();
+      if (!metricsEvaluators.containsKey(partition)) {
+        Expression residual = residualEvaluator.residualFor(partition);
+        InclusiveMetricsEvaluator inclusive =
+            new InclusiveMetricsEvaluator(tableSchema, residual, 
caseSensitive);
+        StrictMetricsEvaluator strict =
+            new StrictMetricsEvaluator(tableSchema, residual, caseSensitive);
+
+        metricsEvaluators.put(
+            partition.copy(), // The partition may be a re-used container so a 
copy is required
+            Pair.of(inclusive, strict));
+      }
+      return metricsEvaluators.get(partition);
     }
   }
 }
diff --git a/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java 
b/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java
index 1ee3b663bc..c67a65bce8 100644
--- a/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java
+++ b/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java
@@ -20,12 +20,18 @@ package org.apache.iceberg;
 
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
+import java.util.List;
+import java.util.stream.Collectors;
 import org.apache.iceberg.ManifestEntry.Status;
 import org.apache.iceberg.exceptions.ValidationException;
 import org.apache.iceberg.expressions.Expression;
 import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
 import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.StructLikeWrapper;
 import org.junit.Assert;
 import org.junit.Assume;
 import org.junit.Test;
@@ -349,6 +355,63 @@ public class TestDeleteFiles extends TableTestBase {
         statuses(Status.EXISTING, Status.DELETED, Status.DELETED));
   }
 
+  @Test
+  public void testDeleteWithCollision() {
+    Schema schema = new Schema(Types.NestedField.of(0, false, "x", 
Types.StringType.get()));
+    PartitionSpec spec = 
PartitionSpec.builderFor(schema).identity("x").build();
+    Table collisionTable =
+        TestTables.create(tableDir, "hashcollision", schema, spec, 
formatVersion);
+
+    PartitionData partitionOne = new PartitionData(spec.partitionType());
+    partitionOne.set(0, "Aa");
+    PartitionData partitionTwo = new PartitionData(spec.partitionType());
+    partitionTwo.set(0, "BB");
+
+    Assert.assertEquals(
+        
StructLikeWrapper.forType(spec.partitionType()).set(partitionOne).hashCode(),
+        
StructLikeWrapper.forType(spec.partitionType()).set(partitionTwo).hashCode());
+
+    DataFile testFileOne =
+        DataFiles.builder(spec)
+            .withPartition(partitionOne)
+            .withPath("/g1.parquet")
+            .withFileSizeInBytes(100)
+            .withRecordCount(1)
+            .build();
+
+    DataFile testFileTwo =
+        DataFiles.builder(spec)
+            .withPartition(partitionTwo)
+            .withRecordCount(1)
+            .withFileSizeInBytes(100)
+            .withPath("/g2.parquet")
+            .build();
+
+    
collisionTable.newFastAppend().appendFile(testFileOne).appendFile(testFileTwo).commit();
+
+    List<StructLike> beforeDeletePartitions =
+        
Lists.newArrayList(collisionTable.newScan().planFiles().iterator()).stream()
+            .map(s -> ((PartitionData) s.partition()).copy())
+            .collect(Collectors.toList());
+
+    Assert.assertEquals(
+        "We should have both partitions",
+        ImmutableList.of(partitionOne, partitionTwo),
+        beforeDeletePartitions);
+
+    collisionTable.newDelete().deleteFromRowFilter(Expressions.equal("x", 
"BB")).commit();
+
+    List<StructLike> afterDeletePartitions =
+        
Lists.newArrayList(collisionTable.newScan().planFiles().iterator()).stream()
+            .map(s -> ((PartitionData) s.partition()).copy())
+            .collect(Collectors.toList());
+
+    Assert.assertEquals(
+        "We should have deleted partitionTwo",
+        ImmutableList.of(partitionOne),
+        afterDeletePartitions);
+  }
+
   private static ByteBuffer longToBuffer(long value) {
     return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, 
value);
   }

Reply via email to