This is an automated email from the ASF dual-hosted git repository.

etudenhoefner pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/main by this push:
     new 9057bf32e9 Spark 3.4, 3.5, 4.1: Fix "AlreadyExistsException: Location 
already exists" at rewrite_table_path (#15616)
9057bf32e9 is described below

commit 9057bf32e94278262cc0084b0d043e757c007f12
Author: Bhargav Kumar Konidena <[email protected]>
AuthorDate: Fri Mar 20 15:04:02 2026 +0530

    Spark 3.4, 3.5, 4.1: Fix "AlreadyExistsException: Location already exists" 
at rewrite_table_path (#15616)
---
 .../spark/actions/RewriteTablePathSparkAction.java |  3 +-
 .../spark/actions/TestRewriteTablePathsAction.java | 73 ++++++++++++++++++++++
 .../spark/actions/RewriteTablePathSparkAction.java |  3 +-
 .../spark/actions/TestRewriteTablePathsAction.java | 73 ++++++++++++++++++++++
 .../spark/actions/RewriteTablePathSparkAction.java |  3 +-
 .../spark/actions/TestRewriteTablePathsAction.java | 73 ++++++++++++++++++++++
 6 files changed, 225 insertions(+), 3 deletions(-)

diff --git 
a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
 
b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
index 64b82f0986..aedb25e4a4 100644
--- 
a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
+++ 
b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
@@ -72,6 +72,7 @@ import 
org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
 import org.apache.iceberg.spark.JobGroupInfo;
 import org.apache.iceberg.spark.source.SerializableTableWithSize;
+import org.apache.iceberg.util.DeleteFileSet;
 import org.apache.iceberg.util.Pair;
 import org.apache.iceberg.util.Tasks;
 import org.apache.spark.api.java.function.ForeachFunction;
@@ -317,7 +318,7 @@ public class RewriteTablePathSparkAction extends 
BaseSparkAction<RewriteTablePat
         rewriteManifestResult.toRewrite().stream()
             .filter(e -> e instanceof DeleteFile)
             .map(e -> (DeleteFile) e)
-            .collect(Collectors.toSet());
+            .collect(Collectors.toCollection(DeleteFileSet::create));
     rewritePositionDeletes(deleteFiles);
 
     ImmutableRewriteTablePath.Result.Builder builder =
diff --git 
a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
 
b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
index 5c36578d06..6044368a46 100644
--- 
a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
+++ 
b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
@@ -567,6 +567,79 @@ public class TestRewriteTablePathsAction extends TestBase {
         .isEmpty();
   }
 
+  /**
+   * Test for https://github.com/apache/iceberg/issues/14814
+   *
+   * <p>This test verifies that rewrite_table_path correctly deduplicates 
delete files when the same
+   * delete file appears in multiple manifests. Without the DeleteFileSet fix, 
this test would fail
+   * with AlreadyExistsException because DeleteFile objects don't override 
equals() and the same
+   * file would be processed multiple times.
+   *
+   * <p>The test creates a scenario where the same delete file is added to 
multiple snapshots,
+   * causing it to appear in multiple manifest entries. When these manifests 
are processed, the same
+   * delete file is returned as different object instances which need proper 
deduplication.
+   */
+  @TestTemplate
+  public void testPositionDeletesDeduplication() throws Exception {
+    // Format versions 3 and 4 use Deletion Vectors stored in Puffin files, 
which have different
+    // validation rules that prevent adding the same delete file multiple times
+    assumeThat(formatVersion)
+        .as("Format versions 3+ use DVs with different validation rules")
+        .isEqualTo(2);
+
+    Table tableWithPosDeletes =
+        createTableWithSnapshots(
+            
tableDir.toFile().toURI().toString().concat("tableWithDuplicateDeletes"),
+            2,
+            Map.of(TableProperties.DELETE_DEFAULT_FILE_FORMAT, "parquet"));
+
+    // Get a data file to create position deletes for
+    DataFile dataFile =
+        tableWithPosDeletes
+            .currentSnapshot()
+            .addedDataFiles(tableWithPosDeletes.io())
+            .iterator()
+            .next();
+
+    // Create a position delete file
+    List<Pair<CharSequence, Long>> deletes = 
Lists.newArrayList(Pair.of(dataFile.location(), 0L));
+    File deleteFile =
+        new File(
+            removePrefix(tableWithPosDeletes.location() + 
"/data/deeply/nested/deletes.parquet"));
+    DeleteFile positionDeletes =
+        FileHelpers.writeDeleteFile(
+                tableWithPosDeletes,
+                
tableWithPosDeletes.io().newOutputFile(deleteFile.toURI().toString()),
+                deletes,
+                formatVersion)
+            .first();
+
+    tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes).commit();
+
+    // Add the SAME delete file AGAIN in a second snapshot - this creates a 
duplicate entry
+    // in a new manifest, which will cause duplicate DeleteFile objects when 
processing
+    tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes).commit();
+
+    // This should NOT throw AlreadyExistsException - the fix uses 
DeleteFileSet to deduplicate
+    // Without the fix (using Collectors.toSet()), this would fail because:
+    // 1. Both manifests contain entries for the same delete file
+    // 2. Processing returns two different DeleteFile objects for the same file
+    // 3. HashSet doesn't deduplicate them (DeleteFile doesn't override 
equals())
+    // 4. rewritePositionDeletes tries to write the same file twice -> 
AlreadyExistsException
+    RewriteTablePath.Result result =
+        actions()
+            .rewriteTablePath(tableWithPosDeletes)
+            .stagingLocation(stagingLocation())
+            .rewriteLocationPrefix(tableWithPosDeletes.location(), 
targetTableLocation())
+            .execute();
+
+    // Verify the rewrite completed successfully - should have rewritten 
exactly 1 delete file
+    // (the duplicate should be deduplicated by DeleteFileSet)
+    assertThat(result.rewrittenDeleteFilePathsCount())
+        .as("Should have rewritten exactly 1 delete file after deduplication")
+        .isEqualTo(1);
+  }
+
   @TestTemplate
   public void testEqualityDeletes() throws Exception {
     Table sourceTable = createTableWithSnapshots(newTableLocation(), 1);
diff --git 
a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
 
b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
index 64b82f0986..aedb25e4a4 100644
--- 
a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
+++ 
b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
@@ -72,6 +72,7 @@ import 
org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
 import org.apache.iceberg.spark.JobGroupInfo;
 import org.apache.iceberg.spark.source.SerializableTableWithSize;
+import org.apache.iceberg.util.DeleteFileSet;
 import org.apache.iceberg.util.Pair;
 import org.apache.iceberg.util.Tasks;
 import org.apache.spark.api.java.function.ForeachFunction;
@@ -317,7 +318,7 @@ public class RewriteTablePathSparkAction extends 
BaseSparkAction<RewriteTablePat
         rewriteManifestResult.toRewrite().stream()
             .filter(e -> e instanceof DeleteFile)
             .map(e -> (DeleteFile) e)
-            .collect(Collectors.toSet());
+            .collect(Collectors.toCollection(DeleteFileSet::create));
     rewritePositionDeletes(deleteFiles);
 
     ImmutableRewriteTablePath.Result.Builder builder =
diff --git 
a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
 
b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
index 5c36578d06..6044368a46 100644
--- 
a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
+++ 
b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
@@ -567,6 +567,79 @@ public class TestRewriteTablePathsAction extends TestBase {
         .isEmpty();
   }
 
+  /**
+   * Test for https://github.com/apache/iceberg/issues/14814
+   *
+   * <p>This test verifies that rewrite_table_path correctly deduplicates 
delete files when the same
+   * delete file appears in multiple manifests. Without the DeleteFileSet fix, 
this test would fail
+   * with AlreadyExistsException because DeleteFile objects don't override 
equals() and the same
+   * file would be processed multiple times.
+   *
+   * <p>The test creates a scenario where the same delete file is added to 
multiple snapshots,
+   * causing it to appear in multiple manifest entries. When these manifests 
are processed, the same
+   * delete file is returned as different object instances which need proper 
deduplication.
+   */
+  @TestTemplate
+  public void testPositionDeletesDeduplication() throws Exception {
+    // Format versions 3 and 4 use Deletion Vectors stored in Puffin files, 
which have different
+    // validation rules that prevent adding the same delete file multiple times
+    assumeThat(formatVersion)
+        .as("Format versions 3+ use DVs with different validation rules")
+        .isEqualTo(2);
+
+    Table tableWithPosDeletes =
+        createTableWithSnapshots(
+            
tableDir.toFile().toURI().toString().concat("tableWithDuplicateDeletes"),
+            2,
+            Map.of(TableProperties.DELETE_DEFAULT_FILE_FORMAT, "parquet"));
+
+    // Get a data file to create position deletes for
+    DataFile dataFile =
+        tableWithPosDeletes
+            .currentSnapshot()
+            .addedDataFiles(tableWithPosDeletes.io())
+            .iterator()
+            .next();
+
+    // Create a position delete file
+    List<Pair<CharSequence, Long>> deletes = 
Lists.newArrayList(Pair.of(dataFile.location(), 0L));
+    File deleteFile =
+        new File(
+            removePrefix(tableWithPosDeletes.location() + 
"/data/deeply/nested/deletes.parquet"));
+    DeleteFile positionDeletes =
+        FileHelpers.writeDeleteFile(
+                tableWithPosDeletes,
+                
tableWithPosDeletes.io().newOutputFile(deleteFile.toURI().toString()),
+                deletes,
+                formatVersion)
+            .first();
+
+    tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes).commit();
+
+    // Add the SAME delete file AGAIN in a second snapshot - this creates a 
duplicate entry
+    // in a new manifest, which will cause duplicate DeleteFile objects when 
processing
+    tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes).commit();
+
+    // This should NOT throw AlreadyExistsException - the fix uses 
DeleteFileSet to deduplicate
+    // Without the fix (using Collectors.toSet()), this would fail because:
+    // 1. Both manifests contain entries for the same delete file
+    // 2. Processing returns two different DeleteFile objects for the same file
+    // 3. HashSet doesn't deduplicate them (DeleteFile doesn't override 
equals())
+    // 4. rewritePositionDeletes tries to write the same file twice -> 
AlreadyExistsException
+    RewriteTablePath.Result result =
+        actions()
+            .rewriteTablePath(tableWithPosDeletes)
+            .stagingLocation(stagingLocation())
+            .rewriteLocationPrefix(tableWithPosDeletes.location(), 
targetTableLocation())
+            .execute();
+
+    // Verify the rewrite completed successfully - should have rewritten 
exactly 1 delete file
+    // (the duplicate should be deduplicated by DeleteFileSet)
+    assertThat(result.rewrittenDeleteFilePathsCount())
+        .as("Should have rewritten exactly 1 delete file after deduplication")
+        .isEqualTo(1);
+  }
+
   @TestTemplate
   public void testEqualityDeletes() throws Exception {
     Table sourceTable = createTableWithSnapshots(newTableLocation(), 1);
diff --git 
a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
 
b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
index 64b82f0986..aedb25e4a4 100644
--- 
a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
+++ 
b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
@@ -72,6 +72,7 @@ import 
org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.relocated.com.google.common.collect.Sets;
 import org.apache.iceberg.spark.JobGroupInfo;
 import org.apache.iceberg.spark.source.SerializableTableWithSize;
+import org.apache.iceberg.util.DeleteFileSet;
 import org.apache.iceberg.util.Pair;
 import org.apache.iceberg.util.Tasks;
 import org.apache.spark.api.java.function.ForeachFunction;
@@ -317,7 +318,7 @@ public class RewriteTablePathSparkAction extends 
BaseSparkAction<RewriteTablePat
         rewriteManifestResult.toRewrite().stream()
             .filter(e -> e instanceof DeleteFile)
             .map(e -> (DeleteFile) e)
-            .collect(Collectors.toSet());
+            .collect(Collectors.toCollection(DeleteFileSet::create));
     rewritePositionDeletes(deleteFiles);
 
     ImmutableRewriteTablePath.Result.Builder builder =
diff --git 
a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
 
b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
index 5c36578d06..6044368a46 100644
--- 
a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
+++ 
b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
@@ -567,6 +567,79 @@ public class TestRewriteTablePathsAction extends TestBase {
         .isEmpty();
   }
 
+  /**
+   * Test for https://github.com/apache/iceberg/issues/14814
+   *
+   * <p>This test verifies that rewrite_table_path correctly deduplicates 
delete files when the same
+   * delete file appears in multiple manifests. Without the DeleteFileSet fix, 
this test would fail
+   * with AlreadyExistsException because DeleteFile objects don't override 
equals() and the same
+   * file would be processed multiple times.
+   *
+   * <p>The test creates a scenario where the same delete file is added to 
multiple snapshots,
+   * causing it to appear in multiple manifest entries. When these manifests 
are processed, the same
+   * delete file is returned as different object instances which need proper 
deduplication.
+   */
+  @TestTemplate
+  public void testPositionDeletesDeduplication() throws Exception {
+    // Format versions 3 and 4 use Deletion Vectors stored in Puffin files, 
which have different
+    // validation rules that prevent adding the same delete file multiple times
+    assumeThat(formatVersion)
+        .as("Format versions 3+ use DVs with different validation rules")
+        .isEqualTo(2);
+
+    Table tableWithPosDeletes =
+        createTableWithSnapshots(
+            
tableDir.toFile().toURI().toString().concat("tableWithDuplicateDeletes"),
+            2,
+            Map.of(TableProperties.DELETE_DEFAULT_FILE_FORMAT, "parquet"));
+
+    // Get a data file to create position deletes for
+    DataFile dataFile =
+        tableWithPosDeletes
+            .currentSnapshot()
+            .addedDataFiles(tableWithPosDeletes.io())
+            .iterator()
+            .next();
+
+    // Create a position delete file
+    List<Pair<CharSequence, Long>> deletes = 
Lists.newArrayList(Pair.of(dataFile.location(), 0L));
+    File deleteFile =
+        new File(
+            removePrefix(tableWithPosDeletes.location() + 
"/data/deeply/nested/deletes.parquet"));
+    DeleteFile positionDeletes =
+        FileHelpers.writeDeleteFile(
+                tableWithPosDeletes,
+                
tableWithPosDeletes.io().newOutputFile(deleteFile.toURI().toString()),
+                deletes,
+                formatVersion)
+            .first();
+
+    tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes).commit();
+
+    // Add the SAME delete file AGAIN in a second snapshot - this creates a 
duplicate entry
+    // in a new manifest, which will cause duplicate DeleteFile objects when 
processing
+    tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes).commit();
+
+    // This should NOT throw AlreadyExistsException - the fix uses 
DeleteFileSet to deduplicate
+    // Without the fix (using Collectors.toSet()), this would fail because:
+    // 1. Both manifests contain entries for the same delete file
+    // 2. Processing returns two different DeleteFile objects for the same file
+    // 3. HashSet doesn't deduplicate them (DeleteFile doesn't override 
equals())
+    // 4. rewritePositionDeletes tries to write the same file twice -> 
AlreadyExistsException
+    RewriteTablePath.Result result =
+        actions()
+            .rewriteTablePath(tableWithPosDeletes)
+            .stagingLocation(stagingLocation())
+            .rewriteLocationPrefix(tableWithPosDeletes.location(), 
targetTableLocation())
+            .execute();
+
+    // Verify the rewrite completed successfully - should have rewritten 
exactly 1 delete file
+    // (the duplicate should be deduplicated by DeleteFileSet)
+    assertThat(result.rewrittenDeleteFilePathsCount())
+        .as("Should have rewritten exactly 1 delete file after deduplication")
+        .isEqualTo(1);
+  }
+
   @TestTemplate
   public void testEqualityDeletes() throws Exception {
     Table sourceTable = createTableWithSnapshots(newTableLocation(), 1);

Reply via email to