This is an automated email from the ASF dual-hosted git repository.
etudenhoefner pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/main by this push:
new 9057bf32e9 Spark 3.4, 3.5, 4.1: Fix "AlreadyExistsException: Location
already exists" at rewrite_table_path (#15616)
9057bf32e9 is described below
commit 9057bf32e94278262cc0084b0d043e757c007f12
Author: Bhargav Kumar Konidena <[email protected]>
AuthorDate: Fri Mar 20 15:04:02 2026 +0530
Spark 3.4, 3.5, 4.1: Fix "AlreadyExistsException: Location already exists"
at rewrite_table_path (#15616)
---
.../spark/actions/RewriteTablePathSparkAction.java | 3 +-
.../spark/actions/TestRewriteTablePathsAction.java | 73 ++++++++++++++++++++++
.../spark/actions/RewriteTablePathSparkAction.java | 3 +-
.../spark/actions/TestRewriteTablePathsAction.java | 73 ++++++++++++++++++++++
.../spark/actions/RewriteTablePathSparkAction.java | 3 +-
.../spark/actions/TestRewriteTablePathsAction.java | 73 ++++++++++++++++++++++
6 files changed, 225 insertions(+), 3 deletions(-)
diff --git
a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
index 64b82f0986..aedb25e4a4 100644
---
a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
+++
b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
@@ -72,6 +72,7 @@ import
org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.spark.JobGroupInfo;
import org.apache.iceberg.spark.source.SerializableTableWithSize;
+import org.apache.iceberg.util.DeleteFileSet;
import org.apache.iceberg.util.Pair;
import org.apache.iceberg.util.Tasks;
import org.apache.spark.api.java.function.ForeachFunction;
@@ -317,7 +318,7 @@ public class RewriteTablePathSparkAction extends
BaseSparkAction<RewriteTablePat
rewriteManifestResult.toRewrite().stream()
.filter(e -> e instanceof DeleteFile)
.map(e -> (DeleteFile) e)
- .collect(Collectors.toSet());
+ .collect(Collectors.toCollection(DeleteFileSet::create));
rewritePositionDeletes(deleteFiles);
ImmutableRewriteTablePath.Result.Builder builder =
diff --git
a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
index 5c36578d06..6044368a46 100644
---
a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
+++
b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
@@ -567,6 +567,79 @@ public class TestRewriteTablePathsAction extends TestBase {
.isEmpty();
}
+ /**
+ * Test for https://github.com/apache/iceberg/issues/14814
+ *
+ * <p>This test verifies that rewrite_table_path correctly deduplicates
delete files when the same
+ * delete file appears in multiple manifests. Without the DeleteFileSet fix,
this test would fail
+ * with AlreadyExistsException because DeleteFile objects don't override
equals() and the same
+ * file would be processed multiple times.
+ *
+ * <p>The test creates a scenario where the same delete file is added to
multiple snapshots,
+ * causing it to appear in multiple manifest entries. When these manifests
are processed, the same
+ * delete file is returned as different object instances which need proper
deduplication.
+ */
+ @TestTemplate
+ public void testPositionDeletesDeduplication() throws Exception {
+ // Format versions 3 and 4 use Deletion Vectors stored in Puffin files,
which have different
+ // validation rules that prevent adding the same delete file multiple times
+ assumeThat(formatVersion)
+ .as("Format versions 3+ use DVs with different validation rules")
+ .isEqualTo(2);
+
+ Table tableWithPosDeletes =
+ createTableWithSnapshots(
+
tableDir.toFile().toURI().toString().concat("tableWithDuplicateDeletes"),
+ 2,
+ Map.of(TableProperties.DELETE_DEFAULT_FILE_FORMAT, "parquet"));
+
+ // Get a data file to create position deletes for
+ DataFile dataFile =
+ tableWithPosDeletes
+ .currentSnapshot()
+ .addedDataFiles(tableWithPosDeletes.io())
+ .iterator()
+ .next();
+
+ // Create a position delete file
+ List<Pair<CharSequence, Long>> deletes =
Lists.newArrayList(Pair.of(dataFile.location(), 0L));
+ File deleteFile =
+ new File(
+ removePrefix(tableWithPosDeletes.location() +
"/data/deeply/nested/deletes.parquet"));
+ DeleteFile positionDeletes =
+ FileHelpers.writeDeleteFile(
+ tableWithPosDeletes,
+
tableWithPosDeletes.io().newOutputFile(deleteFile.toURI().toString()),
+ deletes,
+ formatVersion)
+ .first();
+
+ tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes).commit();
+
+ // Add the SAME delete file AGAIN in a second snapshot - this creates a
duplicate entry
+ // in a new manifest, which will cause duplicate DeleteFile objects when
processing
+ tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes).commit();
+
+ // This should NOT throw AlreadyExistsException - the fix uses
DeleteFileSet to deduplicate
+ // Without the fix (using Collectors.toSet()), this would fail because:
+ // 1. Both manifests contain entries for the same delete file
+ // 2. Processing returns two different DeleteFile objects for the same file
+ // 3. HashSet doesn't deduplicate them (DeleteFile doesn't override
equals())
+ // 4. rewritePositionDeletes tries to write the same file twice ->
AlreadyExistsException
+ RewriteTablePath.Result result =
+ actions()
+ .rewriteTablePath(tableWithPosDeletes)
+ .stagingLocation(stagingLocation())
+ .rewriteLocationPrefix(tableWithPosDeletes.location(),
targetTableLocation())
+ .execute();
+
+ // Verify the rewrite completed successfully - should have rewritten
exactly 1 delete file
+ // (the duplicate should be deduplicated by DeleteFileSet)
+ assertThat(result.rewrittenDeleteFilePathsCount())
+ .as("Should have rewritten exactly 1 delete file after deduplication")
+ .isEqualTo(1);
+ }
+
@TestTemplate
public void testEqualityDeletes() throws Exception {
Table sourceTable = createTableWithSnapshots(newTableLocation(), 1);
diff --git
a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
index 64b82f0986..aedb25e4a4 100644
---
a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
+++
b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
@@ -72,6 +72,7 @@ import
org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.spark.JobGroupInfo;
import org.apache.iceberg.spark.source.SerializableTableWithSize;
+import org.apache.iceberg.util.DeleteFileSet;
import org.apache.iceberg.util.Pair;
import org.apache.iceberg.util.Tasks;
import org.apache.spark.api.java.function.ForeachFunction;
@@ -317,7 +318,7 @@ public class RewriteTablePathSparkAction extends
BaseSparkAction<RewriteTablePat
rewriteManifestResult.toRewrite().stream()
.filter(e -> e instanceof DeleteFile)
.map(e -> (DeleteFile) e)
- .collect(Collectors.toSet());
+ .collect(Collectors.toCollection(DeleteFileSet::create));
rewritePositionDeletes(deleteFiles);
ImmutableRewriteTablePath.Result.Builder builder =
diff --git
a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
index 5c36578d06..6044368a46 100644
---
a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
+++
b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
@@ -567,6 +567,79 @@ public class TestRewriteTablePathsAction extends TestBase {
.isEmpty();
}
+ /**
+ * Test for https://github.com/apache/iceberg/issues/14814
+ *
+ * <p>This test verifies that rewrite_table_path correctly deduplicates
delete files when the same
+ * delete file appears in multiple manifests. Without the DeleteFileSet fix,
this test would fail
+ * with AlreadyExistsException because DeleteFile objects don't override
equals() and the same
+ * file would be processed multiple times.
+ *
+ * <p>The test creates a scenario where the same delete file is added to
multiple snapshots,
+ * causing it to appear in multiple manifest entries. When these manifests
are processed, the same
+ * delete file is returned as different object instances which need proper
deduplication.
+ */
+ @TestTemplate
+ public void testPositionDeletesDeduplication() throws Exception {
+ // Format versions 3 and 4 use Deletion Vectors stored in Puffin files,
which have different
+ // validation rules that prevent adding the same delete file multiple times
+ assumeThat(formatVersion)
+ .as("Format versions 3+ use DVs with different validation rules")
+ .isEqualTo(2);
+
+ Table tableWithPosDeletes =
+ createTableWithSnapshots(
+
tableDir.toFile().toURI().toString().concat("tableWithDuplicateDeletes"),
+ 2,
+ Map.of(TableProperties.DELETE_DEFAULT_FILE_FORMAT, "parquet"));
+
+ // Get a data file to create position deletes for
+ DataFile dataFile =
+ tableWithPosDeletes
+ .currentSnapshot()
+ .addedDataFiles(tableWithPosDeletes.io())
+ .iterator()
+ .next();
+
+ // Create a position delete file
+ List<Pair<CharSequence, Long>> deletes =
Lists.newArrayList(Pair.of(dataFile.location(), 0L));
+ File deleteFile =
+ new File(
+ removePrefix(tableWithPosDeletes.location() +
"/data/deeply/nested/deletes.parquet"));
+ DeleteFile positionDeletes =
+ FileHelpers.writeDeleteFile(
+ tableWithPosDeletes,
+
tableWithPosDeletes.io().newOutputFile(deleteFile.toURI().toString()),
+ deletes,
+ formatVersion)
+ .first();
+
+ tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes).commit();
+
+ // Add the SAME delete file AGAIN in a second snapshot - this creates a
duplicate entry
+ // in a new manifest, which will cause duplicate DeleteFile objects when
processing
+ tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes).commit();
+
+ // This should NOT throw AlreadyExistsException - the fix uses
DeleteFileSet to deduplicate
+ // Without the fix (using Collectors.toSet()), this would fail because:
+ // 1. Both manifests contain entries for the same delete file
+ // 2. Processing returns two different DeleteFile objects for the same file
+ // 3. HashSet doesn't deduplicate them (DeleteFile doesn't override
equals())
+ // 4. rewritePositionDeletes tries to write the same file twice ->
AlreadyExistsException
+ RewriteTablePath.Result result =
+ actions()
+ .rewriteTablePath(tableWithPosDeletes)
+ .stagingLocation(stagingLocation())
+ .rewriteLocationPrefix(tableWithPosDeletes.location(),
targetTableLocation())
+ .execute();
+
+ // Verify the rewrite completed successfully - should have rewritten
exactly 1 delete file
+ // (the duplicate should be deduplicated by DeleteFileSet)
+ assertThat(result.rewrittenDeleteFilePathsCount())
+ .as("Should have rewritten exactly 1 delete file after deduplication")
+ .isEqualTo(1);
+ }
+
@TestTemplate
public void testEqualityDeletes() throws Exception {
Table sourceTable = createTableWithSnapshots(newTableLocation(), 1);
diff --git
a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
index 64b82f0986..aedb25e4a4 100644
---
a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
+++
b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java
@@ -72,6 +72,7 @@ import
org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.spark.JobGroupInfo;
import org.apache.iceberg.spark.source.SerializableTableWithSize;
+import org.apache.iceberg.util.DeleteFileSet;
import org.apache.iceberg.util.Pair;
import org.apache.iceberg.util.Tasks;
import org.apache.spark.api.java.function.ForeachFunction;
@@ -317,7 +318,7 @@ public class RewriteTablePathSparkAction extends
BaseSparkAction<RewriteTablePat
rewriteManifestResult.toRewrite().stream()
.filter(e -> e instanceof DeleteFile)
.map(e -> (DeleteFile) e)
- .collect(Collectors.toSet());
+ .collect(Collectors.toCollection(DeleteFileSet::create));
rewritePositionDeletes(deleteFiles);
ImmutableRewriteTablePath.Result.Builder builder =
diff --git
a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
index 5c36578d06..6044368a46 100644
---
a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
+++
b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java
@@ -567,6 +567,79 @@ public class TestRewriteTablePathsAction extends TestBase {
.isEmpty();
}
+ /**
+ * Test for https://github.com/apache/iceberg/issues/14814
+ *
+ * <p>This test verifies that rewrite_table_path correctly deduplicates
delete files when the same
+ * delete file appears in multiple manifests. Without the DeleteFileSet fix,
this test would fail
+ * with AlreadyExistsException because DeleteFile objects don't override
equals() and the same
+ * file would be processed multiple times.
+ *
+ * <p>The test creates a scenario where the same delete file is added to
multiple snapshots,
+ * causing it to appear in multiple manifest entries. When these manifests
are processed, the same
+ * delete file is returned as different object instances which need proper
deduplication.
+ */
+ @TestTemplate
+ public void testPositionDeletesDeduplication() throws Exception {
+ // Format versions 3 and 4 use Deletion Vectors stored in Puffin files,
which have different
+ // validation rules that prevent adding the same delete file multiple times
+ assumeThat(formatVersion)
+ .as("Format versions 3+ use DVs with different validation rules")
+ .isEqualTo(2);
+
+ Table tableWithPosDeletes =
+ createTableWithSnapshots(
+
tableDir.toFile().toURI().toString().concat("tableWithDuplicateDeletes"),
+ 2,
+ Map.of(TableProperties.DELETE_DEFAULT_FILE_FORMAT, "parquet"));
+
+ // Get a data file to create position deletes for
+ DataFile dataFile =
+ tableWithPosDeletes
+ .currentSnapshot()
+ .addedDataFiles(tableWithPosDeletes.io())
+ .iterator()
+ .next();
+
+ // Create a position delete file
+ List<Pair<CharSequence, Long>> deletes =
Lists.newArrayList(Pair.of(dataFile.location(), 0L));
+ File deleteFile =
+ new File(
+ removePrefix(tableWithPosDeletes.location() +
"/data/deeply/nested/deletes.parquet"));
+ DeleteFile positionDeletes =
+ FileHelpers.writeDeleteFile(
+ tableWithPosDeletes,
+
tableWithPosDeletes.io().newOutputFile(deleteFile.toURI().toString()),
+ deletes,
+ formatVersion)
+ .first();
+
+ tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes).commit();
+
+ // Add the SAME delete file AGAIN in a second snapshot - this creates a
duplicate entry
+ // in a new manifest, which will cause duplicate DeleteFile objects when
processing
+ tableWithPosDeletes.newRowDelta().addDeletes(positionDeletes).commit();
+
+ // This should NOT throw AlreadyExistsException - the fix uses
DeleteFileSet to deduplicate
+ // Without the fix (using Collectors.toSet()), this would fail because:
+ // 1. Both manifests contain entries for the same delete file
+ // 2. Processing returns two different DeleteFile objects for the same file
+ // 3. HashSet doesn't deduplicate them (DeleteFile doesn't override
equals())
+ // 4. rewritePositionDeletes tries to write the same file twice ->
AlreadyExistsException
+ RewriteTablePath.Result result =
+ actions()
+ .rewriteTablePath(tableWithPosDeletes)
+ .stagingLocation(stagingLocation())
+ .rewriteLocationPrefix(tableWithPosDeletes.location(),
targetTableLocation())
+ .execute();
+
+ // Verify the rewrite completed successfully - should have rewritten
exactly 1 delete file
+ // (the duplicate should be deduplicated by DeleteFileSet)
+ assertThat(result.rewrittenDeleteFilePathsCount())
+ .as("Should have rewritten exactly 1 delete file after deduplication")
+ .isEqualTo(1);
+ }
+
@TestTemplate
public void testEqualityDeletes() throws Exception {
Table sourceTable = createTableWithSnapshots(newTableLocation(), 1);