This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new d5aabe3530a branch-4.0: [fix](iceberg)fix iceberg rewrite_data_file
fail when table had been updated. #61112 (#61137)
d5aabe3530a is described below
commit d5aabe3530a24d58cdba750612b40c1c78264c3d
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue Mar 10 11:51:59 2026 +0800
branch-4.0: [fix](iceberg)fix iceberg rewrite_data_file fail when table had
been updated. #61112 (#61137)
Cherry-picked from #61112
Co-authored-by: daidai <[email protected]>
---
.../create_preinstalled_scripts/iceberg/run27.sql | 40 ++++++++++++++
.../datasource/iceberg/IcebergTransaction.java | 6 +++
.../action/test_iceberg_rewrite_data_files.out | 18 +++++++
.../action/test_iceberg_rewrite_data_files.groovy | 63 ++++++++++++++++++++++
4 files changed, 127 insertions(+)
diff --git
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run27.sql
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run27.sql
new file mode 100644
index 00000000000..7c364fa7971
--- /dev/null
+++
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run27.sql
@@ -0,0 +1,40 @@
+use demo.test_db;
+
+
+create table if not exists test_rewrite_data_with_update (
+ id INT,
+ name STRING
+)
+USING iceberg
+TBLPROPERTIES (
+ 'format-version' = '2',
+ 'write.delete.mode' = 'merge-on-read',
+ 'write.update.mode' = 'merge-on-read',
+ 'write.merge.mode' = 'merge-on-read'
+);
+
+
+INSERT INTO test_rewrite_data_with_update VALUES
+(1, 'a'),(2, 'b'),(3, 'c');
+
+update test_rewrite_data_with_update set name = "bb" where id = 1;
+
+
+
+create table if not exists test_rewrite_data_with_delete (
+ id INT,
+ name STRING
+)
+USING iceberg
+TBLPROPERTIES (
+ 'format-version' = '2',
+ 'write.delete.mode' = 'merge-on-read',
+ 'write.update.mode' = 'merge-on-read',
+ 'write.merge.mode' = 'merge-on-read'
+);
+
+
+INSERT INTO test_rewrite_data_with_delete VALUES
+(1, 'a'),(2, 'b'),(3, 'c');
+
+delete from test_rewrite_data_with_delete where id = 1;
\ No newline at end of file
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergTransaction.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergTransaction.java
index b1bbcc920cf..d6b0f9896b1 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergTransaction.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergTransaction.java
@@ -67,6 +67,7 @@ public class IcebergTransaction implements Transaction {
private String branchName;
// Rewrite operation support
+ long startingSnapshotId = -1L; // Track the starting snapshot ID for
rewrite operations
private final List<DataFile> filesToDelete = Lists.newArrayList();
private final List<DataFile> filesToAdd = Lists.newArrayList();
private boolean isRewriteMode = false;
@@ -127,6 +128,9 @@ public class IcebergTransaction implements Transaction {
// create and start the iceberg transaction
this.table = IcebergUtils.getIcebergTable(dorisTable);
+ // Capture the starting snapshot ID for validation during
rewrite commit
+ this.startingSnapshotId = table.currentSnapshot().snapshotId();
+
// For rewrite operations, we work directly on the main table
// No branch information needed
this.transaction = table.newTransaction();
@@ -196,6 +200,8 @@ public class IcebergTransaction implements Transaction {
RewriteFiles rewriteFiles = transaction.newRewrite();
+ rewriteFiles = rewriteFiles.validateFromSnapshot(startingSnapshotId);
+
// For rewrite operations, we work directly on the main table
rewriteFiles =
rewriteFiles.scanManifestsWith(ops.getThreadPoolWithPreAuth());
diff --git
a/regression-test/data/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.out
b/regression-test/data/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.out
index 0c449ed9b44..6110e2e5aae 100644
---
a/regression-test/data/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.out
+++
b/regression-test/data/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.out
@@ -68,3 +68,21 @@
8 EAST 280.00 2024-01-02
9 EAST 380.00 2024-01-02
+-- !before_rewrite_update --
+1 bb
+2 b
+3 c
+
+-- !after_rewrite_update --
+1 bb
+2 b
+3 c
+
+-- !before_rewrite_delete --
+2 b
+3 c
+
+-- !after_rewrite_delete --
+2 b
+3 c
+
diff --git
a/regression-test/suites/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.groovy
b/regression-test/suites/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.groovy
index 3d43089bd2e..5fa7601b7e9 100644
---
a/regression-test/suites/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.groovy
+++
b/regression-test/suites/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.groovy
@@ -493,4 +493,67 @@ suite("test_iceberg_rewrite_data_files",
"p0,external,doris,external_docker,exte
logger.info("Specific partition rewrite test completed successfully")
+ //
=====================================================================================
+ // Test Case 4: Rewrite data files for merge-on-read update table
+ //
+ // Tables `test_rewrite_data_with_update` and
`test_rewrite_data_with_delete`
+ // are pre-created in Docker initialization (run23.sql) using Spark SQL
with
+ // format-version = 2 and merge-on-read enabled for delete/update/merge.
+ //
+ // This case verifies that executing rewrite_data_files on a table that has
+ // already performed UPDATE operations (implemented via delete + insert)
does
+ // not change the logical query results.
+ //
=====================================================================================
+ logger.info("Starting rewrite_data_files test for merge-on-read UPDATE
table")
+
+ def table_name_update = "test_rewrite_data_with_update"
+
+ // Verify data before rewrite: id = 1 should have been updated to 'bb'
+ qt_before_rewrite_update """SELECT id, name FROM ${table_name_update}
ORDER BY id"""
+
+ def rewriteResultUpdate = sql """
+ ALTER TABLE ${catalog_name}.${db_name}.${table_name_update}
+ EXECUTE rewrite_data_files(
+ "target-file-size-bytes" = "10485760",
+ "min-input-files" = "1"
+ )
+ """
+ logger.info("Rewrite data files result for update table:
${rewriteResultUpdate}")
+
+ // Verify data after rewrite (logical rows should remain the same)
+ qt_after_rewrite_update """SELECT id, name FROM ${table_name_update} ORDER
BY id"""
+
+ def totalUpdateRecords = sql """SELECT COUNT(*) FROM
${table_name_update}"""
+ assertTrue(totalUpdateRecords[0][0] == 3, "Update table should still have
3 logical records after rewrite")
+
+ //
=====================================================================================
+ // Test Case 5: Rewrite data files for merge-on-read delete table
+ //
+ // This case verifies that executing rewrite_data_files on a table that has
+ // already performed DELETE operations does not resurrect deleted rows and
+ // keeps the logical result set unchanged.
+ //
=====================================================================================
+ logger.info("Starting rewrite_data_files test for merge-on-read DELETE
table")
+
+ def table_name_delete = "test_rewrite_data_with_delete"
+
+ // Verify data before rewrite: row with id = 1 should have been deleted
+ qt_before_rewrite_delete """SELECT id, name FROM ${table_name_delete}
ORDER BY id"""
+
+ def rewriteResultDelete = sql """
+ ALTER TABLE ${catalog_name}.${db_name}.${table_name_delete}
+ EXECUTE rewrite_data_files(
+ "target-file-size-bytes" = "10485760",
+ "min-input-files" = "1"
+ )
+ """
+ logger.info("Rewrite data files result for delete table:
${rewriteResultDelete}")
+
+ // Verify data after rewrite (deleted rows should not reappear)
+ qt_after_rewrite_delete """SELECT id, name FROM ${table_name_delete} ORDER
BY id"""
+
+ def totalDeleteRecords = sql """SELECT COUNT(*) FROM
${table_name_delete}"""
+ assertTrue(totalDeleteRecords[0][0] == 2, "Delete table should still have
2 logical records after rewrite")
+
+ logger.info("Merge-on-read update/delete rewrite_data_files tests
completed successfully")
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]