This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new d5aabe3530a branch-4.0: [fix](iceberg)fix iceberg rewrite_data_file 
fail when table had been updated. #61112 (#61137)
d5aabe3530a is described below

commit d5aabe3530a24d58cdba750612b40c1c78264c3d
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue Mar 10 11:51:59 2026 +0800

    branch-4.0: [fix](iceberg)fix iceberg rewrite_data_file fail when table had 
been updated. #61112 (#61137)
    
    Cherry-picked from #61112
    
    Co-authored-by: daidai <[email protected]>
---
 .../create_preinstalled_scripts/iceberg/run27.sql  | 40 ++++++++++++++
 .../datasource/iceberg/IcebergTransaction.java     |  6 +++
 .../action/test_iceberg_rewrite_data_files.out     | 18 +++++++
 .../action/test_iceberg_rewrite_data_files.groovy  | 63 ++++++++++++++++++++++
 4 files changed, 127 insertions(+)

diff --git 
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run27.sql
 
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run27.sql
new file mode 100644
index 00000000000..7c364fa7971
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run27.sql
@@ -0,0 +1,40 @@
+use demo.test_db;
+
+
+create table if not exists test_rewrite_data_with_update (
+  id INT,
+  name STRING
+)
+USING iceberg
+TBLPROPERTIES (
+  'format-version' = '2',
+  'write.delete.mode' = 'merge-on-read',
+  'write.update.mode' = 'merge-on-read',
+  'write.merge.mode' = 'merge-on-read'
+);
+
+
+INSERT INTO test_rewrite_data_with_update VALUES
+(1, 'a'),(2, 'b'),(3, 'c');
+
+update test_rewrite_data_with_update set name = "bb"  where id = 1;
+
+
+
+create table if not exists test_rewrite_data_with_delete (
+  id INT,
+  name STRING
+)
+USING iceberg
+TBLPROPERTIES (
+  'format-version' = '2',
+  'write.delete.mode' = 'merge-on-read',
+  'write.update.mode' = 'merge-on-read',
+  'write.merge.mode' = 'merge-on-read'
+);
+
+
+INSERT INTO test_rewrite_data_with_delete VALUES
+(1, 'a'),(2, 'b'),(3, 'c');
+
+delete from test_rewrite_data_with_delete where id = 1;
\ No newline at end of file
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergTransaction.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergTransaction.java
index b1bbcc920cf..d6b0f9896b1 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergTransaction.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergTransaction.java
@@ -67,6 +67,7 @@ public class IcebergTransaction implements Transaction {
     private String branchName;
 
     // Rewrite operation support
+    long startingSnapshotId = -1L; // Track the starting snapshot ID for 
rewrite operations
     private final List<DataFile> filesToDelete = Lists.newArrayList();
     private final List<DataFile> filesToAdd = Lists.newArrayList();
     private boolean isRewriteMode = false;
@@ -127,6 +128,9 @@ public class IcebergTransaction implements Transaction {
                 // create and start the iceberg transaction
                 this.table = IcebergUtils.getIcebergTable(dorisTable);
 
+                // Capture the starting snapshot ID for validation during 
rewrite commit
+                this.startingSnapshotId = table.currentSnapshot().snapshotId();
+
                 // For rewrite operations, we work directly on the main table
                 // No branch information needed
                 this.transaction = table.newTransaction();
@@ -196,6 +200,8 @@ public class IcebergTransaction implements Transaction {
 
         RewriteFiles rewriteFiles = transaction.newRewrite();
 
+        rewriteFiles = rewriteFiles.validateFromSnapshot(startingSnapshotId);
+
         // For rewrite operations, we work directly on the main table
         rewriteFiles = 
rewriteFiles.scanManifestsWith(ops.getThreadPoolWithPreAuth());
 
diff --git 
a/regression-test/data/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.out
 
b/regression-test/data/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.out
index 0c449ed9b44..6110e2e5aae 100644
--- 
a/regression-test/data/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.out
+++ 
b/regression-test/data/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.out
@@ -68,3 +68,21 @@
 8      EAST    280.00  2024-01-02
 9      EAST    380.00  2024-01-02
 
+-- !before_rewrite_update --
+1      bb
+2      b
+3      c
+
+-- !after_rewrite_update --
+1      bb
+2      b
+3      c
+
+-- !before_rewrite_delete --
+2      b
+3      c
+
+-- !after_rewrite_delete --
+2      b
+3      c
+
diff --git 
a/regression-test/suites/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.groovy
 
b/regression-test/suites/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.groovy
index 3d43089bd2e..5fa7601b7e9 100644
--- 
a/regression-test/suites/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.groovy
+++ 
b/regression-test/suites/external_table_p0/iceberg/action/test_iceberg_rewrite_data_files.groovy
@@ -493,4 +493,67 @@ suite("test_iceberg_rewrite_data_files", 
"p0,external,doris,external_docker,exte
     
     logger.info("Specific partition rewrite test completed successfully")
 
+    // 
=====================================================================================
+    // Test Case 4: Rewrite data files for merge-on-read update table
+    //
+    // Tables `test_rewrite_data_with_update` and 
`test_rewrite_data_with_delete`
+    // are pre-created in Docker initialization (run23.sql) using Spark SQL 
with
+    // format-version = 2 and merge-on-read enabled for delete/update/merge.
+    //
+    // This case verifies that executing rewrite_data_files on a table that has
+    // already performed UPDATE operations (implemented via delete + insert) 
does
+    // not change the logical query results.
+    // 
=====================================================================================
+    logger.info("Starting rewrite_data_files test for merge-on-read UPDATE 
table")
+
+    def table_name_update = "test_rewrite_data_with_update"
+
+    // Verify data before rewrite: id = 1 should have been updated to 'bb'
+    qt_before_rewrite_update """SELECT id, name FROM ${table_name_update} 
ORDER BY id"""
+
+    def rewriteResultUpdate = sql """
+        ALTER TABLE ${catalog_name}.${db_name}.${table_name_update}
+        EXECUTE rewrite_data_files(
+            "target-file-size-bytes" = "10485760",
+            "min-input-files" = "1"
+        )
+    """
+    logger.info("Rewrite data files result for update table: 
${rewriteResultUpdate}")
+
+    // Verify data after rewrite (logical rows should remain the same)
+    qt_after_rewrite_update """SELECT id, name FROM ${table_name_update} ORDER 
BY id"""
+
+    def totalUpdateRecords = sql """SELECT COUNT(*) FROM 
${table_name_update}"""
+    assertTrue(totalUpdateRecords[0][0] == 3, "Update table should still have 
3 logical records after rewrite")
+
+    // 
=====================================================================================
+    // Test Case 5: Rewrite data files for merge-on-read delete table
+    //
+    // This case verifies that executing rewrite_data_files on a table that has
+    // already performed DELETE operations does not resurrect deleted rows and
+    // keeps the logical result set unchanged.
+    // 
=====================================================================================
+    logger.info("Starting rewrite_data_files test for merge-on-read DELETE 
table")
+
+    def table_name_delete = "test_rewrite_data_with_delete"
+
+    // Verify data before rewrite: row with id = 1 should have been deleted
+    qt_before_rewrite_delete """SELECT id, name FROM ${table_name_delete} 
ORDER BY id"""
+
+    def rewriteResultDelete = sql """
+        ALTER TABLE ${catalog_name}.${db_name}.${table_name_delete}
+        EXECUTE rewrite_data_files(
+            "target-file-size-bytes" = "10485760",
+            "min-input-files" = "1"
+        )
+    """
+    logger.info("Rewrite data files result for delete table: 
${rewriteResultDelete}")
+
+    // Verify data after rewrite (deleted rows should not reappear)
+    qt_after_rewrite_delete """SELECT id, name FROM ${table_name_delete} ORDER 
BY id"""
+
+    def totalDeleteRecords = sql """SELECT COUNT(*) FROM 
${table_name_delete}"""
+    assertTrue(totalDeleteRecords[0][0] == 2, "Delete table should still have 
2 logical records after rewrite")
+
+    logger.info("Merge-on-read update/delete rewrite_data_files tests 
completed successfully")
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to