This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new 83b74827aa2 branch-2.1: [fix](iceberg)Fix count(*) error with dangling
delete problem #44039 (#44101)
83b74827aa2 is described below
commit 83b74827aa24f7f55469945c4a003c2409000384
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue Nov 19 17:19:25 2024 +0800
branch-2.1: [fix](iceberg)Fix count(*) error with dangling delete problem
#44039 (#44101)
Cherry-picked from #44039
Co-authored-by: wuwenchi <[email protected]>
---
.../docker-compose/iceberg/entrypoint.sh.tpl | 12 ++++++++++--
.../{ => iceberg}/run01.sql | 0
.../{ => iceberg}/run02.sql | 0
.../{ => iceberg}/run03.sql | 0
.../{ => iceberg}/run04.sql | 0
.../{ => iceberg}/run05.sql | 0
.../create_preinstalled_scripts/iceberg/run06.sql | 21 +++++++++++++++++++++
.../{run06.sql => paimon/run01.sql} | 0
.../datasource/iceberg/source/IcebergScanNode.java | 10 ++++++----
.../iceberg/test_iceberg_optimize_count.out | 3 +++
.../iceberg/test_iceberg_optimize_count.groovy | 13 ++++++++++++-
11 files changed, 52 insertions(+), 7 deletions(-)
diff --git a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl
b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl
index 1af170ff91b..a4b27bdd6c0 100644
--- a/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl
+++ b/docker/thirdparties/docker-compose/iceberg/entrypoint.sh.tpl
@@ -25,9 +25,17 @@ start-thriftserver.sh --driver-java-options
"-Dderby.system.home=/tmp/derby"
-ls /mnt/scripts/create_preinstalled_scripts/*.sql | xargs -n 1 -I {} bash -c '
+ls /mnt/scripts/create_preinstalled_scripts/iceberg/*.sql | xargs -n 1 -I {}
bash -c '
START_TIME=$(date +%s)
- spark-sql --master spark://doris--spark-iceberg:7077 -f {}
+ spark-sql --master spark://doris--spark-iceberg:7077 --conf
spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
-f {}
+ END_TIME=$(date +%s)
+ EXECUTION_TIME=$((END_TIME - START_TIME))
+ echo "Script: {} executed in $EXECUTION_TIME seconds"
+'
+
+ls /mnt/scripts/create_preinstalled_scripts/paimon/*.sql | xargs -n 1 -I {}
bash -c '
+ START_TIME=$(date +%s)
+ spark-sql --master spark://doris--spark-iceberg:7077 --conf
spark.sql.extensions=org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions
-f {}
END_TIME=$(date +%s)
EXECUTION_TIME=$((END_TIME - START_TIME))
echo "Script: {} executed in $EXECUTION_TIME seconds"
diff --git
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/run01.sql
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run01.sql
similarity index 100%
rename from
docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/run01.sql
rename to
docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run01.sql
diff --git
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/run02.sql
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run02.sql
similarity index 100%
rename from
docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/run02.sql
rename to
docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run02.sql
diff --git
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/run03.sql
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run03.sql
similarity index 100%
rename from
docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/run03.sql
rename to
docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run03.sql
diff --git
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/run04.sql
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run04.sql
similarity index 100%
rename from
docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/run04.sql
rename to
docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run04.sql
diff --git
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/run05.sql
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run05.sql
similarity index 100%
rename from
docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/run05.sql
rename to
docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run05.sql
diff --git
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run06.sql
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run06.sql
new file mode 100644
index 00000000000..3ac97c50099
--- /dev/null
+++
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run06.sql
@@ -0,0 +1,21 @@
+use demo.test_db;
+
+drop table if exists dangling_delete_after_write;
+create table dangling_delete_after_write (
+ id BIGINT NOT NULL,
+ val STRING)
+USING iceberg
+TBLPROPERTIES (
+ 'format' = 'iceberg/parquet',
+ 'format-version' = '2',
+ 'identifier-fields' = '[id]',
+ 'upsert-enabled' = 'true',
+ 'write.delete.mode' = 'merge-on-read',
+ 'write.parquet.compression-codec' = 'zstd',
+ 'write.update.mode' = 'merge-on-read',
+ 'write.upsert.enabled' = 'true');
+
+insert into dangling_delete_after_write values(1, 'abd');
+update dangling_delete_after_write set val = 'def' where id = 1;
+call demo.system.rewrite_data_files(table =>
'demo.test_db.dangling_delete_after_write', options => map('min-input-files',
'1'));
+insert into dangling_delete_after_write values(2, 'xyz');
\ No newline at end of file
diff --git
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/run06.sql
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/paimon/run01.sql
similarity index 100%
rename from
docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/run06.sql
rename to
docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/paimon/run01.sql
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
index 56dda7b4fe2..f7b58158d1a 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
@@ -384,13 +384,15 @@ public class IcebergScanNode extends FileQueryScanNode {
return 0;
}
+ // `TOTAL_POSITION_DELETES` is need to 0,
+ // because prevent 'dangling delete' problem after `rewrite_data_files`
+ // ref:
https://iceberg.apache.org/docs/nightly/spark-procedures/#rewrite_position_delete_files
Map<String, String> summary = snapshot.summary();
- if (summary.get(IcebergUtils.TOTAL_EQUALITY_DELETES).equals("0")) {
- return Long.parseLong(summary.get(IcebergUtils.TOTAL_RECORDS))
- -
Long.parseLong(summary.get(IcebergUtils.TOTAL_POSITION_DELETES));
- } else {
+ if (!summary.get(IcebergUtils.TOTAL_EQUALITY_DELETES).equals("0")
+ ||
!summary.get(IcebergUtils.TOTAL_POSITION_DELETES).equals("0")) {
return -1;
}
+ return Long.parseLong(summary.get(IcebergUtils.TOTAL_RECORDS));
}
@Override
diff --git
a/regression-test/data/external_table_p0/iceberg/test_iceberg_optimize_count.out
b/regression-test/data/external_table_p0/iceberg/test_iceberg_optimize_count.out
index f2e945f9cec..ec9129a00d2 100644
---
a/regression-test/data/external_table_p0/iceberg/test_iceberg_optimize_count.out
+++
b/regression-test/data/external_table_p0/iceberg/test_iceberg_optimize_count.out
@@ -23,3 +23,6 @@
-- !q08 --
1000
+-- !q09 --
+2
+
diff --git
a/regression-test/suites/external_table_p0/iceberg/test_iceberg_optimize_count.groovy
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_optimize_count.groovy
index 927d442b8dd..4d74e1406e7 100644
---
a/regression-test/suites/external_table_p0/iceberg/test_iceberg_optimize_count.groovy
+++
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_optimize_count.groovy
@@ -70,7 +70,7 @@ suite("test_iceberg_optimize_count",
"p0,external,doris,external_docker,external
}
explain {
sql("""${sqlstr4}""")
- contains """pushdown agg=COUNT (1000)"""
+ contains """pushdown agg=COUNT (-1)"""
}
// don't use push down count
@@ -98,6 +98,17 @@ suite("test_iceberg_optimize_count",
"p0,external,doris,external_docker,external
contains """pushdown agg=NONE"""
}
+ // There has `dangling delete` after rewrite
+ sql """ set enable_count_push_down_for_external_table=true; """
+ sqlstr5 = """ select count(*) from
${catalog_name}.test_db.dangling_delete_after_write; """
+
+ qt_q09 """${sqlstr5}"""
+
+ explain {
+ sql("""${sqlstr5}""")
+ contains """pushdown agg=COUNT (-1)"""
+ }
+
} finally {
sql """ set enable_count_push_down_for_external_table=true; """
sql """drop catalog if exists ${catalog_name}"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]