This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 1c272bc7c0c [fix](iceberg) only enable dynamic partition pruning for 
identity partitions in Iceberg (#58033)
1c272bc7c0c is described below

commit 1c272bc7c0c327852ecf7d1887b2d9db35e44a58
Author: Socrates <[email protected]>
AuthorDate: Thu Nov 20 15:58:15 2025 +0800

    [fix](iceberg) only enable dynamic partition pruning for identity 
partitions in Iceberg (#58033)
    
    ### What problem does this PR solve?
    Problem Summary:
    Currently, the dynamic partition pruning feature for Iceberg tables has
    the following issues:
    
    1. **Does not consider partition evolution**: When processing partition
    values, the code doesn't account for partition evolution scenarios where
    different files may use different partition specs (specId). This can
    lead to incorrect partition information being passed to BE.
    
    2. **Processes all partition transform types**: The current
    implementation processes all partition transform types (identity, day,
    bucket, truncate, etc.) and passes partition information to BE for
    dynamic partition pruning. However, for non-identity transforms (e.g.,
    `day(ts)`, `bucket(16, id)`, `truncate(3, name)`), the partition values
    are transformed values, not the original column values, which can cause
    incorrect filtering behavior.
---
 .../create_preinstalled_scripts/iceberg/run19.sql  |  87 +++++++++++++++++
 .../doris/datasource/iceberg/IcebergUtils.java     |  38 +++++++-
 .../datasource/iceberg/source/IcebergScanNode.java |  18 +++-
 ..._runtime_filter_partition_pruning_transform.out | 108 +++++++++++++++++++++
 ...ntime_filter_partition_pruning_transform.groovy |  98 +++++++++++++++++++
 .../test_iceberg_write_transform_partitions.groovy |   6 ++
 6 files changed, 350 insertions(+), 5 deletions(-)

diff --git 
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run19.sql
 
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run19.sql
index 42451e6c964..d019b89223c 100644
--- 
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run19.sql
+++ 
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run19.sql
@@ -364,6 +364,93 @@ VALUES (1, 'z', 0.00),
     (9, 'big', 9999999.99),
     (10, 'null', NULL);
 
+-- =============================================
+-- Time transform partition coverage (day/year/month/hour)
+-- =============================================
+
+-- Day transform by TIMESTAMP
+CREATE TABLE day_partitioned (
+    id BIGINT,
+    name STRING,
+    ts TIMESTAMP
+) USING ICEBERG PARTITIONED BY (day(ts));
+
+INSERT INTO day_partitioned VALUES
+    (1, 'd1', TIMESTAMP '2024-01-15 08:00:00'),
+    (2, 'd2', TIMESTAMP '2024-01-15 12:30:00'),
+    (3, 'd3', TIMESTAMP '2024-01-15 23:59:59'),
+    (4, 'd4', TIMESTAMP '2024-01-16 00:00:00'),
+    (5, 'd5', TIMESTAMP '2024-01-16 10:00:00'),
+    (6, 'd6', TIMESTAMP '2024-02-29 12:00:00'),
+    (7, 'd7', TIMESTAMP '2024-12-31 23:59:59'),
+    (8, 'null', NULL);
+
+-- Year transform by TIMESTAMP
+CREATE TABLE year_partitioned (
+    id BIGINT,
+    name STRING,
+    ts TIMESTAMP
+) USING ICEBERG PARTITIONED BY (year(ts));
+
+INSERT INTO year_partitioned VALUES
+    (1, 'y1', TIMESTAMP '2023-01-01 00:00:00'),
+    (2, 'y2', TIMESTAMP '2023-06-15 12:00:00'),
+    (3, 'y3', TIMESTAMP '2023-12-31 23:59:59'),
+    (4, 'y4', TIMESTAMP '2024-01-01 00:00:00'),
+    (5, 'y5', TIMESTAMP '2024-06-15 12:00:00'),
+    (6, 'y6', TIMESTAMP '2024-12-31 23:59:59'),
+    (7, 'y7', TIMESTAMP '2025-01-01 00:00:00'),
+    (8, 'null', NULL);
+
+-- Month transform by TIMESTAMP
+CREATE TABLE month_partitioned (
+    id BIGINT,
+    name STRING,
+    ts TIMESTAMP
+) USING ICEBERG PARTITIONED BY (month(ts));
+
+INSERT INTO month_partitioned VALUES
+    (1, 'm1', TIMESTAMP '2024-01-01 00:00:00'),
+    (2, 'm2', TIMESTAMP '2024-01-15 12:00:00'),
+    (3, 'm3', TIMESTAMP '2024-01-31 23:59:59'),
+    (4, 'm4', TIMESTAMP '2024-02-01 00:00:00'),
+    (5, 'm5', TIMESTAMP '2024-02-15 12:00:00'),
+    (6, 'm6', TIMESTAMP '2024-02-29 23:59:59'),
+    (7, 'm7', TIMESTAMP '2024-12-01 00:00:00'),
+    (8, 'm8', TIMESTAMP '2024-12-31 23:59:59'),
+    (9, 'null', NULL);
+
+-- Hour transform by TIMESTAMP
+CREATE TABLE hour_partitioned (
+    id BIGINT,
+    name STRING,
+    ts TIMESTAMP
+) USING ICEBERG PARTITIONED BY (hour(ts));
+
+INSERT INTO hour_partitioned VALUES
+    (1, 'h1', TIMESTAMP '2024-01-15 10:00:00'),
+    (2, 'h2', TIMESTAMP '2024-01-15 10:30:00'),
+    (3, 'h3', TIMESTAMP '2024-01-15 10:59:59'),
+    (4, 'h4', TIMESTAMP '2024-01-15 11:00:00'),
+    (5, 'h5', TIMESTAMP '2024-01-15 11:30:00'),
+    (6, 'h6', TIMESTAMP '2024-01-15 23:00:00'),
+    (7, 'h7', TIMESTAMP '2024-01-15 23:59:59'),
+    (8, 'h8', TIMESTAMP '2024-01-16 00:00:00'),
+    (9, 'null', NULL);
+
+-- Create _copy tables for write testing
+CREATE TABLE day_partitioned_copy AS
+SELECT * FROM day_partitioned;
+
+CREATE TABLE year_partitioned_copy AS
+SELECT * FROM year_partitioned;
+
+CREATE TABLE month_partitioned_copy AS
+SELECT * FROM month_partitioned;
+
+CREATE TABLE hour_partitioned_copy AS
+SELECT * FROM hour_partitioned;
+
 -- create table for testing query partitions with snapshot has been expired
 
 CREATE TABLE test_partitions_with_expired_snapshot (
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java
index f1c2b73b279..9587ca4f816 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java
@@ -609,11 +609,47 @@ public class IcebergUtils {
         }
     }
 
-    public static Map<String, String> getPartitionInfoMap(PartitionData 
partitionData, String timeZone) {
+    /**
+     * Get partition info map for identity partitions only, considering 
partition
+     * evolution.
+     * For non-identity partitions (e.g., day, bucket, truncate), returns null 
to
+     * skip
+     * dynamic partition pruning.
+     *
+     * @param partitionData The partition data from the file
+     * @param partitionSpec The partition spec corresponding to the file's 
specId
+     *                      (required)
+     * @param timeZone      The time zone for timestamp serialization
+     * @return Map of partition field name to partition value string, or null 
if
+     *         there are non-identity partitions
+     */
+    public static Map<String, String> getPartitionInfoMap(PartitionData 
partitionData, PartitionSpec partitionSpec,
+            String timeZone) {
         Map<String, String> partitionInfoMap = new HashMap<>();
         List<NestedField> fields = 
partitionData.getPartitionType().asNestedType().fields();
+
+        // Check if all partition fields are identity transform
+        // If any field is not identity, return null to skip dynamic partition 
pruning
+        List<PartitionField> partitionFields = partitionSpec.fields();
+        Preconditions.checkArgument(fields.size() == partitionFields.size(),
+                "PartitionData fields size does not match PartitionSpec fields 
size");
+
         for (int i = 0; i < fields.size(); i++) {
             NestedField field = fields.get(i);
+            PartitionField partitionField = partitionFields.get(i);
+
+            // Only process identity transform partitions
+            // For other transforms (day, bucket, truncate, etc.), skip 
dynamic partition
+            // pruning
+            if (!partitionField.transform().isIdentity()) {
+                if (LOG.isDebugEnabled()) {
+                    LOG.debug(
+                            "Skip dynamic partition pruning for non-identity 
partition field: {} with transform: {}",
+                            field.name(), 
partitionField.transform().toString());
+                }
+                return null;
+            }
+
             Object value = partitionData.get(i);
             try {
                 String partitionString = serializePartitionValue(field.type(), 
value, timeZone);
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
index 2ad596fa9a3..f5208397a0f 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
@@ -64,6 +64,7 @@ import org.apache.iceberg.FileScanTask;
 import org.apache.iceberg.ManifestFile;
 import org.apache.iceberg.MetadataColumns;
 import org.apache.iceberg.PartitionData;
+import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Snapshot;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableScan;
@@ -382,10 +383,19 @@ public class IcebergScanNode extends FileQueryScanNode {
         if (isPartitionedTable) {
             PartitionData partitionData = (PartitionData) 
fileScanTask.file().partition();
             if (sessionVariable.isEnableRuntimeFilterPartitionPrune()) {
-                // If the partition data is not in the map, we need to 
calculate the partition
-                Map<String, String> partitionInfoMap = 
partitionMapInfos.computeIfAbsent(partitionData, k -> {
-                    return IcebergUtils.getPartitionInfoMap(partitionData, 
sessionVariable.getTimeZone());
-                });
+                // Get specId and corresponding PartitionSpec to handle 
partition evolution
+                int specId = fileScanTask.file().specId();
+                PartitionSpec partitionSpec = icebergTable.specs().get(specId);
+
+                Preconditions.checkNotNull(partitionSpec, "Partition spec with 
specId %s not found for table %s",
+                        specId, icebergTable.name());
+                Map<String, String> partitionInfoMap = 
partitionMapInfos.computeIfAbsent(
+                        partitionData, k -> {
+                            return 
IcebergUtils.getPartitionInfoMap(partitionData, partitionSpec,
+                                    sessionVariable.getTimeZone());
+                        });
+                // Only set partition values if all partitions are identity 
transform
+                // For non-identity partitions, getPartitionInfoMap returns 
null to skip dynamic partition pruning
                 if (partitionInfoMap != null) {
                     split.setIcebergPartitionValues(partitionInfoMap);
                 }
diff --git 
a/regression-test/data/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.out
 
b/regression-test/data/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.out
index 1c26361afa5..fb18cc11885 100644
--- 
a/regression-test/data/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.out
+++ 
b/regression-test/data/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.out
@@ -71,6 +71,60 @@
 -- !trunc_decimal_in --
 2
 
+-- !day_eq --
+1
+
+-- !day_in --
+2
+
+-- !day_range_ge --
+7
+
+-- !day_range_lt --
+3
+
+-- !day_range_between --
+3
+
+-- !day_range_multi --
+5
+
+-- !year_eq --
+1
+
+-- !year_in --
+2
+
+-- !year_range --
+3
+
+-- !year_range_cross --
+3
+
+-- !month_eq --
+1
+
+-- !month_in --
+2
+
+-- !month_range --
+3
+
+-- !month_range_multi --
+6
+
+-- !hour_eq --
+1
+
+-- !hour_in --
+2
+
+-- !hour_range --
+3
+
+-- !hour_range_multi --
+5
+
 -- !bucket_int_eq --
 1
 
@@ -143,3 +197,57 @@
 -- !trunc_decimal_in --
 2
 
+-- !day_eq --
+1
+
+-- !day_in --
+2
+
+-- !day_range_ge --
+7
+
+-- !day_range_lt --
+3
+
+-- !day_range_between --
+3
+
+-- !day_range_multi --
+5
+
+-- !year_eq --
+1
+
+-- !year_in --
+2
+
+-- !year_range --
+3
+
+-- !year_range_cross --
+3
+
+-- !month_eq --
+1
+
+-- !month_in --
+2
+
+-- !month_range --
+3
+
+-- !month_range_multi --
+6
+
+-- !hour_eq --
+1
+
+-- !hour_in --
+2
+
+-- !hour_range --
+3
+
+-- !hour_range_multi --
+5
+
diff --git 
a/regression-test/suites/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.groovy
 
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.groovy
index 63a723ebe9e..7a2dbcfab92 100644
--- 
a/regression-test/suites/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.groovy
+++ 
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.groovy
@@ -201,6 +201,104 @@ 
suite("test_iceberg_runtime_filter_partition_pruning_transform", "p0,external,do
                  group by partition_key having count(*) > 0
                  order by partition_key desc limit 2);
         """
+
+        // Time transform partitions (day/year/month/hour)
+        // Note: Bucket and Truncate transforms are not supported for runtime 
filter partition pruning,
+        // but Day/Year/Month/Hour transforms are supported.
+        
+        // Day transform tests
+        qt_day_eq """
+            select count(*) from day_partitioned where ts =
+                (select ts from day_partitioned
+                 group by ts having count(*) > 0
+                 order by ts desc limit 1);
+        """
+        qt_day_in """
+            select count(*) from day_partitioned where ts in
+                (select ts from day_partitioned
+                 group by ts having count(*) > 0
+                 order by ts desc limit 2);
+        """
+        qt_day_range_ge """
+            select count(*) from day_partitioned where ts >= '2024-01-15 
00:00:00';
+        """
+        qt_day_range_lt """
+            select count(*) from day_partitioned where ts < '2024-01-16 
00:00:00';
+        """
+        qt_day_range_between """
+            select count(*) from day_partitioned where ts >= '2024-01-15 
00:00:00' 
+                and ts < '2024-01-16 00:00:00';
+        """
+        qt_day_range_multi """
+            select count(*) from day_partitioned where ts >= '2024-01-15 
00:00:00' 
+                and ts < '2024-01-17 00:00:00';
+        """
+
+        // Year transform tests
+        qt_year_eq """
+            select count(*) from year_partitioned where ts =
+                (select ts from year_partitioned
+                 group by ts having count(*) > 0
+                 order by ts desc limit 1);
+        """
+        qt_year_in """
+            select count(*) from year_partitioned where ts in
+                (select ts from year_partitioned
+                 group by ts having count(*) > 0
+                 order by ts desc limit 2);
+        """
+        qt_year_range """
+            select count(*) from year_partitioned where ts >= '2024-01-01 
00:00:00' 
+                and ts < '2025-01-01 00:00:00';
+        """
+        qt_year_range_cross """
+            select count(*) from year_partitioned where ts >= '2023-06-01 
00:00:00' 
+                and ts < '2024-06-01 00:00:00';
+        """
+
+        // Month transform tests
+        qt_month_eq """
+            select count(*) from month_partitioned where ts =
+                (select ts from month_partitioned
+                 group by ts having count(*) > 0
+                 order by ts desc limit 1);
+        """
+        qt_month_in """
+            select count(*) from month_partitioned where ts in
+                (select ts from month_partitioned
+                 group by ts having count(*) > 0
+                 order by ts desc limit 2);
+        """
+        qt_month_range """
+            select count(*) from month_partitioned where ts >= '2024-01-01 
00:00:00' 
+                and ts < '2024-02-01 00:00:00';
+        """
+        qt_month_range_multi """
+            select count(*) from month_partitioned where ts >= '2024-01-01 
00:00:00' 
+                and ts < '2024-03-01 00:00:00';
+        """
+
+        // Hour transform tests
+        qt_hour_eq """
+            select count(*) from hour_partitioned where ts =
+                (select ts from hour_partitioned
+                 group by ts having count(*) > 0
+                 order by ts desc limit 1);
+        """
+        qt_hour_in """
+            select count(*) from hour_partitioned where ts in
+                (select ts from hour_partitioned
+                 group by ts having count(*) > 0
+                 order by ts desc limit 2);
+        """
+        qt_hour_range """
+            select count(*) from hour_partitioned where ts >= '2024-01-15 
10:00:00' 
+                and ts < '2024-01-15 11:00:00';
+        """
+        qt_hour_range_multi """
+            select count(*) from hour_partitioned where ts >= '2024-01-15 
10:00:00' 
+                and ts < '2024-01-15 12:00:00';
+        """
     }
     try {
         sql """ set time_zone = 'Asia/Shanghai'; """
diff --git 
a/regression-test/suites/external_table_p0/iceberg/write/test_iceberg_write_transform_partitions.groovy
 
b/regression-test/suites/external_table_p0/iceberg/write/test_iceberg_write_transform_partitions.groovy
index 2662de4dfda..2d8e265222e 100644
--- 
a/regression-test/suites/external_table_p0/iceberg/write/test_iceberg_write_transform_partitions.groovy
+++ 
b/regression-test/suites/external_table_p0/iceberg/write/test_iceberg_write_transform_partitions.groovy
@@ -64,6 +64,12 @@ suite("test_iceberg_write_transform_partitions", 
"p0,external,iceberg,external_d
         test_write_transform_partitions("truncate_int_10");
         test_write_transform_partitions("truncate_bigint_100");
         test_write_transform_partitions("truncate_decimal_10");
+        
+        // Time transform partitions (day/year/month/hour)
+        test_write_transform_partitions("day_partitioned");
+        test_write_transform_partitions("year_partitioned");
+        test_write_transform_partitions("month_partitioned");
+        test_write_transform_partitions("hour_partitioned");
     } finally {
         sql """ unset variable time_zone; """
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to