This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 1c272bc7c0c [fix](iceberg) only enable dynamic partition pruning for
identity partitions in Iceberg (#58033)
1c272bc7c0c is described below
commit 1c272bc7c0c327852ecf7d1887b2d9db35e44a58
Author: Socrates <[email protected]>
AuthorDate: Thu Nov 20 15:58:15 2025 +0800
[fix](iceberg) only enable dynamic partition pruning for identity
partitions in Iceberg (#58033)
### What problem does this PR solve?
Problem Summary:
Currently, the dynamic partition pruning feature for Iceberg tables has
the following issues:
1. **Does not consider partition evolution**: When processing partition
values, the code doesn't account for partition evolution scenarios where
different files may use different partition specs (specId). This can
lead to incorrect partition information being passed to BE.
2. **Processes all partition transform types**: The current
implementation processes all partition transform types (identity, day,
bucket, truncate, etc.) and passes partition information to BE for
dynamic partition pruning. However, for non-identity transforms (e.g.,
`day(ts)`, `bucket(16, id)`, `truncate(3, name)`), the partition values
are transformed values, not the original column values, which can cause
incorrect filtering behavior.
---
.../create_preinstalled_scripts/iceberg/run19.sql | 87 +++++++++++++++++
.../doris/datasource/iceberg/IcebergUtils.java | 38 +++++++-
.../datasource/iceberg/source/IcebergScanNode.java | 18 +++-
..._runtime_filter_partition_pruning_transform.out | 108 +++++++++++++++++++++
...ntime_filter_partition_pruning_transform.groovy | 98 +++++++++++++++++++
.../test_iceberg_write_transform_partitions.groovy | 6 ++
6 files changed, 350 insertions(+), 5 deletions(-)
diff --git
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run19.sql
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run19.sql
index 42451e6c964..d019b89223c 100644
---
a/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run19.sql
+++
b/docker/thirdparties/docker-compose/iceberg/scripts/create_preinstalled_scripts/iceberg/run19.sql
@@ -364,6 +364,93 @@ VALUES (1, 'z', 0.00),
(9, 'big', 9999999.99),
(10, 'null', NULL);
+-- =============================================
+-- Time transform partition coverage (day/year/month/hour)
+-- =============================================
+
+-- Day transform by TIMESTAMP
+CREATE TABLE day_partitioned (
+ id BIGINT,
+ name STRING,
+ ts TIMESTAMP
+) USING ICEBERG PARTITIONED BY (day(ts));
+
+INSERT INTO day_partitioned VALUES
+ (1, 'd1', TIMESTAMP '2024-01-15 08:00:00'),
+ (2, 'd2', TIMESTAMP '2024-01-15 12:30:00'),
+ (3, 'd3', TIMESTAMP '2024-01-15 23:59:59'),
+ (4, 'd4', TIMESTAMP '2024-01-16 00:00:00'),
+ (5, 'd5', TIMESTAMP '2024-01-16 10:00:00'),
+ (6, 'd6', TIMESTAMP '2024-02-29 12:00:00'),
+ (7, 'd7', TIMESTAMP '2024-12-31 23:59:59'),
+ (8, 'null', NULL);
+
+-- Year transform by TIMESTAMP
+CREATE TABLE year_partitioned (
+ id BIGINT,
+ name STRING,
+ ts TIMESTAMP
+) USING ICEBERG PARTITIONED BY (year(ts));
+
+INSERT INTO year_partitioned VALUES
+ (1, 'y1', TIMESTAMP '2023-01-01 00:00:00'),
+ (2, 'y2', TIMESTAMP '2023-06-15 12:00:00'),
+ (3, 'y3', TIMESTAMP '2023-12-31 23:59:59'),
+ (4, 'y4', TIMESTAMP '2024-01-01 00:00:00'),
+ (5, 'y5', TIMESTAMP '2024-06-15 12:00:00'),
+ (6, 'y6', TIMESTAMP '2024-12-31 23:59:59'),
+ (7, 'y7', TIMESTAMP '2025-01-01 00:00:00'),
+ (8, 'null', NULL);
+
+-- Month transform by TIMESTAMP
+CREATE TABLE month_partitioned (
+ id BIGINT,
+ name STRING,
+ ts TIMESTAMP
+) USING ICEBERG PARTITIONED BY (month(ts));
+
+INSERT INTO month_partitioned VALUES
+ (1, 'm1', TIMESTAMP '2024-01-01 00:00:00'),
+ (2, 'm2', TIMESTAMP '2024-01-15 12:00:00'),
+ (3, 'm3', TIMESTAMP '2024-01-31 23:59:59'),
+ (4, 'm4', TIMESTAMP '2024-02-01 00:00:00'),
+ (5, 'm5', TIMESTAMP '2024-02-15 12:00:00'),
+ (6, 'm6', TIMESTAMP '2024-02-29 23:59:59'),
+ (7, 'm7', TIMESTAMP '2024-12-01 00:00:00'),
+ (8, 'm8', TIMESTAMP '2024-12-31 23:59:59'),
+ (9, 'null', NULL);
+
+-- Hour transform by TIMESTAMP
+CREATE TABLE hour_partitioned (
+ id BIGINT,
+ name STRING,
+ ts TIMESTAMP
+) USING ICEBERG PARTITIONED BY (hour(ts));
+
+INSERT INTO hour_partitioned VALUES
+ (1, 'h1', TIMESTAMP '2024-01-15 10:00:00'),
+ (2, 'h2', TIMESTAMP '2024-01-15 10:30:00'),
+ (3, 'h3', TIMESTAMP '2024-01-15 10:59:59'),
+ (4, 'h4', TIMESTAMP '2024-01-15 11:00:00'),
+ (5, 'h5', TIMESTAMP '2024-01-15 11:30:00'),
+ (6, 'h6', TIMESTAMP '2024-01-15 23:00:00'),
+ (7, 'h7', TIMESTAMP '2024-01-15 23:59:59'),
+ (8, 'h8', TIMESTAMP '2024-01-16 00:00:00'),
+ (9, 'null', NULL);
+
+-- Create _copy tables for write testing
+CREATE TABLE day_partitioned_copy AS
+SELECT * FROM day_partitioned;
+
+CREATE TABLE year_partitioned_copy AS
+SELECT * FROM year_partitioned;
+
+CREATE TABLE month_partitioned_copy AS
+SELECT * FROM month_partitioned;
+
+CREATE TABLE hour_partitioned_copy AS
+SELECT * FROM hour_partitioned;
+
-- create table for testing query partitions with snapshot has been expired
CREATE TABLE test_partitions_with_expired_snapshot (
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java
index f1c2b73b279..9587ca4f816 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java
@@ -609,11 +609,47 @@ public class IcebergUtils {
}
}
- public static Map<String, String> getPartitionInfoMap(PartitionData
partitionData, String timeZone) {
+ /**
+ * Get partition info map for identity partitions only, considering
partition
+ * evolution.
+ * For non-identity partitions (e.g., day, bucket, truncate), returns null
to
+ * skip
+ * dynamic partition pruning.
+ *
+ * @param partitionData The partition data from the file
+ * @param partitionSpec The partition spec corresponding to the file's
specId
+ * (required)
+ * @param timeZone The time zone for timestamp serialization
+ * @return Map of partition field name to partition value string, or null
if
+ * there are non-identity partitions
+ */
+ public static Map<String, String> getPartitionInfoMap(PartitionData
partitionData, PartitionSpec partitionSpec,
+ String timeZone) {
Map<String, String> partitionInfoMap = new HashMap<>();
List<NestedField> fields =
partitionData.getPartitionType().asNestedType().fields();
+
+ // Check if all partition fields are identity transform
+ // If any field is not identity, return null to skip dynamic partition
pruning
+ List<PartitionField> partitionFields = partitionSpec.fields();
+ Preconditions.checkArgument(fields.size() == partitionFields.size(),
+ "PartitionData fields size does not match PartitionSpec fields
size");
+
for (int i = 0; i < fields.size(); i++) {
NestedField field = fields.get(i);
+ PartitionField partitionField = partitionFields.get(i);
+
+ // Only process identity transform partitions
+ // For other transforms (day, bucket, truncate, etc.), skip
dynamic partition
+ // pruning
+ if (!partitionField.transform().isIdentity()) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(
+ "Skip dynamic partition pruning for non-identity
partition field: {} with transform: {}",
+ field.name(),
partitionField.transform().toString());
+ }
+ return null;
+ }
+
Object value = partitionData.get(i);
try {
String partitionString = serializePartitionValue(field.type(),
value, timeZone);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
index 2ad596fa9a3..f5208397a0f 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/source/IcebergScanNode.java
@@ -64,6 +64,7 @@ import org.apache.iceberg.FileScanTask;
import org.apache.iceberg.ManifestFile;
import org.apache.iceberg.MetadataColumns;
import org.apache.iceberg.PartitionData;
+import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Snapshot;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableScan;
@@ -382,10 +383,19 @@ public class IcebergScanNode extends FileQueryScanNode {
if (isPartitionedTable) {
PartitionData partitionData = (PartitionData)
fileScanTask.file().partition();
if (sessionVariable.isEnableRuntimeFilterPartitionPrune()) {
- // If the partition data is not in the map, we need to
calculate the partition
- Map<String, String> partitionInfoMap =
partitionMapInfos.computeIfAbsent(partitionData, k -> {
- return IcebergUtils.getPartitionInfoMap(partitionData,
sessionVariable.getTimeZone());
- });
+ // Get specId and corresponding PartitionSpec to handle
partition evolution
+ int specId = fileScanTask.file().specId();
+ PartitionSpec partitionSpec = icebergTable.specs().get(specId);
+
+ Preconditions.checkNotNull(partitionSpec, "Partition spec with
specId %s not found for table %s",
+ specId, icebergTable.name());
+ Map<String, String> partitionInfoMap =
partitionMapInfos.computeIfAbsent(
+ partitionData, k -> {
+ return
IcebergUtils.getPartitionInfoMap(partitionData, partitionSpec,
+ sessionVariable.getTimeZone());
+ });
+ // Only set partition values if all partitions are identity
transform
+ // For non-identity partitions, getPartitionInfoMap returns
null to skip dynamic partition pruning
if (partitionInfoMap != null) {
split.setIcebergPartitionValues(partitionInfoMap);
}
diff --git
a/regression-test/data/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.out
b/regression-test/data/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.out
index 1c26361afa5..fb18cc11885 100644
---
a/regression-test/data/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.out
+++
b/regression-test/data/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.out
@@ -71,6 +71,60 @@
-- !trunc_decimal_in --
2
+-- !day_eq --
+1
+
+-- !day_in --
+2
+
+-- !day_range_ge --
+7
+
+-- !day_range_lt --
+3
+
+-- !day_range_between --
+3
+
+-- !day_range_multi --
+5
+
+-- !year_eq --
+1
+
+-- !year_in --
+2
+
+-- !year_range --
+3
+
+-- !year_range_cross --
+3
+
+-- !month_eq --
+1
+
+-- !month_in --
+2
+
+-- !month_range --
+3
+
+-- !month_range_multi --
+6
+
+-- !hour_eq --
+1
+
+-- !hour_in --
+2
+
+-- !hour_range --
+3
+
+-- !hour_range_multi --
+5
+
-- !bucket_int_eq --
1
@@ -143,3 +197,57 @@
-- !trunc_decimal_in --
2
+-- !day_eq --
+1
+
+-- !day_in --
+2
+
+-- !day_range_ge --
+7
+
+-- !day_range_lt --
+3
+
+-- !day_range_between --
+3
+
+-- !day_range_multi --
+5
+
+-- !year_eq --
+1
+
+-- !year_in --
+2
+
+-- !year_range --
+3
+
+-- !year_range_cross --
+3
+
+-- !month_eq --
+1
+
+-- !month_in --
+2
+
+-- !month_range --
+3
+
+-- !month_range_multi --
+6
+
+-- !hour_eq --
+1
+
+-- !hour_in --
+2
+
+-- !hour_range --
+3
+
+-- !hour_range_multi --
+5
+
diff --git
a/regression-test/suites/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.groovy
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.groovy
index 63a723ebe9e..7a2dbcfab92 100644
---
a/regression-test/suites/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.groovy
+++
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_runtime_filter_partition_pruning_transform.groovy
@@ -201,6 +201,104 @@
suite("test_iceberg_runtime_filter_partition_pruning_transform", "p0,external,do
group by partition_key having count(*) > 0
order by partition_key desc limit 2);
"""
+
+ // Time transform partitions (day/year/month/hour)
+ // Note: Bucket and Truncate transforms are not supported for runtime
filter partition pruning,
+ // but Day/Year/Month/Hour transforms are supported.
+
+ // Day transform tests
+ qt_day_eq """
+ select count(*) from day_partitioned where ts =
+ (select ts from day_partitioned
+ group by ts having count(*) > 0
+ order by ts desc limit 1);
+ """
+ qt_day_in """
+ select count(*) from day_partitioned where ts in
+ (select ts from day_partitioned
+ group by ts having count(*) > 0
+ order by ts desc limit 2);
+ """
+ qt_day_range_ge """
+ select count(*) from day_partitioned where ts >= '2024-01-15
00:00:00';
+ """
+ qt_day_range_lt """
+ select count(*) from day_partitioned where ts < '2024-01-16
00:00:00';
+ """
+ qt_day_range_between """
+ select count(*) from day_partitioned where ts >= '2024-01-15
00:00:00'
+ and ts < '2024-01-16 00:00:00';
+ """
+ qt_day_range_multi """
+ select count(*) from day_partitioned where ts >= '2024-01-15
00:00:00'
+ and ts < '2024-01-17 00:00:00';
+ """
+
+ // Year transform tests
+ qt_year_eq """
+ select count(*) from year_partitioned where ts =
+ (select ts from year_partitioned
+ group by ts having count(*) > 0
+ order by ts desc limit 1);
+ """
+ qt_year_in """
+ select count(*) from year_partitioned where ts in
+ (select ts from year_partitioned
+ group by ts having count(*) > 0
+ order by ts desc limit 2);
+ """
+ qt_year_range """
+ select count(*) from year_partitioned where ts >= '2024-01-01
00:00:00'
+ and ts < '2025-01-01 00:00:00';
+ """
+ qt_year_range_cross """
+ select count(*) from year_partitioned where ts >= '2023-06-01
00:00:00'
+ and ts < '2024-06-01 00:00:00';
+ """
+
+ // Month transform tests
+ qt_month_eq """
+ select count(*) from month_partitioned where ts =
+ (select ts from month_partitioned
+ group by ts having count(*) > 0
+ order by ts desc limit 1);
+ """
+ qt_month_in """
+ select count(*) from month_partitioned where ts in
+ (select ts from month_partitioned
+ group by ts having count(*) > 0
+ order by ts desc limit 2);
+ """
+ qt_month_range """
+ select count(*) from month_partitioned where ts >= '2024-01-01
00:00:00'
+ and ts < '2024-02-01 00:00:00';
+ """
+ qt_month_range_multi """
+ select count(*) from month_partitioned where ts >= '2024-01-01
00:00:00'
+ and ts < '2024-03-01 00:00:00';
+ """
+
+ // Hour transform tests
+ qt_hour_eq """
+ select count(*) from hour_partitioned where ts =
+ (select ts from hour_partitioned
+ group by ts having count(*) > 0
+ order by ts desc limit 1);
+ """
+ qt_hour_in """
+ select count(*) from hour_partitioned where ts in
+ (select ts from hour_partitioned
+ group by ts having count(*) > 0
+ order by ts desc limit 2);
+ """
+ qt_hour_range """
+ select count(*) from hour_partitioned where ts >= '2024-01-15
10:00:00'
+ and ts < '2024-01-15 11:00:00';
+ """
+ qt_hour_range_multi """
+ select count(*) from hour_partitioned where ts >= '2024-01-15
10:00:00'
+ and ts < '2024-01-15 12:00:00';
+ """
}
try {
sql """ set time_zone = 'Asia/Shanghai'; """
diff --git
a/regression-test/suites/external_table_p0/iceberg/write/test_iceberg_write_transform_partitions.groovy
b/regression-test/suites/external_table_p0/iceberg/write/test_iceberg_write_transform_partitions.groovy
index 2662de4dfda..2d8e265222e 100644
---
a/regression-test/suites/external_table_p0/iceberg/write/test_iceberg_write_transform_partitions.groovy
+++
b/regression-test/suites/external_table_p0/iceberg/write/test_iceberg_write_transform_partitions.groovy
@@ -64,6 +64,12 @@ suite("test_iceberg_write_transform_partitions",
"p0,external,iceberg,external_d
test_write_transform_partitions("truncate_int_10");
test_write_transform_partitions("truncate_bigint_100");
test_write_transform_partitions("truncate_decimal_10");
+
+ // Time transform partitions (day/year/month/hour)
+ test_write_transform_partitions("day_partitioned");
+ test_write_transform_partitions("year_partitioned");
+ test_write_transform_partitions("month_partitioned");
+ test_write_transform_partitions("hour_partitioned");
} finally {
sql """ unset variable time_zone; """
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]