This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new ebe2d222d78 [fix](hudi) fix quering hudi table with timestamp key
(#53791)
ebe2d222d78 is described below
commit ebe2d222d7834804019c81e650b2f4228cc66984
Author: Socrates <[email protected]>
AuthorDate: Mon Jul 28 11:49:13 2025 +0800
[fix](hudi) fix quering hudi table with timestamp key (#53791)
### What problem does this PR solve?
Problem Summary:
When querying a Hudi table with timestamp type as the partition key in
Doris, an error will occur:
<img width="1280" height="455" alt="image"
src="https://github.com/user-attachments/assets/1ea05e75-9713-46b0-99e4-95584309a1a2"
/>
Issue Details:
- Hudi tables with timestamp partition columns (e.g., 2023-12-01T08:00)
store partition paths in HMS with URL encoding
- Special characters like : in timestamps get encoded as %3A, and in
some cases may be double-encoded as %253A
**The Solution:** The changes add proper URL unescaping using
`FileUtils.unescapePathName()` from Hadoop Hive common library to handle
encoded partition paths correctly.
---
.../hudi/source/HudiCachedPartitionProcessor.java | 6 ++++++
.../hudi/source/HudiPartitionProcessor.java | 9 +++++----
.../hudi/test_hudi_partition_prune.out | Bin 5389 -> 5549 bytes
.../hudi/test_hudi_partition_prune.groovy | 10 +++++-----
4 files changed, 16 insertions(+), 9 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiCachedPartitionProcessor.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiCachedPartitionProcessor.java
index 1db39c230a1..6356698c067 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiCachedPartitionProcessor.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiCachedPartitionProcessor.java
@@ -31,6 +31,7 @@ import org.apache.doris.datasource.hive.HMSExternalTable;
import com.github.benmanes.caffeine.cache.LoadingCache;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
+import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
@@ -154,6 +155,11 @@ public class HudiCachedPartitionProcessor extends
HudiPartitionProcessor {
// we can still obtain the partition information through
the HMS API.
partitionNames = catalog.getClient()
.listPartitionNames(table.getRemoteDbName(),
table.getRemoteName());
+ // HMS stored Hudi partition paths may have double
encoding issue (e.g., %3A
+ // becomes %253A), need to unescape first here.
+ partitionNames = partitionNames.stream()
+ .map(FileUtils::unescapePathName)
+ .collect(Collectors.toList());
if (partitionNames.size() == 0) {
LOG.warn("Failed to get partitions from hms api,
switch it from hudi api.");
partitionNames = getAllPartitionNames(tableMetaClient);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiPartitionProcessor.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiPartitionProcessor.java
index cb5e2993a56..b1e5bd4a82d 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiPartitionProcessor.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiPartitionProcessor.java
@@ -19,6 +19,7 @@ package org.apache.doris.datasource.hudi.source;
import org.apache.doris.datasource.ExternalTable;
+import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
@@ -98,8 +99,8 @@ public abstract class HudiPartitionProcessor {
} else {
partitionValue = partitionPath;
}
- // TODO: In hive, the specific characters like '=', '/' will
be url encoded
- return Collections.singletonList(partitionValue);
+ // In hive, the specific characters like '=', '/' will be url
encoded
+ return
Collections.singletonList(FileUtils.unescapePathName(partitionValue));
} else {
// If the partition column size is not equal to the partition
fragments size
// and the partition column size > 1, we do not know how to
map the partition
@@ -119,9 +120,9 @@ public abstract class HudiPartitionProcessor {
for (int i = 0; i < partitionFragments.length; i++) {
String prefix = partitionColumns.get(i) + "=";
if (partitionFragments[i].startsWith(prefix)) {
-
partitionValues.add(partitionFragments[i].substring(prefix.length()));
+
partitionValues.add(FileUtils.unescapePathName(partitionFragments[i].substring(prefix.length())));
} else {
- partitionValues.add(partitionFragments[i]);
+
partitionValues.add(FileUtils.unescapePathName(partitionFragments[i]));
}
}
return partitionValues;
diff --git
a/regression-test/data/external_table_p2/hudi/test_hudi_partition_prune.out
b/regression-test/data/external_table_p2/hudi/test_hudi_partition_prune.out
index fd3eafa0255..d3d4600a0e6 100644
Binary files
a/regression-test/data/external_table_p2/hudi/test_hudi_partition_prune.out and
b/regression-test/data/external_table_p2/hudi/test_hudi_partition_prune.out
differ
diff --git
a/regression-test/suites/external_table_p2/hudi/test_hudi_partition_prune.groovy
b/regression-test/suites/external_table_p2/hudi/test_hudi_partition_prune.groovy
index 063439d9a87..629923da306 100644
---
a/regression-test/suites/external_table_p2/hudi/test_hudi_partition_prune.groovy
+++
b/regression-test/suites/external_table_p2/hudi/test_hudi_partition_prune.groovy
@@ -320,11 +320,11 @@ suite("test_hudi_partition_prune",
"p2,external,hudi,external_remote,external_re
sql("${one_partition_date}")
contains "partition=1/2"
}
- // qt_one_partition_timestamp one_partition_timestamp
- // explain {
- // sql("${one_partition_timestamp}")
- // contains "partition=1/2"
- // }
+ qt_one_partition_timestamp one_partition_timestamp
+ explain {
+ sql("${one_partition_timestamp}")
+ contains "partition=1/2"
+ }
sql """drop catalog if exists ${catalog_name};"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]