This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new f3c415dfaec [fix](hudi) support reading hudi read optimized table with
orc format (#44995)
f3c415dfaec is described below
commit f3c415dfaec698a443b657229a56be06923ed522
Author: Socrates <[email protected]>
AuthorDate: Thu Dec 5 01:05:54 2024 +0800
[fix](hudi) support reading hudi read optimized table with orc format
(#44995)
### What problem does this PR solve?
Problem Summary:
When reading the hudi ro table, it will be pushed back from jni to the
native reader. However, this process will default the file format to
parquet, and does not consider the situation that the hudi table is
stored in orc format.
1. support reading hudi read optimized table with orc format
2. fix explain results of hudiScanNode when force_jni_reader=true
3. add cases about timestamp with different timezones
---
.../doris/datasource/hudi/source/HudiScanNode.java | 14 +++++++--
.../hudi/test_hudi_orc_tables.out | 15 ++++++++++
.../external_table_p2/hudi/test_hudi_timestamp.out | 31 ++++++++++++++++++--
...imestamp.groovy => test_hudi_orc_tables.groovy} | 33 ++++------------------
.../hudi/test_hudi_timestamp.groovy | 18 ++++++++++--
5 files changed, 76 insertions(+), 35 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java
index 28805aae63c..a73a2065d0f 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java
@@ -25,6 +25,7 @@ import org.apache.doris.catalog.PartitionItem;
import org.apache.doris.catalog.Type;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.UserException;
+import org.apache.doris.common.util.FileFormatUtils;
import org.apache.doris.common.util.LocationPath;
import org.apache.doris.datasource.ExternalTable;
import org.apache.doris.datasource.FileSplit;
@@ -247,8 +248,15 @@ public class HudiScanNode extends HiveScanNode {
&& !sessionVariable.isForceJniScanner()
&& hudiSplit.getHudiDeltaLogs().isEmpty()) {
// no logs, is read optimize table, fallback to use native
reader
- // TODO: support read orc hudi table in native reader
- rangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET);
+ String fileFormat =
FileFormatUtils.getFileFormatBySuffix(hudiSplit.getDataFilePath())
+ .orElse("Unknown");
+ if (fileFormat.equals("parquet")) {
+ rangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET);
+ } else if (fileFormat.equals("orc")) {
+ rangeDesc.setFormatType(TFileFormatType.FORMAT_ORC);
+ } else {
+ throw new RuntimeException("Unsupported file format: " +
fileFormat);
+ }
}
setHudiParams(rangeDesc, hudiSplit);
}
@@ -495,7 +503,7 @@ public class HudiScanNode extends HiveScanNode {
List<String> logs = fileSlice.getLogFiles().map(HoodieLogFile::getPath)
.map(StoragePath::toString)
.collect(Collectors.toList());
- if (logs.isEmpty()) {
+ if (logs.isEmpty() && !sessionVariable.isForceJniScanner()) {
noLogsSplitNum.incrementAndGet();
}
diff --git
a/regression-test/data/external_table_p2/hudi/test_hudi_orc_tables.out
b/regression-test/data/external_table_p2/hudi/test_hudi_orc_tables.out
new file mode 100644
index 00000000000..9e28074dc91
--- /dev/null
+++ b/regression-test/data/external_table_p2/hudi/test_hudi_orc_tables.out
@@ -0,0 +1,15 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !cow --
+20241204190011744 20241204190011744_0_6 20241204190011744_0_0
a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 1
A
+20241204190011744 20241204190011744_0_7 20241204190011744_2_0
a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 3
C
+20241204190011744 20241204190011744_0_8 20241204190011744_4_0
a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 5
E
+20241204190011744 20241204190011744_0_9 20241204190011744_1_0
a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 2
B
+20241204190011744 20241204190011744_0_10 20241204190011744_3_0
a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 4
D
+
+-- !mor --
+20241204190002046 20241204190002046_0_11 20241204190002046_0_0
b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 1
A
+20241204190002046 20241204190002046_0_12 20241204190002046_2_0
b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 3
C
+20241204190002046 20241204190002046_0_13 20241204190002046_4_0
b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 5
E
+20241204190002046 20241204190002046_0_14 20241204190002046_1_0
b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 2
B
+20241204190002046 20241204190002046_0_15 20241204190002046_3_0
b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 4
D
+
diff --git
a/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out
b/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out
index dc47ff86d90..9bdb0f7cb72 100644
--- a/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out
+++ b/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out
@@ -1,6 +1,31 @@
-- This file is automatically generated. You should know what you did if you
want to edit this
--- !timestamp --
+-- !timestamp1 --
20241115015956800 20241115015956800_0_2 1
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1
Alice 2024-10-25T08:00
-20241115015956800 20241115015956800_0_0 2
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2
Bob 2024-10-25T09:30:00
-20241115015956800 20241115015956800_0_1 3
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3
Charlie 2024-10-25T11:00:00
+20241115015956800 20241115015956800_0_0 2
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2
Bob 2024-10-25T09:30
+20241115015956800 20241115015956800_0_1 3
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3
Charlie 2024-10-25T11:00
+
+-- !timestamp2 --
+20241115015956800 20241115015956800_0_2 1
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1
Alice 2024-10-25T23:00
+20241115015956800 20241115015956800_0_0 2
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2
Bob 2024-10-26T00:30
+20241115015956800 20241115015956800_0_1 3
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3
Charlie 2024-10-26T02:00
+
+-- !timestamp3 --
+20241115015956800 20241115015956800_0_2 1
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1
Alice 2024-10-25T15:00
+20241115015956800 20241115015956800_0_0 2
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2
Bob 2024-10-25T16:30
+20241115015956800 20241115015956800_0_1 3
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3
Charlie 2024-10-25T18:00
+
+-- !timestamp1 --
+20241115015956800 20241115015956800_0_2 1
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1
Alice 2024-10-25T08:00
+20241115015956800 20241115015956800_0_0 2
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2
Bob 2024-10-25T09:30
+20241115015956800 20241115015956800_0_1 3
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3
Charlie 2024-10-25T11:00
+
+-- !timestamp2 --
+20241115015956800 20241115015956800_0_2 1
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1
Alice 2024-10-25T23:00
+20241115015956800 20241115015956800_0_0 2
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2
Bob 2024-10-26T00:30
+20241115015956800 20241115015956800_0_1 3
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3
Charlie 2024-10-26T02:00
+
+-- !timestamp3 --
+20241115015956800 20241115015956800_0_2 1
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1
Alice 2024-10-25T15:00
+20241115015956800 20241115015956800_0_0 2
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2
Bob 2024-10-25T16:30
+20241115015956800 20241115015956800_0_1 3
eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3
Charlie 2024-10-25T18:00
diff --git
a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy
b/regression-test/suites/external_table_p2/hudi/test_hudi_orc_tables.groovy
similarity index 63%
copy from
regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy
copy to
regression-test/suites/external_table_p2/hudi/test_hudi_orc_tables.groovy
index 36309322558..43638a23881 100644
--- a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy
+++ b/regression-test/suites/external_table_p2/hudi/test_hudi_orc_tables.groovy
@@ -15,13 +15,13 @@
// specific language governing permissions and limitations
// under the License.
-suite("test_hudi_timestamp",
"p2,external,hudi,external_remote,external_remote_hudi") {
+suite("test_hudi_orc_tables",
"p2,external,hudi,external_remote,external_remote_hudi") {
String enabled = context.config.otherConfigs.get("enableExternalHudiTest")
if (enabled == null || !enabled.equalsIgnoreCase("true")) {
logger.info("disable hudi test")
}
- String catalog_name = "test_hudi_timestamp"
+ String catalog_name = "test_hudi_orc_tables"
String props = context.config.otherConfigs.get("hudiEmrCatalog")
sql """drop catalog if exists ${catalog_name};"""
sql """
@@ -33,30 +33,9 @@ suite("test_hudi_timestamp",
"p2,external,hudi,external_remote,external_remote_h
sql """ switch ${catalog_name};"""
sql """ use regression_hudi;"""
sql """ set enable_fallback_to_original_planner=false """
-
- // TODO: fix hudi timezone issue and enable this
- // qt_timestamp """ select * from hudi_table_with_timestamp order by id;
"""
+
+ qt_cow """ select * from orc_hudi_table_cow; """
+ qt_mor """ select * from orc_hudi_table_mor; """
sql """drop catalog if exists ${catalog_name};"""
-}
-
-// DROP TABLE IF EXISTS hudi_table_with_timestamp;
-
-// -- create table
-// CREATE TABLE hudi_table_with_timestamp (
-// id STRING,
-// name STRING,
-// event_time TIMESTAMP
-// ) USING HUDI
-// OPTIONS (
-// type = 'cow',
-// primaryKey = 'id',
-// preCombineField = 'event_time'
-// );
-
-// SET TIME ZONE 'America/Los_Angeles';
-
-// INSERT OVERWRITE hudi_table_with_timestamp VALUES
-// ('1', 'Alice', timestamp('2024-10-25 08:00:00')),
-// ('2', 'Bob', timestamp('2024-10-25 09:30:00')),
-// ('3', 'Charlie', timestamp('2024-10-25 11:00:00'));
+}
\ No newline at end of file
diff --git
a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy
b/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy
index 36309322558..3d7bd40b2d5 100644
--- a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy
+++ b/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy
@@ -34,8 +34,22 @@ suite("test_hudi_timestamp",
"p2,external,hudi,external_remote,external_remote_h
sql """ use regression_hudi;"""
sql """ set enable_fallback_to_original_planner=false """
- // TODO: fix hudi timezone issue and enable this
- // qt_timestamp """ select * from hudi_table_with_timestamp order by id;
"""
+ def test_timestamp_different_timezones = {
+ sql """set time_zone = 'America/Los_Angeles';"""
+ qt_timestamp1 """ select * from hudi_table_with_timestamp order by id;
"""
+ sql """set time_zone = 'Asia/Shanghai';"""
+ qt_timestamp2 """ select * from hudi_table_with_timestamp order by id;
"""
+ sql """set time_zone = 'UTC';"""
+ qt_timestamp3 """ select * from hudi_table_with_timestamp order by id;
"""
+ }
+
+ // test native reader
+ test_timestamp_different_timezones()
+ sql """ set force_jni_scanner = true; """
+ // test jni reader
+ test_timestamp_different_timezones()
+ sql """ set force_jni_scanner = false; """
+
sql """drop catalog if exists ${catalog_name};"""
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]