This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 052105b0f76 [fix](hudi)add hudiOrcReader for read hudi table & orc
format. (#52964)
052105b0f76 is described below
commit 052105b0f760c2881b4ef1b1726868054b5f1ca9
Author: daidai <[email protected]>
AuthorDate: Wed Jul 9 22:31:44 2025 +0800
[fix](hudi)add hudiOrcReader for read hudi table & orc format. (#52964)
### What problem does this PR solve?
Related PR: #51341
Problem Summary:
In pr #51341, hudiOrcReader was deleted, and this pr reintroduced it to
read hudi orc table.
Although I encountered this error when testing spark-hudi to read orc,
the orc file was indeed generated by spark-hudi.
```
java.lang.UnsupportedOperationException: Base file format is not currently
supported (ORC)
at
org.apache.hudi.HoodieBaseRelation.createBaseFileReader(HoodieBaseRelation.scala:574)
~[hudi-spark3.4-bundle_2.12-0.14.0-1.jar:0.14.0-1]
at
org.apache.hudi.BaseFileOnlyRelation.composeRDD(BaseFileOnlyRelation.scala:96)
~[hudi-spark3.4-bundle_2.12-0.14.0-1.jar:0.14.0-1]
at
org.apache.hudi.HoodieBaseRelation.buildScan(HoodieBaseRelation.scala:381)
~[hudi-spark3.4-bundle_2.12-0.14.0-1.jar:0.14.0-1]
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy$.$anonfun$apply$4(DataSourceStrategy.scala:329)
~[spark-sql_2.12-3.4.2.jar:0.14.0-1]
```
---
be/src/vec/exec/format/table/hudi_reader.h | 32 +++++++++++++++++++++
be/src/vec/exec/scan/file_scanner.cpp | 28 +++++++++++++-----
be/test/vec/exec/vfile_scanner_exception_test.cpp | 2 +-
.../hudi/test_hudi_full_schema_change.out | Bin 34801 -> 69506 bytes
.../hudi/test_hudi_full_schema_change.groovy | 2 +-
5 files changed, 55 insertions(+), 9 deletions(-)
diff --git a/be/src/vec/exec/format/table/hudi_reader.h
b/be/src/vec/exec/format/table/hudi_reader.h
index add7d094ae2..751094018c9 100644
--- a/be/src/vec/exec/format/table/hudi_reader.h
+++ b/be/src/vec/exec/format/table/hudi_reader.h
@@ -57,5 +57,37 @@ public:
const VExprContextSPtrs* not_single_slot_filter_conjuncts,
const std::unordered_map<int, VExprContextSPtrs>*
slot_id_to_filter_conjuncts);
};
+
+class HudiOrcReader final : public HudiReader {
+public:
+ ENABLE_FACTORY_CREATOR(HudiOrcReader);
+ HudiOrcReader(std::unique_ptr<GenericReader> file_format_reader,
RuntimeProfile* profile,
+ RuntimeState* state, const TFileScanRangeParams& params,
+ const TFileRangeDesc& range, io::IOContext* io_ctx)
+ : HudiReader(std::move(file_format_reader), profile, state,
params, range, io_ctx) {};
+ ~HudiOrcReader() final = default;
+
+ Status init_reader(
+ const std::vector<std::string>& read_table_col_names,
+ const std::unordered_map<std::string, ColumnValueRangeType>*
+ table_col_name_to_value_range,
+ const VExprContextSPtrs& conjuncts, const TupleDescriptor*
tuple_descriptor,
+ const RowDescriptor* row_descriptor,
+ const VExprContextSPtrs* not_single_slot_filter_conjuncts,
+ const std::unordered_map<int, VExprContextSPtrs>*
slot_id_to_filter_conjuncts) {
+ auto* orc_reader = static_cast<OrcReader*>(_file_format_reader.get());
+ const orc::Type* orc_type_ptr = nullptr;
+ RETURN_IF_ERROR(orc_reader->get_file_type(&orc_type_ptr));
+ RETURN_IF_ERROR(gen_table_info_node_by_field_id(
+ _params, _range.table_format_params.hudi_params.schema_id,
tuple_descriptor,
+ orc_type_ptr));
+
+ return orc_reader->init_reader(&read_table_col_names,
table_col_name_to_value_range,
+ conjuncts, false, tuple_descriptor,
row_descriptor,
+ not_single_slot_filter_conjuncts,
+ slot_id_to_filter_conjuncts,
table_info_node_ptr);
+ }
+};
+
#include "common/compile_check_end.h"
} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/scan/file_scanner.cpp
b/be/src/vec/exec/scan/file_scanner.cpp
index 1459cac38b5..6502e151140 100644
--- a/be/src/vec/exec/scan/file_scanner.cpp
+++ b/be/src/vec/exec/scan/file_scanner.cpp
@@ -1077,12 +1077,16 @@ Status FileScanner::_get_next_reader() {
break;
}
default:
- return Status::InternalError("Not supported file format: {}",
_params->format_type);
+ return Status::NotSupported("Not supported create reader for file
format: {}.",
+ to_string(_params->format_type));
}
if (_cur_reader == nullptr) {
- return Status::InternalError("Failed to create reader for file
format: {}",
- _params->format_type);
+ return Status::NotSupported(
+ "Not supported create reader for table format: {} / file
format: {}.",
+ range.__isset.table_format_params ?
range.table_format_params.table_format_type
+ : "NotSet",
+ to_string(_params->format_type));
}
COUNTER_UPDATE(_file_counter, 1);
// The FileScanner for external table may try to open not exist files,
@@ -1238,6 +1242,16 @@ Status
FileScanner::_init_orc_reader(std::unique_ptr<OrcReader>&& orc_reader) {
&_slot_id_to_filter_conjuncts);
RETURN_IF_ERROR(paimon_reader->init_row_filters());
_cur_reader = std::move(paimon_reader);
+ } else if (range.__isset.table_format_params &&
+ range.table_format_params.table_format_type == "hudi") {
+ std::unique_ptr<HudiOrcReader> hudi_reader =
HudiOrcReader::create_unique(
+ std::move(orc_reader), _profile, _state, *_params, range,
_io_ctx.get());
+
+ init_status = hudi_reader->init_reader(
+ _file_col_names, _colname_to_value_range,
_push_down_conjuncts, _real_tuple_desc,
+ _default_val_row_desc.get(),
&_not_single_slot_filter_conjuncts,
+ &_slot_id_to_filter_conjuncts);
+ _cur_reader = std::move(hudi_reader);
} else if (range.__isset.table_format_params &&
range.table_format_params.table_format_type == "hive") {
std::unique_ptr<HiveOrcReader> hive_reader =
HiveOrcReader::create_unique(
@@ -1390,10 +1404,10 @@ Status FileScanner::read_one_line_from_range(const
TFileRangeDesc& range,
break;
}
default: {
- return Status::InternalError(
- "Failed to create one line reader for file format:
{},"
- "only support parquet and orc",
- _params->format_type);
+ return Status::NotSupported(
+ "Not support create lines reader for file format:
{},"
+ "only support parquet and orc.",
+ to_string(_params->format_type));
}
}
return Status::OK();
diff --git a/be/test/vec/exec/vfile_scanner_exception_test.cpp
b/be/test/vec/exec/vfile_scanner_exception_test.cpp
index 8fa37c26278..d1a493a0d97 100644
--- a/be/test/vec/exec/vfile_scanner_exception_test.cpp
+++ b/be/test/vec/exec/vfile_scanner_exception_test.cpp
@@ -299,7 +299,7 @@ TEST_F(VfileScannerExceptionTest, failure_case) {
auto st = scanner->get_block(&_runtime_state, block.get(), &eof);
ASSERT_FALSE(st.ok());
auto msg = st.to_string();
- auto pos = msg.find("Failed to create reader for");
+ auto pos = msg.find("Not supported create reader");
std::cout << "msg = " << msg << std::endl;
ASSERT_TRUE(pos != msg.npos);
WARN_IF_ERROR(scanner->close(&_runtime_state), "fail to close scanner");
diff --git
a/regression-test/data/external_table_p2/hudi/test_hudi_full_schema_change.out
b/regression-test/data/external_table_p2/hudi/test_hudi_full_schema_change.out
index 8a1e62f8249..e6f6b836635 100644
Binary files
a/regression-test/data/external_table_p2/hudi/test_hudi_full_schema_change.out
and
b/regression-test/data/external_table_p2/hudi/test_hudi_full_schema_change.out
differ
diff --git
a/regression-test/suites/external_table_p2/hudi/test_hudi_full_schema_change.groovy
b/regression-test/suites/external_table_p2/hudi/test_hudi_full_schema_change.groovy
index 1b427faf53b..173e6f64657 100644
---
a/regression-test/suites/external_table_p2/hudi/test_hudi_full_schema_change.groovy
+++
b/regression-test/suites/external_table_p2/hudi/test_hudi_full_schema_change.groovy
@@ -38,7 +38,7 @@ suite("test_hudi_full_schema_change",
"p2,external,hudi,external_remote,external
- def tables = ["hudi_full_schema_change_parquet"]
+ def tables =
["hudi_full_schema_change_parquet","hudi_full_schema_change_orc"]
for (String table: tables) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]