This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 052105b0f76 [fix](hudi)add hudiOrcReader for read hudi table & orc 
format. (#52964)
052105b0f76 is described below

commit 052105b0f760c2881b4ef1b1726868054b5f1ca9
Author: daidai <[email protected]>
AuthorDate: Wed Jul 9 22:31:44 2025 +0800

    [fix](hudi)add hudiOrcReader for read hudi table & orc format. (#52964)
    
    ### What problem does this PR solve?
    
    Related PR: #51341
    
    Problem Summary:
    In pr #51341, hudiOrcReader was deleted, and this pr reintroduced it to
    read hudi orc table.
    Although I encountered this error when testing spark-hudi to read orc,
    the orc file was indeed generated by spark-hudi.
    
    ```
    java.lang.UnsupportedOperationException: Base file format is not currently 
supported (ORC)
            at 
org.apache.hudi.HoodieBaseRelation.createBaseFileReader(HoodieBaseRelation.scala:574)
 ~[hudi-spark3.4-bundle_2.12-0.14.0-1.jar:0.14.0-1]
            at 
org.apache.hudi.BaseFileOnlyRelation.composeRDD(BaseFileOnlyRelation.scala:96) 
~[hudi-spark3.4-bundle_2.12-0.14.0-1.jar:0.14.0-1]
            at 
org.apache.hudi.HoodieBaseRelation.buildScan(HoodieBaseRelation.scala:381) 
~[hudi-spark3.4-bundle_2.12-0.14.0-1.jar:0.14.0-1]
            at 
org.apache.spark.sql.execution.datasources.DataSourceStrategy$.$anonfun$apply$4(DataSourceStrategy.scala:329)
 ~[spark-sql_2.12-3.4.2.jar:0.14.0-1]
    ```
---
 be/src/vec/exec/format/table/hudi_reader.h         |  32 +++++++++++++++++++++
 be/src/vec/exec/scan/file_scanner.cpp              |  28 +++++++++++++-----
 be/test/vec/exec/vfile_scanner_exception_test.cpp  |   2 +-
 .../hudi/test_hudi_full_schema_change.out          | Bin 34801 -> 69506 bytes
 .../hudi/test_hudi_full_schema_change.groovy       |   2 +-
 5 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/be/src/vec/exec/format/table/hudi_reader.h 
b/be/src/vec/exec/format/table/hudi_reader.h
index add7d094ae2..751094018c9 100644
--- a/be/src/vec/exec/format/table/hudi_reader.h
+++ b/be/src/vec/exec/format/table/hudi_reader.h
@@ -57,5 +57,37 @@ public:
             const VExprContextSPtrs* not_single_slot_filter_conjuncts,
             const std::unordered_map<int, VExprContextSPtrs>* 
slot_id_to_filter_conjuncts);
 };
+
+class HudiOrcReader final : public HudiReader {
+public:
+    ENABLE_FACTORY_CREATOR(HudiOrcReader);
+    HudiOrcReader(std::unique_ptr<GenericReader> file_format_reader, 
RuntimeProfile* profile,
+                  RuntimeState* state, const TFileScanRangeParams& params,
+                  const TFileRangeDesc& range, io::IOContext* io_ctx)
+            : HudiReader(std::move(file_format_reader), profile, state, 
params, range, io_ctx) {};
+    ~HudiOrcReader() final = default;
+
+    Status init_reader(
+            const std::vector<std::string>& read_table_col_names,
+            const std::unordered_map<std::string, ColumnValueRangeType>*
+                    table_col_name_to_value_range,
+            const VExprContextSPtrs& conjuncts, const TupleDescriptor* 
tuple_descriptor,
+            const RowDescriptor* row_descriptor,
+            const VExprContextSPtrs* not_single_slot_filter_conjuncts,
+            const std::unordered_map<int, VExprContextSPtrs>* 
slot_id_to_filter_conjuncts) {
+        auto* orc_reader = static_cast<OrcReader*>(_file_format_reader.get());
+        const orc::Type* orc_type_ptr = nullptr;
+        RETURN_IF_ERROR(orc_reader->get_file_type(&orc_type_ptr));
+        RETURN_IF_ERROR(gen_table_info_node_by_field_id(
+                _params, _range.table_format_params.hudi_params.schema_id, 
tuple_descriptor,
+                orc_type_ptr));
+
+        return orc_reader->init_reader(&read_table_col_names, 
table_col_name_to_value_range,
+                                       conjuncts, false, tuple_descriptor, 
row_descriptor,
+                                       not_single_slot_filter_conjuncts,
+                                       slot_id_to_filter_conjuncts, 
table_info_node_ptr);
+    }
+};
+
 #include "common/compile_check_end.h"
 } // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/exec/scan/file_scanner.cpp 
b/be/src/vec/exec/scan/file_scanner.cpp
index 1459cac38b5..6502e151140 100644
--- a/be/src/vec/exec/scan/file_scanner.cpp
+++ b/be/src/vec/exec/scan/file_scanner.cpp
@@ -1077,12 +1077,16 @@ Status FileScanner::_get_next_reader() {
             break;
         }
         default:
-            return Status::InternalError("Not supported file format: {}", 
_params->format_type);
+            return Status::NotSupported("Not supported create reader for file 
format: {}.",
+                                        to_string(_params->format_type));
         }
 
         if (_cur_reader == nullptr) {
-            return Status::InternalError("Failed to create reader for  file 
format: {}",
-                                         _params->format_type);
+            return Status::NotSupported(
+                    "Not supported create reader for table format: {} / file 
format: {}.",
+                    range.__isset.table_format_params ? 
range.table_format_params.table_format_type
+                                                      : "NotSet",
+                    to_string(_params->format_type));
         }
         COUNTER_UPDATE(_file_counter, 1);
         // The FileScanner for external table may try to open not exist files,
@@ -1238,6 +1242,16 @@ Status 
FileScanner::_init_orc_reader(std::unique_ptr<OrcReader>&& orc_reader) {
                 &_slot_id_to_filter_conjuncts);
         RETURN_IF_ERROR(paimon_reader->init_row_filters());
         _cur_reader = std::move(paimon_reader);
+    } else if (range.__isset.table_format_params &&
+               range.table_format_params.table_format_type == "hudi") {
+        std::unique_ptr<HudiOrcReader> hudi_reader = 
HudiOrcReader::create_unique(
+                std::move(orc_reader), _profile, _state, *_params, range, 
_io_ctx.get());
+
+        init_status = hudi_reader->init_reader(
+                _file_col_names, _colname_to_value_range, 
_push_down_conjuncts, _real_tuple_desc,
+                _default_val_row_desc.get(), 
&_not_single_slot_filter_conjuncts,
+                &_slot_id_to_filter_conjuncts);
+        _cur_reader = std::move(hudi_reader);
     } else if (range.__isset.table_format_params &&
                range.table_format_params.table_format_type == "hive") {
         std::unique_ptr<HiveOrcReader> hive_reader = 
HiveOrcReader::create_unique(
@@ -1390,10 +1404,10 @@ Status FileScanner::read_one_line_from_range(const 
TFileRangeDesc& range,
                     break;
                 }
                 default: {
-                    return Status::InternalError(
-                            "Failed to create one line reader for file format: 
{},"
-                            "only support parquet and orc",
-                            _params->format_type);
+                    return Status::NotSupported(
+                            "Not support create lines reader for file format: 
{},"
+                            "only support parquet and orc.",
+                            to_string(_params->format_type));
                 }
                 }
                 return Status::OK();
diff --git a/be/test/vec/exec/vfile_scanner_exception_test.cpp 
b/be/test/vec/exec/vfile_scanner_exception_test.cpp
index 8fa37c26278..d1a493a0d97 100644
--- a/be/test/vec/exec/vfile_scanner_exception_test.cpp
+++ b/be/test/vec/exec/vfile_scanner_exception_test.cpp
@@ -299,7 +299,7 @@ TEST_F(VfileScannerExceptionTest, failure_case) {
     auto st = scanner->get_block(&_runtime_state, block.get(), &eof);
     ASSERT_FALSE(st.ok());
     auto msg = st.to_string();
-    auto pos = msg.find("Failed to create reader for");
+    auto pos = msg.find("Not supported create reader");
     std::cout << "msg = " << msg << std::endl;
     ASSERT_TRUE(pos != msg.npos);
     WARN_IF_ERROR(scanner->close(&_runtime_state), "fail to close scanner");
diff --git 
a/regression-test/data/external_table_p2/hudi/test_hudi_full_schema_change.out 
b/regression-test/data/external_table_p2/hudi/test_hudi_full_schema_change.out
index 8a1e62f8249..e6f6b836635 100644
Binary files 
a/regression-test/data/external_table_p2/hudi/test_hudi_full_schema_change.out 
and 
b/regression-test/data/external_table_p2/hudi/test_hudi_full_schema_change.out 
differ
diff --git 
a/regression-test/suites/external_table_p2/hudi/test_hudi_full_schema_change.groovy
 
b/regression-test/suites/external_table_p2/hudi/test_hudi_full_schema_change.groovy
index 1b427faf53b..173e6f64657 100644
--- 
a/regression-test/suites/external_table_p2/hudi/test_hudi_full_schema_change.groovy
+++ 
b/regression-test/suites/external_table_p2/hudi/test_hudi_full_schema_change.groovy
@@ -38,7 +38,7 @@ suite("test_hudi_full_schema_change", 
"p2,external,hudi,external_remote,external
     
 
 
-    def tables  = ["hudi_full_schema_change_parquet"]
+    def tables  = 
["hudi_full_schema_change_parquet","hudi_full_schema_change_orc"]
 
 
     for (String table: tables) {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to