(doris) branch nested_column_prune updated: Clean external table implementation's code.

kakachen Sun, 02 Nov 2025 19:28:55 -0800

This is an automated email from the ASF dual-hosted git repository.

kakachen pushed a commit to branch nested_column_prune
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/nested_column_prune by this 
push:
     new abaac93b380 Clean external table implementation's code.
abaac93b380 is described below

commit abaac93b3801ce22c1a54b593a1d884db87e4b06
Author: kakachen <[email protected]>
AuthorDate: Mon Nov 3 11:28:25 2025 +0800

    Clean external table implementation's code.
---
 be/src/vec/exec/format/orc/vorc_reader.cpp         | 128 ++++++-----------
 be/src/vec/exec/format/orc/vorc_reader.h           |   6 +-
 .../exec/format/parquet/vparquet_column_reader.cpp |  67 +--------
 .../exec/format/parquet/vparquet_column_reader.h   |   4 -
 .../exec/format/parquet/vparquet_group_reader.cpp  |  24 +---
 .../hive/hive_reader_create_column_ids_test.cpp    | 152 +++++++++------------
 .../exec/format/table/hive/hive_reader_test.cpp    |   4 +-
 .../iceberg_reader_create_column_ids_test.cpp      | 113 ++++++---------
 .../format/table/iceberg/iceberg_reader_test.cpp   |   8 +-
 9 files changed, 168 insertions(+), 338 deletions(-)

diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp 
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 11b75b820ca..1b6d4eadeb8 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -406,9 +406,6 @@ Status OrcReader::_init_read_columns() {
     SCOPED_RAW_TIMER(&_statistics.init_column_time);
     const auto& root_type = _reader->getType();
 
-    // Save file root type for later use in type conversion
-    _file_root_type = &root_type;
-
     // Build column ID to file type mapping for all columns in file
     _column_id_to_file_type.clear();
     std::function<void(const orc::Type*, int)> build_column_id_map = [&](const 
orc::Type* type,
@@ -1191,7 +1188,7 @@ Status OrcReader::set_fill_columns(
         if (!_column_ids.empty()) {
             std::list<uint64_t> column_ids_list(_column_ids.begin(), 
_column_ids.end());
             _row_reader_options.includeTypes(column_ids_list);
-        } else {
+        } else { // If column_ids is empty, include all top-level columns to 
be read.
             _row_reader_options.include(_read_file_cols);
         }
         _row_reader_options.setEnableLazyDecoding(true);
@@ -1281,35 +1278,6 @@ Status OrcReader::set_fill_columns(
             }
         }
 
-        // Helper function to recursively print ORC type structure
-        std::function<void(const orc::Type*, int, const std::string&)> 
print_orc_type_tree =
-                [&](const orc::Type* type, int indent, const std::string& 
name) {
-                    if (type) {
-                        std::string indent_str(indent * 2, ' ');
-                        VLOG_DEBUG << indent_str << "- " << name
-                                   << " (type_id=" << type->getColumnId()
-                                   << ", kind=" << 
static_cast<int>(type->getKind()) << ")";
-                    }
-
-                    if (type->getKind() == orc::TypeKind::STRUCT) {
-                        for (uint64_t i = 0; i < type->getSubtypeCount(); ++i) 
{
-                            std::string field_name = type->getFieldName(i);
-                            print_orc_type_tree(type->getSubtype(i), indent + 
1, field_name);
-                        }
-                    } else if (type->getKind() == orc::TypeKind::LIST) {
-                        print_orc_type_tree(type->getSubtype(0), indent + 1, 
"element");
-                    } else if (type->getKind() == orc::TypeKind::MAP) {
-                        print_orc_type_tree(type->getSubtype(0), indent + 1, 
"key");
-                        print_orc_type_tree(type->getSubtype(1), indent + 1, 
"value");
-                    } else {
-                        // Primitive type, no children
-                    }
-                };
-
-        // VLOG_DEBUG << "\n[OrcReader] ========== ORC Type Structure 
==========";
-        // print_orc_type_tree(&selected_type, 0, "root");
-        // VLOG_DEBUG << "[OrcReader] 
==========================================\n";
-
         _type_map.clear();
         if (_is_acid) {
             for (uint64_t i = 0; i < selected_type.getSubtypeCount(); ++i) {
@@ -1429,11 +1397,6 @@ Status OrcReader::_fill_missing_columns(
                 block->erase(result_column_id);
             }
         }
-
-        // // no default column, fill with null
-        // auto mutable_column = 
block->get_by_name(kv.first).column->assume_mutable();
-        // auto* nullable_column = 
static_cast<vectorized::ColumnNullable*>(mutable_column.get());
-        // nullable_column->insert_many_defaults(rows);
     }
     return Status::OK();
 }
@@ -1542,22 +1505,22 @@ DataTypePtr OrcReader::convert_to_doris_type(const 
orc::Type* orc_type) {
             orc_type->getSubtype(1) == nullptr) {
             // Try to find the complete type from _column_id_to_file_type
             uint64_t column_id = orc_type->getColumnId();
-            VLOG(1) << "[OrcReader] Detected incomplete MAP type: column_id=" 
<< column_id
-                    << ", subtype_count=" << orc_type->getSubtypeCount() << ", 
subtype(0)="
-                    << (orc_type->getSubtypeCount() > 0 && 
orc_type->getSubtype(0) != nullptr
-                                ? "not null"
-                                : "null")
-                    << ", subtype(1)="
-                    << (orc_type->getSubtypeCount() > 1 && 
orc_type->getSubtype(1) != nullptr
-                                ? "not null"
-                                : "null");
+            VLOG_DEBUG << "[OrcReader] Detected incomplete MAP type: 
column_id=" << column_id
+                       << ", subtype_count=" << orc_type->getSubtypeCount() << 
", subtype(0)="
+                       << (orc_type->getSubtypeCount() > 0 && 
orc_type->getSubtype(0) != nullptr
+                                   ? "not null"
+                                   : "null")
+                       << ", subtype(1)="
+                       << (orc_type->getSubtypeCount() > 1 && 
orc_type->getSubtype(1) != nullptr
+                                   ? "not null"
+                                   : "null");
 
             auto it = _column_id_to_file_type.find(column_id);
             if (it != _column_id_to_file_type.end() && it->second != nullptr) {
                 const orc::Type* complete_type = it->second;
-                VLOG(1) << "[OrcReader] Found complete type in mapping: 
column_id=" << column_id
-                        << ", complete_type_kind=" << 
static_cast<int>(complete_type->getKind())
-                        << ", complete_subtype_count=" << 
complete_type->getSubtypeCount();
+                VLOG_DEBUG << "[OrcReader] Found complete type in mapping: 
column_id=" << column_id
+                           << ", complete_type_kind=" << 
static_cast<int>(complete_type->getKind())
+                           << ", complete_subtype_count=" << 
complete_type->getSubtypeCount();
 
                 // Print subtypes information
                 for (uint64_t i = 0; i < complete_type->getSubtypeCount(); 
++i) {
@@ -1573,7 +1536,8 @@ DataTypePtr OrcReader::convert_to_doris_type(const 
orc::Type* orc_type) {
 
                 if (complete_type->getKind() == orc::TypeKind::MAP &&
                     complete_type->getSubtypeCount() == 2) {
-                    VLOG(1) << "[OrcReader] Using complete MAP type from file 
schema for column_id="
+                    VLOG_DEBUG
+                            << "[OrcReader] Using complete MAP type from file 
schema for column_id="
                             << column_id;
 
                     // Get subtypes with extra validation
@@ -1925,18 +1889,18 @@ Status OrcReader::_fill_doris_data_column(const 
std::string& col_name,
         const orc::Type* orc_key_type = orc_column_type->getSubtype(0);
         const orc::Type* orc_value_type = orc_column_type->getSubtype(1);
 
-        VLOG(1) << "[OrcReader] MAP column '" << col_name
-                << "': orc_key_type=" << (orc_key_type != nullptr ? "not null" 
: "NULL")
-                << ", orc_value_type=" << (orc_value_type != nullptr ? "not 
null" : "NULL")
-                << ", element_size=" << element_size;
+        VLOG_DEBUG << "[OrcReader] MAP column '" << col_name
+                   << "': orc_key_type=" << (orc_key_type != nullptr ? "not 
null" : "NULL")
+                   << ", orc_value_type=" << (orc_value_type != nullptr ? "not 
null" : "NULL")
+                   << ", element_size=" << element_size;
 
         // Handle incomplete MAP type - if key or value type is nullptr, try 
to recover from mapping
         bool key_is_missing = (orc_key_type == nullptr);
         bool value_is_missing = (orc_value_type == nullptr);
 
         if (key_is_missing || value_is_missing) {
-            VLOG(1) << "[OrcReader] Detected incomplete MAP subtypes for 
column '" << col_name
-                    << "', attempting to recover from mapping...";
+            VLOG_DEBUG << "[OrcReader] Detected incomplete MAP subtypes for 
column '" << col_name
+                       << "', attempting to recover from mapping...";
 
             uint64_t column_id = orc_column_type->getColumnId();
             auto it = _column_id_to_file_type.find(column_id);
@@ -1948,15 +1912,16 @@ Status OrcReader::_fill_doris_data_column(const 
std::string& col_name,
                         orc_key_type = complete_map_type->getSubtype(0);
                         if (orc_key_type != nullptr) {
                             // key_is_missing = false;
-                            VLOG(1) << "[OrcReader] Recovered key type from 
mapping for column '"
-                                    << col_name << "'";
+                            VLOG_DEBUG << "[OrcReader] Recovered key type from 
mapping for column '"
+                                       << col_name << "'";
                         }
                     }
                     if (value_is_missing) {
                         orc_value_type = complete_map_type->getSubtype(1);
                         if (orc_value_type != nullptr) {
                             // value_is_missing = false;
-                            VLOG(1) << "[OrcReader] Recovered value type from 
mapping for column '"
+                            VLOG_DEBUG
+                                    << "[OrcReader] Recovered value type from 
mapping for column '"
                                     << col_name << "'";
                         }
                     }
@@ -2072,7 +2037,6 @@ Status OrcReader::_fill_doris_data_column(const 
std::string& col_name,
 
         for (auto read_field : read_fields) {
             orc::ColumnVectorBatch* orc_field = 
orc_struct->fields[read_field.second];
-
             const orc::Type* orc_type = 
orc_column_type->getSubtype(read_field.second);
             std::string field_name =
                     col_name + "." + 
orc_column_type->getFieldName(read_field.second);
@@ -2099,7 +2063,6 @@ Status OrcReader::_orc_column_to_doris_column(
         const orc::ColumnVectorBatch* cvb, size_t num_values) {
     DataTypePtr resolved_type;
     ColumnPtr resolved_column;
-    converter::ColumnTypeConverter* converter_ptr = nullptr;
     MutableColumnPtr data_column;
     if (orc_column_type != nullptr) {
         auto src_type = convert_to_doris_type(orc_column_type);
@@ -2129,9 +2092,9 @@ Status OrcReader::_orc_column_to_doris_column(
             // reuse the cached converter
             _converters[converter_key] = std::move(converter);
         }
-        converter_ptr = _converters[converter_key].get();
-        resolved_column = converter_ptr->get_column(src_type, doris_column, 
data_type);
-        resolved_type = converter_ptr->get_type();
+        converter::ColumnTypeConverter* converter = 
_converters[converter_key].get();
+        resolved_column = converter->get_column(src_type, doris_column, 
data_type);
+        resolved_type = converter->get_type();
 
         if (resolved_column->is_nullable()) {
             SCOPED_RAW_TIMER(&_statistics.decode_null_map_time);
@@ -2158,17 +2121,20 @@ Status OrcReader::_orc_column_to_doris_column(
             data_column = resolved_column->assume_mutable();
         }
 
-        RETURN_IF_ERROR((_fill_doris_data_column<is_filter>(
+        RETURN_IF_ERROR(_fill_doris_data_column<is_filter>(
                 col_name, data_column, remove_nullable(resolved_type), 
root_node, orc_column_type,
-                cvb, num_values)));
-        // if (converter_ptr != nullptr) {
+                cvb, num_values));
         // resolve schema change
         auto converted_column = doris_column->assume_mutable();
-        return converter_ptr->convert(resolved_column, converted_column);
-        // }
+        return converter->convert(resolved_column, converted_column);
     } else {
-        reinterpret_cast<ColumnNullable*>(doris_column->assume_mutable().get())
-                ->insert_many_defaults(num_values);
+        auto mutable_column = doris_column->assume_mutable();
+        if (mutable_column->is_nullable()) {
+            auto* nullable_column = 
static_cast<ColumnNullable*>(mutable_column.get());
+            nullable_column->insert_many_defaults(num_values);
+        } else {
+            mutable_column->insert_many_defaults(num_values);
+        }
     }
 
     return Status::OK();
@@ -2272,10 +2238,10 @@ Status OrcReader::_get_next_block_impl(Block* block, 
size_t* read_rows, bool* eo
             if (orc_col_idx == _colname_to_idx.end()) {
                 return Status::InternalError("Wrong read column '{}' in orc 
file", col_name);
             }
-            RETURN_IF_ERROR((_orc_column_to_doris_column<true>(
+            RETURN_IF_ERROR(_orc_column_to_doris_column<true>(
                     col_name, column_ptr, column_type,
                     _table_info_node_ptr->get_children_node(col_name), 
_type_map[file_column_name],
-                    batch_vec[orc_col_idx->second], _batch->numElements)));
+                    batch_vec[orc_col_idx->second], _batch->numElements));
         }
 
         RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements,
@@ -2330,12 +2296,6 @@ Status OrcReader::_get_next_block_impl(Block* block, 
size_t* read_rows, bool* eo
             }
         }
 
-        //std::cout << "[OrcReader] _column_ids: ";
-        //for (const auto& cid : _column_ids) {
-        //    std::cout << cid << " ";
-        //}
-        //std::cout << std::endl;
-
         if (!_dict_cols_has_converted && !_dict_filter_cols.empty()) {
             for (auto& dict_filter_cols : _dict_filter_cols) {
                 MutableColumnPtr dict_col_ptr = ColumnInt32::create();
@@ -2369,10 +2329,10 @@ Status OrcReader::_get_next_block_impl(Block* block, 
size_t* read_rows, bool* eo
             if (orc_col_idx == _colname_to_idx.end()) {
                 return Status::InternalError("Wrong read column '{}' in orc 
file", col_name);
             }
-            RETURN_IF_ERROR((_orc_column_to_doris_column<false>(
+            RETURN_IF_ERROR(_orc_column_to_doris_column<false>(
                     col_name, column_ptr, column_type,
                     _table_info_node_ptr->get_children_node(col_name), 
_type_map[file_column_name],
-                    batch_vec[orc_col_idx->second], _batch->numElements)));
+                    batch_vec[orc_col_idx->second], _batch->numElements));
         }
 
         RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements,
@@ -2535,10 +2495,10 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, 
uint16_t* sel, uint16_t s
         if (orc_col_idx == _colname_to_idx.end()) {
             return Status::InternalError("Wrong read column '{}' in orc file", 
table_col_name);
         }
-        RETURN_IF_ERROR((_orc_column_to_doris_column<false>(
+        RETURN_IF_ERROR(_orc_column_to_doris_column<false>(
                 table_col_name, column_ptr, column_type,
                 _table_info_node_ptr->get_children_node(table_col_name),
-                _type_map[file_column_name], batch_vec[orc_col_idx->second], 
data.numElements)));
+                _type_map[file_column_name], batch_vec[orc_col_idx->second], 
data.numElements));
     }
     RETURN_IF_ERROR(
             _fill_partition_columns(block, size, 
_lazy_read_ctx.predicate_partition_columns));
diff --git a/be/src/vec/exec/format/orc/vorc_reader.h 
b/be/src/vec/exec/format/orc/vorc_reader.h
index 341e40a9ce2..d1313bc4639 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.h
+++ b/be/src/vec/exec/format/orc/vorc_reader.h
@@ -639,7 +639,6 @@ private:
     size_t _batch_size;
     int64_t _range_start_offset;
     int64_t _range_size;
-    // const std::string& _ctz;
     std::string _ctz;
 
     int32_t _offset_days = 0;
@@ -663,11 +662,8 @@ private:
     // file column name to orc type
     std::unordered_map<std::string, const orc::Type*> _type_map;
 
-    // Optimized type conversion support
-    // column ID to file original type mapping for handling partial column 
selection
+    // Column ID to file original type mapping for handling incomplete MAP 
type due to column pruning.
     std::unordered_map<uint64_t, const orc::Type*> _column_id_to_file_type;
-    // Keep reference to file root type for complete type information
-    const orc::Type* _file_root_type = nullptr;
 
     std::unique_ptr<ORCFileInputStream> _file_input_stream;
     Statistics _statistics;
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
index a7455455198..589570c7ab0 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
@@ -111,23 +111,6 @@ Status ParquetColumnReader::create(io::FileReaderSPtr 
file, FieldSchema* field,
                                    size_t max_buf_size, const 
tparquet::OffsetIndex* offset_index,
                                    const std::set<uint64_t>& column_ids,
                                    const std::set<uint64_t>& 
filter_column_ids) {
-    // if (!column_ids.empty()) {
-    //     uint64_t field_column_id = field->get_column_id();
-    //     VLOG_DEBUG << "[ParquetReader] Checking column_id: " << 
field_column_id
-    //                << ", column_ids.size(): " << column_ids.size();
-    //     if (column_ids.find(field_column_id) == column_ids.end()) {
-    //         VLOG_DEBUG << "[ParquetReader] Setting skip_reading=true for 
column_id: "
-    //                    << field_column_id;
-    //         auto skip_reader = 
std::make_unique<SkipReadingReader>(row_ranges, ctz, io_ctx, field);
-    //         skip_reader->_filter_column_ids = filter_column_ids;
-    //         reader = std::move(skip_reader);
-    //         return Status::OK();
-    //     } else {
-    //         VLOG_DEBUG << "[ParquetReader] Column_id " << field_column_id
-    //                    << " found in column_ids, skip_reading=false";
-    //     }
-    // }
-
     if (field->data_type->get_primitive_type() == TYPE_ARRAY) {
         std::unique_ptr<ParquetColumnReader> element_reader;
         RETURN_IF_ERROR(create(file, &field->children[0], row_group, 
row_ranges, ctz, io_ctx,
@@ -177,7 +160,7 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, 
FieldSchema* field,
         child_readers.reserve(field->children.size());
         int non_skip_reader_idx = -1;
         for (int i = 0; i < field->children.size(); ++i) {
-            auto& child = field->children[i]; // 去掉 const，保证 &child 类型为 
FieldSchema*
+            auto& child = field->children[i];
             std::unique_ptr<ParquetColumnReader> child_reader;
             if (column_ids.empty() || column_ids.find(child.get_column_id()) 
!= column_ids.end()) {
                 RETURN_IF_ERROR(create(file, &child, row_group, row_ranges, 
ctz, io_ctx,
@@ -185,32 +168,25 @@ Status ParquetColumnReader::create(io::FileReaderSPtr 
file, FieldSchema* field,
                                        filter_column_ids));
                 child_reader->set_nested_column();
                 child_readers[child.name] = std::move(child_reader);
-                // 记录第一个非 SkippingReader
+                // Record the first non-SkippingReader
                 if (non_skip_reader_idx == -1) {
                     non_skip_reader_idx = i;
                 }
             } else {
-                // 不在 column_ids 的 child，创建 SkippingReader
                 auto skip_reader =
                         std::make_unique<SkipReadingReader>(row_ranges, ctz, 
io_ctx, &child);
                 skip_reader->_filter_column_ids = filter_column_ids;
                 child_readers[child.name] = std::move(skip_reader);
             }
         }
-        // 如果所有都是 SkipReadingReader，则强制第一个 child 调用 create
+        // If all children are SkipReadingReader, force the first child to 
call create
         if (non_skip_reader_idx == -1) {
-            // for (int i = 0; i < field->children.size(); ++i) {
-            // 只处理 SkipReadingReader（即刚才创建的）
-            // if 
(dynamic_cast<SkipReadingReader*>(child_readers[field->children[i].name].get()) 
!= nullptr) {
             std::unique_ptr<ParquetColumnReader> child_reader;
             RETURN_IF_ERROR(create(file, &field->children[0], row_group, 
row_ranges, ctz, io_ctx,
                                    child_reader, max_buf_size, nullptr, 
column_ids,
                                    filter_column_ids));
             child_reader->set_nested_column();
             child_readers[field->children[0].name] = std::move(child_reader);
-            // break;
-            // }
-            // }
         }
         auto struct_reader = StructColumnReader::create_unique(row_ranges, 
ctz, io_ctx);
         RETURN_IF_ERROR(struct_reader->init(std::move(child_readers), field));
@@ -878,7 +854,6 @@ Status MapColumnReader::read_column_data(
         value_rows += loop_rows;
     }
     DCHECK_EQ(key_rows, value_rows);
-    // DCHECK_EQ(key_eof, value_eof);
     *read_rows = key_rows;
     *eof = key_eof;
 
@@ -948,10 +923,6 @@ Status StructColumnReader::read_column_data(
             continue;
         }
         auto file_name = root_node->children_file_column_name(doris_name);
-        //     if (_child_readers.find(file_name) == _child_readers.end()) {
-        //     missing_column_idxs.push_back(i);
-        //     continue;
-        // }
 
         // Check if this is a SkipReadingReader - we should skip it when 
choosing reference column
         // because SkipReadingReader doesn't know the actual data size in 
nested context
@@ -978,13 +949,6 @@ Status StructColumnReader::read_column_data(
                     batch_size, &field_rows, &field_eof, is_dict_filter));
             *read_rows = field_rows;
             *eof = field_eof;
-            // Debug: print the first non-missing child column read info
-            //std::cout << "[ParquetReader] struct '" << _field_schema->name 
<< "' first non-missing child='"
-            //          << file_name << "' doris_name='" << doris_name << "' 
field_rows=" << field_rows
-            //          << " field_eof=" << field_eof << " doris_field_size=" 
<< doris_field->size()
-            //          << " child_rep_levels=" << 
_child_readers[file_name]->get_rep_level().size()
-            //          << " child_def_levels=" << 
_child_readers[file_name]->get_def_level().size()
-            //          << std::endl;
             /*
              * Considering the issue in the `_read_nested_column` function 
where data may span across pages, leading
              * to missing definition and repetition levels, when filling the 
null_map of the struct later, it is
@@ -1001,12 +965,6 @@ Status StructColumnReader::read_column_data(
                         doris_field, doris_type, 
root_node->get_children_node(doris_name),
                         filter_map, *read_rows - field_rows, &loop_rows, 
&field_eof,
                         is_dict_filter));
-                // Debug: print each loop iteration for non-first child columns
-                //std::cout << "[ParquetReader] struct '" << 
_field_schema->name << "' non-first child='"
-                //          << file_name << "' doris_name='" << doris_name << 
"' loop_rows=" << loop_rows
-                //          << " field_rows=" << field_rows << " target_rows=" 
<< *read_rows
-                //          << " field_eof=" << field_eof << " 
doris_field_size=" << doris_field->size()
-                //          << std::endl;
                 field_rows += loop_rows;
             }
             DCHECK_EQ(*read_rows, field_rows);
@@ -1014,20 +972,11 @@ Status StructColumnReader::read_column_data(
         }
     }
 
-    // After scanning children, print summary of missing/skip info
-    //std::cout << "[ParquetReader] struct '" << _field_schema->name << "' 
summary: not_missing_column_id="
-    //          << not_missing_column_id << " missing_columns_count=" << 
missing_column_idxs.size()
-    //          << " skip_reading_count=" << skip_reading_column_idxs.size() 
<< std::endl;
-
     int64_t missing_column_sz = -1;
 
     if (not_missing_column_id == -1) {
         // All queried columns are missing in the file (e.g., all added after 
schema change)
-        // We need to pick a column from _field_schema that exists in the file 
to get RL/DL
-        //std::cout << "[ParquetReader] All queried columns missing for struct 
'" << _field_schema->name
-        //          << "', searching for reference column in _field_schema" << 
std::endl;
-
-        // Find a column from _field_schema children that exists in the file 
for RL/DL reference
+        // We need to pick a column from _field_schema children that exists in 
the file for RL/DL reference
         std::string reference_file_column_name;
         std::unique_ptr<ParquetColumnReader>* reference_reader = nullptr;
 
@@ -1039,8 +988,6 @@ Status StructColumnReader::read_column_data(
                 if (!is_skip_reader) {
                     reference_file_column_name = child.name;
                     reference_reader = &(it->second);
-                    //std::cout << "[ParquetReader] Found reference column: " 
<< reference_file_column_name
-                    //          << " for struct '" << _field_schema->name << 
"'" << std::endl;
                     break;
                 }
             }
@@ -1087,10 +1034,6 @@ Status StructColumnReader::read_column_data(
             // Store this reference column name for 
get_rep_level/get_def_level to use
             _read_column_names.emplace_back(reference_file_column_name);
 
-            //std::cout << "[ParquetReader] Read " << field_rows << " rows 
from reference column '"
-            //          << reference_file_column_name << "', 
temp_column.size=" << temp_column->size()
-            //          << " for struct '" << _field_schema->name << "'" << 
std::endl;
-
             missing_column_sz = temp_column->size() - 
not_missing_orig_column_size;
         } else {
             return Status::Corruption(
@@ -1139,8 +1082,6 @@ Status StructColumnReader::read_column_data(
     }
 
     if (null_map_ptr != nullptr) {
-        //std::cout << "[ParquetReader] struct '" << _field_schema->name << "' 
before fill_struct_null_map: rep_levels="
-        //          << this->get_rep_level().size() << " def_levels=" << 
this->get_def_level().size() << std::endl;
         fill_struct_null_map(_field_schema, *null_map_ptr, 
this->get_rep_level(),
                              this->get_def_level());
     }
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.h 
b/be/src/vec/exec/format/parquet/vparquet_column_reader.h
index 7b37f2e0e6c..893b3bc784a 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.h
@@ -412,8 +412,6 @@ public:
                    << _field_schema->name
                    << ". This indicates the SkipReadingReader was incorrectly 
used as a reference "
                       "column.";
-        // static std::vector<level_t> empty_levels;
-        // return empty_levels;
         __builtin_unreachable();
     }
 
@@ -422,8 +420,6 @@ public:
                    << _field_schema->name
                    << ". This indicates the SkipReadingReader was incorrectly 
used as a reference "
                       "column.";
-        // static std::vector<level_t> empty_levels;
-        // return empty_levels;
         __builtin_unreachable();
     }
 
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index 19ddb8d1b99..947dcee3408 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -98,13 +98,7 @@ RowGroupReader::RowGroupReader(io::FileReaderSPtr 
file_reader,
           _state(state),
           _obj_pool(new ObjectPool()),
           _column_ids(column_ids),
-          _filter_column_ids(filter_column_ids) {
-    //std::cout << "[RowGroupReader] _column_ids: ";
-    //for (const auto& col_id : _column_ids) {
-    //    std::cout << col_id << " ";
-    //}
-    //std::cout << std::endl;
-}
+          _filter_column_ids(filter_column_ids) {}
 
 RowGroupReader::~RowGroupReader() {
     _column_readers.clear();
@@ -438,14 +432,13 @@ Status RowGroupReader::_read_column_data(Block* block,
             RETURN_IF_ERROR(_column_readers[read_col_name]->read_column_data(
                     column_ptr, column_type, 
_table_info_node_ptr->get_children_node(read_col_name),
                     filter_map, batch_size - col_read_rows, &loop_rows, 
&col_eof, is_dict_filter));
-            // Debug: print loop_rows for this iteration
-            //std::cout << "[RowGroupReader] column '" << read_col_name << "' 
loop_rows=" << loop_rows
-            //          << " col_read_rows_so_far=" << col_read_rows << 
std::endl;
+            VLOG_DEBUG << "[RowGroupReader] column '" << read_col_name
+                       << "' loop_rows=" << loop_rows << " 
col_read_rows_so_far=" << col_read_rows
+                       << std::endl;
             col_read_rows += loop_rows;
         }
-        // Debug: print rows read for this column
-        //std::cout << "[RowGroupReader] column '" << read_col_name << "' 
read_rows=" << col_read_rows
-        //          << std::endl;
+        VLOG_DEBUG << "[RowGroupReader] column '" << read_col_name
+                   << "' read_rows=" << col_read_rows << std::endl;
         if (batch_read_rows > 0 && batch_read_rows != col_read_rows) {
             LOG(WARNING) << "[RowGroupReader] Mismatched read rows among 
parquet columns. "
                             "previous_batch_read_rows="
@@ -741,10 +734,6 @@ Status RowGroupReader::_fill_missing_columns(
                 block->erase(result_column_id);
             }
         }
-
-        // auto mutable_column = 
block->get_by_name(kv.first).column->assume_mutable();
-        // auto* nullable_column = 
assert_cast<vectorized::ColumnNullable*>(mutable_column.get());
-        // nullable_column->insert_many_defaults(rows);
     }
     return Status::OK();
 }
@@ -1135,7 +1124,6 @@ ParquetColumnReader::Statistics 
RowGroupReader::statistics() {
     }
     return st;
 }
-
 #include "common/compile_check_end.h"
 
 } // namespace doris::vectorized
diff --git 
a/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp 
b/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp
index d46acbdc134..eea63192857 100644
--- a/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp
+++ b/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp
@@ -60,7 +60,6 @@ struct ColumnAccessPathConfig {
     std::string column_name;
     std::vector<std::vector<std::string>> all_column_paths; // For 
all_column_access_paths
     std::vector<std::vector<std::string>> predicate_paths;  // For 
predicate_column_access_paths
-    bool use_name_paths = true; // true=use field names, false=use indices
 };
 
 // ORC column IDs are assigned in a tree-increment order, root is 0 and 
children increment sequentially.
@@ -738,12 +737,11 @@ protected:
     }
 
     // Helper function: Run Parquet test with different column ID extraction 
methods
-    void run_parquet_test_with_method(const std::vector<std::string>& 
table_column_names,
-                                      const 
std::vector<ColumnAccessPathConfig>& access_configs,
-                                      const std::set<uint64_t>& 
expected_column_ids,
-                                      const std::set<uint64_t>& 
expected_filter_column_ids,
-                                      bool use_top_level_method = false,
-                                      bool should_skip_assertion = false) {
+    void run_parquet_test(const std::vector<std::string>& table_column_names,
+                          const std::vector<ColumnAccessPathConfig>& 
access_configs,
+                          const std::set<uint64_t>& expected_column_ids,
+                          const std::set<uint64_t>& expected_filter_column_ids,
+                          bool use_top_level_method = false, bool 
should_skip_assertion = false) {
         std::string test_file =
                 "./be/test/exec/test_data/nested_user_profiles_parquet/"
                 
"part-00000-64a7a390-1a03-4efc-ab51-557e9369a1f9-c000.snappy.parquet";
@@ -809,12 +807,11 @@ protected:
     }
 
     // Helper function: Run ORC test with different column ID extraction 
methods
-    void run_orc_test_with_method(const std::vector<std::string>& 
table_column_names,
-                                  const std::vector<ColumnAccessPathConfig>& 
access_configs,
-                                  const std::set<uint64_t>& 
expected_column_ids,
-                                  const std::set<uint64_t>& 
expected_filter_column_ids,
-                                  bool use_top_level_method = false,
-                                  bool should_skip_assertion = false) {
+    void run_orc_test(const std::vector<std::string>& table_column_names,
+                      const std::vector<ColumnAccessPathConfig>& 
access_configs,
+                      const std::set<uint64_t>& expected_column_ids,
+                      const std::set<uint64_t>& expected_filter_column_ids,
+                      bool use_top_level_method = false, bool 
should_skip_assertion = false) {
         std::string test_file =
                 "./be/test/exec/test_data/nested_user_profiles_orc/"
                 
"part-00000-62614f23-05d1-4043-a533-b155ef52b720-c000.snappy.orc";
@@ -881,12 +878,8 @@ protected:
 };
 
 TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_1) {
-    // Properties:
-
-    // Configure access paths for profile column
     ColumnAccessPathConfig access_config;
     access_config.column_name = "profile";
-    access_config.use_name_paths = true;
 
     access_config.all_column_paths = {{"profile", "address", "coordinates", 
"lat"},
                                       {"profile", "address", "coordinates", 
"lng"},
@@ -900,21 +893,20 @@ TEST_F(HiveReaderCreateColumnIdsTest, 
test_create_column_ids_1) {
     std::set<uint64_t> expected_column_ids = {2, 3, 4, 7, 8, 9, 10, 11, 15, 
16, 18};
     std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11};
 
-    run_parquet_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                                 expected_filter_column_ids);
-    run_parquet_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                                 expected_filter_column_ids, true);
+    run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+                     expected_filter_column_ids);
+    run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+                     expected_filter_column_ids, true);
 
-    run_orc_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                             expected_filter_column_ids);
-    run_orc_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                             expected_filter_column_ids, true);
+    run_orc_test(table_column_names, {access_config}, expected_column_ids,
+                 expected_filter_column_ids);
+    run_orc_test(table_column_names, {access_config}, expected_column_ids,
+                 expected_filter_column_ids, true);
 }
 
 TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_2) {
-    // Properties:
-    //     iceberg.schema: 
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type":
 [...]
-    // ORC column IDs are assigned in a tree-increment order, root is 0 and 
children increment sequentially.
+    // ORC column IDs are assigned in a tree-like incremental manner: the root 
node is 0, and child nodes increase sequentially.
+    // Currently, Parquet uses a similar design.
     // 0: struct (table/root)
     //   1: id (int64)
     //   2: name (string)
@@ -935,36 +927,32 @@ TEST_F(HiveReaderCreateColumnIdsTest, 
test_create_column_ids_2) {
     //           17: name (string)
     //           18: level (int32)
 
-    // 配置profile列的访问路径
     ColumnAccessPathConfig access_config;
     access_config.column_name = "profile";
-    access_config.use_name_paths = true;
 
     access_config.all_column_paths = {{"profile"}};
     access_config.predicate_paths = {{"profile", "address", "coordinates", 
"lat"},
                                      {"profile", "contact", "email"}};
 
     std::vector<std::string> table_column_names = {"name", "profile"};
-    // column_ids should contain all necessary column IDs (set automatically 
deduplicates)
     std::set<uint64_t> expected_column_ids = {2,  3,  4,  5,  6,  7,  8,  9, 
10,
                                               11, 12, 13, 14, 15, 16, 17, 18};
     std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11};
 
-    run_parquet_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                                 expected_filter_column_ids);
-    run_parquet_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                                 expected_filter_column_ids, true);
+    run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+                     expected_filter_column_ids);
+    run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+                     expected_filter_column_ids, true);
 
-    run_orc_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                             expected_filter_column_ids);
-    run_orc_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                             expected_filter_column_ids, true);
+    run_orc_test(table_column_names, {access_config}, expected_column_ids,
+                 expected_filter_column_ids);
+    run_orc_test(table_column_names, {access_config}, expected_column_ids,
+                 expected_filter_column_ids, true);
 }
 
 TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_3) {
-    // Properties:
-    //     iceberg.schema: 
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type":
 [...]
-    // ORC column IDs are assigned in a tree-increment order, root is 0 and 
children increment sequentially.
+    // ORC column IDs are assigned in a tree-like incremental manner: the root 
node is 0, and child nodes increase sequentially.
+    // Currently, Parquet uses a similar design.
     // 0: struct (table/root)
     //   1: id (int64)
     //   2: name (string)
@@ -985,58 +973,54 @@ TEST_F(HiveReaderCreateColumnIdsTest, 
test_create_column_ids_3) {
     //           17: name (string)
     //           18: level (int32)
 
-    // 配置profile列的访问路径
     ColumnAccessPathConfig access_config;
     access_config.column_name = "profile";
-    access_config.use_name_paths = true;
 
     access_config.all_column_paths = {{"profile", "contact"}, {"profile", 
"address"}};
     access_config.predicate_paths = {{"profile", "address", "coordinates"},
                                      {"profile", "contact", "email"}};
 
     std::vector<std::string> table_column_names = {"name", "profile"};
-    // column_ids should contain all necessary column IDs (set automatically 
deduplicates)
     std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
12, 13, 14};
     std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 9, 10, 11};
 
-    run_parquet_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                                 expected_filter_column_ids);
-    run_parquet_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                                 expected_filter_column_ids, true);
+    run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+                     expected_filter_column_ids);
+    run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+                     expected_filter_column_ids, true);
 
-    run_orc_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                             expected_filter_column_ids);
-    run_orc_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                             expected_filter_column_ids, true);
+    run_orc_test(table_column_names, {access_config}, expected_column_ids,
+                 expected_filter_column_ids);
+    run_orc_test(table_column_names, {access_config}, expected_column_ids,
+                 expected_filter_column_ids, true);
 }
 
 TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_4) {
-    // 配置profile列的访问路径
     ColumnAccessPathConfig access_config;
     access_config.column_name = "profile";
-    access_config.use_name_paths = true;
 
     access_config.all_column_paths = {};
     access_config.predicate_paths = {};
 
     std::vector<std::string> table_column_names = {"name", "profile"};
-    // column_ids should contain all necessary column IDs (set automatically 
deduplicates)
     std::set<uint64_t> expected_column_ids = {2,  3,  4,  5,  6,  7,  8,  9, 
10,
                                               11, 12, 13, 14, 15, 16, 17, 18};
     std::set<uint64_t> expected_filter_column_ids = {};
 
-    run_parquet_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                                 expected_filter_column_ids);
-    run_parquet_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                                 expected_filter_column_ids, true);
+    run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+                     expected_filter_column_ids);
+    run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+                     expected_filter_column_ids, true);
 
-    run_orc_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                             expected_filter_column_ids);
-    run_orc_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                             expected_filter_column_ids, true);
+    run_orc_test(table_column_names, {access_config}, expected_column_ids,
+                 expected_filter_column_ids);
+    run_orc_test(table_column_names, {access_config}, expected_column_ids,
+                 expected_filter_column_ids, true);
 }
 
 TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_5) {
+    // ORC column IDs are assigned in a tree-like incremental manner: the root 
node is 0, and child nodes increase sequentially.
+    // Currently, Parquet uses a similar design.
     //   19: tags (array)
     //     20: element (string)
     //   21: friends (array)
@@ -1058,7 +1042,6 @@ TEST_F(HiveReaderCreateColumnIdsTest, 
test_create_column_ids_5) {
         // Configure access paths for friends column
         ColumnAccessPathConfig access_config;
         access_config.column_name = "friends";
-        access_config.use_name_paths = true;
 
         access_config.all_column_paths = {{"friends", "*", "nickname"},
                                           {"friends", "*", 
"friendship_level"}};
@@ -1072,7 +1055,6 @@ TEST_F(HiveReaderCreateColumnIdsTest, 
test_create_column_ids_5) {
         // Configure access paths for recent_activity column
         ColumnAccessPathConfig access_config;
         access_config.column_name = "recent_activity";
-        access_config.use_name_paths = true;
 
         access_config.all_column_paths = {{"recent_activity", "*", "action"},
                                           {"recent_activity", "*", "details", 
"*", "value"}};
@@ -1083,22 +1065,23 @@ TEST_F(HiveReaderCreateColumnIdsTest, 
test_create_column_ids_5) {
     }
 
     std::vector<std::string> table_column_names = {"name", "friends", 
"recent_activity"};
-    // column_ids should contain all necessary column IDs (set automatically 
deduplicates)
     std::set<uint64_t> expected_column_ids = {2, 21, 22, 24, 25, 26, 27, 28, 
29, 30, 32};
     std::set<uint64_t> expected_filter_column_ids = {21, 22, 24, 26, 27, 28};
 
-    run_parquet_test_with_method(table_column_names, access_configs, 
expected_column_ids,
-                                 expected_filter_column_ids);
-    run_parquet_test_with_method(table_column_names, access_configs, 
expected_column_ids,
-                                 expected_filter_column_ids, true);
+    run_parquet_test(table_column_names, access_configs, expected_column_ids,
+                     expected_filter_column_ids);
+    run_parquet_test(table_column_names, access_configs, expected_column_ids,
+                     expected_filter_column_ids, true);
 
-    run_orc_test_with_method(table_column_names, access_configs, 
expected_column_ids,
-                             expected_filter_column_ids);
-    run_orc_test_with_method(table_column_names, access_configs, 
expected_column_ids,
-                             expected_filter_column_ids, true);
+    run_orc_test(table_column_names, access_configs, expected_column_ids,
+                 expected_filter_column_ids);
+    run_orc_test(table_column_names, access_configs, expected_column_ids,
+                 expected_filter_column_ids, true);
 }
 
 TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_6) {
+    // ORC column IDs are assigned in a tree-like incremental manner: the root 
node is 0, and child nodes increase sequentially.
+    // Currently, Parquet uses a similar design.
     //   33: attributes (map)
     //     34: key (string)
     //     35: value (string)
@@ -1165,7 +1148,6 @@ TEST_F(HiveReaderCreateColumnIdsTest, 
test_create_column_ids_6) {
         // Configure access paths for complex_attributes column
         ColumnAccessPathConfig access_config;
         access_config.column_name = "complex_attributes";
-        access_config.use_name_paths = true;
 
         access_config.all_column_paths = {
                 {"complex_attributes", "*", "metadata", "version"},
@@ -1183,30 +1165,28 @@ TEST_F(HiveReaderCreateColumnIdsTest, 
test_create_column_ids_6) {
     {
         std::vector<std::string> table_column_names = {"name", 
"complex_attributes"};
         // parquet values should access keys
-        // column_ids should contain all necessary column IDs (set 
automatically deduplicates)
         std::set<uint64_t> expected_column_ids = {2,  36, 37, 38, 39, 40, 44, 
45, 48, 49, 52, 53,
                                                   54, 61, 62, 63, 64, 65, 66, 
67, 68, 69, 70, 71,
                                                   72, 73, 74, 75, 76, 77, 79, 
80, 82, 83};
         std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40};
 
-        run_parquet_test_with_method(table_column_names, access_configs, 
expected_column_ids,
-                                     expected_filter_column_ids);
-        run_parquet_test_with_method(table_column_names, access_configs, 
expected_column_ids,
-                                     expected_filter_column_ids, true);
+        run_parquet_test(table_column_names, access_configs, 
expected_column_ids,
+                         expected_filter_column_ids);
+        run_parquet_test(table_column_names, access_configs, 
expected_column_ids,
+                         expected_filter_column_ids, true);
     }
 
     {
         std::vector<std::string> table_column_names = {"name", 
"complex_attributes"};
         // orc values should access keys because need to deduplicate by keys
-        // column_ids should contain all necessary column IDs (set 
automatically deduplicates)
         std::set<uint64_t> expected_column_ids = {2,  36, 37, 38, 39, 40, 44, 
45, 48, 49, 52, 53,
                                                   54, 61, 62, 63, 64, 65, 66, 
67, 68, 69, 70, 71,
                                                   72, 73, 74, 75, 76, 77, 79, 
80, 82, 83};
         std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40};
-        run_orc_test_with_method(table_column_names, access_configs, 
expected_column_ids,
-                                 expected_filter_column_ids);
-        run_orc_test_with_method(table_column_names, access_configs, 
expected_column_ids,
-                                 expected_filter_column_ids, true);
+        run_orc_test(table_column_names, access_configs, expected_column_ids,
+                     expected_filter_column_ids);
+        run_orc_test(table_column_names, access_configs, expected_column_ids,
+                     expected_filter_column_ids, true);
     }
 }
 
diff --git a/be/test/vec/exec/format/table/hive/hive_reader_test.cpp 
b/be/test/vec/exec/format/table/hive/hive_reader_test.cpp
index 16bde5c7bae..ca2cada932f 100644
--- a/be/test/vec/exec/format/table/hive/hive_reader_test.cpp
+++ b/be/test/vec/exec/format/table/hive/hive_reader_test.cpp
@@ -462,7 +462,7 @@ protected:
 };
 
 // Test reading real Hive Parquet file using HiveTableReader
-TEST_F(HiveReaderTest, ReadHiveParquetFile) {
+TEST_F(HiveReaderTest, read_hive_parquet_file) {
     // Read only: name, profile.address.coordinates.lat, 
profile.address.coordinates.lng, profile.contact.email
     // Setup table descriptor for test columns with new schema:
     /**
@@ -604,7 +604,7 @@ TEST_F(HiveReaderTest, ReadHiveParquetFile) {
 }
 
 // Test reading real Hive Orc file using HiveTableReader
-TEST_F(HiveReaderTest, ReadHiveOrcFile) {
+TEST_F(HiveReaderTest, read_hive_rrc_file) {
     // Read only: name, profile.address.coordinates.lat, 
profile.address.coordinates.lng, profile.contact.email
     // Setup table descriptor for test columns with new schema:
     /**
diff --git 
a/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp
 
b/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp
index b1152f159bd..d897f0e4762 100644
--- 
a/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp
+++ 
b/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp
@@ -60,10 +60,9 @@ struct ColumnAccessPathConfig {
     std::string column_name;
     std::vector<std::vector<std::string>> all_column_paths; // For 
all_column_access_paths
     std::vector<std::vector<std::string>> predicate_paths;  // For 
predicate_column_access_paths
-    bool use_name_paths = true; // true=use field names, false=use indices
 };
 
-// Iceberg 列ID分配 (基于 Protobuf Schema 定义)
+// Iceberg column ID assignment (based on Protobuf schema definition)
 // 1: id (int64)
 // 2: name (string)
 // 3: profile (struct)
@@ -770,12 +769,11 @@ protected:
     }
 
     // Helper function: run Parquet test with different column ID extraction 
methods
-    void run_parquet_test_with_method(const std::vector<std::string>& 
table_column_names,
-                                      const 
std::vector<ColumnAccessPathConfig>& access_configs,
-                                      const std::set<uint64_t>& 
expected_column_ids,
-                                      const std::set<uint64_t>& 
expected_filter_column_ids,
-                                      bool use_top_level_method = false,
-                                      bool should_skip_assertion = false) {
+    void run_parquet_test(const std::vector<std::string>& table_column_names,
+                          const std::vector<ColumnAccessPathConfig>& 
access_configs,
+                          const std::set<uint64_t>& expected_column_ids,
+                          const std::set<uint64_t>& expected_filter_column_ids,
+                          bool use_top_level_method = false, bool 
should_skip_assertion = false) {
         std::string test_file =
                 
"./be/test/exec/test_data/nested_user_profiles_iceberg_parquet/data/"
                 "00000-9-a7e0135f-d581-40e4-8d56-a929aded99e4-0-00001.parquet";
@@ -841,12 +839,11 @@ protected:
     }
 
     // Helper function: run Orc test with different column ID extraction 
methods
-    void run_orc_test_with_method(const std::vector<std::string>& 
table_column_names,
-                                  const std::vector<ColumnAccessPathConfig>& 
access_configs,
-                                  const std::set<uint64_t>& 
expected_column_ids,
-                                  const std::set<uint64_t>& 
expected_filter_column_ids,
-                                  bool use_top_level_method = false,
-                                  bool should_skip_assertion = false) {
+    void run_orc_test(const std::vector<std::string>& table_column_names,
+                      const std::vector<ColumnAccessPathConfig>& 
access_configs,
+                      const std::set<uint64_t>& expected_column_ids,
+                      const std::set<uint64_t>& expected_filter_column_ids,
+                      bool use_top_level_method = false, bool 
should_skip_assertion = false) {
         std::string test_file =
                 
"./be/test/exec/test_data/nested_user_profiles_iceberg_orc/data/"
                 "00000-8-5a144c37-16a4-47c6-96db-0007175b5c90-0-00001.orc";
@@ -913,12 +910,8 @@ protected:
 };
 
 TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_1) {
-    // Properties:
-
-    // Configure access paths for the profile column
     ColumnAccessPathConfig access_config;
     access_config.column_name = "profile";
-    access_config.use_name_paths = true;
 
     access_config.all_column_paths = {{"3", "9", "14", "15"},
                                       {"3", "9", "14", "16"},
@@ -931,16 +924,15 @@ TEST_F(IcebergReaderCreateColumnIdsTest, 
test_create_column_ids_1) {
     std::set<uint64_t> expected_column_ids = {2, 3, 4, 7, 8, 9, 10, 11, 15, 
16, 18};
     std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11};
 
-    run_parquet_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                                 expected_filter_column_ids);
-    run_orc_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                             expected_filter_column_ids);
+    run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+                     expected_filter_column_ids);
+    run_orc_test(table_column_names, {access_config}, expected_column_ids,
+                 expected_filter_column_ids);
 }
 
 TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_2) {
-    // Properties:
-    //     iceberg.schema: 
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type":
 [...]
-    // ORC 列ID分配为树型递增，根节点为0，子节点依次递增。
+    // ORC column IDs are assigned in a tree-like incremental manner: the root 
node is 0, and child nodes increase sequentially.
+    // Currently, Parquet uses a similar design.
     // 0: struct (table/root)
     //   1: id (int64)
     //   2: name (string)
@@ -961,30 +953,26 @@ TEST_F(IcebergReaderCreateColumnIdsTest, 
test_create_column_ids_2) {
     //           17: name (string)
     //           18: level (int32)
 
-    // 配置profile列的访问路径
     ColumnAccessPathConfig access_config;
     access_config.column_name = "profile";
-    access_config.use_name_paths = true;
 
     access_config.all_column_paths = {{"3"}};
     access_config.predicate_paths = {{"3", "9", "14", "15"}, {"3", "10", 
"17"}};
 
     std::vector<std::string> table_column_names = {"name", "profile"};
-    // column_ids should contain all necessary column IDs (set automatically 
deduplicates)
     std::set<uint64_t> expected_column_ids = {2,  3,  4,  5,  6,  7,  8,  9, 
10,
                                               11, 12, 13, 14, 15, 16, 17, 18};
     std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11};
 
-    run_parquet_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                                 expected_filter_column_ids);
-    run_orc_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                             expected_filter_column_ids);
+    run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+                     expected_filter_column_ids);
+    run_orc_test(table_column_names, {access_config}, expected_column_ids,
+                 expected_filter_column_ids);
 }
 
 TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_3) {
-    // Properties:
-    //     iceberg.schema: 
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type":
 [...]
-    // ORC 列ID分配为树型递增，根节点为0，子节点依次递增。
+    // ORC column IDs are assigned in a tree-like incremental manner: the root 
node is 0, and child nodes increase sequentially.
+    // Currently, Parquet uses a similar design.
     // 0: struct (table/root)
     //   1: id (int64)
     //   2: name (string)
@@ -1005,51 +993,43 @@ TEST_F(IcebergReaderCreateColumnIdsTest, 
test_create_column_ids_3) {
     //           17: name (string)
     //           18: level (int32)
 
-    // 配置profile列的访问路径
     ColumnAccessPathConfig access_config;
     access_config.column_name = "profile";
-    access_config.use_name_paths = true;
 
     access_config.all_column_paths = {{"3", "10"}, {"3", "9"}};
     access_config.predicate_paths = {{"3", "9", "14"}, {"3", "10", "17"}};
 
-    // access_config.all_column_paths = {{"profile", "contact"}, {"profile", 
"address"}};
-    // access_config.predicate_paths = {{"profile", "address", "coordinates"},
-    //                                  {"profile", "contact", "email"}};
-
     std::vector<std::string> table_column_names = {"name", "profile"};
-    // column_ids should contain all necessary column IDs (set automatically 
deduplicates)
     std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
12, 13, 14};
     std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 9, 10, 11};
 
-    run_parquet_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                                 expected_filter_column_ids);
-    run_orc_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                             expected_filter_column_ids);
+    run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+                     expected_filter_column_ids);
+    run_orc_test(table_column_names, {access_config}, expected_column_ids,
+                 expected_filter_column_ids);
 }
 
 TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_4) {
-    // 配置profile列的访问路径
     ColumnAccessPathConfig access_config;
     access_config.column_name = "profile";
-    access_config.use_name_paths = true;
 
     access_config.all_column_paths = {};
     access_config.predicate_paths = {};
 
     std::vector<std::string> table_column_names = {"name", "profile"};
-    // column_ids should contain all necessary column IDs (set automatically 
deduplicates)
     std::set<uint64_t> expected_column_ids = {2,  3,  4,  5,  6,  7,  8,  9, 
10,
                                               11, 12, 13, 14, 15, 16, 17, 18};
     std::set<uint64_t> expected_filter_column_ids = {};
 
-    run_parquet_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                                 expected_filter_column_ids);
-    run_orc_test_with_method(table_column_names, {access_config}, 
expected_column_ids,
-                             expected_filter_column_ids);
+    run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+                     expected_filter_column_ids);
+    run_orc_test(table_column_names, {access_config}, expected_column_ids,
+                 expected_filter_column_ids);
 }
 
 TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_5) {
+    // ORC column IDs are assigned in a tree-like incremental manner: the root 
node is 0, and child nodes increase sequentially.
+    // Currently, Parquet uses a similar design.
     //   19: tags (array)
     //     20: element (string)
     //   21: friends (array)
@@ -1068,10 +1048,8 @@ TEST_F(IcebergReaderCreateColumnIdsTest, 
test_create_column_ids_5) {
     std::vector<ColumnAccessPathConfig> access_configs;
 
     {
-        // 配置friends列的访问路径
         ColumnAccessPathConfig access_config;
         access_config.column_name = "friends";
-        access_config.use_name_paths = true;
 
         access_config.all_column_paths = {{"5", "*", "27"}, {"5", "*", "28"}};
         access_config.predicate_paths = {
@@ -1081,10 +1059,8 @@ TEST_F(IcebergReaderCreateColumnIdsTest, 
test_create_column_ids_5) {
     }
 
     {
-        // 配置recent_activity列的访问路径
         ColumnAccessPathConfig access_config;
         access_config.column_name = "recent_activity";
-        access_config.use_name_paths = true;
 
         access_config.all_column_paths = {{"6", "*", "30"}, {"6", "*", "31", 
"*", "34"}};
         access_config.predicate_paths = {
@@ -1094,17 +1070,18 @@ TEST_F(IcebergReaderCreateColumnIdsTest, 
test_create_column_ids_5) {
     }
 
     std::vector<std::string> table_column_names = {"name", "friends", 
"recent_activity"};
-    // column_ids should contain all necessary column IDs (set automatically 
deduplicates)
     std::set<uint64_t> expected_column_ids = {2, 21, 22, 24, 25, 26, 27, 28, 
29, 30, 32};
     std::set<uint64_t> expected_filter_column_ids = {21, 22, 24, 26, 27, 28};
 
-    run_parquet_test_with_method(table_column_names, access_configs, 
expected_column_ids,
-                                 expected_filter_column_ids);
-    run_orc_test_with_method(table_column_names, access_configs, 
expected_column_ids,
-                             expected_filter_column_ids);
+    run_parquet_test(table_column_names, access_configs, expected_column_ids,
+                     expected_filter_column_ids);
+    run_orc_test(table_column_names, access_configs, expected_column_ids,
+                 expected_filter_column_ids);
 }
 
 TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_6) {
+    // ORC column IDs are assigned in a tree-like incremental manner: the root 
node is 0, and child nodes increase sequentially.
+    // Currently, Parquet uses a similar design.
     //   33: attributes (map)
     //     34: key (string)
     //     35: value (string)
@@ -1168,10 +1145,8 @@ TEST_F(IcebergReaderCreateColumnIdsTest, 
test_create_column_ids_6) {
     std::vector<ColumnAccessPathConfig> access_configs;
 
     {
-        // 配置complex_attributes列的访问路径
         ColumnAccessPathConfig access_config;
         access_config.column_name = "complex_attributes";
-        access_config.use_name_paths = true;
 
         access_config.all_column_paths = {{"8", "*", "39", "43"},
                                           {"8", "*", "40", "*", "50", "*", 
"55"},
@@ -1184,27 +1159,25 @@ TEST_F(IcebergReaderCreateColumnIdsTest, 
test_create_column_ids_6) {
     {
         std::vector<std::string> table_column_names = {"name", 
"complex_attributes"};
         // parquet values should access keys
-        // column_ids should contain all necessary column IDs (set 
automatically deduplicates)
         std::set<uint64_t> expected_column_ids = {2,  36, 37, 38, 39, 40, 44, 
45, 48, 49, 52, 53,
                                                   54, 61, 62, 63, 64, 65, 66, 
67, 68, 69, 70, 71,
                                                   72, 73, 74, 75, 76, 77, 79, 
80, 82, 83};
         std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40};
 
-        run_parquet_test_with_method(table_column_names, access_configs, 
expected_column_ids,
-                                     expected_filter_column_ids);
+        run_parquet_test(table_column_names, access_configs, 
expected_column_ids,
+                         expected_filter_column_ids);
     }
 
     {
         std::vector<std::string> table_column_names = {"name", 
"complex_attributes"};
         // orc values should access keys because need to deduplicate by keys
-        // column_ids should contain all necessary column IDs (set 
automatically deduplicates)
         std::set<uint64_t> expected_column_ids = {2,  36, 37, 38, 39, 40, 44, 
45, 48, 49, 52, 53,
                                                   54, 61, 62, 63, 64, 65, 66, 
67, 68, 69, 70, 71,
                                                   72, 73, 74, 75, 76, 77, 79, 
80, 82, 83};
         std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40};
 
-        run_orc_test_with_method(table_column_names, access_configs, 
expected_column_ids,
-                                 expected_filter_column_ids);
+        run_orc_test(table_column_names, access_configs, expected_column_ids,
+                     expected_filter_column_ids);
     }
 }
 
diff --git a/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp 
b/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp
index 12d6bb0299f..b7dedcf834e 100644
--- a/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp
+++ b/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp
@@ -465,12 +465,10 @@ protected:
 };
 
 // Test reading real Iceberg Parquet file using IcebergTableReader
-TEST_F(IcebergReaderTest, ReadIcebergParquetFile) {
+TEST_F(IcebergReaderTest, read_iceberg_parquet_file) {
     // Read only: name, profile.address.coordinates.lat, 
profile.address.coordinates.lng, profile.contact.email
     // Setup table descriptor for test columns with new schema:
     /**
-    Properties:
-    iceberg.schema: 
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type":{"type"
 [...]
     Schema:
     message table {
     required int64 id = 1;
@@ -603,12 +601,10 @@ TEST_F(IcebergReaderTest, ReadIcebergParquetFile) {
 }
 
 // Test reading real Iceberg Orc file using IcebergTableReader
-TEST_F(IcebergReaderTest, ReadIcebergOrcFile) {
+TEST_F(IcebergReaderTest, read_iceberg_orc_file) {
     // Read only: name, profile.address.coordinates.lat, 
profile.address.coordinates.lng, profile.contact.email
     // Setup table descriptor for test columns with new schema:
     /**
-    Properties:
-    iceberg.schema: 
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type":{"type"
 [...]
     Schema:
     message table {
     required int64 id = 1;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch nested_column_prune updated: Clean external table implementation's code.

Reply via email to