This is an automated email from the ASF dual-hosted git repository. huajianlan pushed a commit to branch nested_column_prune in repository https://gitbox.apache.org/repos/asf/doris.git
commit 6572e52de834374948f104d1f8501a8f0803a02e Author: kakachen <[email protected]> AuthorDate: Mon Nov 3 11:28:25 2025 +0800 Clean external table implementation's code. --- be/src/vec/exec/format/orc/vorc_reader.cpp | 134 ++++++------------ be/src/vec/exec/format/orc/vorc_reader.h | 6 +- .../exec/format/parquet/vparquet_column_reader.cpp | 67 +-------- .../exec/format/parquet/vparquet_column_reader.h | 4 - .../exec/format/parquet/vparquet_group_reader.cpp | 24 +--- .../hive/hive_reader_create_column_ids_test.cpp | 152 +++++++++------------ .../exec/format/table/hive/hive_reader_test.cpp | 4 +- .../iceberg_reader_create_column_ids_test.cpp | 113 ++++++--------- .../format/table/iceberg/iceberg_reader_test.cpp | 8 +- 9 files changed, 168 insertions(+), 344 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 11b75b820ca..62d56c0c8cd 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -406,9 +406,6 @@ Status OrcReader::_init_read_columns() { SCOPED_RAW_TIMER(&_statistics.init_column_time); const auto& root_type = _reader->getType(); - // Save file root type for later use in type conversion - _file_root_type = &root_type; - // Build column ID to file type mapping for all columns in file _column_id_to_file_type.clear(); std::function<void(const orc::Type*, int)> build_column_id_map = [&](const orc::Type* type, @@ -1191,7 +1188,7 @@ Status OrcReader::set_fill_columns( if (!_column_ids.empty()) { std::list<uint64_t> column_ids_list(_column_ids.begin(), _column_ids.end()); _row_reader_options.includeTypes(column_ids_list); - } else { + } else { // If column_ids is empty, include all top-level columns to be read. _row_reader_options.include(_read_file_cols); } _row_reader_options.setEnableLazyDecoding(true); @@ -1281,35 +1278,6 @@ Status OrcReader::set_fill_columns( } } - // Helper function to recursively print ORC type structure - std::function<void(const orc::Type*, int, const std::string&)> print_orc_type_tree = - [&](const orc::Type* type, int indent, const std::string& name) { - if (type) { - std::string indent_str(indent * 2, ' '); - VLOG_DEBUG << indent_str << "- " << name - << " (type_id=" << type->getColumnId() - << ", kind=" << static_cast<int>(type->getKind()) << ")"; - } - - if (type->getKind() == orc::TypeKind::STRUCT) { - for (uint64_t i = 0; i < type->getSubtypeCount(); ++i) { - std::string field_name = type->getFieldName(i); - print_orc_type_tree(type->getSubtype(i), indent + 1, field_name); - } - } else if (type->getKind() == orc::TypeKind::LIST) { - print_orc_type_tree(type->getSubtype(0), indent + 1, "element"); - } else if (type->getKind() == orc::TypeKind::MAP) { - print_orc_type_tree(type->getSubtype(0), indent + 1, "key"); - print_orc_type_tree(type->getSubtype(1), indent + 1, "value"); - } else { - // Primitive type, no children - } - }; - - // VLOG_DEBUG << "\n[OrcReader] ========== ORC Type Structure =========="; - // print_orc_type_tree(&selected_type, 0, "root"); - // VLOG_DEBUG << "[OrcReader] ==========================================\n"; - _type_map.clear(); if (_is_acid) { for (uint64_t i = 0; i < selected_type.getSubtypeCount(); ++i) { @@ -1429,11 +1397,6 @@ Status OrcReader::_fill_missing_columns( block->erase(result_column_id); } } - - // // no default column, fill with null - // auto mutable_column = block->get_by_name(kv.first).column->assume_mutable(); - // auto* nullable_column = static_cast<vectorized::ColumnNullable*>(mutable_column.get()); - // nullable_column->insert_many_defaults(rows); } return Status::OK(); } @@ -1542,22 +1505,22 @@ DataTypePtr OrcReader::convert_to_doris_type(const orc::Type* orc_type) { orc_type->getSubtype(1) == nullptr) { // Try to find the complete type from _column_id_to_file_type uint64_t column_id = orc_type->getColumnId(); - VLOG(1) << "[OrcReader] Detected incomplete MAP type: column_id=" << column_id - << ", subtype_count=" << orc_type->getSubtypeCount() << ", subtype(0)=" - << (orc_type->getSubtypeCount() > 0 && orc_type->getSubtype(0) != nullptr - ? "not null" - : "null") - << ", subtype(1)=" - << (orc_type->getSubtypeCount() > 1 && orc_type->getSubtype(1) != nullptr - ? "not null" - : "null"); + VLOG_DEBUG << "[OrcReader] Detected incomplete MAP type: column_id=" << column_id + << ", subtype_count=" << orc_type->getSubtypeCount() << ", subtype(0)=" + << (orc_type->getSubtypeCount() > 0 && orc_type->getSubtype(0) != nullptr + ? "not null" + : "null") + << ", subtype(1)=" + << (orc_type->getSubtypeCount() > 1 && orc_type->getSubtype(1) != nullptr + ? "not null" + : "null"); auto it = _column_id_to_file_type.find(column_id); if (it != _column_id_to_file_type.end() && it->second != nullptr) { const orc::Type* complete_type = it->second; - VLOG(1) << "[OrcReader] Found complete type in mapping: column_id=" << column_id - << ", complete_type_kind=" << static_cast<int>(complete_type->getKind()) - << ", complete_subtype_count=" << complete_type->getSubtypeCount(); + VLOG_DEBUG << "[OrcReader] Found complete type in mapping: column_id=" << column_id + << ", complete_type_kind=" << static_cast<int>(complete_type->getKind()) + << ", complete_subtype_count=" << complete_type->getSubtypeCount(); // Print subtypes information for (uint64_t i = 0; i < complete_type->getSubtypeCount(); ++i) { @@ -1573,7 +1536,8 @@ DataTypePtr OrcReader::convert_to_doris_type(const orc::Type* orc_type) { if (complete_type->getKind() == orc::TypeKind::MAP && complete_type->getSubtypeCount() == 2) { - VLOG(1) << "[OrcReader] Using complete MAP type from file schema for column_id=" + VLOG_DEBUG + << "[OrcReader] Using complete MAP type from file schema for column_id=" << column_id; // Get subtypes with extra validation @@ -1925,18 +1889,18 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name, const orc::Type* orc_key_type = orc_column_type->getSubtype(0); const orc::Type* orc_value_type = orc_column_type->getSubtype(1); - VLOG(1) << "[OrcReader] MAP column '" << col_name - << "': orc_key_type=" << (orc_key_type != nullptr ? "not null" : "NULL") - << ", orc_value_type=" << (orc_value_type != nullptr ? "not null" : "NULL") - << ", element_size=" << element_size; + VLOG_DEBUG << "[OrcReader] MAP column '" << col_name + << "': orc_key_type=" << (orc_key_type != nullptr ? "not null" : "NULL") + << ", orc_value_type=" << (orc_value_type != nullptr ? "not null" : "NULL") + << ", element_size=" << element_size; // Handle incomplete MAP type - if key or value type is nullptr, try to recover from mapping bool key_is_missing = (orc_key_type == nullptr); bool value_is_missing = (orc_value_type == nullptr); if (key_is_missing || value_is_missing) { - VLOG(1) << "[OrcReader] Detected incomplete MAP subtypes for column '" << col_name - << "', attempting to recover from mapping..."; + VLOG_DEBUG << "[OrcReader] Detected incomplete MAP subtypes for column '" << col_name + << "', attempting to recover from mapping..."; uint64_t column_id = orc_column_type->getColumnId(); auto it = _column_id_to_file_type.find(column_id); @@ -1948,15 +1912,16 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name, orc_key_type = complete_map_type->getSubtype(0); if (orc_key_type != nullptr) { // key_is_missing = false; - VLOG(1) << "[OrcReader] Recovered key type from mapping for column '" - << col_name << "'"; + VLOG_DEBUG << "[OrcReader] Recovered key type from mapping for column '" + << col_name << "'"; } } if (value_is_missing) { orc_value_type = complete_map_type->getSubtype(1); if (orc_value_type != nullptr) { // value_is_missing = false; - VLOG(1) << "[OrcReader] Recovered value type from mapping for column '" + VLOG_DEBUG + << "[OrcReader] Recovered value type from mapping for column '" << col_name << "'"; } } @@ -1971,9 +1936,6 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name, // Handle key column: if still missing, fill with default values if (key_is_missing) { - LOG(INFO) << "[OrcReader] Key type is missing for MAP column '" << col_name - << "', filling with default values (element_size=" << element_size << ")"; - // Fill key column with default values (nulls or empty values) auto mutable_key_column = doris_key_column->assume_mutable(); if (mutable_key_column->is_nullable()) { @@ -1991,9 +1953,6 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name, // Handle value column: if still missing, fill with default values if (value_is_missing) { - LOG(INFO) << "[OrcReader] Value type is missing for MAP column '" << col_name - << "', filling with default values (element_size=" << element_size << ")"; - // Fill value column with default values (nulls or empty values) auto mutable_value_column = doris_value_column->assume_mutable(); if (mutable_value_column->is_nullable()) { @@ -2072,7 +2031,6 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name, for (auto read_field : read_fields) { orc::ColumnVectorBatch* orc_field = orc_struct->fields[read_field.second]; - const orc::Type* orc_type = orc_column_type->getSubtype(read_field.second); std::string field_name = col_name + "." + orc_column_type->getFieldName(read_field.second); @@ -2099,7 +2057,6 @@ Status OrcReader::_orc_column_to_doris_column( const orc::ColumnVectorBatch* cvb, size_t num_values) { DataTypePtr resolved_type; ColumnPtr resolved_column; - converter::ColumnTypeConverter* converter_ptr = nullptr; MutableColumnPtr data_column; if (orc_column_type != nullptr) { auto src_type = convert_to_doris_type(orc_column_type); @@ -2129,9 +2086,9 @@ Status OrcReader::_orc_column_to_doris_column( // reuse the cached converter _converters[converter_key] = std::move(converter); } - converter_ptr = _converters[converter_key].get(); - resolved_column = converter_ptr->get_column(src_type, doris_column, data_type); - resolved_type = converter_ptr->get_type(); + converter::ColumnTypeConverter* converter = _converters[converter_key].get(); + resolved_column = converter->get_column(src_type, doris_column, data_type); + resolved_type = converter->get_type(); if (resolved_column->is_nullable()) { SCOPED_RAW_TIMER(&_statistics.decode_null_map_time); @@ -2158,17 +2115,20 @@ Status OrcReader::_orc_column_to_doris_column( data_column = resolved_column->assume_mutable(); } - RETURN_IF_ERROR((_fill_doris_data_column<is_filter>( + RETURN_IF_ERROR(_fill_doris_data_column<is_filter>( col_name, data_column, remove_nullable(resolved_type), root_node, orc_column_type, - cvb, num_values))); - // if (converter_ptr != nullptr) { + cvb, num_values)); // resolve schema change auto converted_column = doris_column->assume_mutable(); - return converter_ptr->convert(resolved_column, converted_column); - // } + return converter->convert(resolved_column, converted_column); } else { - reinterpret_cast<ColumnNullable*>(doris_column->assume_mutable().get()) - ->insert_many_defaults(num_values); + auto mutable_column = doris_column->assume_mutable(); + if (mutable_column->is_nullable()) { + auto* nullable_column = static_cast<ColumnNullable*>(mutable_column.get()); + nullable_column->insert_many_defaults(num_values); + } else { + mutable_column->insert_many_defaults(num_values); + } } return Status::OK(); @@ -2272,10 +2232,10 @@ Status OrcReader::_get_next_block_impl(Block* block, size_t* read_rows, bool* eo if (orc_col_idx == _colname_to_idx.end()) { return Status::InternalError("Wrong read column '{}' in orc file", col_name); } - RETURN_IF_ERROR((_orc_column_to_doris_column<true>( + RETURN_IF_ERROR(_orc_column_to_doris_column<true>( col_name, column_ptr, column_type, _table_info_node_ptr->get_children_node(col_name), _type_map[file_column_name], - batch_vec[orc_col_idx->second], _batch->numElements))); + batch_vec[orc_col_idx->second], _batch->numElements)); } RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements, @@ -2330,12 +2290,6 @@ Status OrcReader::_get_next_block_impl(Block* block, size_t* read_rows, bool* eo } } - //std::cout << "[OrcReader] _column_ids: "; - //for (const auto& cid : _column_ids) { - // std::cout << cid << " "; - //} - //std::cout << std::endl; - if (!_dict_cols_has_converted && !_dict_filter_cols.empty()) { for (auto& dict_filter_cols : _dict_filter_cols) { MutableColumnPtr dict_col_ptr = ColumnInt32::create(); @@ -2369,10 +2323,10 @@ Status OrcReader::_get_next_block_impl(Block* block, size_t* read_rows, bool* eo if (orc_col_idx == _colname_to_idx.end()) { return Status::InternalError("Wrong read column '{}' in orc file", col_name); } - RETURN_IF_ERROR((_orc_column_to_doris_column<false>( + RETURN_IF_ERROR(_orc_column_to_doris_column<false>( col_name, column_ptr, column_type, _table_info_node_ptr->get_children_node(col_name), _type_map[file_column_name], - batch_vec[orc_col_idx->second], _batch->numElements))); + batch_vec[orc_col_idx->second], _batch->numElements)); } RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements, @@ -2535,10 +2489,10 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t s if (orc_col_idx == _colname_to_idx.end()) { return Status::InternalError("Wrong read column '{}' in orc file", table_col_name); } - RETURN_IF_ERROR((_orc_column_to_doris_column<false>( + RETURN_IF_ERROR(_orc_column_to_doris_column<false>( table_col_name, column_ptr, column_type, _table_info_node_ptr->get_children_node(table_col_name), - _type_map[file_column_name], batch_vec[orc_col_idx->second], data.numElements))); + _type_map[file_column_name], batch_vec[orc_col_idx->second], data.numElements)); } RETURN_IF_ERROR( _fill_partition_columns(block, size, _lazy_read_ctx.predicate_partition_columns)); diff --git a/be/src/vec/exec/format/orc/vorc_reader.h b/be/src/vec/exec/format/orc/vorc_reader.h index 341e40a9ce2..d1313bc4639 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.h +++ b/be/src/vec/exec/format/orc/vorc_reader.h @@ -639,7 +639,6 @@ private: size_t _batch_size; int64_t _range_start_offset; int64_t _range_size; - // const std::string& _ctz; std::string _ctz; int32_t _offset_days = 0; @@ -663,11 +662,8 @@ private: // file column name to orc type std::unordered_map<std::string, const orc::Type*> _type_map; - // Optimized type conversion support - // column ID to file original type mapping for handling partial column selection + // Column ID to file original type mapping for handling incomplete MAP type due to column pruning. std::unordered_map<uint64_t, const orc::Type*> _column_id_to_file_type; - // Keep reference to file root type for complete type information - const orc::Type* _file_root_type = nullptr; std::unique_ptr<ORCFileInputStream> _file_input_stream; Statistics _statistics; diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index a7455455198..589570c7ab0 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -111,23 +111,6 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, FieldSchema* field, size_t max_buf_size, const tparquet::OffsetIndex* offset_index, const std::set<uint64_t>& column_ids, const std::set<uint64_t>& filter_column_ids) { - // if (!column_ids.empty()) { - // uint64_t field_column_id = field->get_column_id(); - // VLOG_DEBUG << "[ParquetReader] Checking column_id: " << field_column_id - // << ", column_ids.size(): " << column_ids.size(); - // if (column_ids.find(field_column_id) == column_ids.end()) { - // VLOG_DEBUG << "[ParquetReader] Setting skip_reading=true for column_id: " - // << field_column_id; - // auto skip_reader = std::make_unique<SkipReadingReader>(row_ranges, ctz, io_ctx, field); - // skip_reader->_filter_column_ids = filter_column_ids; - // reader = std::move(skip_reader); - // return Status::OK(); - // } else { - // VLOG_DEBUG << "[ParquetReader] Column_id " << field_column_id - // << " found in column_ids, skip_reading=false"; - // } - // } - if (field->data_type->get_primitive_type() == TYPE_ARRAY) { std::unique_ptr<ParquetColumnReader> element_reader; RETURN_IF_ERROR(create(file, &field->children[0], row_group, row_ranges, ctz, io_ctx, @@ -177,7 +160,7 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, FieldSchema* field, child_readers.reserve(field->children.size()); int non_skip_reader_idx = -1; for (int i = 0; i < field->children.size(); ++i) { - auto& child = field->children[i]; // 去掉 const,保证 &child 类型为 FieldSchema* + auto& child = field->children[i]; std::unique_ptr<ParquetColumnReader> child_reader; if (column_ids.empty() || column_ids.find(child.get_column_id()) != column_ids.end()) { RETURN_IF_ERROR(create(file, &child, row_group, row_ranges, ctz, io_ctx, @@ -185,32 +168,25 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file, FieldSchema* field, filter_column_ids)); child_reader->set_nested_column(); child_readers[child.name] = std::move(child_reader); - // 记录第一个非 SkippingReader + // Record the first non-SkippingReader if (non_skip_reader_idx == -1) { non_skip_reader_idx = i; } } else { - // 不在 column_ids 的 child,创建 SkippingReader auto skip_reader = std::make_unique<SkipReadingReader>(row_ranges, ctz, io_ctx, &child); skip_reader->_filter_column_ids = filter_column_ids; child_readers[child.name] = std::move(skip_reader); } } - // 如果所有都是 SkipReadingReader,则强制第一个 child 调用 create + // If all children are SkipReadingReader, force the first child to call create if (non_skip_reader_idx == -1) { - // for (int i = 0; i < field->children.size(); ++i) { - // 只处理 SkipReadingReader(即刚才创建的) - // if (dynamic_cast<SkipReadingReader*>(child_readers[field->children[i].name].get()) != nullptr) { std::unique_ptr<ParquetColumnReader> child_reader; RETURN_IF_ERROR(create(file, &field->children[0], row_group, row_ranges, ctz, io_ctx, child_reader, max_buf_size, nullptr, column_ids, filter_column_ids)); child_reader->set_nested_column(); child_readers[field->children[0].name] = std::move(child_reader); - // break; - // } - // } } auto struct_reader = StructColumnReader::create_unique(row_ranges, ctz, io_ctx); RETURN_IF_ERROR(struct_reader->init(std::move(child_readers), field)); @@ -878,7 +854,6 @@ Status MapColumnReader::read_column_data( value_rows += loop_rows; } DCHECK_EQ(key_rows, value_rows); - // DCHECK_EQ(key_eof, value_eof); *read_rows = key_rows; *eof = key_eof; @@ -948,10 +923,6 @@ Status StructColumnReader::read_column_data( continue; } auto file_name = root_node->children_file_column_name(doris_name); - // if (_child_readers.find(file_name) == _child_readers.end()) { - // missing_column_idxs.push_back(i); - // continue; - // } // Check if this is a SkipReadingReader - we should skip it when choosing reference column // because SkipReadingReader doesn't know the actual data size in nested context @@ -978,13 +949,6 @@ Status StructColumnReader::read_column_data( batch_size, &field_rows, &field_eof, is_dict_filter)); *read_rows = field_rows; *eof = field_eof; - // Debug: print the first non-missing child column read info - //std::cout << "[ParquetReader] struct '" << _field_schema->name << "' first non-missing child='" - // << file_name << "' doris_name='" << doris_name << "' field_rows=" << field_rows - // << " field_eof=" << field_eof << " doris_field_size=" << doris_field->size() - // << " child_rep_levels=" << _child_readers[file_name]->get_rep_level().size() - // << " child_def_levels=" << _child_readers[file_name]->get_def_level().size() - // << std::endl; /* * Considering the issue in the `_read_nested_column` function where data may span across pages, leading * to missing definition and repetition levels, when filling the null_map of the struct later, it is @@ -1001,12 +965,6 @@ Status StructColumnReader::read_column_data( doris_field, doris_type, root_node->get_children_node(doris_name), filter_map, *read_rows - field_rows, &loop_rows, &field_eof, is_dict_filter)); - // Debug: print each loop iteration for non-first child columns - //std::cout << "[ParquetReader] struct '" << _field_schema->name << "' non-first child='" - // << file_name << "' doris_name='" << doris_name << "' loop_rows=" << loop_rows - // << " field_rows=" << field_rows << " target_rows=" << *read_rows - // << " field_eof=" << field_eof << " doris_field_size=" << doris_field->size() - // << std::endl; field_rows += loop_rows; } DCHECK_EQ(*read_rows, field_rows); @@ -1014,20 +972,11 @@ Status StructColumnReader::read_column_data( } } - // After scanning children, print summary of missing/skip info - //std::cout << "[ParquetReader] struct '" << _field_schema->name << "' summary: not_missing_column_id=" - // << not_missing_column_id << " missing_columns_count=" << missing_column_idxs.size() - // << " skip_reading_count=" << skip_reading_column_idxs.size() << std::endl; - int64_t missing_column_sz = -1; if (not_missing_column_id == -1) { // All queried columns are missing in the file (e.g., all added after schema change) - // We need to pick a column from _field_schema that exists in the file to get RL/DL - //std::cout << "[ParquetReader] All queried columns missing for struct '" << _field_schema->name - // << "', searching for reference column in _field_schema" << std::endl; - - // Find a column from _field_schema children that exists in the file for RL/DL reference + // We need to pick a column from _field_schema children that exists in the file for RL/DL reference std::string reference_file_column_name; std::unique_ptr<ParquetColumnReader>* reference_reader = nullptr; @@ -1039,8 +988,6 @@ Status StructColumnReader::read_column_data( if (!is_skip_reader) { reference_file_column_name = child.name; reference_reader = &(it->second); - //std::cout << "[ParquetReader] Found reference column: " << reference_file_column_name - // << " for struct '" << _field_schema->name << "'" << std::endl; break; } } @@ -1087,10 +1034,6 @@ Status StructColumnReader::read_column_data( // Store this reference column name for get_rep_level/get_def_level to use _read_column_names.emplace_back(reference_file_column_name); - //std::cout << "[ParquetReader] Read " << field_rows << " rows from reference column '" - // << reference_file_column_name << "', temp_column.size=" << temp_column->size() - // << " for struct '" << _field_schema->name << "'" << std::endl; - missing_column_sz = temp_column->size() - not_missing_orig_column_size; } else { return Status::Corruption( @@ -1139,8 +1082,6 @@ Status StructColumnReader::read_column_data( } if (null_map_ptr != nullptr) { - //std::cout << "[ParquetReader] struct '" << _field_schema->name << "' before fill_struct_null_map: rep_levels=" - // << this->get_rep_level().size() << " def_levels=" << this->get_def_level().size() << std::endl; fill_struct_null_map(_field_schema, *null_map_ptr, this->get_rep_level(), this->get_def_level()); } diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_reader.h index 7b37f2e0e6c..893b3bc784a 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.h @@ -412,8 +412,6 @@ public: << _field_schema->name << ". This indicates the SkipReadingReader was incorrectly used as a reference " "column."; - // static std::vector<level_t> empty_levels; - // return empty_levels; __builtin_unreachable(); } @@ -422,8 +420,6 @@ public: << _field_schema->name << ". This indicates the SkipReadingReader was incorrectly used as a reference " "column."; - // static std::vector<level_t> empty_levels; - // return empty_levels; __builtin_unreachable(); } diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index 19ddb8d1b99..947dcee3408 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -98,13 +98,7 @@ RowGroupReader::RowGroupReader(io::FileReaderSPtr file_reader, _state(state), _obj_pool(new ObjectPool()), _column_ids(column_ids), - _filter_column_ids(filter_column_ids) { - //std::cout << "[RowGroupReader] _column_ids: "; - //for (const auto& col_id : _column_ids) { - // std::cout << col_id << " "; - //} - //std::cout << std::endl; -} + _filter_column_ids(filter_column_ids) {} RowGroupReader::~RowGroupReader() { _column_readers.clear(); @@ -438,14 +432,13 @@ Status RowGroupReader::_read_column_data(Block* block, RETURN_IF_ERROR(_column_readers[read_col_name]->read_column_data( column_ptr, column_type, _table_info_node_ptr->get_children_node(read_col_name), filter_map, batch_size - col_read_rows, &loop_rows, &col_eof, is_dict_filter)); - // Debug: print loop_rows for this iteration - //std::cout << "[RowGroupReader] column '" << read_col_name << "' loop_rows=" << loop_rows - // << " col_read_rows_so_far=" << col_read_rows << std::endl; + VLOG_DEBUG << "[RowGroupReader] column '" << read_col_name + << "' loop_rows=" << loop_rows << " col_read_rows_so_far=" << col_read_rows + << std::endl; col_read_rows += loop_rows; } - // Debug: print rows read for this column - //std::cout << "[RowGroupReader] column '" << read_col_name << "' read_rows=" << col_read_rows - // << std::endl; + VLOG_DEBUG << "[RowGroupReader] column '" << read_col_name + << "' read_rows=" << col_read_rows << std::endl; if (batch_read_rows > 0 && batch_read_rows != col_read_rows) { LOG(WARNING) << "[RowGroupReader] Mismatched read rows among parquet columns. " "previous_batch_read_rows=" @@ -741,10 +734,6 @@ Status RowGroupReader::_fill_missing_columns( block->erase(result_column_id); } } - - // auto mutable_column = block->get_by_name(kv.first).column->assume_mutable(); - // auto* nullable_column = assert_cast<vectorized::ColumnNullable*>(mutable_column.get()); - // nullable_column->insert_many_defaults(rows); } return Status::OK(); } @@ -1135,7 +1124,6 @@ ParquetColumnReader::Statistics RowGroupReader::statistics() { } return st; } - #include "common/compile_check_end.h" } // namespace doris::vectorized diff --git a/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp b/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp index d46acbdc134..eea63192857 100644 --- a/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp +++ b/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp @@ -60,7 +60,6 @@ struct ColumnAccessPathConfig { std::string column_name; std::vector<std::vector<std::string>> all_column_paths; // For all_column_access_paths std::vector<std::vector<std::string>> predicate_paths; // For predicate_column_access_paths - bool use_name_paths = true; // true=use field names, false=use indices }; // ORC column IDs are assigned in a tree-increment order, root is 0 and children increment sequentially. @@ -738,12 +737,11 @@ protected: } // Helper function: Run Parquet test with different column ID extraction methods - void run_parquet_test_with_method(const std::vector<std::string>& table_column_names, - const std::vector<ColumnAccessPathConfig>& access_configs, - const std::set<uint64_t>& expected_column_ids, - const std::set<uint64_t>& expected_filter_column_ids, - bool use_top_level_method = false, - bool should_skip_assertion = false) { + void run_parquet_test(const std::vector<std::string>& table_column_names, + const std::vector<ColumnAccessPathConfig>& access_configs, + const std::set<uint64_t>& expected_column_ids, + const std::set<uint64_t>& expected_filter_column_ids, + bool use_top_level_method = false, bool should_skip_assertion = false) { std::string test_file = "./be/test/exec/test_data/nested_user_profiles_parquet/" "part-00000-64a7a390-1a03-4efc-ab51-557e9369a1f9-c000.snappy.parquet"; @@ -809,12 +807,11 @@ protected: } // Helper function: Run ORC test with different column ID extraction methods - void run_orc_test_with_method(const std::vector<std::string>& table_column_names, - const std::vector<ColumnAccessPathConfig>& access_configs, - const std::set<uint64_t>& expected_column_ids, - const std::set<uint64_t>& expected_filter_column_ids, - bool use_top_level_method = false, - bool should_skip_assertion = false) { + void run_orc_test(const std::vector<std::string>& table_column_names, + const std::vector<ColumnAccessPathConfig>& access_configs, + const std::set<uint64_t>& expected_column_ids, + const std::set<uint64_t>& expected_filter_column_ids, + bool use_top_level_method = false, bool should_skip_assertion = false) { std::string test_file = "./be/test/exec/test_data/nested_user_profiles_orc/" "part-00000-62614f23-05d1-4043-a533-b155ef52b720-c000.snappy.orc"; @@ -881,12 +878,8 @@ protected: }; TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_1) { - // Properties: - - // Configure access paths for profile column ColumnAccessPathConfig access_config; access_config.column_name = "profile"; - access_config.use_name_paths = true; access_config.all_column_paths = {{"profile", "address", "coordinates", "lat"}, {"profile", "address", "coordinates", "lng"}, @@ -900,21 +893,20 @@ TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_1) { std::set<uint64_t> expected_column_ids = {2, 3, 4, 7, 8, 9, 10, 11, 15, 16, 18}; std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11}; - run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); - run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids, true); + run_parquet_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_parquet_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); - run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); - run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids, true); + run_orc_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); } TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_2) { - // Properties: - // iceberg.schema: {"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type": [...] - // ORC column IDs are assigned in a tree-increment order, root is 0 and children increment sequentially. + // ORC column IDs are assigned in a tree-like incremental manner: the root node is 0, and child nodes increase sequentially. + // Currently, Parquet uses a similar design. // 0: struct (table/root) // 1: id (int64) // 2: name (string) @@ -935,36 +927,32 @@ TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_2) { // 17: name (string) // 18: level (int32) - // 配置profile列的访问路径 ColumnAccessPathConfig access_config; access_config.column_name = "profile"; - access_config.use_name_paths = true; access_config.all_column_paths = {{"profile"}}; access_config.predicate_paths = {{"profile", "address", "coordinates", "lat"}, {"profile", "contact", "email"}}; std::vector<std::string> table_column_names = {"name", "profile"}; - // column_ids should contain all necessary column IDs (set automatically deduplicates) std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11}; - run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); - run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids, true); + run_parquet_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_parquet_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); - run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); - run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids, true); + run_orc_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); } TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_3) { - // Properties: - // iceberg.schema: {"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type": [...] - // ORC column IDs are assigned in a tree-increment order, root is 0 and children increment sequentially. + // ORC column IDs are assigned in a tree-like incremental manner: the root node is 0, and child nodes increase sequentially. + // Currently, Parquet uses a similar design. // 0: struct (table/root) // 1: id (int64) // 2: name (string) @@ -985,58 +973,54 @@ TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_3) { // 17: name (string) // 18: level (int32) - // 配置profile列的访问路径 ColumnAccessPathConfig access_config; access_config.column_name = "profile"; - access_config.use_name_paths = true; access_config.all_column_paths = {{"profile", "contact"}, {"profile", "address"}}; access_config.predicate_paths = {{"profile", "address", "coordinates"}, {"profile", "contact", "email"}}; std::vector<std::string> table_column_names = {"name", "profile"}; - // column_ids should contain all necessary column IDs (set automatically deduplicates) std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}; std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 9, 10, 11}; - run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); - run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids, true); + run_parquet_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_parquet_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); - run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); - run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids, true); + run_orc_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); } TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_4) { - // 配置profile列的访问路径 ColumnAccessPathConfig access_config; access_config.column_name = "profile"; - access_config.use_name_paths = true; access_config.all_column_paths = {}; access_config.predicate_paths = {}; std::vector<std::string> table_column_names = {"name", "profile"}; - // column_ids should contain all necessary column IDs (set automatically deduplicates) std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; std::set<uint64_t> expected_filter_column_ids = {}; - run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); - run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids, true); + run_parquet_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_parquet_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); - run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); - run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids, true); + run_orc_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids, true); } TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_5) { + // ORC column IDs are assigned in a tree-like incremental manner: the root node is 0, and child nodes increase sequentially. + // Currently, Parquet uses a similar design. // 19: tags (array) // 20: element (string) // 21: friends (array) @@ -1058,7 +1042,6 @@ TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_5) { // Configure access paths for friends column ColumnAccessPathConfig access_config; access_config.column_name = "friends"; - access_config.use_name_paths = true; access_config.all_column_paths = {{"friends", "*", "nickname"}, {"friends", "*", "friendship_level"}}; @@ -1072,7 +1055,6 @@ TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_5) { // Configure access paths for recent_activity column ColumnAccessPathConfig access_config; access_config.column_name = "recent_activity"; - access_config.use_name_paths = true; access_config.all_column_paths = {{"recent_activity", "*", "action"}, {"recent_activity", "*", "details", "*", "value"}}; @@ -1083,22 +1065,23 @@ TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_5) { } std::vector<std::string> table_column_names = {"name", "friends", "recent_activity"}; - // column_ids should contain all necessary column IDs (set automatically deduplicates) std::set<uint64_t> expected_column_ids = {2, 21, 22, 24, 25, 26, 27, 28, 29, 30, 32}; std::set<uint64_t> expected_filter_column_ids = {21, 22, 24, 26, 27, 28}; - run_parquet_test_with_method(table_column_names, access_configs, expected_column_ids, - expected_filter_column_ids); - run_parquet_test_with_method(table_column_names, access_configs, expected_column_ids, - expected_filter_column_ids, true); + run_parquet_test(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids); + run_parquet_test(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids, true); - run_orc_test_with_method(table_column_names, access_configs, expected_column_ids, - expected_filter_column_ids); - run_orc_test_with_method(table_column_names, access_configs, expected_column_ids, - expected_filter_column_ids, true); + run_orc_test(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids); + run_orc_test(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids, true); } TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_6) { + // ORC column IDs are assigned in a tree-like incremental manner: the root node is 0, and child nodes increase sequentially. + // Currently, Parquet uses a similar design. // 33: attributes (map) // 34: key (string) // 35: value (string) @@ -1165,7 +1148,6 @@ TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_6) { // Configure access paths for complex_attributes column ColumnAccessPathConfig access_config; access_config.column_name = "complex_attributes"; - access_config.use_name_paths = true; access_config.all_column_paths = { {"complex_attributes", "*", "metadata", "version"}, @@ -1183,30 +1165,28 @@ TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_6) { { std::vector<std::string> table_column_names = {"name", "complex_attributes"}; // parquet values should access keys - // column_ids should contain all necessary column IDs (set automatically deduplicates) std::set<uint64_t> expected_column_ids = {2, 36, 37, 38, 39, 40, 44, 45, 48, 49, 52, 53, 54, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80, 82, 83}; std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40}; - run_parquet_test_with_method(table_column_names, access_configs, expected_column_ids, - expected_filter_column_ids); - run_parquet_test_with_method(table_column_names, access_configs, expected_column_ids, - expected_filter_column_ids, true); + run_parquet_test(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids); + run_parquet_test(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids, true); } { std::vector<std::string> table_column_names = {"name", "complex_attributes"}; // orc values should access keys because need to deduplicate by keys - // column_ids should contain all necessary column IDs (set automatically deduplicates) std::set<uint64_t> expected_column_ids = {2, 36, 37, 38, 39, 40, 44, 45, 48, 49, 52, 53, 54, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80, 82, 83}; std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40}; - run_orc_test_with_method(table_column_names, access_configs, expected_column_ids, - expected_filter_column_ids); - run_orc_test_with_method(table_column_names, access_configs, expected_column_ids, - expected_filter_column_ids, true); + run_orc_test(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids); + run_orc_test(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids, true); } } diff --git a/be/test/vec/exec/format/table/hive/hive_reader_test.cpp b/be/test/vec/exec/format/table/hive/hive_reader_test.cpp index 16bde5c7bae..ca2cada932f 100644 --- a/be/test/vec/exec/format/table/hive/hive_reader_test.cpp +++ b/be/test/vec/exec/format/table/hive/hive_reader_test.cpp @@ -462,7 +462,7 @@ protected: }; // Test reading real Hive Parquet file using HiveTableReader -TEST_F(HiveReaderTest, ReadHiveParquetFile) { +TEST_F(HiveReaderTest, read_hive_parquet_file) { // Read only: name, profile.address.coordinates.lat, profile.address.coordinates.lng, profile.contact.email // Setup table descriptor for test columns with new schema: /** @@ -604,7 +604,7 @@ TEST_F(HiveReaderTest, ReadHiveParquetFile) { } // Test reading real Hive Orc file using HiveTableReader -TEST_F(HiveReaderTest, ReadHiveOrcFile) { +TEST_F(HiveReaderTest, read_hive_rrc_file) { // Read only: name, profile.address.coordinates.lat, profile.address.coordinates.lng, profile.contact.email // Setup table descriptor for test columns with new schema: /** diff --git a/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp b/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp index b1152f159bd..d897f0e4762 100644 --- a/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp +++ b/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp @@ -60,10 +60,9 @@ struct ColumnAccessPathConfig { std::string column_name; std::vector<std::vector<std::string>> all_column_paths; // For all_column_access_paths std::vector<std::vector<std::string>> predicate_paths; // For predicate_column_access_paths - bool use_name_paths = true; // true=use field names, false=use indices }; -// Iceberg 列ID分配 (基于 Protobuf Schema 定义) +// Iceberg column ID assignment (based on Protobuf schema definition) // 1: id (int64) // 2: name (string) // 3: profile (struct) @@ -770,12 +769,11 @@ protected: } // Helper function: run Parquet test with different column ID extraction methods - void run_parquet_test_with_method(const std::vector<std::string>& table_column_names, - const std::vector<ColumnAccessPathConfig>& access_configs, - const std::set<uint64_t>& expected_column_ids, - const std::set<uint64_t>& expected_filter_column_ids, - bool use_top_level_method = false, - bool should_skip_assertion = false) { + void run_parquet_test(const std::vector<std::string>& table_column_names, + const std::vector<ColumnAccessPathConfig>& access_configs, + const std::set<uint64_t>& expected_column_ids, + const std::set<uint64_t>& expected_filter_column_ids, + bool use_top_level_method = false, bool should_skip_assertion = false) { std::string test_file = "./be/test/exec/test_data/nested_user_profiles_iceberg_parquet/data/" "00000-9-a7e0135f-d581-40e4-8d56-a929aded99e4-0-00001.parquet"; @@ -841,12 +839,11 @@ protected: } // Helper function: run Orc test with different column ID extraction methods - void run_orc_test_with_method(const std::vector<std::string>& table_column_names, - const std::vector<ColumnAccessPathConfig>& access_configs, - const std::set<uint64_t>& expected_column_ids, - const std::set<uint64_t>& expected_filter_column_ids, - bool use_top_level_method = false, - bool should_skip_assertion = false) { + void run_orc_test(const std::vector<std::string>& table_column_names, + const std::vector<ColumnAccessPathConfig>& access_configs, + const std::set<uint64_t>& expected_column_ids, + const std::set<uint64_t>& expected_filter_column_ids, + bool use_top_level_method = false, bool should_skip_assertion = false) { std::string test_file = "./be/test/exec/test_data/nested_user_profiles_iceberg_orc/data/" "00000-8-5a144c37-16a4-47c6-96db-0007175b5c90-0-00001.orc"; @@ -913,12 +910,8 @@ protected: }; TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_1) { - // Properties: - - // Configure access paths for the profile column ColumnAccessPathConfig access_config; access_config.column_name = "profile"; - access_config.use_name_paths = true; access_config.all_column_paths = {{"3", "9", "14", "15"}, {"3", "9", "14", "16"}, @@ -931,16 +924,15 @@ TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_1) { std::set<uint64_t> expected_column_ids = {2, 3, 4, 7, 8, 9, 10, 11, 15, 16, 18}; std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11}; - run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); - run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); + run_parquet_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); } TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_2) { - // Properties: - // iceberg.schema: {"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type": [...] - // ORC 列ID分配为树型递增,根节点为0,子节点依次递增。 + // ORC column IDs are assigned in a tree-like incremental manner: the root node is 0, and child nodes increase sequentially. + // Currently, Parquet uses a similar design. // 0: struct (table/root) // 1: id (int64) // 2: name (string) @@ -961,30 +953,26 @@ TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_2) { // 17: name (string) // 18: level (int32) - // 配置profile列的访问路径 ColumnAccessPathConfig access_config; access_config.column_name = "profile"; - access_config.use_name_paths = true; access_config.all_column_paths = {{"3"}}; access_config.predicate_paths = {{"3", "9", "14", "15"}, {"3", "10", "17"}}; std::vector<std::string> table_column_names = {"name", "profile"}; - // column_ids should contain all necessary column IDs (set automatically deduplicates) std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11}; - run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); - run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); + run_parquet_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); } TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_3) { - // Properties: - // iceberg.schema: {"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type": [...] - // ORC 列ID分配为树型递增,根节点为0,子节点依次递增。 + // ORC column IDs are assigned in a tree-like incremental manner: the root node is 0, and child nodes increase sequentially. + // Currently, Parquet uses a similar design. // 0: struct (table/root) // 1: id (int64) // 2: name (string) @@ -1005,51 +993,43 @@ TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_3) { // 17: name (string) // 18: level (int32) - // 配置profile列的访问路径 ColumnAccessPathConfig access_config; access_config.column_name = "profile"; - access_config.use_name_paths = true; access_config.all_column_paths = {{"3", "10"}, {"3", "9"}}; access_config.predicate_paths = {{"3", "9", "14"}, {"3", "10", "17"}}; - // access_config.all_column_paths = {{"profile", "contact"}, {"profile", "address"}}; - // access_config.predicate_paths = {{"profile", "address", "coordinates"}, - // {"profile", "contact", "email"}}; - std::vector<std::string> table_column_names = {"name", "profile"}; - // column_ids should contain all necessary column IDs (set automatically deduplicates) std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}; std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 9, 10, 11}; - run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); - run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); + run_parquet_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); } TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_4) { - // 配置profile列的访问路径 ColumnAccessPathConfig access_config; access_config.column_name = "profile"; - access_config.use_name_paths = true; access_config.all_column_paths = {}; access_config.predicate_paths = {}; std::vector<std::string> table_column_names = {"name", "profile"}; - // column_ids should contain all necessary column IDs (set automatically deduplicates) std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; std::set<uint64_t> expected_filter_column_ids = {}; - run_parquet_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); - run_orc_test_with_method(table_column_names, {access_config}, expected_column_ids, - expected_filter_column_ids); + run_parquet_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); + run_orc_test(table_column_names, {access_config}, expected_column_ids, + expected_filter_column_ids); } TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_5) { + // ORC column IDs are assigned in a tree-like incremental manner: the root node is 0, and child nodes increase sequentially. + // Currently, Parquet uses a similar design. // 19: tags (array) // 20: element (string) // 21: friends (array) @@ -1068,10 +1048,8 @@ TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_5) { std::vector<ColumnAccessPathConfig> access_configs; { - // 配置friends列的访问路径 ColumnAccessPathConfig access_config; access_config.column_name = "friends"; - access_config.use_name_paths = true; access_config.all_column_paths = {{"5", "*", "27"}, {"5", "*", "28"}}; access_config.predicate_paths = { @@ -1081,10 +1059,8 @@ TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_5) { } { - // 配置recent_activity列的访问路径 ColumnAccessPathConfig access_config; access_config.column_name = "recent_activity"; - access_config.use_name_paths = true; access_config.all_column_paths = {{"6", "*", "30"}, {"6", "*", "31", "*", "34"}}; access_config.predicate_paths = { @@ -1094,17 +1070,18 @@ TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_5) { } std::vector<std::string> table_column_names = {"name", "friends", "recent_activity"}; - // column_ids should contain all necessary column IDs (set automatically deduplicates) std::set<uint64_t> expected_column_ids = {2, 21, 22, 24, 25, 26, 27, 28, 29, 30, 32}; std::set<uint64_t> expected_filter_column_ids = {21, 22, 24, 26, 27, 28}; - run_parquet_test_with_method(table_column_names, access_configs, expected_column_ids, - expected_filter_column_ids); - run_orc_test_with_method(table_column_names, access_configs, expected_column_ids, - expected_filter_column_ids); + run_parquet_test(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids); + run_orc_test(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids); } TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_6) { + // ORC column IDs are assigned in a tree-like incremental manner: the root node is 0, and child nodes increase sequentially. + // Currently, Parquet uses a similar design. // 33: attributes (map) // 34: key (string) // 35: value (string) @@ -1168,10 +1145,8 @@ TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_6) { std::vector<ColumnAccessPathConfig> access_configs; { - // 配置complex_attributes列的访问路径 ColumnAccessPathConfig access_config; access_config.column_name = "complex_attributes"; - access_config.use_name_paths = true; access_config.all_column_paths = {{"8", "*", "39", "43"}, {"8", "*", "40", "*", "50", "*", "55"}, @@ -1184,27 +1159,25 @@ TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_6) { { std::vector<std::string> table_column_names = {"name", "complex_attributes"}; // parquet values should access keys - // column_ids should contain all necessary column IDs (set automatically deduplicates) std::set<uint64_t> expected_column_ids = {2, 36, 37, 38, 39, 40, 44, 45, 48, 49, 52, 53, 54, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80, 82, 83}; std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40}; - run_parquet_test_with_method(table_column_names, access_configs, expected_column_ids, - expected_filter_column_ids); + run_parquet_test(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids); } { std::vector<std::string> table_column_names = {"name", "complex_attributes"}; // orc values should access keys because need to deduplicate by keys - // column_ids should contain all necessary column IDs (set automatically deduplicates) std::set<uint64_t> expected_column_ids = {2, 36, 37, 38, 39, 40, 44, 45, 48, 49, 52, 53, 54, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80, 82, 83}; std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40}; - run_orc_test_with_method(table_column_names, access_configs, expected_column_ids, - expected_filter_column_ids); + run_orc_test(table_column_names, access_configs, expected_column_ids, + expected_filter_column_ids); } } diff --git a/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp b/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp index 12d6bb0299f..b7dedcf834e 100644 --- a/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp +++ b/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp @@ -465,12 +465,10 @@ protected: }; // Test reading real Iceberg Parquet file using IcebergTableReader -TEST_F(IcebergReaderTest, ReadIcebergParquetFile) { +TEST_F(IcebergReaderTest, read_iceberg_parquet_file) { // Read only: name, profile.address.coordinates.lat, profile.address.coordinates.lng, profile.contact.email // Setup table descriptor for test columns with new schema: /** - Properties: - iceberg.schema: {"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type":{"type" [...] Schema: message table { required int64 id = 1; @@ -603,12 +601,10 @@ TEST_F(IcebergReaderTest, ReadIcebergParquetFile) { } // Test reading real Iceberg Orc file using IcebergTableReader -TEST_F(IcebergReaderTest, ReadIcebergOrcFile) { +TEST_F(IcebergReaderTest, read_iceberg_orc_file) { // Read only: name, profile.address.coordinates.lat, profile.address.coordinates.lng, profile.contact.email // Setup table descriptor for test columns with new schema: /** - Properties: - iceberg.schema: {"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type":{"type" [...] Schema: message table { required int64 id = 1; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
