This is an automated email from the ASF dual-hosted git repository.
kakachen pushed a commit to branch nested_column_prune
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/nested_column_prune by this
push:
new abaac93b380 Clean external table implementation's code.
abaac93b380 is described below
commit abaac93b3801ce22c1a54b593a1d884db87e4b06
Author: kakachen <[email protected]>
AuthorDate: Mon Nov 3 11:28:25 2025 +0800
Clean external table implementation's code.
---
be/src/vec/exec/format/orc/vorc_reader.cpp | 128 ++++++-----------
be/src/vec/exec/format/orc/vorc_reader.h | 6 +-
.../exec/format/parquet/vparquet_column_reader.cpp | 67 +--------
.../exec/format/parquet/vparquet_column_reader.h | 4 -
.../exec/format/parquet/vparquet_group_reader.cpp | 24 +---
.../hive/hive_reader_create_column_ids_test.cpp | 152 +++++++++------------
.../exec/format/table/hive/hive_reader_test.cpp | 4 +-
.../iceberg_reader_create_column_ids_test.cpp | 113 ++++++---------
.../format/table/iceberg/iceberg_reader_test.cpp | 8 +-
9 files changed, 168 insertions(+), 338 deletions(-)
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 11b75b820ca..1b6d4eadeb8 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -406,9 +406,6 @@ Status OrcReader::_init_read_columns() {
SCOPED_RAW_TIMER(&_statistics.init_column_time);
const auto& root_type = _reader->getType();
- // Save file root type for later use in type conversion
- _file_root_type = &root_type;
-
// Build column ID to file type mapping for all columns in file
_column_id_to_file_type.clear();
std::function<void(const orc::Type*, int)> build_column_id_map = [&](const
orc::Type* type,
@@ -1191,7 +1188,7 @@ Status OrcReader::set_fill_columns(
if (!_column_ids.empty()) {
std::list<uint64_t> column_ids_list(_column_ids.begin(),
_column_ids.end());
_row_reader_options.includeTypes(column_ids_list);
- } else {
+ } else { // If column_ids is empty, include all top-level columns to
be read.
_row_reader_options.include(_read_file_cols);
}
_row_reader_options.setEnableLazyDecoding(true);
@@ -1281,35 +1278,6 @@ Status OrcReader::set_fill_columns(
}
}
- // Helper function to recursively print ORC type structure
- std::function<void(const orc::Type*, int, const std::string&)>
print_orc_type_tree =
- [&](const orc::Type* type, int indent, const std::string&
name) {
- if (type) {
- std::string indent_str(indent * 2, ' ');
- VLOG_DEBUG << indent_str << "- " << name
- << " (type_id=" << type->getColumnId()
- << ", kind=" <<
static_cast<int>(type->getKind()) << ")";
- }
-
- if (type->getKind() == orc::TypeKind::STRUCT) {
- for (uint64_t i = 0; i < type->getSubtypeCount(); ++i)
{
- std::string field_name = type->getFieldName(i);
- print_orc_type_tree(type->getSubtype(i), indent +
1, field_name);
- }
- } else if (type->getKind() == orc::TypeKind::LIST) {
- print_orc_type_tree(type->getSubtype(0), indent + 1,
"element");
- } else if (type->getKind() == orc::TypeKind::MAP) {
- print_orc_type_tree(type->getSubtype(0), indent + 1,
"key");
- print_orc_type_tree(type->getSubtype(1), indent + 1,
"value");
- } else {
- // Primitive type, no children
- }
- };
-
- // VLOG_DEBUG << "\n[OrcReader] ========== ORC Type Structure
==========";
- // print_orc_type_tree(&selected_type, 0, "root");
- // VLOG_DEBUG << "[OrcReader]
==========================================\n";
-
_type_map.clear();
if (_is_acid) {
for (uint64_t i = 0; i < selected_type.getSubtypeCount(); ++i) {
@@ -1429,11 +1397,6 @@ Status OrcReader::_fill_missing_columns(
block->erase(result_column_id);
}
}
-
- // // no default column, fill with null
- // auto mutable_column =
block->get_by_name(kv.first).column->assume_mutable();
- // auto* nullable_column =
static_cast<vectorized::ColumnNullable*>(mutable_column.get());
- // nullable_column->insert_many_defaults(rows);
}
return Status::OK();
}
@@ -1542,22 +1505,22 @@ DataTypePtr OrcReader::convert_to_doris_type(const
orc::Type* orc_type) {
orc_type->getSubtype(1) == nullptr) {
// Try to find the complete type from _column_id_to_file_type
uint64_t column_id = orc_type->getColumnId();
- VLOG(1) << "[OrcReader] Detected incomplete MAP type: column_id="
<< column_id
- << ", subtype_count=" << orc_type->getSubtypeCount() << ",
subtype(0)="
- << (orc_type->getSubtypeCount() > 0 &&
orc_type->getSubtype(0) != nullptr
- ? "not null"
- : "null")
- << ", subtype(1)="
- << (orc_type->getSubtypeCount() > 1 &&
orc_type->getSubtype(1) != nullptr
- ? "not null"
- : "null");
+ VLOG_DEBUG << "[OrcReader] Detected incomplete MAP type:
column_id=" << column_id
+ << ", subtype_count=" << orc_type->getSubtypeCount() <<
", subtype(0)="
+ << (orc_type->getSubtypeCount() > 0 &&
orc_type->getSubtype(0) != nullptr
+ ? "not null"
+ : "null")
+ << ", subtype(1)="
+ << (orc_type->getSubtypeCount() > 1 &&
orc_type->getSubtype(1) != nullptr
+ ? "not null"
+ : "null");
auto it = _column_id_to_file_type.find(column_id);
if (it != _column_id_to_file_type.end() && it->second != nullptr) {
const orc::Type* complete_type = it->second;
- VLOG(1) << "[OrcReader] Found complete type in mapping:
column_id=" << column_id
- << ", complete_type_kind=" <<
static_cast<int>(complete_type->getKind())
- << ", complete_subtype_count=" <<
complete_type->getSubtypeCount();
+ VLOG_DEBUG << "[OrcReader] Found complete type in mapping:
column_id=" << column_id
+ << ", complete_type_kind=" <<
static_cast<int>(complete_type->getKind())
+ << ", complete_subtype_count=" <<
complete_type->getSubtypeCount();
// Print subtypes information
for (uint64_t i = 0; i < complete_type->getSubtypeCount();
++i) {
@@ -1573,7 +1536,8 @@ DataTypePtr OrcReader::convert_to_doris_type(const
orc::Type* orc_type) {
if (complete_type->getKind() == orc::TypeKind::MAP &&
complete_type->getSubtypeCount() == 2) {
- VLOG(1) << "[OrcReader] Using complete MAP type from file
schema for column_id="
+ VLOG_DEBUG
+ << "[OrcReader] Using complete MAP type from file
schema for column_id="
<< column_id;
// Get subtypes with extra validation
@@ -1925,18 +1889,18 @@ Status OrcReader::_fill_doris_data_column(const
std::string& col_name,
const orc::Type* orc_key_type = orc_column_type->getSubtype(0);
const orc::Type* orc_value_type = orc_column_type->getSubtype(1);
- VLOG(1) << "[OrcReader] MAP column '" << col_name
- << "': orc_key_type=" << (orc_key_type != nullptr ? "not null"
: "NULL")
- << ", orc_value_type=" << (orc_value_type != nullptr ? "not
null" : "NULL")
- << ", element_size=" << element_size;
+ VLOG_DEBUG << "[OrcReader] MAP column '" << col_name
+ << "': orc_key_type=" << (orc_key_type != nullptr ? "not
null" : "NULL")
+ << ", orc_value_type=" << (orc_value_type != nullptr ? "not
null" : "NULL")
+ << ", element_size=" << element_size;
// Handle incomplete MAP type - if key or value type is nullptr, try
to recover from mapping
bool key_is_missing = (orc_key_type == nullptr);
bool value_is_missing = (orc_value_type == nullptr);
if (key_is_missing || value_is_missing) {
- VLOG(1) << "[OrcReader] Detected incomplete MAP subtypes for
column '" << col_name
- << "', attempting to recover from mapping...";
+ VLOG_DEBUG << "[OrcReader] Detected incomplete MAP subtypes for
column '" << col_name
+ << "', attempting to recover from mapping...";
uint64_t column_id = orc_column_type->getColumnId();
auto it = _column_id_to_file_type.find(column_id);
@@ -1948,15 +1912,16 @@ Status OrcReader::_fill_doris_data_column(const
std::string& col_name,
orc_key_type = complete_map_type->getSubtype(0);
if (orc_key_type != nullptr) {
// key_is_missing = false;
- VLOG(1) << "[OrcReader] Recovered key type from
mapping for column '"
- << col_name << "'";
+ VLOG_DEBUG << "[OrcReader] Recovered key type from
mapping for column '"
+ << col_name << "'";
}
}
if (value_is_missing) {
orc_value_type = complete_map_type->getSubtype(1);
if (orc_value_type != nullptr) {
// value_is_missing = false;
- VLOG(1) << "[OrcReader] Recovered value type from
mapping for column '"
+ VLOG_DEBUG
+ << "[OrcReader] Recovered value type from
mapping for column '"
<< col_name << "'";
}
}
@@ -2072,7 +2037,6 @@ Status OrcReader::_fill_doris_data_column(const
std::string& col_name,
for (auto read_field : read_fields) {
orc::ColumnVectorBatch* orc_field =
orc_struct->fields[read_field.second];
-
const orc::Type* orc_type =
orc_column_type->getSubtype(read_field.second);
std::string field_name =
col_name + "." +
orc_column_type->getFieldName(read_field.second);
@@ -2099,7 +2063,6 @@ Status OrcReader::_orc_column_to_doris_column(
const orc::ColumnVectorBatch* cvb, size_t num_values) {
DataTypePtr resolved_type;
ColumnPtr resolved_column;
- converter::ColumnTypeConverter* converter_ptr = nullptr;
MutableColumnPtr data_column;
if (orc_column_type != nullptr) {
auto src_type = convert_to_doris_type(orc_column_type);
@@ -2129,9 +2092,9 @@ Status OrcReader::_orc_column_to_doris_column(
// reuse the cached converter
_converters[converter_key] = std::move(converter);
}
- converter_ptr = _converters[converter_key].get();
- resolved_column = converter_ptr->get_column(src_type, doris_column,
data_type);
- resolved_type = converter_ptr->get_type();
+ converter::ColumnTypeConverter* converter =
_converters[converter_key].get();
+ resolved_column = converter->get_column(src_type, doris_column,
data_type);
+ resolved_type = converter->get_type();
if (resolved_column->is_nullable()) {
SCOPED_RAW_TIMER(&_statistics.decode_null_map_time);
@@ -2158,17 +2121,20 @@ Status OrcReader::_orc_column_to_doris_column(
data_column = resolved_column->assume_mutable();
}
- RETURN_IF_ERROR((_fill_doris_data_column<is_filter>(
+ RETURN_IF_ERROR(_fill_doris_data_column<is_filter>(
col_name, data_column, remove_nullable(resolved_type),
root_node, orc_column_type,
- cvb, num_values)));
- // if (converter_ptr != nullptr) {
+ cvb, num_values));
// resolve schema change
auto converted_column = doris_column->assume_mutable();
- return converter_ptr->convert(resolved_column, converted_column);
- // }
+ return converter->convert(resolved_column, converted_column);
} else {
- reinterpret_cast<ColumnNullable*>(doris_column->assume_mutable().get())
- ->insert_many_defaults(num_values);
+ auto mutable_column = doris_column->assume_mutable();
+ if (mutable_column->is_nullable()) {
+ auto* nullable_column =
static_cast<ColumnNullable*>(mutable_column.get());
+ nullable_column->insert_many_defaults(num_values);
+ } else {
+ mutable_column->insert_many_defaults(num_values);
+ }
}
return Status::OK();
@@ -2272,10 +2238,10 @@ Status OrcReader::_get_next_block_impl(Block* block,
size_t* read_rows, bool* eo
if (orc_col_idx == _colname_to_idx.end()) {
return Status::InternalError("Wrong read column '{}' in orc
file", col_name);
}
- RETURN_IF_ERROR((_orc_column_to_doris_column<true>(
+ RETURN_IF_ERROR(_orc_column_to_doris_column<true>(
col_name, column_ptr, column_type,
_table_info_node_ptr->get_children_node(col_name),
_type_map[file_column_name],
- batch_vec[orc_col_idx->second], _batch->numElements)));
+ batch_vec[orc_col_idx->second], _batch->numElements));
}
RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements,
@@ -2330,12 +2296,6 @@ Status OrcReader::_get_next_block_impl(Block* block,
size_t* read_rows, bool* eo
}
}
- //std::cout << "[OrcReader] _column_ids: ";
- //for (const auto& cid : _column_ids) {
- // std::cout << cid << " ";
- //}
- //std::cout << std::endl;
-
if (!_dict_cols_has_converted && !_dict_filter_cols.empty()) {
for (auto& dict_filter_cols : _dict_filter_cols) {
MutableColumnPtr dict_col_ptr = ColumnInt32::create();
@@ -2369,10 +2329,10 @@ Status OrcReader::_get_next_block_impl(Block* block,
size_t* read_rows, bool* eo
if (orc_col_idx == _colname_to_idx.end()) {
return Status::InternalError("Wrong read column '{}' in orc
file", col_name);
}
- RETURN_IF_ERROR((_orc_column_to_doris_column<false>(
+ RETURN_IF_ERROR(_orc_column_to_doris_column<false>(
col_name, column_ptr, column_type,
_table_info_node_ptr->get_children_node(col_name),
_type_map[file_column_name],
- batch_vec[orc_col_idx->second], _batch->numElements)));
+ batch_vec[orc_col_idx->second], _batch->numElements));
}
RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements,
@@ -2535,10 +2495,10 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data,
uint16_t* sel, uint16_t s
if (orc_col_idx == _colname_to_idx.end()) {
return Status::InternalError("Wrong read column '{}' in orc file",
table_col_name);
}
- RETURN_IF_ERROR((_orc_column_to_doris_column<false>(
+ RETURN_IF_ERROR(_orc_column_to_doris_column<false>(
table_col_name, column_ptr, column_type,
_table_info_node_ptr->get_children_node(table_col_name),
- _type_map[file_column_name], batch_vec[orc_col_idx->second],
data.numElements)));
+ _type_map[file_column_name], batch_vec[orc_col_idx->second],
data.numElements));
}
RETURN_IF_ERROR(
_fill_partition_columns(block, size,
_lazy_read_ctx.predicate_partition_columns));
diff --git a/be/src/vec/exec/format/orc/vorc_reader.h
b/be/src/vec/exec/format/orc/vorc_reader.h
index 341e40a9ce2..d1313bc4639 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.h
+++ b/be/src/vec/exec/format/orc/vorc_reader.h
@@ -639,7 +639,6 @@ private:
size_t _batch_size;
int64_t _range_start_offset;
int64_t _range_size;
- // const std::string& _ctz;
std::string _ctz;
int32_t _offset_days = 0;
@@ -663,11 +662,8 @@ private:
// file column name to orc type
std::unordered_map<std::string, const orc::Type*> _type_map;
- // Optimized type conversion support
- // column ID to file original type mapping for handling partial column
selection
+ // Column ID to file original type mapping for handling incomplete MAP
type due to column pruning.
std::unordered_map<uint64_t, const orc::Type*> _column_id_to_file_type;
- // Keep reference to file root type for complete type information
- const orc::Type* _file_root_type = nullptr;
std::unique_ptr<ORCFileInputStream> _file_input_stream;
Statistics _statistics;
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
index a7455455198..589570c7ab0 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
@@ -111,23 +111,6 @@ Status ParquetColumnReader::create(io::FileReaderSPtr
file, FieldSchema* field,
size_t max_buf_size, const
tparquet::OffsetIndex* offset_index,
const std::set<uint64_t>& column_ids,
const std::set<uint64_t>&
filter_column_ids) {
- // if (!column_ids.empty()) {
- // uint64_t field_column_id = field->get_column_id();
- // VLOG_DEBUG << "[ParquetReader] Checking column_id: " <<
field_column_id
- // << ", column_ids.size(): " << column_ids.size();
- // if (column_ids.find(field_column_id) == column_ids.end()) {
- // VLOG_DEBUG << "[ParquetReader] Setting skip_reading=true for
column_id: "
- // << field_column_id;
- // auto skip_reader =
std::make_unique<SkipReadingReader>(row_ranges, ctz, io_ctx, field);
- // skip_reader->_filter_column_ids = filter_column_ids;
- // reader = std::move(skip_reader);
- // return Status::OK();
- // } else {
- // VLOG_DEBUG << "[ParquetReader] Column_id " << field_column_id
- // << " found in column_ids, skip_reading=false";
- // }
- // }
-
if (field->data_type->get_primitive_type() == TYPE_ARRAY) {
std::unique_ptr<ParquetColumnReader> element_reader;
RETURN_IF_ERROR(create(file, &field->children[0], row_group,
row_ranges, ctz, io_ctx,
@@ -177,7 +160,7 @@ Status ParquetColumnReader::create(io::FileReaderSPtr file,
FieldSchema* field,
child_readers.reserve(field->children.size());
int non_skip_reader_idx = -1;
for (int i = 0; i < field->children.size(); ++i) {
- auto& child = field->children[i]; // 去掉 const,保证 &child 类型为
FieldSchema*
+ auto& child = field->children[i];
std::unique_ptr<ParquetColumnReader> child_reader;
if (column_ids.empty() || column_ids.find(child.get_column_id())
!= column_ids.end()) {
RETURN_IF_ERROR(create(file, &child, row_group, row_ranges,
ctz, io_ctx,
@@ -185,32 +168,25 @@ Status ParquetColumnReader::create(io::FileReaderSPtr
file, FieldSchema* field,
filter_column_ids));
child_reader->set_nested_column();
child_readers[child.name] = std::move(child_reader);
- // 记录第一个非 SkippingReader
+ // Record the first non-SkippingReader
if (non_skip_reader_idx == -1) {
non_skip_reader_idx = i;
}
} else {
- // 不在 column_ids 的 child,创建 SkippingReader
auto skip_reader =
std::make_unique<SkipReadingReader>(row_ranges, ctz,
io_ctx, &child);
skip_reader->_filter_column_ids = filter_column_ids;
child_readers[child.name] = std::move(skip_reader);
}
}
- // 如果所有都是 SkipReadingReader,则强制第一个 child 调用 create
+ // If all children are SkipReadingReader, force the first child to
call create
if (non_skip_reader_idx == -1) {
- // for (int i = 0; i < field->children.size(); ++i) {
- // 只处理 SkipReadingReader(即刚才创建的)
- // if
(dynamic_cast<SkipReadingReader*>(child_readers[field->children[i].name].get())
!= nullptr) {
std::unique_ptr<ParquetColumnReader> child_reader;
RETURN_IF_ERROR(create(file, &field->children[0], row_group,
row_ranges, ctz, io_ctx,
child_reader, max_buf_size, nullptr,
column_ids,
filter_column_ids));
child_reader->set_nested_column();
child_readers[field->children[0].name] = std::move(child_reader);
- // break;
- // }
- // }
}
auto struct_reader = StructColumnReader::create_unique(row_ranges,
ctz, io_ctx);
RETURN_IF_ERROR(struct_reader->init(std::move(child_readers), field));
@@ -878,7 +854,6 @@ Status MapColumnReader::read_column_data(
value_rows += loop_rows;
}
DCHECK_EQ(key_rows, value_rows);
- // DCHECK_EQ(key_eof, value_eof);
*read_rows = key_rows;
*eof = key_eof;
@@ -948,10 +923,6 @@ Status StructColumnReader::read_column_data(
continue;
}
auto file_name = root_node->children_file_column_name(doris_name);
- // if (_child_readers.find(file_name) == _child_readers.end()) {
- // missing_column_idxs.push_back(i);
- // continue;
- // }
// Check if this is a SkipReadingReader - we should skip it when
choosing reference column
// because SkipReadingReader doesn't know the actual data size in
nested context
@@ -978,13 +949,6 @@ Status StructColumnReader::read_column_data(
batch_size, &field_rows, &field_eof, is_dict_filter));
*read_rows = field_rows;
*eof = field_eof;
- // Debug: print the first non-missing child column read info
- //std::cout << "[ParquetReader] struct '" << _field_schema->name
<< "' first non-missing child='"
- // << file_name << "' doris_name='" << doris_name << "'
field_rows=" << field_rows
- // << " field_eof=" << field_eof << " doris_field_size="
<< doris_field->size()
- // << " child_rep_levels=" <<
_child_readers[file_name]->get_rep_level().size()
- // << " child_def_levels=" <<
_child_readers[file_name]->get_def_level().size()
- // << std::endl;
/*
* Considering the issue in the `_read_nested_column` function
where data may span across pages, leading
* to missing definition and repetition levels, when filling the
null_map of the struct later, it is
@@ -1001,12 +965,6 @@ Status StructColumnReader::read_column_data(
doris_field, doris_type,
root_node->get_children_node(doris_name),
filter_map, *read_rows - field_rows, &loop_rows,
&field_eof,
is_dict_filter));
- // Debug: print each loop iteration for non-first child columns
- //std::cout << "[ParquetReader] struct '" <<
_field_schema->name << "' non-first child='"
- // << file_name << "' doris_name='" << doris_name <<
"' loop_rows=" << loop_rows
- // << " field_rows=" << field_rows << " target_rows="
<< *read_rows
- // << " field_eof=" << field_eof << "
doris_field_size=" << doris_field->size()
- // << std::endl;
field_rows += loop_rows;
}
DCHECK_EQ(*read_rows, field_rows);
@@ -1014,20 +972,11 @@ Status StructColumnReader::read_column_data(
}
}
- // After scanning children, print summary of missing/skip info
- //std::cout << "[ParquetReader] struct '" << _field_schema->name << "'
summary: not_missing_column_id="
- // << not_missing_column_id << " missing_columns_count=" <<
missing_column_idxs.size()
- // << " skip_reading_count=" << skip_reading_column_idxs.size()
<< std::endl;
-
int64_t missing_column_sz = -1;
if (not_missing_column_id == -1) {
// All queried columns are missing in the file (e.g., all added after
schema change)
- // We need to pick a column from _field_schema that exists in the file
to get RL/DL
- //std::cout << "[ParquetReader] All queried columns missing for struct
'" << _field_schema->name
- // << "', searching for reference column in _field_schema" <<
std::endl;
-
- // Find a column from _field_schema children that exists in the file
for RL/DL reference
+ // We need to pick a column from _field_schema children that exists in
the file for RL/DL reference
std::string reference_file_column_name;
std::unique_ptr<ParquetColumnReader>* reference_reader = nullptr;
@@ -1039,8 +988,6 @@ Status StructColumnReader::read_column_data(
if (!is_skip_reader) {
reference_file_column_name = child.name;
reference_reader = &(it->second);
- //std::cout << "[ParquetReader] Found reference column: "
<< reference_file_column_name
- // << " for struct '" << _field_schema->name <<
"'" << std::endl;
break;
}
}
@@ -1087,10 +1034,6 @@ Status StructColumnReader::read_column_data(
// Store this reference column name for
get_rep_level/get_def_level to use
_read_column_names.emplace_back(reference_file_column_name);
- //std::cout << "[ParquetReader] Read " << field_rows << " rows
from reference column '"
- // << reference_file_column_name << "',
temp_column.size=" << temp_column->size()
- // << " for struct '" << _field_schema->name << "'" <<
std::endl;
-
missing_column_sz = temp_column->size() -
not_missing_orig_column_size;
} else {
return Status::Corruption(
@@ -1139,8 +1082,6 @@ Status StructColumnReader::read_column_data(
}
if (null_map_ptr != nullptr) {
- //std::cout << "[ParquetReader] struct '" << _field_schema->name << "'
before fill_struct_null_map: rep_levels="
- // << this->get_rep_level().size() << " def_levels=" <<
this->get_def_level().size() << std::endl;
fill_struct_null_map(_field_schema, *null_map_ptr,
this->get_rep_level(),
this->get_def_level());
}
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.h
b/be/src/vec/exec/format/parquet/vparquet_column_reader.h
index 7b37f2e0e6c..893b3bc784a 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_reader.h
+++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.h
@@ -412,8 +412,6 @@ public:
<< _field_schema->name
<< ". This indicates the SkipReadingReader was incorrectly
used as a reference "
"column.";
- // static std::vector<level_t> empty_levels;
- // return empty_levels;
__builtin_unreachable();
}
@@ -422,8 +420,6 @@ public:
<< _field_schema->name
<< ". This indicates the SkipReadingReader was incorrectly
used as a reference "
"column.";
- // static std::vector<level_t> empty_levels;
- // return empty_levels;
__builtin_unreachable();
}
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index 19ddb8d1b99..947dcee3408 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -98,13 +98,7 @@ RowGroupReader::RowGroupReader(io::FileReaderSPtr
file_reader,
_state(state),
_obj_pool(new ObjectPool()),
_column_ids(column_ids),
- _filter_column_ids(filter_column_ids) {
- //std::cout << "[RowGroupReader] _column_ids: ";
- //for (const auto& col_id : _column_ids) {
- // std::cout << col_id << " ";
- //}
- //std::cout << std::endl;
-}
+ _filter_column_ids(filter_column_ids) {}
RowGroupReader::~RowGroupReader() {
_column_readers.clear();
@@ -438,14 +432,13 @@ Status RowGroupReader::_read_column_data(Block* block,
RETURN_IF_ERROR(_column_readers[read_col_name]->read_column_data(
column_ptr, column_type,
_table_info_node_ptr->get_children_node(read_col_name),
filter_map, batch_size - col_read_rows, &loop_rows,
&col_eof, is_dict_filter));
- // Debug: print loop_rows for this iteration
- //std::cout << "[RowGroupReader] column '" << read_col_name << "'
loop_rows=" << loop_rows
- // << " col_read_rows_so_far=" << col_read_rows <<
std::endl;
+ VLOG_DEBUG << "[RowGroupReader] column '" << read_col_name
+ << "' loop_rows=" << loop_rows << "
col_read_rows_so_far=" << col_read_rows
+ << std::endl;
col_read_rows += loop_rows;
}
- // Debug: print rows read for this column
- //std::cout << "[RowGroupReader] column '" << read_col_name << "'
read_rows=" << col_read_rows
- // << std::endl;
+ VLOG_DEBUG << "[RowGroupReader] column '" << read_col_name
+ << "' read_rows=" << col_read_rows << std::endl;
if (batch_read_rows > 0 && batch_read_rows != col_read_rows) {
LOG(WARNING) << "[RowGroupReader] Mismatched read rows among
parquet columns. "
"previous_batch_read_rows="
@@ -741,10 +734,6 @@ Status RowGroupReader::_fill_missing_columns(
block->erase(result_column_id);
}
}
-
- // auto mutable_column =
block->get_by_name(kv.first).column->assume_mutable();
- // auto* nullable_column =
assert_cast<vectorized::ColumnNullable*>(mutable_column.get());
- // nullable_column->insert_many_defaults(rows);
}
return Status::OK();
}
@@ -1135,7 +1124,6 @@ ParquetColumnReader::Statistics
RowGroupReader::statistics() {
}
return st;
}
-
#include "common/compile_check_end.h"
} // namespace doris::vectorized
diff --git
a/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp
b/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp
index d46acbdc134..eea63192857 100644
--- a/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp
+++ b/be/test/vec/exec/format/table/hive/hive_reader_create_column_ids_test.cpp
@@ -60,7 +60,6 @@ struct ColumnAccessPathConfig {
std::string column_name;
std::vector<std::vector<std::string>> all_column_paths; // For
all_column_access_paths
std::vector<std::vector<std::string>> predicate_paths; // For
predicate_column_access_paths
- bool use_name_paths = true; // true=use field names, false=use indices
};
// ORC column IDs are assigned in a tree-increment order, root is 0 and
children increment sequentially.
@@ -738,12 +737,11 @@ protected:
}
// Helper function: Run Parquet test with different column ID extraction
methods
- void run_parquet_test_with_method(const std::vector<std::string>&
table_column_names,
- const
std::vector<ColumnAccessPathConfig>& access_configs,
- const std::set<uint64_t>&
expected_column_ids,
- const std::set<uint64_t>&
expected_filter_column_ids,
- bool use_top_level_method = false,
- bool should_skip_assertion = false) {
+ void run_parquet_test(const std::vector<std::string>& table_column_names,
+ const std::vector<ColumnAccessPathConfig>&
access_configs,
+ const std::set<uint64_t>& expected_column_ids,
+ const std::set<uint64_t>& expected_filter_column_ids,
+ bool use_top_level_method = false, bool
should_skip_assertion = false) {
std::string test_file =
"./be/test/exec/test_data/nested_user_profiles_parquet/"
"part-00000-64a7a390-1a03-4efc-ab51-557e9369a1f9-c000.snappy.parquet";
@@ -809,12 +807,11 @@ protected:
}
// Helper function: Run ORC test with different column ID extraction
methods
- void run_orc_test_with_method(const std::vector<std::string>&
table_column_names,
- const std::vector<ColumnAccessPathConfig>&
access_configs,
- const std::set<uint64_t>&
expected_column_ids,
- const std::set<uint64_t>&
expected_filter_column_ids,
- bool use_top_level_method = false,
- bool should_skip_assertion = false) {
+ void run_orc_test(const std::vector<std::string>& table_column_names,
+ const std::vector<ColumnAccessPathConfig>&
access_configs,
+ const std::set<uint64_t>& expected_column_ids,
+ const std::set<uint64_t>& expected_filter_column_ids,
+ bool use_top_level_method = false, bool
should_skip_assertion = false) {
std::string test_file =
"./be/test/exec/test_data/nested_user_profiles_orc/"
"part-00000-62614f23-05d1-4043-a533-b155ef52b720-c000.snappy.orc";
@@ -881,12 +878,8 @@ protected:
};
TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_1) {
- // Properties:
-
- // Configure access paths for profile column
ColumnAccessPathConfig access_config;
access_config.column_name = "profile";
- access_config.use_name_paths = true;
access_config.all_column_paths = {{"profile", "address", "coordinates",
"lat"},
{"profile", "address", "coordinates",
"lng"},
@@ -900,21 +893,20 @@ TEST_F(HiveReaderCreateColumnIdsTest,
test_create_column_ids_1) {
std::set<uint64_t> expected_column_ids = {2, 3, 4, 7, 8, 9, 10, 11, 15,
16, 18};
std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11};
- run_parquet_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
- run_parquet_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids, true);
+ run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
+ run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids, true);
- run_orc_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
- run_orc_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids, true);
+ run_orc_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
+ run_orc_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids, true);
}
TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_2) {
- // Properties:
- // iceberg.schema:
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type":
[...]
- // ORC column IDs are assigned in a tree-increment order, root is 0 and
children increment sequentially.
+ // ORC column IDs are assigned in a tree-like incremental manner: the root
node is 0, and child nodes increase sequentially.
+ // Currently, Parquet uses a similar design.
// 0: struct (table/root)
// 1: id (int64)
// 2: name (string)
@@ -935,36 +927,32 @@ TEST_F(HiveReaderCreateColumnIdsTest,
test_create_column_ids_2) {
// 17: name (string)
// 18: level (int32)
- // 配置profile列的访问路径
ColumnAccessPathConfig access_config;
access_config.column_name = "profile";
- access_config.use_name_paths = true;
access_config.all_column_paths = {{"profile"}};
access_config.predicate_paths = {{"profile", "address", "coordinates",
"lat"},
{"profile", "contact", "email"}};
std::vector<std::string> table_column_names = {"name", "profile"};
- // column_ids should contain all necessary column IDs (set automatically
deduplicates)
std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9,
10,
11, 12, 13, 14, 15, 16, 17, 18};
std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11};
- run_parquet_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
- run_parquet_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids, true);
+ run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
+ run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids, true);
- run_orc_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
- run_orc_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids, true);
+ run_orc_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
+ run_orc_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids, true);
}
TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_3) {
- // Properties:
- // iceberg.schema:
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type":
[...]
- // ORC column IDs are assigned in a tree-increment order, root is 0 and
children increment sequentially.
+ // ORC column IDs are assigned in a tree-like incremental manner: the root
node is 0, and child nodes increase sequentially.
+ // Currently, Parquet uses a similar design.
// 0: struct (table/root)
// 1: id (int64)
// 2: name (string)
@@ -985,58 +973,54 @@ TEST_F(HiveReaderCreateColumnIdsTest,
test_create_column_ids_3) {
// 17: name (string)
// 18: level (int32)
- // 配置profile列的访问路径
ColumnAccessPathConfig access_config;
access_config.column_name = "profile";
- access_config.use_name_paths = true;
access_config.all_column_paths = {{"profile", "contact"}, {"profile",
"address"}};
access_config.predicate_paths = {{"profile", "address", "coordinates"},
{"profile", "contact", "email"}};
std::vector<std::string> table_column_names = {"name", "profile"};
- // column_ids should contain all necessary column IDs (set automatically
deduplicates)
std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14};
std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 9, 10, 11};
- run_parquet_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
- run_parquet_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids, true);
+ run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
+ run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids, true);
- run_orc_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
- run_orc_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids, true);
+ run_orc_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
+ run_orc_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids, true);
}
TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_4) {
- // 配置profile列的访问路径
ColumnAccessPathConfig access_config;
access_config.column_name = "profile";
- access_config.use_name_paths = true;
access_config.all_column_paths = {};
access_config.predicate_paths = {};
std::vector<std::string> table_column_names = {"name", "profile"};
- // column_ids should contain all necessary column IDs (set automatically
deduplicates)
std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9,
10,
11, 12, 13, 14, 15, 16, 17, 18};
std::set<uint64_t> expected_filter_column_ids = {};
- run_parquet_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
- run_parquet_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids, true);
+ run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
+ run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids, true);
- run_orc_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
- run_orc_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids, true);
+ run_orc_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
+ run_orc_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids, true);
}
TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_5) {
+ // ORC column IDs are assigned in a tree-like incremental manner: the root
node is 0, and child nodes increase sequentially.
+ // Currently, Parquet uses a similar design.
// 19: tags (array)
// 20: element (string)
// 21: friends (array)
@@ -1058,7 +1042,6 @@ TEST_F(HiveReaderCreateColumnIdsTest,
test_create_column_ids_5) {
// Configure access paths for friends column
ColumnAccessPathConfig access_config;
access_config.column_name = "friends";
- access_config.use_name_paths = true;
access_config.all_column_paths = {{"friends", "*", "nickname"},
{"friends", "*",
"friendship_level"}};
@@ -1072,7 +1055,6 @@ TEST_F(HiveReaderCreateColumnIdsTest,
test_create_column_ids_5) {
// Configure access paths for recent_activity column
ColumnAccessPathConfig access_config;
access_config.column_name = "recent_activity";
- access_config.use_name_paths = true;
access_config.all_column_paths = {{"recent_activity", "*", "action"},
{"recent_activity", "*", "details",
"*", "value"}};
@@ -1083,22 +1065,23 @@ TEST_F(HiveReaderCreateColumnIdsTest,
test_create_column_ids_5) {
}
std::vector<std::string> table_column_names = {"name", "friends",
"recent_activity"};
- // column_ids should contain all necessary column IDs (set automatically
deduplicates)
std::set<uint64_t> expected_column_ids = {2, 21, 22, 24, 25, 26, 27, 28,
29, 30, 32};
std::set<uint64_t> expected_filter_column_ids = {21, 22, 24, 26, 27, 28};
- run_parquet_test_with_method(table_column_names, access_configs,
expected_column_ids,
- expected_filter_column_ids);
- run_parquet_test_with_method(table_column_names, access_configs,
expected_column_ids,
- expected_filter_column_ids, true);
+ run_parquet_test(table_column_names, access_configs, expected_column_ids,
+ expected_filter_column_ids);
+ run_parquet_test(table_column_names, access_configs, expected_column_ids,
+ expected_filter_column_ids, true);
- run_orc_test_with_method(table_column_names, access_configs,
expected_column_ids,
- expected_filter_column_ids);
- run_orc_test_with_method(table_column_names, access_configs,
expected_column_ids,
- expected_filter_column_ids, true);
+ run_orc_test(table_column_names, access_configs, expected_column_ids,
+ expected_filter_column_ids);
+ run_orc_test(table_column_names, access_configs, expected_column_ids,
+ expected_filter_column_ids, true);
}
TEST_F(HiveReaderCreateColumnIdsTest, test_create_column_ids_6) {
+ // ORC column IDs are assigned in a tree-like incremental manner: the root
node is 0, and child nodes increase sequentially.
+ // Currently, Parquet uses a similar design.
// 33: attributes (map)
// 34: key (string)
// 35: value (string)
@@ -1165,7 +1148,6 @@ TEST_F(HiveReaderCreateColumnIdsTest,
test_create_column_ids_6) {
// Configure access paths for complex_attributes column
ColumnAccessPathConfig access_config;
access_config.column_name = "complex_attributes";
- access_config.use_name_paths = true;
access_config.all_column_paths = {
{"complex_attributes", "*", "metadata", "version"},
@@ -1183,30 +1165,28 @@ TEST_F(HiveReaderCreateColumnIdsTest,
test_create_column_ids_6) {
{
std::vector<std::string> table_column_names = {"name",
"complex_attributes"};
// parquet values should access keys
- // column_ids should contain all necessary column IDs (set
automatically deduplicates)
std::set<uint64_t> expected_column_ids = {2, 36, 37, 38, 39, 40, 44,
45, 48, 49, 52, 53,
54, 61, 62, 63, 64, 65, 66,
67, 68, 69, 70, 71,
72, 73, 74, 75, 76, 77, 79,
80, 82, 83};
std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40};
- run_parquet_test_with_method(table_column_names, access_configs,
expected_column_ids,
- expected_filter_column_ids);
- run_parquet_test_with_method(table_column_names, access_configs,
expected_column_ids,
- expected_filter_column_ids, true);
+ run_parquet_test(table_column_names, access_configs,
expected_column_ids,
+ expected_filter_column_ids);
+ run_parquet_test(table_column_names, access_configs,
expected_column_ids,
+ expected_filter_column_ids, true);
}
{
std::vector<std::string> table_column_names = {"name",
"complex_attributes"};
// orc values should access keys because need to deduplicate by keys
- // column_ids should contain all necessary column IDs (set
automatically deduplicates)
std::set<uint64_t> expected_column_ids = {2, 36, 37, 38, 39, 40, 44,
45, 48, 49, 52, 53,
54, 61, 62, 63, 64, 65, 66,
67, 68, 69, 70, 71,
72, 73, 74, 75, 76, 77, 79,
80, 82, 83};
std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40};
- run_orc_test_with_method(table_column_names, access_configs,
expected_column_ids,
- expected_filter_column_ids);
- run_orc_test_with_method(table_column_names, access_configs,
expected_column_ids,
- expected_filter_column_ids, true);
+ run_orc_test(table_column_names, access_configs, expected_column_ids,
+ expected_filter_column_ids);
+ run_orc_test(table_column_names, access_configs, expected_column_ids,
+ expected_filter_column_ids, true);
}
}
diff --git a/be/test/vec/exec/format/table/hive/hive_reader_test.cpp
b/be/test/vec/exec/format/table/hive/hive_reader_test.cpp
index 16bde5c7bae..ca2cada932f 100644
--- a/be/test/vec/exec/format/table/hive/hive_reader_test.cpp
+++ b/be/test/vec/exec/format/table/hive/hive_reader_test.cpp
@@ -462,7 +462,7 @@ protected:
};
// Test reading real Hive Parquet file using HiveTableReader
-TEST_F(HiveReaderTest, ReadHiveParquetFile) {
+TEST_F(HiveReaderTest, read_hive_parquet_file) {
// Read only: name, profile.address.coordinates.lat,
profile.address.coordinates.lng, profile.contact.email
// Setup table descriptor for test columns with new schema:
/**
@@ -604,7 +604,7 @@ TEST_F(HiveReaderTest, ReadHiveParquetFile) {
}
// Test reading real Hive Orc file using HiveTableReader
-TEST_F(HiveReaderTest, ReadHiveOrcFile) {
+TEST_F(HiveReaderTest, read_hive_rrc_file) {
// Read only: name, profile.address.coordinates.lat,
profile.address.coordinates.lng, profile.contact.email
// Setup table descriptor for test columns with new schema:
/**
diff --git
a/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp
b/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp
index b1152f159bd..d897f0e4762 100644
---
a/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp
+++
b/be/test/vec/exec/format/table/iceberg/iceberg_reader_create_column_ids_test.cpp
@@ -60,10 +60,9 @@ struct ColumnAccessPathConfig {
std::string column_name;
std::vector<std::vector<std::string>> all_column_paths; // For
all_column_access_paths
std::vector<std::vector<std::string>> predicate_paths; // For
predicate_column_access_paths
- bool use_name_paths = true; // true=use field names, false=use indices
};
-// Iceberg 列ID分配 (基于 Protobuf Schema 定义)
+// Iceberg column ID assignment (based on Protobuf schema definition)
// 1: id (int64)
// 2: name (string)
// 3: profile (struct)
@@ -770,12 +769,11 @@ protected:
}
// Helper function: run Parquet test with different column ID extraction
methods
- void run_parquet_test_with_method(const std::vector<std::string>&
table_column_names,
- const
std::vector<ColumnAccessPathConfig>& access_configs,
- const std::set<uint64_t>&
expected_column_ids,
- const std::set<uint64_t>&
expected_filter_column_ids,
- bool use_top_level_method = false,
- bool should_skip_assertion = false) {
+ void run_parquet_test(const std::vector<std::string>& table_column_names,
+ const std::vector<ColumnAccessPathConfig>&
access_configs,
+ const std::set<uint64_t>& expected_column_ids,
+ const std::set<uint64_t>& expected_filter_column_ids,
+ bool use_top_level_method = false, bool
should_skip_assertion = false) {
std::string test_file =
"./be/test/exec/test_data/nested_user_profiles_iceberg_parquet/data/"
"00000-9-a7e0135f-d581-40e4-8d56-a929aded99e4-0-00001.parquet";
@@ -841,12 +839,11 @@ protected:
}
// Helper function: run Orc test with different column ID extraction
methods
- void run_orc_test_with_method(const std::vector<std::string>&
table_column_names,
- const std::vector<ColumnAccessPathConfig>&
access_configs,
- const std::set<uint64_t>&
expected_column_ids,
- const std::set<uint64_t>&
expected_filter_column_ids,
- bool use_top_level_method = false,
- bool should_skip_assertion = false) {
+ void run_orc_test(const std::vector<std::string>& table_column_names,
+ const std::vector<ColumnAccessPathConfig>&
access_configs,
+ const std::set<uint64_t>& expected_column_ids,
+ const std::set<uint64_t>& expected_filter_column_ids,
+ bool use_top_level_method = false, bool
should_skip_assertion = false) {
std::string test_file =
"./be/test/exec/test_data/nested_user_profiles_iceberg_orc/data/"
"00000-8-5a144c37-16a4-47c6-96db-0007175b5c90-0-00001.orc";
@@ -913,12 +910,8 @@ protected:
};
TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_1) {
- // Properties:
-
- // Configure access paths for the profile column
ColumnAccessPathConfig access_config;
access_config.column_name = "profile";
- access_config.use_name_paths = true;
access_config.all_column_paths = {{"3", "9", "14", "15"},
{"3", "9", "14", "16"},
@@ -931,16 +924,15 @@ TEST_F(IcebergReaderCreateColumnIdsTest,
test_create_column_ids_1) {
std::set<uint64_t> expected_column_ids = {2, 3, 4, 7, 8, 9, 10, 11, 15,
16, 18};
std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11};
- run_parquet_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
- run_orc_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
+ run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
+ run_orc_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
}
TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_2) {
- // Properties:
- // iceberg.schema:
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type":
[...]
- // ORC 列ID分配为树型递增,根节点为0,子节点依次递增。
+ // ORC column IDs are assigned in a tree-like incremental manner: the root
node is 0, and child nodes increase sequentially.
+ // Currently, Parquet uses a similar design.
// 0: struct (table/root)
// 1: id (int64)
// 2: name (string)
@@ -961,30 +953,26 @@ TEST_F(IcebergReaderCreateColumnIdsTest,
test_create_column_ids_2) {
// 17: name (string)
// 18: level (int32)
- // 配置profile列的访问路径
ColumnAccessPathConfig access_config;
access_config.column_name = "profile";
- access_config.use_name_paths = true;
access_config.all_column_paths = {{"3"}};
access_config.predicate_paths = {{"3", "9", "14", "15"}, {"3", "10",
"17"}};
std::vector<std::string> table_column_names = {"name", "profile"};
- // column_ids should contain all necessary column IDs (set automatically
deduplicates)
std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9,
10,
11, 12, 13, 14, 15, 16, 17, 18};
std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 10, 11};
- run_parquet_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
- run_orc_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
+ run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
+ run_orc_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
}
TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_3) {
- // Properties:
- // iceberg.schema:
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type":
[...]
- // ORC 列ID分配为树型递增,根节点为0,子节点依次递增。
+ // ORC column IDs are assigned in a tree-like incremental manner: the root
node is 0, and child nodes increase sequentially.
+ // Currently, Parquet uses a similar design.
// 0: struct (table/root)
// 1: id (int64)
// 2: name (string)
@@ -1005,51 +993,43 @@ TEST_F(IcebergReaderCreateColumnIdsTest,
test_create_column_ids_3) {
// 17: name (string)
// 18: level (int32)
- // 配置profile列的访问路径
ColumnAccessPathConfig access_config;
access_config.column_name = "profile";
- access_config.use_name_paths = true;
access_config.all_column_paths = {{"3", "10"}, {"3", "9"}};
access_config.predicate_paths = {{"3", "9", "14"}, {"3", "10", "17"}};
- // access_config.all_column_paths = {{"profile", "contact"}, {"profile",
"address"}};
- // access_config.predicate_paths = {{"profile", "address", "coordinates"},
- // {"profile", "contact", "email"}};
-
std::vector<std::string> table_column_names = {"name", "profile"};
- // column_ids should contain all necessary column IDs (set automatically
deduplicates)
std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14};
std::set<uint64_t> expected_filter_column_ids = {3, 4, 7, 8, 9, 10, 11};
- run_parquet_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
- run_orc_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
+ run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
+ run_orc_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
}
TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_4) {
- // 配置profile列的访问路径
ColumnAccessPathConfig access_config;
access_config.column_name = "profile";
- access_config.use_name_paths = true;
access_config.all_column_paths = {};
access_config.predicate_paths = {};
std::vector<std::string> table_column_names = {"name", "profile"};
- // column_ids should contain all necessary column IDs (set automatically
deduplicates)
std::set<uint64_t> expected_column_ids = {2, 3, 4, 5, 6, 7, 8, 9,
10,
11, 12, 13, 14, 15, 16, 17, 18};
std::set<uint64_t> expected_filter_column_ids = {};
- run_parquet_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
- run_orc_test_with_method(table_column_names, {access_config},
expected_column_ids,
- expected_filter_column_ids);
+ run_parquet_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
+ run_orc_test(table_column_names, {access_config}, expected_column_ids,
+ expected_filter_column_ids);
}
TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_5) {
+ // ORC column IDs are assigned in a tree-like incremental manner: the root
node is 0, and child nodes increase sequentially.
+ // Currently, Parquet uses a similar design.
// 19: tags (array)
// 20: element (string)
// 21: friends (array)
@@ -1068,10 +1048,8 @@ TEST_F(IcebergReaderCreateColumnIdsTest,
test_create_column_ids_5) {
std::vector<ColumnAccessPathConfig> access_configs;
{
- // 配置friends列的访问路径
ColumnAccessPathConfig access_config;
access_config.column_name = "friends";
- access_config.use_name_paths = true;
access_config.all_column_paths = {{"5", "*", "27"}, {"5", "*", "28"}};
access_config.predicate_paths = {
@@ -1081,10 +1059,8 @@ TEST_F(IcebergReaderCreateColumnIdsTest,
test_create_column_ids_5) {
}
{
- // 配置recent_activity列的访问路径
ColumnAccessPathConfig access_config;
access_config.column_name = "recent_activity";
- access_config.use_name_paths = true;
access_config.all_column_paths = {{"6", "*", "30"}, {"6", "*", "31",
"*", "34"}};
access_config.predicate_paths = {
@@ -1094,17 +1070,18 @@ TEST_F(IcebergReaderCreateColumnIdsTest,
test_create_column_ids_5) {
}
std::vector<std::string> table_column_names = {"name", "friends",
"recent_activity"};
- // column_ids should contain all necessary column IDs (set automatically
deduplicates)
std::set<uint64_t> expected_column_ids = {2, 21, 22, 24, 25, 26, 27, 28,
29, 30, 32};
std::set<uint64_t> expected_filter_column_ids = {21, 22, 24, 26, 27, 28};
- run_parquet_test_with_method(table_column_names, access_configs,
expected_column_ids,
- expected_filter_column_ids);
- run_orc_test_with_method(table_column_names, access_configs,
expected_column_ids,
- expected_filter_column_ids);
+ run_parquet_test(table_column_names, access_configs, expected_column_ids,
+ expected_filter_column_ids);
+ run_orc_test(table_column_names, access_configs, expected_column_ids,
+ expected_filter_column_ids);
}
TEST_F(IcebergReaderCreateColumnIdsTest, test_create_column_ids_6) {
+ // ORC column IDs are assigned in a tree-like incremental manner: the root
node is 0, and child nodes increase sequentially.
+ // Currently, Parquet uses a similar design.
// 33: attributes (map)
// 34: key (string)
// 35: value (string)
@@ -1168,10 +1145,8 @@ TEST_F(IcebergReaderCreateColumnIdsTest,
test_create_column_ids_6) {
std::vector<ColumnAccessPathConfig> access_configs;
{
- // 配置complex_attributes列的访问路径
ColumnAccessPathConfig access_config;
access_config.column_name = "complex_attributes";
- access_config.use_name_paths = true;
access_config.all_column_paths = {{"8", "*", "39", "43"},
{"8", "*", "40", "*", "50", "*",
"55"},
@@ -1184,27 +1159,25 @@ TEST_F(IcebergReaderCreateColumnIdsTest,
test_create_column_ids_6) {
{
std::vector<std::string> table_column_names = {"name",
"complex_attributes"};
// parquet values should access keys
- // column_ids should contain all necessary column IDs (set
automatically deduplicates)
std::set<uint64_t> expected_column_ids = {2, 36, 37, 38, 39, 40, 44,
45, 48, 49, 52, 53,
54, 61, 62, 63, 64, 65, 66,
67, 68, 69, 70, 71,
72, 73, 74, 75, 76, 77, 79,
80, 82, 83};
std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40};
- run_parquet_test_with_method(table_column_names, access_configs,
expected_column_ids,
- expected_filter_column_ids);
+ run_parquet_test(table_column_names, access_configs,
expected_column_ids,
+ expected_filter_column_ids);
}
{
std::vector<std::string> table_column_names = {"name",
"complex_attributes"};
// orc values should access keys because need to deduplicate by keys
- // column_ids should contain all necessary column IDs (set
automatically deduplicates)
std::set<uint64_t> expected_column_ids = {2, 36, 37, 38, 39, 40, 44,
45, 48, 49, 52, 53,
54, 61, 62, 63, 64, 65, 66,
67, 68, 69, 70, 71,
72, 73, 74, 75, 76, 77, 79,
80, 82, 83};
std::set<uint64_t> expected_filter_column_ids = {36, 37, 38, 39, 40};
- run_orc_test_with_method(table_column_names, access_configs,
expected_column_ids,
- expected_filter_column_ids);
+ run_orc_test(table_column_names, access_configs, expected_column_ids,
+ expected_filter_column_ids);
}
}
diff --git a/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp
b/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp
index 12d6bb0299f..b7dedcf834e 100644
--- a/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp
+++ b/be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp
@@ -465,12 +465,10 @@ protected:
};
// Test reading real Iceberg Parquet file using IcebergTableReader
-TEST_F(IcebergReaderTest, ReadIcebergParquetFile) {
+TEST_F(IcebergReaderTest, read_iceberg_parquet_file) {
// Read only: name, profile.address.coordinates.lat,
profile.address.coordinates.lng, profile.contact.email
// Setup table descriptor for test columns with new schema:
/**
- Properties:
- iceberg.schema:
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type":{"type"
[...]
Schema:
message table {
required int64 id = 1;
@@ -603,12 +601,10 @@ TEST_F(IcebergReaderTest, ReadIcebergParquetFile) {
}
// Test reading real Iceberg Orc file using IcebergTableReader
-TEST_F(IcebergReaderTest, ReadIcebergOrcFile) {
+TEST_F(IcebergReaderTest, read_iceberg_orc_file) {
// Read only: name, profile.address.coordinates.lat,
profile.address.coordinates.lng, profile.contact.email
// Setup table descriptor for test columns with new schema:
/**
- Properties:
- iceberg.schema:
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"id","required":true,"type":"long"},{"id":2,"name":"name","required":true,"type":"string"},{"id":3,"name":"profile","required":true,"type":{"type":"struct","fields":[{"id":4,"name":"address","required":false,"type":{"type":"struct","fields":[{"id":7,"name":"street","required":false,"type":"string"},{"id":8,"name":"city","required":false,"type":"string"},{"id":9,"name":"coordinates","required":false,"type":{"type"
[...]
Schema:
message table {
required int64 id = 1;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]