This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new fa16c50c7be branch-4.0: [Enhancement](parquet-orc)add column size
check for debug & stable . #59780 (#60424)
fa16c50c7be is described below
commit fa16c50c7be81f4a853b3acd1ac7ee37a504658a
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue Feb 3 10:45:23 2026 +0800
branch-4.0: [Enhancement](parquet-orc)add column size check for debug &
stable . #59780 (#60424)
Cherry-picked from #59780
Co-authored-by: daidai <[email protected]>
---
be/src/vec/exec/format/orc/vorc_reader.cpp | 58 ++++++++++++++++++++++
.../exec/format/parquet/vparquet_column_reader.cpp | 11 +++-
.../exec/format/parquet/vparquet_group_reader.cpp | 52 +++++++++++++++++++
3 files changed, 119 insertions(+), 2 deletions(-)
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 373101d6ae2..4169aacc4ba 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -2272,6 +2272,9 @@ Status OrcReader::_get_next_block_impl(Block* block,
size_t* read_rows, bool* eo
col_name, column_ptr, column_type,
_table_info_node_ptr->get_children_node(col_name),
_type_map[file_column_name],
batch_vec[orc_col_idx->second], _batch->numElements));
+#ifndef NDEBUG
+ column_ptr->sanity_check();
+#endif
}
RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements,
@@ -2288,8 +2291,25 @@ Status OrcReader::_get_next_block_impl(Block* block,
size_t* read_rows, bool* eo
return Status::OK();
}
{
+#ifndef NDEBUG
+ for (auto col : *block) {
+ col.column->sanity_check();
+
+ DCHECK(block->rows() == col.column->size())
+ << absl::Substitute("block rows = $0 , column rows =
$1, col name = $2",
+ block->rows(), col.column->size(),
col.name);
+ }
+#endif
SCOPED_RAW_TIMER(&_statistics.predicate_filter_time);
_execute_filter_position_delete_rowids(*_filter);
+#ifndef NDEBUG
+ for (auto col : *block) {
+ col.column->sanity_check();
+ DCHECK(block->rows() == col.column->size())
+ << absl::Substitute("block rows = $0 , column rows =
$1, col name = $2",
+ block->rows(), col.column->size(),
col.name);
+ }
+#endif
{
SCOPED_RAW_TIMER(&_statistics.decode_null_map_time);
RETURN_IF_CATCH_EXCEPTION(
@@ -2298,6 +2318,14 @@ Status OrcReader::_get_next_block_impl(Block* block,
size_t* read_rows, bool* eo
Block::erase_useless_column(block, column_to_keep);
RETURN_IF_ERROR(_convert_dict_cols_to_string_cols(block,
&batch_vec));
*read_rows = block->rows();
+#ifndef NDEBUG
+ for (auto col : *block) {
+ col.column->sanity_check();
+ DCHECK(block->rows() == col.column->size())
+ << absl::Substitute("block rows = $0 , column rows =
$1, col name = $2",
+ block->rows(), col.column->size(),
col.name);
+ }
+#endif
}
} else {
uint64_t rr;
@@ -2369,6 +2397,9 @@ Status OrcReader::_get_next_block_impl(Block* block,
size_t* read_rows, bool* eo
col_name, column_ptr, column_type,
_table_info_node_ptr->get_children_node(col_name),
_type_map[file_column_name],
batch_vec[orc_col_idx->second], _batch->numElements));
+#ifndef NDEBUG
+ column_ptr->sanity_check();
+#endif
}
RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements,
@@ -2385,6 +2416,14 @@ Status OrcReader::_get_next_block_impl(Block* block,
size_t* read_rows, bool* eo
return Status::OK();
}
+#ifndef NDEBUG
+ for (auto col : *block) {
+ col.column->sanity_check();
+ DCHECK(block->rows() == col.column->size())
+ << absl::Substitute("block rows = $0 , column rows = $1,
col name = $2",
+ block->rows(), col.column->size(),
col.name);
+ }
+#endif
{
SCOPED_RAW_TIMER(&_statistics.predicate_filter_time);
_build_delete_row_filter(block, _batch->numElements);
@@ -2438,7 +2477,23 @@ Status OrcReader::_get_next_block_impl(Block* block,
size_t* read_rows, bool* eo
Block::erase_useless_column(block, column_to_keep);
}
}
+#ifndef NDEBUG
+ for (auto col : *block) {
+ col.column->sanity_check();
+ DCHECK(block->rows() == col.column->size())
+ << absl::Substitute("block rows = $0 , column rows = $1,
col name = $2",
+ block->rows(), col.column->size(),
col.name);
+ }
+#endif
RETURN_IF_ERROR(_convert_dict_cols_to_string_cols(block, &batch_vec));
+#ifndef NDEBUG
+ for (auto col : *block) {
+ col.column->sanity_check();
+ DCHECK(block->rows() == col.column->size())
+ << absl::Substitute("block rows = $0 , column rows = $1,
col name = $2",
+ block->rows(), col.column->size(),
col.name);
+ }
+#endif
*read_rows = block->rows();
}
return Status::OK();
@@ -2548,6 +2603,9 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data,
uint16_t* sel, uint16_t s
table_col_name, column_ptr, column_type,
_table_info_node_ptr->get_children_node(table_col_name),
_type_map[file_column_name], batch_vec[orc_col_idx->second],
data.numElements));
+#ifndef NDEBUG
+ column_ptr->sanity_check();
+#endif
}
RETURN_IF_ERROR(
_fill_partition_columns(block, size,
_lazy_read_ctx.predicate_partition_columns));
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
index 0364fe496b0..30962632662 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
@@ -677,7 +677,9 @@ Status ArrayColumnReader::read_column_data(
fill_array_offset(_field_schema, offsets_data, null_map_ptr,
_element_reader->get_rep_level(),
_element_reader->get_def_level());
DCHECK_EQ(element_column->size(), offsets_data.back());
-
+#ifndef NDEBUG
+ doris_column->sanity_check();
+#endif
return Status::OK();
}
@@ -753,7 +755,9 @@ Status MapColumnReader::read_column_data(
fill_array_offset(_field_schema, map.get_offsets(), null_map_ptr,
_key_reader->get_rep_level(),
_key_reader->get_def_level());
DCHECK_EQ(key_column->size(), map.get_offsets().back());
-
+#ifndef NDEBUG
+ doris_column->sanity_check();
+#endif
return Status::OK();
}
@@ -977,6 +981,9 @@ Status StructColumnReader::read_column_data(
fill_struct_null_map(_field_schema, *null_map_ptr,
this->get_rep_level(),
this->get_def_level());
}
+#ifndef NDEBUG
+ doris_column->sanity_check();
+#endif
return Status::OK();
}
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index a9bed4c27cd..5de4a01a88d 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -328,9 +328,26 @@ Status RowGroupReader::next_batch(Block* block, size_t
batch_size, size_t* read_
RETURN_IF_ERROR(_fill_missing_columns(block, *read_rows,
_lazy_read_ctx.missing_columns));
RETURN_IF_ERROR(_fill_row_id_columns(block, *read_rows, false));
+#ifndef NDEBUG
+ for (auto col : *block) {
+ col.column->sanity_check();
+ DCHECK(block->rows() == col.column->size())
+ << absl::Substitute("block rows = $0 , column rows = $1,
col name = $2",
+ block->rows(), col.column->size(),
col.name);
+ }
+#endif
+
if (block->rows() == 0) {
_convert_dict_cols_to_string_cols(block);
*read_rows = block->rows();
+#ifndef NDEBUG
+ for (auto col : *block) {
+ col.column->sanity_check();
+ DCHECK(block->rows() == col.column->size())
+ << absl::Substitute("block rows = $0 , column rows =
$1, col name = $2",
+ block->rows(), col.column->size(),
col.name);
+ }
+#endif
return Status::OK();
}
{
@@ -374,6 +391,14 @@ Status RowGroupReader::next_batch(Block* block, size_t
batch_size, size_t* read_
}
_convert_dict_cols_to_string_cols(block);
}
+#ifndef NDEBUG
+ for (auto col : *block) {
+ col.column->sanity_check();
+ DCHECK(block->rows() == col.column->size())
+ << absl::Substitute("block rows = $0 , column rows = $1,
col name = $2",
+ block->rows(), col.column->size(),
col.name);
+ }
+#endif
*read_rows = block->rows();
return Status::OK();
}
@@ -442,6 +467,10 @@ Status RowGroupReader::_read_column_data(Block* block,
return Status::Corruption("Can't read the same number of rows
among parquet columns");
}
batch_read_rows = col_read_rows;
+
+#ifndef NDEBUG
+ column_ptr->sanity_check();
+#endif
if (col_eof) {
has_eof = true;
}
@@ -486,6 +515,18 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t
batch_size, size_t* re
RETURN_IF_ERROR(_build_pos_delete_filter(pre_read_rows));
+#ifndef NDEBUG
+ for (auto col : *block) {
+ if (col.column->size() == 0) { // lazy read column.
+ continue;
+ }
+ col.column->sanity_check();
+ DCHECK(pre_read_rows == col.column->size())
+ << absl::Substitute("pre_read_rows = $0 , column rows =
$1, col name = $2",
+ pre_read_rows, col.column->size(),
col.name);
+ }
+#endif
+
bool can_filter_all = false;
{
SCOPED_RAW_TIMER(&_predicate_filter_time);
@@ -632,6 +673,14 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t
batch_size, size_t* re
*batch_eof = pre_eof;
RETURN_IF_ERROR(_fill_partition_columns(block, column_size,
_lazy_read_ctx.partition_columns));
RETURN_IF_ERROR(_fill_missing_columns(block, column_size,
_lazy_read_ctx.missing_columns));
+#ifndef NDEBUG
+ for (auto col : *block) {
+ col.column->sanity_check();
+ DCHECK(block->rows() == col.column->size())
+ << absl::Substitute("block rows = $0 , column rows = $1, col
name = $2",
+ block->rows(), col.column->size(),
col.name);
+ }
+#endif
return Status::OK();
}
@@ -903,6 +952,9 @@ Status RowGroupReader::_rewrite_dict_predicates() {
bool has_dict = false;
RETURN_IF_ERROR(_column_readers[dict_filter_col_name]->read_dict_values_to_column(
dict_value_column, &has_dict));
+#ifndef NDEBUG
+ dict_value_column->sanity_check();
+#endif
size_t dict_value_column_size = dict_value_column->size();
DCHECK(has_dict);
// 2. Build a temp block from the dict string column, then execute
conjuncts and filter block.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]