This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new fa16c50c7be branch-4.0: [Enhancement](parquet-orc)add column size 
check for debug & stable . #59780 (#60424)
fa16c50c7be is described below

commit fa16c50c7be81f4a853b3acd1ac7ee37a504658a
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue Feb 3 10:45:23 2026 +0800

    branch-4.0: [Enhancement](parquet-orc)add column size check for debug & 
stable . #59780 (#60424)
    
    Cherry-picked from #59780
    
    Co-authored-by: daidai <[email protected]>
---
 be/src/vec/exec/format/orc/vorc_reader.cpp         | 58 ++++++++++++++++++++++
 .../exec/format/parquet/vparquet_column_reader.cpp | 11 +++-
 .../exec/format/parquet/vparquet_group_reader.cpp  | 52 +++++++++++++++++++
 3 files changed, 119 insertions(+), 2 deletions(-)

diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp 
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 373101d6ae2..4169aacc4ba 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -2272,6 +2272,9 @@ Status OrcReader::_get_next_block_impl(Block* block, 
size_t* read_rows, bool* eo
                     col_name, column_ptr, column_type,
                     _table_info_node_ptr->get_children_node(col_name), 
_type_map[file_column_name],
                     batch_vec[orc_col_idx->second], _batch->numElements));
+#ifndef NDEBUG
+            column_ptr->sanity_check();
+#endif
         }
 
         RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements,
@@ -2288,8 +2291,25 @@ Status OrcReader::_get_next_block_impl(Block* block, 
size_t* read_rows, bool* eo
             return Status::OK();
         }
         {
+#ifndef NDEBUG
+            for (auto col : *block) {
+                col.column->sanity_check();
+
+                DCHECK(block->rows() == col.column->size())
+                        << absl::Substitute("block rows = $0 , column rows = 
$1, col name = $2",
+                                            block->rows(), col.column->size(), 
col.name);
+            }
+#endif
             SCOPED_RAW_TIMER(&_statistics.predicate_filter_time);
             _execute_filter_position_delete_rowids(*_filter);
+#ifndef NDEBUG
+            for (auto col : *block) {
+                col.column->sanity_check();
+                DCHECK(block->rows() == col.column->size())
+                        << absl::Substitute("block rows = $0 , column rows = 
$1, col name = $2",
+                                            block->rows(), col.column->size(), 
col.name);
+            }
+#endif
             {
                 SCOPED_RAW_TIMER(&_statistics.decode_null_map_time);
                 RETURN_IF_CATCH_EXCEPTION(
@@ -2298,6 +2318,14 @@ Status OrcReader::_get_next_block_impl(Block* block, 
size_t* read_rows, bool* eo
             Block::erase_useless_column(block, column_to_keep);
             RETURN_IF_ERROR(_convert_dict_cols_to_string_cols(block, 
&batch_vec));
             *read_rows = block->rows();
+#ifndef NDEBUG
+            for (auto col : *block) {
+                col.column->sanity_check();
+                DCHECK(block->rows() == col.column->size())
+                        << absl::Substitute("block rows = $0 , column rows = 
$1, col name = $2",
+                                            block->rows(), col.column->size(), 
col.name);
+            }
+#endif
         }
     } else {
         uint64_t rr;
@@ -2369,6 +2397,9 @@ Status OrcReader::_get_next_block_impl(Block* block, 
size_t* read_rows, bool* eo
                     col_name, column_ptr, column_type,
                     _table_info_node_ptr->get_children_node(col_name), 
_type_map[file_column_name],
                     batch_vec[orc_col_idx->second], _batch->numElements));
+#ifndef NDEBUG
+            column_ptr->sanity_check();
+#endif
         }
 
         RETURN_IF_ERROR(_fill_partition_columns(block, _batch->numElements,
@@ -2385,6 +2416,14 @@ Status OrcReader::_get_next_block_impl(Block* block, 
size_t* read_rows, bool* eo
             return Status::OK();
         }
 
+#ifndef NDEBUG
+        for (auto col : *block) {
+            col.column->sanity_check();
+            DCHECK(block->rows() == col.column->size())
+                    << absl::Substitute("block rows = $0 , column rows = $1, 
col name = $2",
+                                        block->rows(), col.column->size(), 
col.name);
+        }
+#endif
         {
             SCOPED_RAW_TIMER(&_statistics.predicate_filter_time);
             _build_delete_row_filter(block, _batch->numElements);
@@ -2438,7 +2477,23 @@ Status OrcReader::_get_next_block_impl(Block* block, 
size_t* read_rows, bool* eo
                 Block::erase_useless_column(block, column_to_keep);
             }
         }
+#ifndef NDEBUG
+        for (auto col : *block) {
+            col.column->sanity_check();
+            DCHECK(block->rows() == col.column->size())
+                    << absl::Substitute("block rows = $0 , column rows = $1, 
col name = $2",
+                                        block->rows(), col.column->size(), 
col.name);
+        }
+#endif
         RETURN_IF_ERROR(_convert_dict_cols_to_string_cols(block, &batch_vec));
+#ifndef NDEBUG
+        for (auto col : *block) {
+            col.column->sanity_check();
+            DCHECK(block->rows() == col.column->size())
+                    << absl::Substitute("block rows = $0 , column rows = $1, 
col name = $2",
+                                        block->rows(), col.column->size(), 
col.name);
+        }
+#endif
         *read_rows = block->rows();
     }
     return Status::OK();
@@ -2548,6 +2603,9 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, 
uint16_t* sel, uint16_t s
                 table_col_name, column_ptr, column_type,
                 _table_info_node_ptr->get_children_node(table_col_name),
                 _type_map[file_column_name], batch_vec[orc_col_idx->second], 
data.numElements));
+#ifndef NDEBUG
+        column_ptr->sanity_check();
+#endif
     }
     RETURN_IF_ERROR(
             _fill_partition_columns(block, size, 
_lazy_read_ctx.predicate_partition_columns));
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
index 0364fe496b0..30962632662 100644
--- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
@@ -677,7 +677,9 @@ Status ArrayColumnReader::read_column_data(
     fill_array_offset(_field_schema, offsets_data, null_map_ptr, 
_element_reader->get_rep_level(),
                       _element_reader->get_def_level());
     DCHECK_EQ(element_column->size(), offsets_data.back());
-
+#ifndef NDEBUG
+    doris_column->sanity_check();
+#endif
     return Status::OK();
 }
 
@@ -753,7 +755,9 @@ Status MapColumnReader::read_column_data(
     fill_array_offset(_field_schema, map.get_offsets(), null_map_ptr, 
_key_reader->get_rep_level(),
                       _key_reader->get_def_level());
     DCHECK_EQ(key_column->size(), map.get_offsets().back());
-
+#ifndef NDEBUG
+    doris_column->sanity_check();
+#endif
     return Status::OK();
 }
 
@@ -977,6 +981,9 @@ Status StructColumnReader::read_column_data(
         fill_struct_null_map(_field_schema, *null_map_ptr, 
this->get_rep_level(),
                              this->get_def_level());
     }
+#ifndef NDEBUG
+    doris_column->sanity_check();
+#endif
     return Status::OK();
 }
 
diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
index a9bed4c27cd..5de4a01a88d 100644
--- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp
@@ -328,9 +328,26 @@ Status RowGroupReader::next_batch(Block* block, size_t 
batch_size, size_t* read_
         RETURN_IF_ERROR(_fill_missing_columns(block, *read_rows, 
_lazy_read_ctx.missing_columns));
         RETURN_IF_ERROR(_fill_row_id_columns(block, *read_rows, false));
 
+#ifndef NDEBUG
+        for (auto col : *block) {
+            col.column->sanity_check();
+            DCHECK(block->rows() == col.column->size())
+                    << absl::Substitute("block rows = $0 , column rows = $1, 
col name = $2",
+                                        block->rows(), col.column->size(), 
col.name);
+        }
+#endif
+
         if (block->rows() == 0) {
             _convert_dict_cols_to_string_cols(block);
             *read_rows = block->rows();
+#ifndef NDEBUG
+            for (auto col : *block) {
+                col.column->sanity_check();
+                DCHECK(block->rows() == col.column->size())
+                        << absl::Substitute("block rows = $0 , column rows = 
$1, col name = $2",
+                                            block->rows(), col.column->size(), 
col.name);
+            }
+#endif
             return Status::OK();
         }
         {
@@ -374,6 +391,14 @@ Status RowGroupReader::next_batch(Block* block, size_t 
batch_size, size_t* read_
             }
             _convert_dict_cols_to_string_cols(block);
         }
+#ifndef NDEBUG
+        for (auto col : *block) {
+            col.column->sanity_check();
+            DCHECK(block->rows() == col.column->size())
+                    << absl::Substitute("block rows = $0 , column rows = $1, 
col name = $2",
+                                        block->rows(), col.column->size(), 
col.name);
+        }
+#endif
         *read_rows = block->rows();
         return Status::OK();
     }
@@ -442,6 +467,10 @@ Status RowGroupReader::_read_column_data(Block* block,
             return Status::Corruption("Can't read the same number of rows 
among parquet columns");
         }
         batch_read_rows = col_read_rows;
+
+#ifndef NDEBUG
+        column_ptr->sanity_check();
+#endif
         if (col_eof) {
             has_eof = true;
         }
@@ -486,6 +515,18 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t 
batch_size, size_t* re
 
         RETURN_IF_ERROR(_build_pos_delete_filter(pre_read_rows));
 
+#ifndef NDEBUG
+        for (auto col : *block) {
+            if (col.column->size() == 0) { // lazy read column.
+                continue;
+            }
+            col.column->sanity_check();
+            DCHECK(pre_read_rows == col.column->size())
+                    << absl::Substitute("pre_read_rows = $0 , column rows = 
$1, col name = $2",
+                                        pre_read_rows, col.column->size(), 
col.name);
+        }
+#endif
+
         bool can_filter_all = false;
         {
             SCOPED_RAW_TIMER(&_predicate_filter_time);
@@ -632,6 +673,14 @@ Status RowGroupReader::_do_lazy_read(Block* block, size_t 
batch_size, size_t* re
     *batch_eof = pre_eof;
     RETURN_IF_ERROR(_fill_partition_columns(block, column_size, 
_lazy_read_ctx.partition_columns));
     RETURN_IF_ERROR(_fill_missing_columns(block, column_size, 
_lazy_read_ctx.missing_columns));
+#ifndef NDEBUG
+    for (auto col : *block) {
+        col.column->sanity_check();
+        DCHECK(block->rows() == col.column->size())
+                << absl::Substitute("block rows = $0 , column rows = $1, col 
name = $2",
+                                    block->rows(), col.column->size(), 
col.name);
+    }
+#endif
     return Status::OK();
 }
 
@@ -903,6 +952,9 @@ Status RowGroupReader::_rewrite_dict_predicates() {
         bool has_dict = false;
         
RETURN_IF_ERROR(_column_readers[dict_filter_col_name]->read_dict_values_to_column(
                 dict_value_column, &has_dict));
+#ifndef NDEBUG
+        dict_value_column->sanity_check();
+#endif
         size_t dict_value_column_size = dict_value_column->size();
         DCHECK(has_dict);
         // 2. Build a temp block from the dict string column, then execute 
conjuncts and filter block.


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to