Re: [PR] [Opt](multi-catalog) Opt by avoid building name_to_index map every time. [doris]

via GitHub Wed, 03 Dec 2025 00:42:26 -0800


Copilot commented on code in PR #58679:
URL: https://github.com/apache/doris/pull/58679#discussion_r2584146205



##########
be/src/vec/exec/jni_connector.cpp:
##########
@@ -313,10 +313,9 @@ Status JniConnector::_fill_block(Block* block, size_t 
num_rows) {
     SCOPED_RAW_TIMER(&_fill_block_watcher);
     JNIEnv* env = nullptr;
     RETURN_IF_ERROR(JniUtil::GetJNIEnv(&env));
-    // todo: maybe do not need to build name to index map every time
-    auto name_to_pos_map = block->get_name_to_pos_map();
     for (int i = 0; i < _column_names.size(); ++i) {
-        auto& column_with_type_and_name = 
block->get_by_position(name_to_pos_map[_column_names[i]]);
+        auto& column_with_type_and_name =
+                
block->get_by_position(_col_name_to_block_idx->at(_column_names[i]));

Review Comment:
   Potential null pointer dereference. `_col_name_to_block_idx` is used without 
checking if it's nullptr. If `set_col_name_to_block_idx()` is not called before 
`_fill_block()`, this will crash. Add a null check or ensure initialization in 
the constructor.



##########
be/src/vec/exec/scan/scanner_context.cpp:
##########
@@ -487,43 +512,45 @@ int32_t 
ScannerContext::_get_margin(std::unique_lock<std::mutex>& transfer_lock,
 Status ScannerContext::schedule_scan_task(std::shared_ptr<ScanTask> 
current_scan_task,
                                           std::unique_lock<std::mutex>& 
transfer_lock,
                                           std::unique_lock<std::shared_mutex>& 
scheduler_lock) {
+    std::cout << "ScannerContext::schedule_scan_task()" << std::endl;
     if (current_scan_task &&
         (!current_scan_task->cached_blocks.empty() || 
current_scan_task->is_eos())) {
         throw doris::Exception(ErrorCode::INTERNAL_ERROR, "Scanner scheduler 
logical error.");
     }
 
     std::list<std::shared_ptr<ScanTask>> tasks_to_submit;
 
-    int32_t margin = _get_margin(transfer_lock, scheduler_lock);
-
-    // margin is less than zero. Means this scan operator could not submit any 
scan task for now.
-    if (margin <= 0) {
-        // Be careful with current scan task.
-        // We need to add it back to task queue to make sure it could be 
resubmitted.
-        if (current_scan_task) {
-            // This usually happens when we should downgrade the concurrency.
-            _pending_scanners.push(current_scan_task);
-            VLOG_DEBUG << fmt::format(
-                    "{} push back scanner to task queue, because diff <= 0, 
task_queue size "
-                    "{}, _num_scheduled_scanners {}",
-                    ctx_id, _tasks_queue.size(), _num_scheduled_scanners);
-        }
-
-#ifndef NDEBUG
-        // This DCHECK is necessary.
-        // We need to make sure each scan operator could have at least 1 scan 
tasks.
-        // Or this scan operator will not be re-scheduled.
-        if (!_pending_scanners.empty() && _num_scheduled_scanners == 0 && 
_tasks_queue.empty()) {
-            throw doris::Exception(ErrorCode::INTERNAL_ERROR, "Scanner 
scheduler logical error.");
-        }
-#endif
-
-        return Status::OK();
-    }
+    //     int32_t margin = _get_margin(transfer_lock, scheduler_lock);
+
+    //     // margin is less than zero. Means this scan operator could not 
submit any scan task for now.
+    //     if (margin <= 0) {
+    //         // Be careful with current scan task.
+    //         // We need to add it back to task queue to make sure it could 
be resubmitted.
+    //         if (current_scan_task) {
+    //             // This usually happens when we should downgrade the 
concurrency.
+    //             _pending_scanners.push(current_scan_task);
+    //             VLOG_DEBUG << fmt::format(
+    //                     "{} push back scanner to task queue, because diff 
<= 0, task_queue size "
+    //                     "{}, _num_scheduled_scanners {}",
+    //                     ctx_id, _tasks_queue.size(), 
_num_scheduled_scanners);
+    //         }
+
+    // #ifndef NDEBUG
+    //         // This DCHECK is necessary.
+    //         // We need to make sure each scan operator could have at least 
1 scan tasks.
+    //         // Or this scan operator will not be re-scheduled.
+    //         if (!_pending_scanners.empty() && _num_scheduled_scanners == 0 
&& _tasks_queue.empty()) {
+    //             throw doris::Exception(ErrorCode::INTERNAL_ERROR, "Scanner 
scheduler logical error.");
+    //         }
+    // #endif
+
+    //         return Status::OK();
+    //     }
 

Review Comment:
   Large blocks of commented-out code (lines 523-548) should be removed. This 
includes critical logic for checking margins and concurrency limits that 
appears to have been disabled. If this is intentional, it needs proper 
documentation explaining why this critical logic was disabled.
   ```suggestion
       // [Removed: previously disabled margin and concurrency check logic. If 
needed, see git history for details.]
   ```



##########
be/src/vec/exec/format/orc/vorc_reader.cpp:
##########
@@ -1334,10 +1336,9 @@ Status OrcReader::_fill_partition_columns(
         const std::unordered_map<std::string, std::tuple<std::string, const 
SlotDescriptor*>>&
                 partition_columns) {
     DataTypeSerDe::FormatOptions _text_formatOptions;
-    // todo: maybe do not need to build name to index map every time
-    auto name_to_pos_map = block->get_name_to_pos_map();
     for (const auto& kv : partition_columns) {
-        auto col_ptr = 
block->get_by_position(name_to_pos_map[kv.first]).column->assume_mutable();
+        auto col_ptr = 
block->get_by_position((*_col_name_to_block_idx)[kv.first])
+                               .column->assume_mutable();

Review Comment:
   Potential null pointer dereference. `_col_name_to_block_idx` is dereferenced 
without null checking. If not properly initialized, this will crash. Add a null 
check or ensure it's always set before use.



##########
be/src/vec/exec/scan/scanner_context.cpp:
##########
@@ -487,43 +512,45 @@ int32_t 
ScannerContext::_get_margin(std::unique_lock<std::mutex>& transfer_lock,
 Status ScannerContext::schedule_scan_task(std::shared_ptr<ScanTask> 
current_scan_task,
                                           std::unique_lock<std::mutex>& 
transfer_lock,
                                           std::unique_lock<std::shared_mutex>& 
scheduler_lock) {
+    std::cout << "ScannerContext::schedule_scan_task()" << std::endl;
     if (current_scan_task &&
         (!current_scan_task->cached_blocks.empty() || 
current_scan_task->is_eos())) {
         throw doris::Exception(ErrorCode::INTERNAL_ERROR, "Scanner scheduler 
logical error.");
     }
 
     std::list<std::shared_ptr<ScanTask>> tasks_to_submit;
 
-    int32_t margin = _get_margin(transfer_lock, scheduler_lock);
-
-    // margin is less than zero. Means this scan operator could not submit any 
scan task for now.
-    if (margin <= 0) {
-        // Be careful with current scan task.
-        // We need to add it back to task queue to make sure it could be 
resubmitted.
-        if (current_scan_task) {
-            // This usually happens when we should downgrade the concurrency.
-            _pending_scanners.push(current_scan_task);
-            VLOG_DEBUG << fmt::format(
-                    "{} push back scanner to task queue, because diff <= 0, 
task_queue size "
-                    "{}, _num_scheduled_scanners {}",
-                    ctx_id, _tasks_queue.size(), _num_scheduled_scanners);
-        }
-
-#ifndef NDEBUG
-        // This DCHECK is necessary.
-        // We need to make sure each scan operator could have at least 1 scan 
tasks.
-        // Or this scan operator will not be re-scheduled.
-        if (!_pending_scanners.empty() && _num_scheduled_scanners == 0 && 
_tasks_queue.empty()) {
-            throw doris::Exception(ErrorCode::INTERNAL_ERROR, "Scanner 
scheduler logical error.");
-        }
-#endif
-
-        return Status::OK();
-    }
+    //     int32_t margin = _get_margin(transfer_lock, scheduler_lock);
+
+    //     // margin is less than zero. Means this scan operator could not 
submit any scan task for now.
+    //     if (margin <= 0) {
+    //         // Be careful with current scan task.
+    //         // We need to add it back to task queue to make sure it could 
be resubmitted.
+    //         if (current_scan_task) {
+    //             // This usually happens when we should downgrade the 
concurrency.
+    //             _pending_scanners.push(current_scan_task);
+    //             VLOG_DEBUG << fmt::format(
+    //                     "{} push back scanner to task queue, because diff 
<= 0, task_queue size "
+    //                     "{}, _num_scheduled_scanners {}",
+    //                     ctx_id, _tasks_queue.size(), 
_num_scheduled_scanners);
+    //         }
+
+    // #ifndef NDEBUG
+    //         // This DCHECK is necessary.
+    //         // We need to make sure each scan operator could have at least 
1 scan tasks.
+    //         // Or this scan operator will not be re-scheduled.
+    //         if (!_pending_scanners.empty() && _num_scheduled_scanners == 0 
&& _tasks_queue.empty()) {
+    //             throw doris::Exception(ErrorCode::INTERNAL_ERROR, "Scanner 
scheduler logical error.");
+    //         }
+    // #endif
+
+    //         return Status::OK();
+    //     }
 
     bool first_pull = true;
 
-    while (margin-- > 0) {
+    // while (margin-- > 0) {
+    while (true) {

Review Comment:
   Changing `while (margin-- > 0)` to `while (true)` creates an infinite loop. 
This is a critical bug that will cause the scanner to run indefinitely without 
respecting concurrency limits. The margin check was there to limit how many 
scan tasks could be submitted.
   ```suggestion
       int32_t margin = _max_scan_concurrency - cast_set<int32_t>(
           _tasks_queue.size() + _num_scheduled_scanners + 
tasks_to_submit.size());
       while (margin-- > 0) {
   ```



##########
be/src/vec/exec/format/parquet/vparquet_group_reader.cpp:
##########
@@ -391,32 +391,32 @@ Status RowGroupReader::_read_column_data(Block* block,
                                          FilterMap& filter_map) {
     size_t batch_read_rows = 0;
     bool has_eof = false;
-    // todo: maybe do not need to build name to index map every time
-    auto name_to_idx = block->get_name_to_pos_map();
     for (auto& read_col_name : table_columns) {
-        auto& column_with_type_and_name = 
block->safe_get_by_position(name_to_idx[read_col_name]);
+        auto& column_with_type_and_name =
+                
block->safe_get_by_position((*_col_name_to_block_idx)[read_col_name]);

Review Comment:
   Potential null pointer dereference. `_col_name_to_block_idx` is dereferenced 
without null checking. If not properly initialized, this will crash. Add a null 
check or ensure it's always set before use.



##########
be/src/vec/exec/scan/scanner_context.cpp:
##########
@@ -583,13 +610,13 @@ Status 
ScannerContext::schedule_scan_task(std::shared_ptr<ScanTask> current_scan
 
 std::shared_ptr<ScanTask> ScannerContext::_pull_next_scan_task(
         std::shared_ptr<ScanTask> current_scan_task, int32_t 
current_concurrency) {
-    if (current_concurrency >= _max_scan_concurrency) {
-        VLOG_DEBUG << fmt::format(
-                "ScannerContext {} current concurrency {} >= 
_max_scan_concurrency {}, skip "
-                "pull",
-                ctx_id, current_concurrency, _max_scan_concurrency);
-        return nullptr;
-    }
+    // if (current_concurrency >= _max_scan_concurrency) {
+    //     VLOG_DEBUG << fmt::format(
+    //             "ScannerContext {} current concurrency {} >= 
_max_scan_concurrency {}, skip "
+    //             "pull",
+    //             ctx_id, current_concurrency, _max_scan_concurrency);
+    //     return nullptr;
+    // }

Review Comment:
   The concurrency limit check has been commented out. This appears to be 
debugging/experimental code that should not be merged. The commented code 
controlled important resource management.
   ```suggestion
       if (current_concurrency >= _max_scan_concurrency) {
           VLOG_DEBUG << fmt::format(
                   "ScannerContext {} current concurrency {} >= 
_max_scan_concurrency {}, skip "
                   "pull",
                   ctx_id, current_concurrency, _max_scan_concurrency);
           return nullptr;
       }
   ```



##########
be/src/vec/exec/format/table/iceberg_reader.h:
##########
@@ -211,8 +217,10 @@ class IcebergOrcReader final : public IcebergTableReader {
     }
 
     Status init_reader(
-            const std::vector<std::string>& file_col_names, const 
VExprContextSPtrs& conjuncts,
-            const TupleDescriptor* tuple_descriptor, const RowDescriptor* 
row_descriptor,
+            const std::vector<std::string>& file_col_names,
+            std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
+            const VExprContextSPtrs& conjuncts, const TupleDescriptor* 
tuple_descriptor,
+            const RowDescriptor* row_descriptor,
             const std::unordered_map<std::string, int>* colname_to_slot_id,
             const VExprContextSPtrs* not_single_slot_filter_conjuncts,
             const std::unordered_map<int, VExprContextSPtrs>* 
slot_id_to_filter_conjuncts);

Review Comment:
   The signature of `init_reader` has been changed to include 
`col_name_to_block_idx` parameter, but the implementation needs to store this 
pointer in the `_col_name_to_block_idx` member variable. Add 
`_col_name_to_block_idx = col_name_to_block_idx;` at the start of the 
init_reader implementation to avoid null pointer dereferences.



##########
be/src/vec/exec/format/table/iceberg_reader.cpp:
##########
@@ -236,23 +236,23 @@ Status IcebergTableReader::_expand_block_if_need(Block* 
block) {
             return Status::InternalError("Wrong expand column '{}'", col.name);
         }
         names.insert(col.name);
+        (*_col_name_to_block_idx)[col.name] = 
static_cast<uint32_t>(block->columns());
         block->insert(col);
     }
     return Status::OK();
 }
 
 Status IcebergTableReader::_shrink_block_if_need(Block* block) {
-    // todo: maybe do not need to build name to index map every time
-    auto name_to_pos_map = block->get_name_to_pos_map();
     std::set<size_t> positions_to_erase;
     for (const std::string& expand_col : _expand_col_names) {
-        if (!name_to_pos_map.contains(expand_col)) {
+        if (!_col_name_to_block_idx->contains(expand_col)) {
             return Status::InternalError("Wrong erase column '{}', block: {}", 
expand_col,
                                          block->dump_names());
         }
-        positions_to_erase.emplace(name_to_pos_map[expand_col]);
+        positions_to_erase.emplace((*_col_name_to_block_idx)[expand_col]);
     }
     block->erase(positions_to_erase);
+    _col_name_to_block_idx->erase(_expand_col_names.begin(), 
_expand_col_names.end());

Review Comment:
   The `erase` method signature is incorrect. `std::unordered_map::erase()` 
doesn't accept iterator pairs like this. It should be a loop erasing individual 
keys. Example: `for (const auto& name : _expand_col_names) { 
_col_name_to_block_idx->erase(name); }`
   ```suggestion
       for (const auto& name : _expand_col_names) {
           _col_name_to_block_idx->erase(name);
       }
   ```



##########
be/src/vec/exec/scan/file_scanner.cpp:
##########
@@ -537,8 +537,10 @@ Status FileScanner::_init_src_block(Block* block) {
     if (!_is_load) {
         _src_block_ptr = block;
 
-        // todo: maybe do not need to build name to index map every time
-        _src_block_name_to_idx = block->get_name_to_pos_map();
+        // Build name to index map only once on first call
+        if (_src_block_name_to_idx.empty()) {
+            _src_block_name_to_idx = block->get_name_to_pos_map();
+        }

Review Comment:
   The empty check for `_src_block_name_to_idx` is insufficient. The map can be 
non-empty from a previous file but may have different columns in a new file. 
This could lead to incorrect column mappings when reading multiple files with 
different schemas. Consider clearing and rebuilding the map when 
`_src_block_ptr` changes or validating that the map matches the current block's 
columns.
   ```suggestion
           // Always rebuild name to index map to match the current block's 
columns
           _src_block_name_to_idx.clear();
           _src_block_name_to_idx = block->get_name_to_pos_map();
   ```



##########
be/src/vec/exec/format/table/equality_delete.cpp:
##########
@@ -43,14 +43,11 @@ Status SimpleEqualityDelete::_build_set() {
     return Status::OK();
 }
 
-Status SimpleEqualityDelete::filter_data_block(Block* data_block) {
+Status SimpleEqualityDelete::filter_data_block(
+        Block* data_block, const std::unordered_map<std::string, uint32_t>* 
col_name_to_block_idx) {
     SCOPED_TIMER(equality_delete_time);

Review Comment:
   Potential null pointer dereference. `col_name_to_block_idx` is used with 
`->at()` without checking if it's nullptr. Add a null check before 
dereferencing.
   ```suggestion
       SCOPED_TIMER(equality_delete_time);
       if (col_name_to_block_idx == nullptr) {
           return Status::InternalError("col_name_to_block_idx is nullptr in 
filter_data_block");
       }
   ```



##########
be/src/vec/exec/format/table/iceberg_reader.h:
##########
@@ -165,8 +169,10 @@ class IcebergParquetReader final : public 
IcebergTableReader {
             : IcebergTableReader(std::move(file_format_reader), profile, 
state, params, range,
                                  kv_cache, io_ctx, meta_cache) {}
     Status init_reader(
-            const std::vector<std::string>& file_col_names, const 
VExprContextSPtrs& conjuncts,
-            const TupleDescriptor* tuple_descriptor, const RowDescriptor* 
row_descriptor,
+            const std::vector<std::string>& file_col_names,
+            std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
+            const VExprContextSPtrs& conjuncts, const TupleDescriptor* 
tuple_descriptor,
+            const RowDescriptor* row_descriptor,
             const std::unordered_map<std::string, int>* colname_to_slot_id,
             const VExprContextSPtrs* not_single_slot_filter_conjuncts,
             const std::unordered_map<int, VExprContextSPtrs>* 
slot_id_to_filter_conjuncts);

Review Comment:
   The signature of `init_reader` has been changed to include 
`col_name_to_block_idx` parameter, but the implementation in iceberg_reader.cpp 
may not have been updated accordingly. Additionally, the implementation needs 
to store this pointer in the `_col_name_to_block_idx` member variable to avoid 
null pointer dereferences in `_expand_block_if_need` and 
`_shrink_block_if_need`. Add `_col_name_to_block_idx = col_name_to_block_idx;` 
at the start of the init_reader implementation.



##########
be/src/vec/exec/scan/scanner_context.cpp:
##########
@@ -300,28 +303,50 @@ Status ScannerContext::get_block_from_queue(RuntimeState* 
state, vectorized::Blo
             return_free_block(std::move(current_block));
         }
 
-        VLOG_DEBUG << fmt::format(
-                "ScannerContext {} get block from queue, task_queue size {}, 
current scan "
-                "task remaing cached_block size {}, eos {}, scheduled tasks 
{}",
-                ctx_id, _tasks_queue.size(), scan_task->cached_blocks.size(), 
scan_task->is_eos(),
-                _num_scheduled_scanners);
-
-        if (scan_task->cached_blocks.empty()) {
-            // All Cached blocks are consumed, pop this task from task_queue.
+        // VLOG_DEBUG << fmt::format(
+        //         "ScannerContext {} get block from queue, task_queue size 
{}, current scan "
+        //         "task remaing cached_block size {}, eos {}, scheduled tasks 
{}",
+        //         ctx_id, _tasks_queue.size(), 
scan_task->cached_blocks.size(), scan_task->is_eos(),
+        //         _num_scheduled_scanners);
+        else {
             if (!_tasks_queue.empty()) {
                 _tasks_queue.pop_front();
             }
 
             if (scan_task->is_eos()) {
                 // 1. if eos, record a finished scanner.
                 _num_finished_scanners++;
+                std::cout << "ScannerContext::get_block_from_queue() eos 
_num_finished_scanners: "
+                          << _num_finished_scanners << std::endl;
                 RETURN_IF_ERROR(
                         
_scanner_scheduler->schedule_scan_task(shared_from_this(), nullptr, l));
             } else {
+                std::cout << "ScannerContext::get_block_from_queue() not eos 
resubmit scan task"
+                          << std::endl;
                 RETURN_IF_ERROR(
                         
_scanner_scheduler->schedule_scan_task(shared_from_this(), scan_task, l));
             }
         }
+        // if (scan_task->cached_blocks.empty()) {
+        //     // All Cached blocks are consumed, pop this task from 
task_queue.
+        //     if (!_tasks_queue.empty()) {
+        //         _tasks_queue.pop_front();
+        //     }
+
+        //     if (scan_task->is_eos()) {
+        //         // 1. if eos, record a finished scanner.
+        //         _num_finished_scanners++;
+        //         std::cout << "ScannerContext::get_block_from_queue() eos 
_num_finished_scanners: "
+        //                   << _num_finished_scanners << std::endl;
+        //         RETURN_IF_ERROR(
+        //                 
_scanner_scheduler->schedule_scan_task(shared_from_this(), nullptr, l));
+        //     } else {
+        //         std::cout << "ScannerContext::get_block_from_queue() not 
eos resubmit scan task"
+        //                   << std::endl;
+        //         RETURN_IF_ERROR(
+        //                 
_scanner_scheduler->schedule_scan_task(shared_from_this(), scan_task, l));
+        //     }
+        // }

Review Comment:
   Large blocks of commented-out code (lines 330-349) should be removed rather 
than left in the codebase. If this code needs to be preserved for reference, it 
should be documented in commit history instead.
   ```suggestion
   
   ```



##########
be/src/olap/push_handler.cpp:
##########
@@ -646,9 +646,9 @@ Status PushBrokerReader::_get_next_reader() {
                                                          _io_ctx.get(), 
_runtime_state.get());
 
         init_status = parquet_reader->init_reader(
-                _all_col_names, _push_down_exprs, _real_tuple_desc, 
_default_val_row_desc.get(),
-                _col_name_to_slot_id, &_not_single_slot_filter_conjuncts,
-                &_slot_id_to_filter_conjuncts,
+                _all_col_names, nullptr, _push_down_exprs, _real_tuple_desc,

Review Comment:
   Passing `nullptr` as the `col_name_to_block_idx` parameter will cause null 
pointer dereferences in ParquetReader and its group readers when they try to 
use `_col_name_to_block_idx`. Either create and pass a valid map, or update all 
the reader implementations to handle nullptr gracefully.



##########
be/src/vec/exec/format/table/transactional_hive_reader.cpp:
##########
@@ -119,11 +122,14 @@ Status 
TransactionalHiveReader::get_next_block_inner(Block* block, size_t* read_
         DataTypePtr data_type = get_data_type_with_default_argument(
                 DataTypeFactory::instance().create_data_type(i.type, false));
         MutableColumnPtr data_column = data_type->create_column();
+        (*_col_name_to_block_idx)[i.column_lower_case] = 
static_cast<uint32_t>(block->columns());
         block->insert(
                 ColumnWithTypeAndName(std::move(data_column), data_type, 
i.column_lower_case));
     }
     auto res = _file_format_reader->get_next_block(block, read_rows, eof);
     Block::erase_useless_column(block, block->columns() - 
TransactionalHive::READ_PARAMS.size());
+    _col_name_to_block_idx->erase(READ_ROW_COLUMN_NAMES_LOWER_CASE.begin(),
+                                  READ_ROW_COLUMN_NAMES_LOWER_CASE.end());

Review Comment:
   The `erase` method signature is incorrect. `std::unordered_map::erase()` 
doesn't accept iterator pairs like this. It should be a loop erasing individual 
keys or use a helper function. This code will fail to compile. Example: `for 
(const auto& name : READ_ROW_COLUMN_NAMES_LOWER_CASE) { 
_col_name_to_block_idx->erase(name); }`
   ```suggestion
       for (const auto& name : READ_ROW_COLUMN_NAMES_LOWER_CASE) {
           _col_name_to_block_idx->erase(name);
       }
   ```



##########
be/test/vec/exec/format/table/iceberg/iceberg_reader_test.cpp:
##########
@@ -695,13 +699,16 @@ TEST_F(IcebergReaderTest, read_iceberg_orc_file) {
     VExprContextSPtrs conjuncts; // Empty conjuncts for this test
     std::vector<std::string> table_col_names = {"name", "profile"};
     const RowDescriptor* row_descriptor = nullptr;
-    const std::unordered_map<std::string, int>* colname_to_slot_id = nullptr;
+    std::unordered_map<std::string, uint32_t> col_name_to_block_idx = {
+            {"name", 0},
+            {"profile", 1},
+    };
     const VExprContextSPtrs* not_single_slot_filter_conjuncts = nullptr;
     const std::unordered_map<int, VExprContextSPtrs>* 
slot_id_to_filter_conjuncts = nullptr;

Review Comment:
   The variable `colname_to_slot_id` is used but not declared in this test 
function. This will cause a compilation error. Add `const 
std::unordered_map<std::string, int>* colname_to_slot_id = nullptr;` before 
line 709.
   ```suggestion
       const std::unordered_map<int, VExprContextSPtrs>* 
slot_id_to_filter_conjuncts = nullptr;
       const std::unordered_map<std::string, int>* colname_to_slot_id = nullptr;
   ```



##########
be/src/vec/exec/format/table/equality_delete.cpp:
##########
@@ -106,25 +103,25 @@ Status MultiEqualityDelete::_build_set() {
     return Status::OK();
 }
 
-Status MultiEqualityDelete::filter_data_block(Block* data_block) {
+Status MultiEqualityDelete::filter_data_block(
+        Block* data_block, const std::unordered_map<std::string, uint32_t>* 
col_name_to_block_idx) {
     SCOPED_TIMER(equality_delete_time);
     size_t column_index = 0;
 
-    // todo: maybe do not need to build name to index map every time
-    auto name_to_pos_map = data_block->get_name_to_pos_map();
     for (auto delete_col : _delete_block->get_columns_with_type_and_name()) {
         const std::string& column_name = delete_col.name;
-        auto column_and_type = 
data_block->safe_get_by_position(name_to_pos_map[column_name]);
-        if (name_to_pos_map.contains(column_name) == false) {
+        if (!col_name_to_block_idx->contains(column_name)) {
             return Status::InternalError("Column '{}' not found in data block: 
{}", column_name,
                                          data_block->dump_structure());
         }
+        auto column_and_type =
+                
data_block->safe_get_by_position(col_name_to_block_idx->at(column_name));
         if (!delete_col.type->equals(*column_and_type.type)) {
             return Status::InternalError(
                     "Not support type change in column '{}', src type: {}, 
target type: {}",
                     column_name, delete_col.type->get_name(), 
column_and_type.type->get_name());
         }
-        _data_column_index[column_index++] = name_to_pos_map[column_name];
+        _data_column_index[column_index++] = 
col_name_to_block_idx->at(column_name);

Review Comment:
   Potential null pointer dereference. `col_name_to_block_idx` is used with 
`->contains()` and `->at()` without checking if it's nullptr. Add a null check 
before dereferencing.



##########
be/src/vec/exec/scan/scanner_context.cpp:
##########
@@ -300,28 +303,50 @@ Status ScannerContext::get_block_from_queue(RuntimeState* 
state, vectorized::Blo
             return_free_block(std::move(current_block));
         }
 
-        VLOG_DEBUG << fmt::format(
-                "ScannerContext {} get block from queue, task_queue size {}, 
current scan "
-                "task remaing cached_block size {}, eos {}, scheduled tasks 
{}",
-                ctx_id, _tasks_queue.size(), scan_task->cached_blocks.size(), 
scan_task->is_eos(),
-                _num_scheduled_scanners);
-
-        if (scan_task->cached_blocks.empty()) {
-            // All Cached blocks are consumed, pop this task from task_queue.
+        // VLOG_DEBUG << fmt::format(
+        //         "ScannerContext {} get block from queue, task_queue size 
{}, current scan "
+        //         "task remaing cached_block size {}, eos {}, scheduled tasks 
{}",
+        //         ctx_id, _tasks_queue.size(), 
scan_task->cached_blocks.size(), scan_task->is_eos(),
+        //         _num_scheduled_scanners);
+        else {

Review Comment:
   The VLOG_DEBUG statement has been commented out but the logic that follows 
has been changed from `if (scan_task->cached_blocks.empty())` to `else`. This 
creates an `else` without a corresponding `if` block, which is a syntax error. 
The commented-out VLOG_DEBUG should either be removed entirely or the control 
flow should be fixed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [Opt](multi-catalog) Opt by avoid building name_to_index map every time. [doris]

Reply via email to