This is an automated email from the ASF dual-hosted git repository.

mrhhsg pushed a commit to branch nested_column_prune
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/nested_column_prune by this 
push:
     new c56e05f0e77 [opt](olap) Support column pruning for struct, array, and 
map types (#57195)
c56e05f0e77 is described below

commit c56e05f0e7754e4e41c10805eceb771f27de6ce2
Author: Jerry Hu <[email protected]>
AuthorDate: Tue Oct 21 17:55:26 2025 +0800

    [opt](olap) Support column pruning for struct, array, and map types (#57195)
    
    ### What problem does this PR solve?
    
    Issue Number: close #xxx
    
    Related PR: #xxx
    
    Problem Summary:
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test <!-- At least one of them must be included. -->
        - [ ] Regression test
        - [ ] Unit Test
        - [ ] Manual test (add detailed scripts or steps below)
        - [ ] No need to test or manual test. Explain why:
    - [ ] This is a refactor/code format and no logic has been changed.
            - [ ] Previous test can cover this change.
            - [ ] No code files have been changed.
            - [ ] Other reason <!-- Add your reason?  -->
    
    - Behavior changed:
        - [ ] No.
        - [ ] Yes. <!-- Explain the behavior change -->
    
    - Does this need documentation?
        - [ ] No.
    - [ ] Yes. <!-- Add document PR link here. eg:
    https://github.com/apache/doris-website/pull/1214 -->
    
    ### Check List (For Reviewer who merge this PR)
    
    - [ ] Confirm the release note
    - [ ] Confirm test cases
    - [ ] Confirm document
    - [ ] Add branch pick label <!-- Add branch pick label that this PR
    should merge into -->
---
 be/src/olap/iterators.h                         |   3 +
 be/src/olap/rowset/beta_rowset_reader.cpp       |   4 +
 be/src/olap/rowset/rowset_reader_context.h      |   3 +
 be/src/olap/rowset/segment_v2/column_reader.cpp | 307 +++++++++++++++++++++++-
 be/src/olap/rowset/segment_v2/column_reader.h   |  29 ++-
 be/src/olap/rowset/segment_v2/segment.cpp       |  11 +
 be/src/olap/tablet_reader.cpp                   |   3 +
 be/src/olap/tablet_reader.h                     |   4 +
 be/src/runtime/descriptors.cpp                  |   5 +
 be/src/runtime/descriptors.h                    |   6 +
 be/src/vec/columns/column_map.cpp               |   1 +
 be/src/vec/exec/scan/olap_scanner.cpp           |  13 +
 12 files changed, 378 insertions(+), 11 deletions(-)

diff --git a/be/src/olap/iterators.h b/be/src/olap/iterators.h
index 5d2b319dd23..f84a25c3720 100644
--- a/be/src/olap/iterators.h
+++ b/be/src/olap/iterators.h
@@ -127,6 +127,9 @@ public:
     std::map<ColumnId, size_t> vir_cid_to_idx_in_block;
     std::map<size_t, vectorized::DataTypePtr> vir_col_idx_to_type;
 
+    std::map<int32_t, TColumnAccessPaths> all_access_paths;
+    std::map<int32_t, TColumnAccessPaths> predicate_access_paths;
+
     std::shared_ptr<vectorized::ScoreRuntime> score_runtime;
     CollectionStatisticsPtr collection_statistics;
 };
diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp 
b/be/src/olap/rowset/beta_rowset_reader.cpp
index 2a2eea96ceb..e5d75fbb240 100644
--- a/be/src/olap/rowset/beta_rowset_reader.cpp
+++ b/be/src/olap/rowset/beta_rowset_reader.cpp
@@ -103,6 +103,10 @@ Status 
BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context
     _read_options.remaining_conjunct_roots = 
_read_context->remaining_conjunct_roots;
     _read_options.common_expr_ctxs_push_down = 
_read_context->common_expr_ctxs_push_down;
     _read_options.virtual_column_exprs = _read_context->virtual_column_exprs;
+
+    _read_options.all_access_paths = _read_context->all_access_paths;
+    _read_options.predicate_access_paths = 
_read_context->predicate_access_paths;
+
     _read_options.ann_topn_runtime = _read_context->ann_topn_runtime;
     _read_options.vir_cid_to_idx_in_block = 
_read_context->vir_cid_to_idx_in_block;
     _read_options.vir_col_idx_to_type = _read_context->vir_col_idx_to_type;
diff --git a/be/src/olap/rowset/rowset_reader_context.h 
b/be/src/olap/rowset/rowset_reader_context.h
index c6e8dc718c7..7caec564af9 100644
--- a/be/src/olap/rowset/rowset_reader_context.h
+++ b/be/src/olap/rowset/rowset_reader_context.h
@@ -90,6 +90,9 @@ struct RowsetReaderContext {
     std::map<ColumnId, size_t> vir_cid_to_idx_in_block;
     std::map<size_t, vectorized::DataTypePtr> vir_col_idx_to_type;
 
+    std::map<int32_t, TColumnAccessPaths> all_access_paths;
+    std::map<int32_t, TColumnAccessPaths> predicate_access_paths;
+
     std::shared_ptr<vectorized::ScoreRuntime> score_runtime;
     CollectionStatisticsPtr collection_statistics;
     std::shared_ptr<segment_v2::AnnTopNRuntime> ann_topn_runtime;
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 10ced03dc61..41702a463a1 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -18,6 +18,7 @@
 #include "olap/rowset/segment_v2/column_reader.h"
 
 #include <assert.h>
+#include <gen_cpp/Descriptors_types.h>
 #include <gen_cpp/segment_v2.pb.h>
 
 #include <algorithm>
@@ -72,7 +73,6 @@
 #include "vec/columns/column_map.h"
 #include "vec/columns/column_nullable.h"
 #include "vec/columns/column_struct.h"
-#include "vec/columns/column_variant.h"
 #include "vec/columns/column_vector.h"
 #include "vec/common/assert_cast.h"
 #include "vec/common/schema_util.h"
@@ -805,11 +805,12 @@ Status ColumnReader::new_iterator(ColumnIteratorUPtr* 
iterator, const TabletColu
         *iterator = std::make_unique<EmptyFileColumnIterator>();
         return Status::OK();
     }
-    if (is_scalar_type((FieldType)_meta_type)) {
+    if (is_scalar_type(_meta_type)) {
         *iterator = std::make_unique<FileColumnIterator>(shared_from_this());
+        (*iterator)->set_column_name(tablet_column ? tablet_column->name() : 
"");
         return Status::OK();
     } else {
-        auto type = (FieldType)_meta_type;
+        auto type = _meta_type;
         switch (type) {
         case FieldType::OLAP_FIELD_TYPE_AGG_STATE: {
             return new_agg_state_iterator(iterator);
@@ -843,6 +844,8 @@ Status ColumnReader::new_array_iterator(ColumnIteratorUPtr* 
iterator,
                                     ? &tablet_column->get_sub_column(0)
                                     : nullptr));
 
+    item_iterator->set_column_name(tablet_column ? 
tablet_column->get_sub_column(0).name() : "");
+
     ColumnIteratorUPtr offset_iterator;
     RETURN_IF_ERROR(_sub_readers[1]->new_iterator(&offset_iterator, nullptr));
     auto* file_iter = 
static_cast<FileColumnIterator*>(offset_iterator.release());
@@ -866,11 +869,13 @@ Status ColumnReader::new_map_iterator(ColumnIteratorUPtr* 
iterator,
             &key_iterator, tablet_column && tablet_column->get_subtype_count() 
> 1
                                    ? &tablet_column->get_sub_column(0)
                                    : nullptr));
+    key_iterator->set_column_name(tablet_column ? 
tablet_column->get_sub_column(0).name() : "");
     ColumnIteratorUPtr val_iterator;
     RETURN_IF_ERROR(_sub_readers[1]->new_iterator(
             &val_iterator, tablet_column && tablet_column->get_subtype_count() 
> 1
                                    ? &tablet_column->get_sub_column(1)
                                    : nullptr));
+    val_iterator->set_column_name(tablet_column ? 
tablet_column->get_sub_column(1).name() : "");
     ColumnIteratorUPtr offsets_iterator;
     RETURN_IF_ERROR(_sub_readers[2]->new_iterator(&offsets_iterator, nullptr));
     auto* file_iter = 
static_cast<FileColumnIterator*>(offsets_iterator.release());
@@ -898,6 +903,8 @@ Status 
ColumnReader::new_struct_iterator(ColumnIteratorUPtr* iterator,
         ColumnIteratorUPtr sub_column_iterator;
         RETURN_IF_ERROR(_sub_readers[i]->new_iterator(
                 &sub_column_iterator, tablet_column ? 
&tablet_column->get_sub_column(i) : nullptr));
+        sub_column_iterator->set_column_name(tablet_column ? 
tablet_column->get_sub_column(i).name()
+                                                           : "");
         sub_column_iterators.emplace_back(std::move(sub_column_iterator));
     }
 
@@ -934,6 +941,10 @@ 
MapFileColumnIterator::MapFileColumnIterator(std::shared_ptr<ColumnReader> reade
 }
 
 Status MapFileColumnIterator::init(const ColumnIteratorOptions& opts) {
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        LOG(INFO) << "Map column iterator column " << _column_name << " skip 
reading.";
+        return Status::OK();
+    }
     RETURN_IF_ERROR(_key_iterator->init(opts));
     RETURN_IF_ERROR(_val_iterator->init(opts));
     RETURN_IF_ERROR(_offsets_iterator->init(opts));
@@ -944,6 +955,11 @@ Status MapFileColumnIterator::init(const 
ColumnIteratorOptions& opts) {
 }
 
 Status MapFileColumnIterator::seek_to_ordinal(ordinal_t ord) {
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        DLOG(INFO) << "Map column iterator column " << _column_name << " skip 
reading.";
+        return Status::OK();
+    }
+
     if (_map_reader->is_nullable()) {
         RETURN_IF_ERROR(_null_iterator->seek_to_ordinal(ord));
     }
@@ -958,10 +974,16 @@ Status MapFileColumnIterator::seek_to_ordinal(ordinal_t 
ord) {
 
 Status MapFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr& dst,
                                          bool* has_null) {
-    const auto* column_map = 
vectorized::check_and_get_column<vectorized::ColumnMap>(
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        DLOG(INFO) << "Map column iterator column " << _column_name << " skip 
reading.";
+        dst->resize(*n);
+        return Status::OK();
+    }
+
+    auto& column_map = assert_cast<vectorized::ColumnMap&>(
             dst->is_nullable() ? 
static_cast<vectorized::ColumnNullable&>(*dst).get_nested_column()
                                : *dst);
-    auto column_offsets_ptr = 
column_map->get_offsets_column().assume_mutable();
+    auto column_offsets_ptr = column_map.get_offsets_column().assume_mutable();
     bool offsets_has_null = false;
     ssize_t start = column_offsets_ptr->size();
     RETURN_IF_ERROR(_offsets_iterator->next_batch(n, column_offsets_ptr, 
&offsets_has_null));
@@ -974,8 +996,8 @@ Status MapFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr
     DCHECK(column_offsets.get_data().back() >= column_offsets.get_data()[start 
- 1]);
     size_t num_items =
             column_offsets.get_data().back() - column_offsets.get_data()[start 
- 1]; // -1 is valid
-    auto key_ptr = column_map->get_keys().assume_mutable();
-    auto val_ptr = column_map->get_values().assume_mutable();
+    auto key_ptr = column_map.get_keys().assume_mutable();
+    auto val_ptr = column_map.get_values().assume_mutable();
 
     if (num_items > 0) {
         size_t num_read = num_items;
@@ -984,6 +1006,9 @@ Status MapFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr
         RETURN_IF_ERROR(_key_iterator->next_batch(&num_read, key_ptr, 
&key_has_null));
         RETURN_IF_ERROR(_val_iterator->next_batch(&num_read, val_ptr, 
&val_has_null));
         DCHECK(num_read == num_items);
+
+        column_map.get_keys_ptr() = std::move(key_ptr);
+        column_map.get_values_ptr() = std::move(val_ptr);
     }
 
     if (dst->is_nullable()) {
@@ -1009,6 +1034,12 @@ Status MapFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr
 
 Status MapFileColumnIterator::read_by_rowids(const rowid_t* rowids, const 
size_t count,
                                              vectorized::MutableColumnPtr& 
dst) {
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        DLOG(INFO) << "File column iterator column " << _column_name << " skip 
reading.";
+        dst->resize(count);
+        return Status::OK();
+    }
+
     for (size_t i = 0; i < count; ++i) {
         RETURN_IF_ERROR(seek_to_ordinal(rowids[i]));
         size_t num_read = 1;
@@ -1018,6 +1049,87 @@ Status MapFileColumnIterator::read_by_rowids(const 
rowid_t* rowids, const size_t
     return Status::OK();
 }
 
+Status MapFileColumnIterator::set_access_paths(const TColumnAccessPaths& 
all_access_paths,
+                                               const TColumnAccessPaths& 
predicate_access_paths) {
+    if (all_access_paths.name_access_paths.empty()) {
+        return Status::OK();
+    }
+
+    if (!predicate_access_paths.name_access_paths.empty()) {
+        _reading_flag = ReadingFlag::READING_FOR_PREDICATE;
+        LOG(INFO) << "Map column iterator set sub-column " << _column_name
+                  << " to READING_FOR_PREDICATE";
+    }
+
+    auto get_sub_access_paths = [&](const TColumnAccessPaths& access_paths) -> 
TColumnAccessPaths {
+        TColumnAccessPaths sub_access_paths = access_paths;
+        for (auto it = sub_access_paths.name_access_paths.begin();
+             it != sub_access_paths.name_access_paths.end();) {
+            TColumnNameAccessPath& name_path = *it;
+            if (name_path.path.size() > 1) {
+                name_path.path.erase(name_path.path.begin());
+                ++it;
+            } else {
+                it = sub_access_paths.name_access_paths.erase(it);
+            }
+        }
+        return sub_access_paths;
+    };
+
+    auto sub_all_access_paths = get_sub_access_paths(all_access_paths);
+    auto sub_predicate_access_paths = 
get_sub_access_paths(predicate_access_paths);
+
+    if (sub_all_access_paths.name_access_paths.empty()) {
+        return Status::OK();
+    }
+
+    TColumnAccessPaths key_all_access_paths;
+    TColumnAccessPaths val_all_access_paths;
+    TColumnAccessPaths key_predicate_access_paths;
+    TColumnAccessPaths val_predicate_access_paths;
+
+    for (const auto& paths : sub_all_access_paths.name_access_paths) {
+        if (paths.path[0] == "*") {
+            key_all_access_paths.name_access_paths.push_back(paths);
+            val_all_access_paths.name_access_paths.push_back(paths);
+        } else if (paths.path[0] == "KEYS") {
+            key_all_access_paths.name_access_paths.push_back(paths);
+        } else if (paths.path[0] == "VALUES") {
+            val_all_access_paths.name_access_paths.push_back(paths);
+        }
+    }
+    const auto need_read_keys = 
!key_all_access_paths.name_access_paths.empty();
+    const auto need_read_values = 
!val_all_access_paths.name_access_paths.empty();
+
+    for (const auto& paths : sub_predicate_access_paths.name_access_paths) {
+        if (paths.path[0] == "*") {
+            key_predicate_access_paths.name_access_paths.push_back(paths);
+            val_predicate_access_paths.name_access_paths.push_back(paths);
+        } else if (paths.path[0] == "KEYS") {
+            key_predicate_access_paths.name_access_paths.push_back(paths);
+        } else if (paths.path[0] == "VALUES") {
+            val_predicate_access_paths.name_access_paths.push_back(paths);
+        }
+    }
+
+    if (need_read_keys) {
+        RETURN_IF_ERROR(
+                _key_iterator->set_access_paths(key_all_access_paths, 
key_predicate_access_paths));
+    } else {
+        _key_iterator->set_reading_flag(ReadingFlag::SKIP_READING);
+        LOG(INFO) << "Map column iterator set key column to SKIP_READING";
+    }
+
+    if (need_read_values) {
+        RETURN_IF_ERROR(
+                _val_iterator->set_access_paths(val_all_access_paths, 
val_predicate_access_paths));
+    } else {
+        _val_iterator->set_reading_flag(ReadingFlag::SKIP_READING);
+        LOG(INFO) << "Map column iterator set value column to SKIP_READING";
+    }
+    return Status::OK();
+}
+
 
////////////////////////////////////////////////////////////////////////////////
 
 StructFileColumnIterator::StructFileColumnIterator(
@@ -1030,6 +1142,11 @@ StructFileColumnIterator::StructFileColumnIterator(
 }
 
 Status StructFileColumnIterator::init(const ColumnIteratorOptions& opts) {
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        LOG(INFO) << "Struct column iterator column " << _column_name << " 
skip reading.";
+        return Status::OK();
+    }
+
     for (auto& column_iterator : _sub_column_iterators) {
         RETURN_IF_ERROR(column_iterator->init(opts));
     }
@@ -1041,16 +1158,23 @@ Status StructFileColumnIterator::init(const 
ColumnIteratorOptions& opts) {
 
 Status StructFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr& dst,
                                             bool* has_null) {
-    const auto* column_struct = 
vectorized::check_and_get_column<vectorized::ColumnStruct>(
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        DLOG(INFO) << "Struct column iterator column " << _column_name << " 
skip reading.";
+        dst->resize(*n);
+        return Status::OK();
+    }
+
+    auto& column_struct = assert_cast<vectorized::ColumnStruct&>(
             dst->is_nullable() ? 
static_cast<vectorized::ColumnNullable&>(*dst).get_nested_column()
                                : *dst);
-    for (size_t i = 0; i < column_struct->tuple_size(); i++) {
+    for (size_t i = 0; i < column_struct.tuple_size(); i++) {
         size_t num_read = *n;
-        auto sub_column_ptr = column_struct->get_column(i).assume_mutable();
+        auto sub_column_ptr = column_struct.get_column(i).assume_mutable();
         bool column_has_null = false;
         RETURN_IF_ERROR(
                 _sub_column_iterators[i]->next_batch(&num_read, 
sub_column_ptr, &column_has_null));
         DCHECK(num_read == *n);
+        column_struct.get_column_ptr(i) = std::move(sub_column_ptr);
     }
 
     if (dst->is_nullable()) {
@@ -1076,6 +1200,11 @@ Status StructFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumn
 }
 
 Status StructFileColumnIterator::seek_to_ordinal(ordinal_t ord) {
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        DLOG(INFO) << "Struct column iterator column " << _column_name << " 
skip reading.";
+        return Status::OK();
+    }
+
     for (auto& column_iterator : _sub_column_iterators) {
         RETURN_IF_ERROR(column_iterator->seek_to_ordinal(ord));
     }
@@ -1087,6 +1216,12 @@ Status 
StructFileColumnIterator::seek_to_ordinal(ordinal_t ord) {
 
 Status StructFileColumnIterator::read_by_rowids(const rowid_t* rowids, const 
size_t count,
                                                 vectorized::MutableColumnPtr& 
dst) {
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        DLOG(INFO) << "Struct column iterator column " << _column_name << " 
skip reading.";
+        dst->resize(count);
+        return Status::OK();
+    }
+
     for (size_t i = 0; i < count; ++i) {
         RETURN_IF_ERROR(seek_to_ordinal(rowids[i]));
         size_t num_read = 1;
@@ -1096,6 +1231,74 @@ Status StructFileColumnIterator::read_by_rowids(const 
rowid_t* rowids, const siz
     return Status::OK();
 }
 
+Status StructFileColumnIterator::set_access_paths(
+        const TColumnAccessPaths& all_access_paths,
+        const TColumnAccessPaths& predicate_access_paths) {
+    if (all_access_paths.name_access_paths.empty()) {
+        return Status::OK();
+    }
+
+    if (!predicate_access_paths.name_access_paths.empty()) {
+        _reading_flag = ReadingFlag::READING_FOR_PREDICATE;
+        LOG(INFO) << "Struct column iterator set sub-column " << _column_name
+                  << " to READING_FOR_PREDICATE";
+    }
+
+    auto get_sub_access_paths = [&](const TColumnAccessPaths& access_paths) -> 
TColumnAccessPaths {
+        TColumnAccessPaths sub_access_paths = access_paths;
+        for (auto it = sub_access_paths.name_access_paths.begin();
+             it != sub_access_paths.name_access_paths.end();) {
+            TColumnNameAccessPath& name_path = *it;
+            if (name_path.path.size() > 1) {
+                name_path.path.erase(name_path.path.begin());
+                ++it;
+            } else {
+                it = sub_access_paths.name_access_paths.erase(it);
+            }
+        }
+        return sub_access_paths;
+    };
+
+    auto sub_all_access_paths = get_sub_access_paths(all_access_paths);
+    auto sub_predicate_access_paths = 
get_sub_access_paths(predicate_access_paths);
+
+    const auto no_sub_column_to_skip = 
sub_all_access_paths.name_access_paths.empty();
+    const auto no_predicate_sub_column = 
sub_predicate_access_paths.name_access_paths.empty();
+
+    for (auto& sub_iterator : _sub_column_iterators) {
+        const auto& name = sub_iterator->column_name();
+        bool need_to_read = no_sub_column_to_skip;
+        TColumnAccessPaths sub_all_access_paths_of_this;
+        if (!need_to_read) {
+            for (const auto& paths : sub_all_access_paths.name_access_paths) {
+                if (paths.path[0] == name) {
+                    
sub_all_access_paths_of_this.name_access_paths.push_back(paths);
+                }
+            }
+            need_to_read = 
!sub_all_access_paths_of_this.name_access_paths.empty();
+        }
+
+        if (!need_to_read) {
+            sub_iterator->set_reading_flag(ReadingFlag::SKIP_READING);
+            LOG(INFO) << "Struct column iterator set sub-column " << name << " 
to SKIP_READING";
+            continue;
+        }
+
+        TColumnAccessPaths sub_predicate_access_paths_of_this;
+
+        if (!no_predicate_sub_column) {
+            for (const auto& paths : 
sub_predicate_access_paths.name_access_paths) {
+                if (paths.path[0] == name) {
+                    
sub_predicate_access_paths_of_this.name_access_paths.push_back(paths);
+                }
+            }
+        }
+        
RETURN_IF_ERROR(sub_iterator->set_access_paths(sub_all_access_paths_of_this,
+                                                       
sub_predicate_access_paths_of_this));
+    }
+    return Status::OK();
+}
+
 
////////////////////////////////////////////////////////////////////////////////
 Status OffsetFileColumnIterator::init(const ColumnIteratorOptions& opts) {
     RETURN_IF_ERROR(_offset_iterator->init(opts));
@@ -1167,6 +1370,11 @@ 
ArrayFileColumnIterator::ArrayFileColumnIterator(std::shared_ptr<ColumnReader> r
 }
 
 Status ArrayFileColumnIterator::init(const ColumnIteratorOptions& opts) {
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        LOG(INFO) << "Array column iterator column " << _column_name << " skip 
readking.";
+        return Status::OK();
+    }
+
     RETURN_IF_ERROR(_offset_iterator->init(opts));
     RETURN_IF_ERROR(_item_iterator->init(opts));
     if (_array_reader->is_nullable()) {
@@ -1184,6 +1392,11 @@ Status 
ArrayFileColumnIterator::_seek_by_offsets(ordinal_t ord) {
 }
 
 Status ArrayFileColumnIterator::seek_to_ordinal(ordinal_t ord) {
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        DLOG(INFO) << "Array column iterator column " << _column_name << " 
skip reading.";
+        return Status::OK();
+    }
+
     RETURN_IF_ERROR(_offset_iterator->seek_to_ordinal(ord));
     if (_array_reader->is_nullable()) {
         RETURN_IF_ERROR(_null_iterator->seek_to_ordinal(ord));
@@ -1193,6 +1406,12 @@ Status 
ArrayFileColumnIterator::seek_to_ordinal(ordinal_t ord) {
 
 Status ArrayFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr& dst,
                                            bool* has_null) {
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        DLOG(INFO) << "Array column iterator column " << _column_name << " 
skip reading.";
+        dst->resize(*n);
+        return Status::OK();
+    }
+
     const auto* column_array = 
vectorized::check_and_get_column<vectorized::ColumnArray>(
             dst->is_nullable() ? 
static_cast<vectorized::ColumnNullable&>(*dst).get_nested_column()
                                : *dst);
@@ -1241,6 +1460,12 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnP
 
 Status ArrayFileColumnIterator::read_by_rowids(const rowid_t* rowids, const 
size_t count,
                                                vectorized::MutableColumnPtr& 
dst) {
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        DLOG(INFO) << "Array column iterator column " << _column_name << " 
skip reading.";
+        dst->resize(count);
+        return Status::OK();
+    }
+
     for (size_t i = 0; i < count; ++i) {
         // TODO(cambyzju): now read array one by one, need optimize later
         RETURN_IF_ERROR(seek_to_ordinal(rowids[i]));
@@ -1251,11 +1476,56 @@ Status ArrayFileColumnIterator::read_by_rowids(const 
rowid_t* rowids, const size
     return Status::OK();
 }
 
+Status ArrayFileColumnIterator::set_access_paths(const TColumnAccessPaths& 
all_access_paths,
+                                                 const TColumnAccessPaths& 
predicate_access_paths) {
+    if (all_access_paths.name_access_paths.empty()) {
+        return Status::OK();
+    }
+
+    if (!predicate_access_paths.name_access_paths.empty()) {
+        _reading_flag = ReadingFlag::READING_FOR_PREDICATE;
+        LOG(INFO) << "Array column iterator set sub-column " << _column_name
+                  << " to READING_FOR_PREDICATE";
+    }
+
+    auto get_sub_access_paths = [&](const TColumnAccessPaths& access_paths) -> 
TColumnAccessPaths {
+        TColumnAccessPaths sub_access_paths = access_paths;
+        for (auto it = sub_access_paths.name_access_paths.begin();
+             it != sub_access_paths.name_access_paths.end();) {
+            TColumnNameAccessPath& name_path = *it;
+            if (name_path.path.size() > 1) {
+                name_path.path.erase(name_path.path.begin());
+                ++it;
+            } else {
+                it = sub_access_paths.name_access_paths.erase(it);
+            }
+        }
+        return sub_access_paths;
+    };
+
+    auto sub_all_access_paths = get_sub_access_paths(all_access_paths);
+    auto sub_predicate_access_paths = 
get_sub_access_paths(predicate_access_paths);
+
+    const auto no_sub_column_to_skip = 
sub_all_access_paths.name_access_paths.empty();
+    const auto no_predicate_sub_column = 
sub_predicate_access_paths.name_access_paths.empty();
+
+    if (!no_sub_column_to_skip || !no_predicate_sub_column) {
+        RETURN_IF_ERROR(
+                _item_iterator->set_access_paths(sub_all_access_paths, 
sub_predicate_access_paths));
+    }
+    return Status::OK();
+}
+
 
////////////////////////////////////////////////////////////////////////////////
 
 FileColumnIterator::FileColumnIterator(std::shared_ptr<ColumnReader> reader) : 
_reader(reader) {}
 
 Status FileColumnIterator::init(const ColumnIteratorOptions& opts) {
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        LOG(INFO) << "File column iterator column " << _column_name << " skip 
reading.";
+        return Status::OK();
+    }
+
     _opts = opts;
     if (!_opts.use_page_cache) {
         _reader->disable_index_meta_cache();
@@ -1287,6 +1557,11 @@ Status FileColumnIterator::init(const 
ColumnIteratorOptions& opts) {
 FileColumnIterator::~FileColumnIterator() = default;
 
 Status FileColumnIterator::seek_to_ordinal(ordinal_t ord) {
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        DLOG(INFO) << "File column iterator column " << _column_name << " skip 
reading.";
+        return Status::OK();
+    }
+
     // if current page contains this row, we don't need to seek
     if (!_page || !_page.contains(ord) || !_page_iter.valid()) {
         RETURN_IF_ERROR(_reader->seek_at_or_before(ord, &_page_iter, _opts));
@@ -1337,6 +1612,12 @@ Status 
FileColumnIterator::next_batch_of_zone_map(size_t* n, vectorized::Mutable
 
 Status FileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& 
dst,
                                       bool* has_null) {
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        DLOG(INFO) << "File column iterator column " << _column_name << " skip 
reading.";
+        dst->resize(*n);
+        return Status::OK();
+    }
+
     size_t curr_size = dst->byte_size();
     dst->reserve(*n);
     size_t remaining = *n;
@@ -1393,6 +1674,12 @@ Status FileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr& d
 
 Status FileColumnIterator::read_by_rowids(const rowid_t* rowids, const size_t 
count,
                                           vectorized::MutableColumnPtr& dst) {
+    if (_reading_flag == ReadingFlag::SKIP_READING) {
+        DLOG(INFO) << "File column iterator column " << _column_name << " skip 
reading.";
+        dst->resize(count);
+        return Status::OK();
+    }
+
     size_t remaining = count;
     size_t total_read_count = 0;
     size_t nrows_to_read = 0;
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h 
b/be/src/olap/rowset/segment_v2/column_reader.h
index 91aadffbfce..062f8625678 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -17,6 +17,7 @@
 
 #pragma once
 
+#include <gen_cpp/Descriptors_types.h>
 #include <gen_cpp/segment_v2.pb.h>
 #include <sys/types.h>
 
@@ -271,7 +272,6 @@ private:
     Status _calculate_row_ranges(const std::vector<uint32_t>& page_indexes, 
RowRanges* row_ranges,
                                  const ColumnIteratorOptions& iter_opts);
 
-private:
     int64_t _meta_length;
     FieldType _meta_type;
     FieldType _meta_children_column_type;
@@ -366,8 +366,26 @@ public:
 
     virtual bool is_all_dict_encoding() const { return false; }
 
+    virtual Status set_access_paths(const TColumnAccessPaths& all_access_paths,
+                                    const TColumnAccessPaths& 
predicate_access_paths) {
+        if (!predicate_access_paths.name_access_paths.empty()) {
+            _reading_flag = ReadingFlag::READING_FOR_PREDICATE;
+        }
+        return Status::OK();
+    }
+
+    void set_column_name(const std::string& column_name) { _column_name = 
column_name; }
+
+    const std::string& column_name() const { return _column_name; }
+
+    enum class ReadingFlag { NORMAL_READING, READING_FOR_PREDICATE, 
SKIP_READING };
+    void set_reading_flag(ReadingFlag flag) { _reading_flag = flag; }
+
 protected:
     ColumnIteratorOptions _opts;
+
+    ReadingFlag _reading_flag {ReadingFlag::NORMAL_READING};
+    std::string _column_name;
 };
 
 // This iterator is used to read column data from file
@@ -504,6 +522,9 @@ public:
         return _offsets_iterator->get_current_ordinal();
     }
 
+    Status set_access_paths(const TColumnAccessPaths& all_access_paths,
+                            const TColumnAccessPaths& predicate_access_paths) 
override;
+
 private:
     std::shared_ptr<ColumnReader> _map_reader = nullptr;
     ColumnIteratorUPtr _null_iterator;
@@ -533,6 +554,9 @@ public:
         return _sub_column_iterators[0]->get_current_ordinal();
     }
 
+    Status set_access_paths(const TColumnAccessPaths& all_access_paths,
+                            const TColumnAccessPaths& predicate_access_paths) 
override;
+
 private:
     std::shared_ptr<ColumnReader> _struct_reader = nullptr;
     ColumnIteratorUPtr _null_iterator;
@@ -561,6 +585,9 @@ public:
         return _offset_iterator->get_current_ordinal();
     }
 
+    Status set_access_paths(const TColumnAccessPaths& all_access_paths,
+                            const TColumnAccessPaths& predicate_access_paths) 
override;
+
 private:
     std::shared_ptr<ColumnReader> _array_reader = nullptr;
     std::unique_ptr<OffsetFileColumnIterator> _offset_iterator;
diff --git a/be/src/olap/rowset/segment_v2/segment.cpp 
b/be/src/olap/rowset/segment_v2/segment.cpp
index 0675c357ccb..60062a772e7 100644
--- a/be/src/olap/rowset/segment_v2/segment.cpp
+++ b/be/src/olap/rowset/segment_v2/segment.cpp
@@ -17,6 +17,7 @@
 
 #include "olap/rowset/segment_v2/segment.h"
 
+#include <gen_cpp/Descriptors_types.h>
 #include <gen_cpp/PlanNodes_types.h>
 #include <gen_cpp/olap_file.pb.h>
 #include <gen_cpp/segment_v2.pb.h>
@@ -765,6 +766,16 @@ Status Segment::new_column_iterator(const TabletColumn& 
tablet_column,
                                                sparse_column_cache_ptr));
     } else {
         RETURN_IF_ERROR(reader->new_iterator(iter, &tablet_column, opt));
+        if (opt->all_access_paths.contains(unique_id) ||
+            opt->predicate_access_paths.contains(unique_id)) {
+            const auto& all_access_paths = 
opt->all_access_paths.contains(unique_id)
+                                                   ? 
opt->all_access_paths.at(unique_id)
+                                                   : TColumnAccessPaths {};
+            const auto& predicate_access_paths = 
opt->predicate_access_paths.contains(unique_id)
+                                                         ? 
opt->predicate_access_paths.at(unique_id)
+                                                         : TColumnAccessPaths 
{};
+            RETURN_IF_ERROR((*iter)->set_access_paths(all_access_paths, 
predicate_access_paths));
+        }
     }
 
     if (config::enable_column_type_check && !tablet_column.has_path_info() &&
diff --git a/be/src/olap/tablet_reader.cpp b/be/src/olap/tablet_reader.cpp
index 09a35d45618..e81e2653a99 100644
--- a/be/src/olap/tablet_reader.cpp
+++ b/be/src/olap/tablet_reader.cpp
@@ -272,6 +272,9 @@ Status TabletReader::_capture_rs_readers(const 
ReaderParams& read_params) {
     _reader_context.vir_col_idx_to_type = read_params.vir_col_idx_to_type;
     _reader_context.ann_topn_runtime = read_params.ann_topn_runtime;
 
+    _reader_context.all_access_paths = read_params.all_access_paths;
+    _reader_context.predicate_access_paths = 
read_params.predicate_access_paths;
+
     return Status::OK();
 }
 
diff --git a/be/src/olap/tablet_reader.h b/be/src/olap/tablet_reader.h
index 81fb03ef7b2..504b9cd14b5 100644
--- a/be/src/olap/tablet_reader.h
+++ b/be/src/olap/tablet_reader.h
@@ -17,6 +17,7 @@
 
 #pragma once
 
+#include <gen_cpp/Descriptors_types.h>
 #include <gen_cpp/PaloInternalService_types.h>
 #include <gen_cpp/PlanNodes_types.h>
 #include <stddef.h>
@@ -146,6 +147,9 @@ public:
         // slots that cast may be eliminated in storage layer
         std::map<std::string, vectorized::DataTypePtr> 
target_cast_type_for_variants;
 
+        std::map<int32_t, TColumnAccessPaths> all_access_paths;
+        std::map<int32_t, TColumnAccessPaths> predicate_access_paths;
+
         std::vector<RowSetSplits> rs_splits;
         // For unique key table with merge-on-write
         DeleteBitmap* delete_bitmap = nullptr;
diff --git a/be/src/runtime/descriptors.cpp b/be/src/runtime/descriptors.cpp
index 7ccc69ed9ac..a17f0926752 100644
--- a/be/src/runtime/descriptors.cpp
+++ b/be/src/runtime/descriptors.cpp
@@ -63,6 +63,11 @@ SlotDescriptor::SlotDescriptor(const TSlotDescriptor& tdesc)
           _is_materialized(tdesc.isMaterialized && tdesc.need_materialize),
           _is_key(tdesc.is_key),
           _column_paths(tdesc.column_paths),
+          _all_access_paths(tdesc.__isset.all_access_paths ? 
tdesc.all_access_paths
+                                                           : 
TColumnAccessPaths {}),
+          _predicate_access_paths(tdesc.__isset.predicate_access_paths
+                                          ? tdesc.predicate_access_paths
+                                          : TColumnAccessPaths {}),
           _is_auto_increment(tdesc.__isset.is_auto_increment ? 
tdesc.is_auto_increment : false),
           _col_default_value(tdesc.__isset.col_default_value ? 
tdesc.col_default_value : "") {
     if (tdesc.__isset.virtual_column_expr) {
diff --git a/be/src/runtime/descriptors.h b/be/src/runtime/descriptors.h
index 0d86844b226..7ff16f6f5e2 100644
--- a/be/src/runtime/descriptors.h
+++ b/be/src/runtime/descriptors.h
@@ -82,6 +82,9 @@ public:
     bool is_key() const { return _is_key; }
     const std::vector<std::string>& column_paths() const { return 
_column_paths; };
 
+    const TColumnAccessPaths& all_access_paths() const { return 
_all_access_paths; }
+    const TColumnAccessPaths& predicate_access_paths() const { return 
_predicate_access_paths; }
+
     bool is_auto_increment() const { return _is_auto_increment; }
 
     bool is_skip_bitmap_col() const { return _col_name == SKIP_BITMAP_COL; }
@@ -128,6 +131,9 @@ private:
     const bool _is_key;
     const std::vector<std::string> _column_paths;
 
+    const TColumnAccessPaths _all_access_paths;
+    const TColumnAccessPaths _predicate_access_paths;
+
     const bool _is_auto_increment;
     const std::string _col_default_value;
 
diff --git a/be/src/vec/columns/column_map.cpp 
b/be/src/vec/columns/column_map.cpp
index 8686febfc5f..c8b907d5de2 100644
--- a/be/src/vec/columns/column_map.cpp
+++ b/be/src/vec/columns/column_map.cpp
@@ -59,6 +59,7 @@ ColumnMap::ColumnMap(MutableColumnPtr&& keys, 
MutableColumnPtr&& values, Mutable
 
         /// This will also prevent possible overflow in offset.
         if (keys_column->size() != last_offset) {
+            DCHECK(0);
             throw doris::Exception(
                     doris::ErrorCode::INTERNAL_ERROR,
                     "offsets_column size {} has data inconsistent with 
key_column {}", last_offset,
diff --git a/be/src/vec/exec/scan/olap_scanner.cpp 
b/be/src/vec/exec/scan/olap_scanner.cpp
index bbd557d2834..b736ecce044 100644
--- a/be/src/vec/exec/scan/olap_scanner.cpp
+++ b/be/src/vec/exec/scan/olap_scanner.cpp
@@ -83,6 +83,8 @@ OlapScanner::OlapScanner(pipeline::ScanLocalStateBase* 
parent, OlapScanner::Para
                                  .function_filters {},
                                  .delete_predicates {},
                                  .target_cast_type_for_variants {},
+                                 .all_access_paths {},
+                                 .predicate_access_paths {},
                                  .rs_splits {},
                                  .return_columns {},
                                  .output_columns {},
@@ -539,6 +541,17 @@ Status OlapScanner::_init_return_columns() {
                     _vir_col_idx_to_type[idx_in_block]->get_name());
         }
 
+        const auto& column = tablet_schema->column(index);
+        if (!slot->all_access_paths().name_access_paths.empty()) {
+            _tablet_reader_params.all_access_paths.insert(
+                    {column.unique_id(), slot->all_access_paths()});
+        }
+
+        if (!slot->predicate_access_paths().name_access_paths.empty()) {
+            _tablet_reader_params.predicate_access_paths.insert(
+                    {column.unique_id(), slot->predicate_access_paths()});
+        }
+
         _return_columns.push_back(index);
         if (slot->is_nullable() && 
!tablet_schema->column(index).is_nullable()) {
             _tablet_columns_convert_to_null_set.emplace(index);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to