[doris] 01/04: [enhancement](invert index) read columns by index reduce seek time (#24735)

kxiao Tue, 17 Oct 2023 04:58:26 -0700

This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


commit 9c483f1ceba861236624eeab8d7a13250e9b8466
Author: zzzxl <[email protected]>
AuthorDate: Tue Oct 17 10:34:33 2023 +0800

    [enhancement](invert index) read columns by index reduce seek time (#24735)
---
 be/src/olap/rowset/segment_v2/binary_dict_page.cpp |  1 +
 be/src/olap/rowset/segment_v2/segment_iterator.cpp | 41 +++++++++++++---------
 be/src/vec/columns/column_fixed_length_object.h    | 17 +++++++++
 3 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp 
b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
index f075a5a2083..4f3ffff2961 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
@@ -284,6 +284,7 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, 
vectorized::MutableColumnPtr
 Status BinaryDictPageDecoder::read_by_rowids(const rowid_t* rowids, ordinal_t 
page_first_ordinal,
                                              size_t* n, 
vectorized::MutableColumnPtr& dst) {
     if (_encoding_type == PLAIN_ENCODING) {
+        dst = dst->convert_to_predicate_column_if_dictionary();
         return _data_page_decoder->read_by_rowids(rowids, page_first_ordinal, 
n, dst);
     }
     DCHECK(_parsed);
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp 
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 6bdc48b4723..548de6a15dd 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1588,28 +1588,19 @@ void 
SegmentIterator::_output_non_pred_columns(vectorized::Block* block) {
 Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, 
uint32_t& nrows_read,
                                                bool set_block_rowid) {
     SCOPED_RAW_TIMER(&_opts.stats->first_read_ns);
+
     do {
-        uint32_t range_from;
-        uint32_t range_to;
+        uint32_t range_from = 0;
+        uint32_t range_to = 0;
         bool has_next_range =
                 _range_iter->next_range(nrows_read_limit - nrows_read, 
&range_from, &range_to);
         if (!has_next_range) {
             break;
         }
-        if (_cur_rowid == 0 || _cur_rowid != range_from) {
-            _cur_rowid = range_from;
-            _opts.stats->block_first_read_seek_num += 1;
-            if (_opts.runtime_state && _opts.runtime_state->enable_profile()) {
-                SCOPED_RAW_TIMER(&_opts.stats->block_first_read_seek_ns);
-                RETURN_IF_ERROR(_seek_columns(_first_read_column_ids, 
_cur_rowid));
-            } else {
-                RETURN_IF_ERROR(_seek_columns(_first_read_column_ids, 
_cur_rowid));
-            }
-        }
+
         size_t rows_to_read = range_to - range_from;
-        RETURN_IF_ERROR(
-                _read_columns(_first_read_column_ids, _current_return_columns, 
rows_to_read));
-        _cur_rowid += rows_to_read;
+        _cur_rowid = range_to;
+
         if (set_block_rowid) {
             // Here use std::iota is better performance than for-loop, maybe 
for-loop is not vectorized
             auto start = _block_rowids.data() + nrows_read;
@@ -1621,8 +1612,26 @@ Status SegmentIterator::_read_columns_by_index(uint32_t 
nrows_read_limit, uint32
         }
 
         _split_row_ranges.emplace_back(std::pair {range_from, range_to});
-        // if _opts.read_orderby_key_reverse is true, only read one range for 
fast reverse purpose
     } while (nrows_read < nrows_read_limit && !_opts.read_orderby_key_reverse);
+
+    for (auto cid : _first_read_column_ids) {
+        auto& column = _current_return_columns[cid];
+        if (_prune_column(cid, column, true, nrows_read)) {
+            continue;
+        }
+        for (auto& range : _split_row_ranges) {
+            size_t nrows = range.second - range.first;
+
+            
RETURN_IF_ERROR(_column_iterators[cid]->seek_to_ordinal(range.first));
+            size_t rows_read = nrows;
+            RETURN_IF_ERROR(_column_iterators[cid]->next_batch(&rows_read, 
column));
+            if (rows_read != nrows) {
+                return Status::Error<ErrorCode::INTERNAL_ERROR>("nrows({}) != 
rows_read({})", nrows,
+                                                                rows_read);
+            }
+        }
+    }
+
     return Status::OK();
 }
 
diff --git a/be/src/vec/columns/column_fixed_length_object.h 
b/be/src/vec/columns/column_fixed_length_object.h
index 323fabd6256..42be6835c66 100644
--- a/be/src/vec/columns/column_fixed_length_object.h
+++ b/be/src/vec/columns/column_fixed_length_object.h
@@ -284,6 +284,23 @@ public:
         memcpy(_data.data() + old_size, data + begin_offset, total_mem_size);
     }
 
+    void insert_many_binary_data(char* data_array, uint32_t* len_array,
+                                 uint32_t* start_offset_array, size_t num) 
override {
+        if (UNLIKELY(num == 0)) {
+            return;
+        }
+
+        size_t old_count = _item_count;
+        resize(old_count + num);
+        auto dst = _data.data() + old_count * _item_size;
+        for (size_t i = 0; i < num; i++) {
+            auto src = data_array + start_offset_array[i];
+            uint32_t len = len_array[i];
+            dst += i * _item_size;
+            memcpy(dst, src, len);
+        }
+    }
+
 protected:
     size_t _item_size;
     size_t _item_count;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris] 01/04: [enhancement](invert index) read columns by index reduce seek time (#24735)

Reply via email to