This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 9c483f1ceba861236624eeab8d7a13250e9b8466 Author: zzzxl <[email protected]> AuthorDate: Tue Oct 17 10:34:33 2023 +0800 [enhancement](invert index) read columns by index reduce seek time (#24735) --- be/src/olap/rowset/segment_v2/binary_dict_page.cpp | 1 + be/src/olap/rowset/segment_v2/segment_iterator.cpp | 41 +++++++++++++--------- be/src/vec/columns/column_fixed_length_object.h | 17 +++++++++ 3 files changed, 43 insertions(+), 16 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index f075a5a2083..4f3ffff2961 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -284,6 +284,7 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, vectorized::MutableColumnPtr Status BinaryDictPageDecoder::read_by_rowids(const rowid_t* rowids, ordinal_t page_first_ordinal, size_t* n, vectorized::MutableColumnPtr& dst) { if (_encoding_type == PLAIN_ENCODING) { + dst = dst->convert_to_predicate_column_if_dictionary(); return _data_page_decoder->read_by_rowids(rowids, page_first_ordinal, n, dst); } DCHECK(_parsed); diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 6bdc48b4723..548de6a15dd 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -1588,28 +1588,19 @@ void SegmentIterator::_output_non_pred_columns(vectorized::Block* block) { Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32_t& nrows_read, bool set_block_rowid) { SCOPED_RAW_TIMER(&_opts.stats->first_read_ns); + do { - uint32_t range_from; - uint32_t range_to; + uint32_t range_from = 0; + uint32_t range_to = 0; bool has_next_range = _range_iter->next_range(nrows_read_limit - nrows_read, &range_from, &range_to); if (!has_next_range) { break; } - if (_cur_rowid == 0 || _cur_rowid != range_from) { - _cur_rowid = range_from; - _opts.stats->block_first_read_seek_num += 1; - if (_opts.runtime_state && _opts.runtime_state->enable_profile()) { - SCOPED_RAW_TIMER(&_opts.stats->block_first_read_seek_ns); - RETURN_IF_ERROR(_seek_columns(_first_read_column_ids, _cur_rowid)); - } else { - RETURN_IF_ERROR(_seek_columns(_first_read_column_ids, _cur_rowid)); - } - } + size_t rows_to_read = range_to - range_from; - RETURN_IF_ERROR( - _read_columns(_first_read_column_ids, _current_return_columns, rows_to_read)); - _cur_rowid += rows_to_read; + _cur_rowid = range_to; + if (set_block_rowid) { // Here use std::iota is better performance than for-loop, maybe for-loop is not vectorized auto start = _block_rowids.data() + nrows_read; @@ -1621,8 +1612,26 @@ Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32 } _split_row_ranges.emplace_back(std::pair {range_from, range_to}); - // if _opts.read_orderby_key_reverse is true, only read one range for fast reverse purpose } while (nrows_read < nrows_read_limit && !_opts.read_orderby_key_reverse); + + for (auto cid : _first_read_column_ids) { + auto& column = _current_return_columns[cid]; + if (_prune_column(cid, column, true, nrows_read)) { + continue; + } + for (auto& range : _split_row_ranges) { + size_t nrows = range.second - range.first; + + RETURN_IF_ERROR(_column_iterators[cid]->seek_to_ordinal(range.first)); + size_t rows_read = nrows; + RETURN_IF_ERROR(_column_iterators[cid]->next_batch(&rows_read, column)); + if (rows_read != nrows) { + return Status::Error<ErrorCode::INTERNAL_ERROR>("nrows({}) != rows_read({})", nrows, + rows_read); + } + } + } + return Status::OK(); } diff --git a/be/src/vec/columns/column_fixed_length_object.h b/be/src/vec/columns/column_fixed_length_object.h index 323fabd6256..42be6835c66 100644 --- a/be/src/vec/columns/column_fixed_length_object.h +++ b/be/src/vec/columns/column_fixed_length_object.h @@ -284,6 +284,23 @@ public: memcpy(_data.data() + old_size, data + begin_offset, total_mem_size); } + void insert_many_binary_data(char* data_array, uint32_t* len_array, + uint32_t* start_offset_array, size_t num) override { + if (UNLIKELY(num == 0)) { + return; + } + + size_t old_count = _item_count; + resize(old_count + num); + auto dst = _data.data() + old_count * _item_size; + for (size_t i = 0; i < num; i++) { + auto src = data_array + start_offset_array[i]; + uint32_t len = len_array[i]; + dst += i * _item_size; + memcpy(dst, src, len); + } + } + protected: size_t _item_size; size_t _item_count; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
