This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 28abeef72b [performace](colddata) opt cold data read performance
(#21141)
28abeef72b is described below
commit 28abeef72b12340784d11dceb7474f91214469d2
Author: yiguolei <[email protected]>
AuthorDate: Mon Jun 26 10:39:20 2023 +0800
[performace](colddata) opt cold data read performance (#21141)
In #10370, we try to opt string evaluate performance by rewrite the
predicate using dict value. But it has to check if the string column is full
dict encoding. So that we add a logic to read the last page of the string
column to check it.
But it has some bad performance for cold data because it has to load the
column's ordinal index and zone map index. In some scenario for example, select
* from table where pk_col=1. If the query condition is primary key, the result
maybe just a few rows but the result may have 100 columns, it will cost a lot
of time to load these indices. We could find a lot of time is spending on
block_init_time.
In my test, a table with 50 string columns and query with primary key.
The first read time will reduce from 220ms to 40ms.
---
be/src/olap/rowset/segment_v2/column_reader.cpp | 8 +++++++-
be/src/olap/rowset/segment_v2/column_reader.h | 1 +
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 16 ++++++++++++++++
3 files changed, 24 insertions(+), 1 deletion(-)
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 609c9fdc28..093eb365b0 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -925,7 +925,13 @@ Status FileColumnIterator::init(const
ColumnIteratorOptions& opts) {
opts.io_ctx.reader_type == ReaderType::READER_QUERY &&
_reader->encoding_info()->encoding() == DICT_ENCODING) {
auto dict_encoding_type = _reader->get_dict_encoding_type();
- if (dict_encoding_type == ColumnReader::UNKNOWN_DICT_ENCODING) {
+ // Only if the column is a predicate column, then we need check the
all dict encoding flag
+ // because we could rewrite the predciate to accelarate query speed.
But if it is not a
+ // predicate column, then it is useless. And it has a bad impact on
cold read(first time read)
+ // because it will load the column's ordinal index and zonemap index
and maybe other indices.
+ // it has bad impact on primary key query. For example, select * from
table where pk = 1, and
+ // the table has 2000 columns.
+ if (dict_encoding_type == ColumnReader::UNKNOWN_DICT_ENCODING &&
opts.is_predicate_column) {
seek_to_ordinal(_reader->num_rows() - 1);
_is_all_dict_encoding = _page.is_dict_encoding;
_reader->set_dict_encoding_type(_is_all_dict_encoding
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h
b/be/src/olap/rowset/segment_v2/column_reader.h
index 18f5aad760..bee0bcfb91 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -90,6 +90,7 @@ struct ColumnIteratorOptions {
// INDEX_PAGE including index_page, dict_page and short_key_page
PageTypePB type;
io::IOContext io_ctx;
+ bool is_predicate_column = false;
void sanity_check() const {
CHECK_NOTNULL(file_reader);
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 53d89e6395..c153713680 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -983,6 +983,19 @@ Status SegmentIterator::_init_return_column_iterators() {
new RowIdColumnIterator(_opts.tablet_id, _opts.rowset_id,
_segment->id()));
continue;
}
+ std::set<ColumnId> del_cond_id_set;
+ _opts.delete_condition_predicates->get_all_column_ids(del_cond_id_set);
+ std::vector<bool> tmp_is_pred_column;
+ tmp_is_pred_column.resize(_schema->columns().size(), false);
+ for (auto predicate : _col_predicates) {
+ auto cid = predicate->column_id();
+ tmp_is_pred_column[cid] = true;
+ }
+ // handle delete_condition
+ for (auto cid : del_cond_id_set) {
+ tmp_is_pred_column[cid] = true;
+ }
+
int32_t unique_id = _opts.tablet_schema->column(cid).unique_id();
if (_column_iterators.count(unique_id) < 1) {
RETURN_IF_ERROR(_segment->new_column_iterator(_opts.tablet_schema->column(cid),
@@ -992,6 +1005,9 @@ Status SegmentIterator::_init_return_column_iterators() {
iter_opts.use_page_cache = _opts.use_page_cache;
iter_opts.file_reader = _file_reader.get();
iter_opts.io_ctx = _opts.io_ctx;
+ // If the col is predicate column, then should read the last page
to check
+ // if the column is full dict encoding
+ iter_opts.is_predicate_column = tmp_is_pred_column[cid];
RETURN_IF_ERROR(_column_iterators[unique_id]->init(iter_opts));
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]