This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 0cece561f9 [refactor](segment iterator) remove std::map in iterator
use std::vector instead and not rely on unique id to idenfy position (#23505)
0cece561f9 is described below
commit 0cece561f9367ab3cb1f1d0fa8f854d9f00f9a01
Author: lihangyu <[email protected]>
AuthorDate: Tue Aug 29 16:43:32 2023 +0800
[refactor](segment iterator) remove std::map in iterator use std::vector
instead and not rely on unique id to idenfy position (#23505)
---
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 123 +++++++++------------
be/src/olap/rowset/segment_v2/segment_iterator.h | 12 +-
2 files changed, 59 insertions(+), 76 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 87b8014117..4a7e96476f 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -23,10 +23,12 @@
#include <algorithm>
#include <boost/iterator/iterator_facade.hpp>
+#include <iterator>
#include <memory>
#include <numeric>
#include <set>
#include <utility>
+#include <vector>
// IWYU pragma: no_include <opentelemetry/common/threadlocal.h>
#include "common/compiler_util.h" // IWYU pragma: keep
@@ -198,6 +200,9 @@ private:
SegmentIterator::SegmentIterator(std::shared_ptr<Segment> segment, SchemaSPtr
schema)
: _segment(std::move(segment)),
_schema(schema),
+ _column_iterators(_schema->num_columns()),
+ _bitmap_index_iterators(_schema->num_columns()),
+ _inverted_index_iterators(_schema->num_columns()),
_cur_rowid(0),
_lazy_materialization_read(false),
_lazy_inited(false),
@@ -368,16 +373,15 @@ Status SegmentIterator::_prepare_seek(const
StorageReadOptions::KeyRange& key_ra
// create used column iterator
for (auto cid : _seek_schema->column_ids()) {
- int32_t unique_id = _opts.tablet_schema->column(cid).unique_id();
- if (_column_iterators.count(unique_id) < 1) {
+ if (_column_iterators[cid] == nullptr) {
RETURN_IF_ERROR(_segment->new_column_iterator(_opts.tablet_schema->column(cid),
-
&_column_iterators[unique_id]));
+
&_column_iterators[cid]));
ColumnIteratorOptions iter_opts;
iter_opts.stats = _opts.stats;
iter_opts.use_page_cache = _opts.use_page_cache;
iter_opts.file_reader = _file_reader.get();
iter_opts.io_ctx = _opts.io_ctx;
- RETURN_IF_ERROR(_column_iterators[unique_id]->init(iter_opts));
+ RETURN_IF_ERROR(_column_iterators[cid]->init(iter_opts));
}
}
@@ -440,8 +444,7 @@ Status
SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
// get row ranges by bf index of this column,
RowRanges column_bf_row_ranges = RowRanges::create_single(num_rows());
DCHECK(_opts.col_id_to_predicates.count(cid) > 0);
- uint32_t unique_cid = _schema->unique_id(cid);
-
RETURN_IF_ERROR(_column_iterators[unique_cid]->get_row_ranges_by_bloom_filter(
+ RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_bloom_filter(
_opts.col_id_to_predicates.at(cid).get(),
&column_bf_row_ranges));
RowRanges::ranges_intersection(bf_row_ranges, column_bf_row_ranges,
&bf_row_ranges);
}
@@ -456,7 +459,7 @@ Status
SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
// get row ranges by zone map of this column,
RowRanges column_row_ranges = RowRanges::create_single(num_rows());
DCHECK(_opts.col_id_to_predicates.count(cid) > 0);
-
RETURN_IF_ERROR(_column_iterators[_schema->unique_id(cid)]->get_row_ranges_by_zone_map(
+ RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_zone_map(
_opts.col_id_to_predicates.at(cid).get(),
_opts.del_predicates_for_zone_map.count(cid) > 0
? &(_opts.del_predicates_for_zone_map.at(cid))
@@ -472,14 +475,14 @@ Status
SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
auto query_ctx = _opts.runtime_state->get_query_ctx();
runtime_predicate =
query_ctx->get_runtime_predicate().get_predictate();
if (runtime_predicate) {
- int32_t cid =
_opts.tablet_schema->column(runtime_predicate->column_id()).unique_id();
AndBlockColumnPredicate and_predicate;
auto single_predicate = new
SingleColumnBlockPredicate(runtime_predicate.get());
and_predicate.add_column_predicate(single_predicate);
RowRanges column_rp_row_ranges =
RowRanges::create_single(num_rows());
-
RETURN_IF_ERROR(_column_iterators[_schema->unique_id(cid)]->get_row_ranges_by_zone_map(
- &and_predicate, nullptr, &column_rp_row_ranges));
+ RETURN_IF_ERROR(
+
_column_iterators[runtime_predicate->column_id()]->get_row_ranges_by_zone_map(
+ &and_predicate, nullptr, &column_rp_row_ranges));
// intersect different columns's row ranges to get final row
ranges by zone map
RowRanges::ranges_intersection(zone_map_row_ranges,
column_rp_row_ranges,
@@ -499,8 +502,7 @@ Status
SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row
for (auto cid : cids) {
RowRanges tmp_row_ranges = RowRanges::create_single(num_rows());
DCHECK(_opts.col_id_to_predicates.count(cid) > 0);
- uint32_t unique_cid = _schema->unique_id(cid);
-
RETURN_IF_ERROR(_column_iterators[unique_cid]->get_row_ranges_by_dict(
+ RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_dict(
_opts.col_id_to_predicates.at(cid).get(),
&tmp_row_ranges));
RowRanges::ranges_intersection(dict_row_ranges, tmp_row_ranges,
&dict_row_ranges);
}
@@ -523,19 +525,18 @@ Status SegmentIterator::_apply_bitmap_index() {
std::vector<ColumnPredicate*> remaining_predicates;
for (auto pred : _col_predicates) {
- int32_t unique_id = _schema->unique_id(pred->column_id());
- if (_bitmap_index_iterators.count(unique_id) < 1 ||
- _bitmap_index_iterators[unique_id] == nullptr || pred->type() ==
PredicateType::BF) {
+ auto cid = pred->column_id();
+ if (_bitmap_index_iterators[cid] == nullptr || pred->type() ==
PredicateType::BF) {
// no bitmap index for this column
remaining_predicates.push_back(pred);
} else {
-
RETURN_IF_ERROR(pred->evaluate(_bitmap_index_iterators[unique_id].get(),
- _segment->num_rows(),
&_row_bitmap));
+ RETURN_IF_ERROR(pred->evaluate(_bitmap_index_iterators[cid].get(),
_segment->num_rows(),
+ &_row_bitmap));
auto column_name = _schema->column(pred->column_id())->name();
if (_check_column_pred_all_push_down(column_name) &&
!pred->predicate_params()->marked_by_runtime_filter) {
- _need_read_data_indices[unique_id] = false;
+ _need_read_data_indices[cid] = false;
}
if (_row_bitmap.isEmpty()) {
@@ -657,9 +658,7 @@ bool
SegmentIterator::_can_filter_by_preds_except_leafnode_of_andnode() {
}
bool SegmentIterator::_check_apply_by_bitmap_index(ColumnPredicate* pred) {
- int32_t unique_id = _schema->unique_id(pred->column_id());
- if (_bitmap_index_iterators.count(unique_id) < 1 ||
- _bitmap_index_iterators[unique_id] == nullptr) {
+ if (_bitmap_index_iterators[pred->column_id()] == nullptr) {
// no bitmap index for this column
return false;
}
@@ -667,9 +666,7 @@ bool
SegmentIterator::_check_apply_by_bitmap_index(ColumnPredicate* pred) {
}
bool SegmentIterator::_check_apply_by_inverted_index(ColumnPredicate* pred,
bool pred_in_compound) {
- int32_t unique_id = _schema->unique_id(pred->column_id());
- if (_inverted_index_iterators.count(unique_id) < 1 ||
- _inverted_index_iterators[unique_id] == nullptr) {
+ if (_inverted_index_iterators[pred->column_id()] == nullptr) {
//this column without inverted index
return false;
}
@@ -689,7 +686,7 @@ bool
SegmentIterator::_check_apply_by_inverted_index(ColumnPredicate* pred, bool
return false;
}
- bool handle_by_fulltext = _column_has_fulltext_index(unique_id);
+ bool handle_by_fulltext = _column_has_fulltext_index(pred->column_id());
if (handle_by_fulltext) {
// when predicate in compound condition which except leafNode of
andNode,
// only can apply match query for fulltext index,
@@ -705,9 +702,8 @@ bool
SegmentIterator::_check_apply_by_inverted_index(ColumnPredicate* pred, bool
Status SegmentIterator::_apply_bitmap_index_except_leafnode_of_andnode(
ColumnPredicate* pred, roaring::Roaring* output_result) {
- int32_t unique_id = _schema->unique_id(pred->column_id());
- RETURN_IF_ERROR(pred->evaluate(_bitmap_index_iterators[unique_id].get(),
_segment->num_rows(),
- output_result));
+
RETURN_IF_ERROR(pred->evaluate(_bitmap_index_iterators[pred->column_id()].get(),
+ _segment->num_rows(), output_result));
return Status::OK();
}
@@ -716,9 +712,8 @@ Status
SegmentIterator::_apply_inverted_index_except_leafnode_of_andnode(
if (_opts.runtime_state &&
!_opts.runtime_state->query_options().enable_inverted_index_query) {
return Status::OK();
}
- int32_t unique_id = _schema->unique_id(pred->column_id());
- RETURN_IF_ERROR(pred->evaluate(*_schema,
_inverted_index_iterators[unique_id].get(), num_rows(),
- output_result));
+ RETURN_IF_ERROR(pred->evaluate(*_schema,
_inverted_index_iterators[pred->column_id()].get(),
+ num_rows(), output_result));
return Status::OK();
}
@@ -745,8 +740,7 @@ Status
SegmentIterator::_apply_index_except_leafnode_of_andnode() {
continue;
}
- int32_t unique_id = _schema->unique_id(pred->column_id());
- bool need_remaining_after_evaluate =
_column_has_fulltext_index(unique_id) &&
+ bool need_remaining_after_evaluate =
_column_has_fulltext_index(pred->column_id()) &&
PredicateTypeTraits::is_equal_or_list(pred_type);
if (!res.ok()) {
if (_downgrade_without_index(res, need_remaining_after_evaluate)) {
@@ -771,8 +765,7 @@ Status
SegmentIterator::_apply_index_except_leafnode_of_andnode() {
_check_column_pred_all_push_down(column_name, true,
pred->type() ==
PredicateType::MATCH) &&
!pred->predicate_params()->marked_by_runtime_filter) {
- int32_t unique_id = _schema->unique_id(pred->column_id());
- _need_read_data_indices[unique_id] = false;
+ _need_read_data_indices[pred->column_id()] = false;
}
}
@@ -823,12 +816,10 @@ std::string
SegmentIterator::_gen_predicate_result_sign(ColumnPredicateInfo* pre
return pred_result_sign;
}
-bool SegmentIterator::_column_has_fulltext_index(int32_t unique_id) {
- bool has_fulltext_index =
- _inverted_index_iterators.count(unique_id) > 0 &&
- _inverted_index_iterators[unique_id] != nullptr &&
-
_inverted_index_iterators[unique_id]->get_inverted_index_reader_type() ==
- InvertedIndexReaderType::FULLTEXT;
+bool SegmentIterator::_column_has_fulltext_index(int32_t cid) {
+ bool has_fulltext_index = _inverted_index_iterators[cid] != nullptr &&
+
_inverted_index_iterators[cid]->get_inverted_index_reader_type() ==
+ InvertedIndexReaderType::FULLTEXT;
return has_fulltext_index;
}
@@ -852,11 +843,10 @@ Status
SegmentIterator::_apply_inverted_index_on_column_predicate(
if (!_check_apply_by_inverted_index(pred)) {
remaining_predicates.emplace_back(pred);
} else {
- int32_t unique_id = _schema->unique_id(pred->column_id());
- bool need_remaining_after_evaluate =
_column_has_fulltext_index(unique_id) &&
+ bool need_remaining_after_evaluate =
_column_has_fulltext_index(pred->column_id()) &&
PredicateTypeTraits::is_equal_or_list(pred->type());
roaring::Roaring bitmap = _row_bitmap;
- Status res = pred->evaluate(*_schema,
_inverted_index_iterators[unique_id].get(),
+ Status res = pred->evaluate(*_schema,
_inverted_index_iterators[pred->column_id()].get(),
num_rows(), &bitmap);
if (!res.ok()) {
if (_downgrade_without_index(res, need_remaining_after_evaluate)) {
@@ -891,7 +881,7 @@ Status
SegmentIterator::_apply_inverted_index_on_column_predicate(
if (_check_column_pred_all_push_down(column_name, false,
pred->type() ==
PredicateType::MATCH) &&
!pred->predicate_params()->marked_by_runtime_filter) {
- _need_read_data_indices[unique_id] = false;
+ _need_read_data_indices[pred->column_id()] = false;
}
}
return Status::OK();
@@ -901,8 +891,7 @@ Status
SegmentIterator::_apply_inverted_index_on_block_column_predicate(
ColumnId column_id, MutilColumnBlockPredicate* pred,
std::set<const ColumnPredicate*>& no_need_to_pass_column_predicate_set,
bool* continue_apply) {
- auto unique_id = _schema->unique_id(column_id);
- bool handle_by_fulltext = _column_has_fulltext_index(unique_id);
+ bool handle_by_fulltext = _column_has_fulltext_index(column_id);
std::set<const ColumnPredicate*> predicate_set {};
pred->get_all_column_predicate(predicate_set);
@@ -912,20 +901,19 @@ Status
SegmentIterator::_apply_inverted_index_on_block_column_predicate(
//2. There are multiple predicates for this column.
//3. All the predicates are range predicate.
//4. if it's under fulltext parser type, we need to skip inverted index
evaluate.
- if (_inverted_index_iterators.count(unique_id) > 0 &&
- _inverted_index_iterators[unique_id] != nullptr &&
predicate_set.size() > 1 &&
+ if (_inverted_index_iterators[column_id] != nullptr &&
predicate_set.size() > 1 &&
all_predicates_are_range_predicate(predicate_set) &&
!handle_by_fulltext) {
roaring::Roaring output_result = _row_bitmap;
std::string column_name = _schema->column(column_id)->name();
- auto res = pred->evaluate(column_name,
_inverted_index_iterators[unique_id].get(),
+ auto res = pred->evaluate(column_name,
_inverted_index_iterators[column_id].get(),
num_rows(), &output_result);
if (res.ok()) {
if (_check_column_pred_all_push_down(column_name) &&
!all_predicates_are_marked_by_runtime_filter(predicate_set)) {
- _need_read_data_indices[unique_id] = false;
+ _need_read_data_indices[column_id] = false;
}
no_need_to_pass_column_predicate_set.insert(predicate_set.begin(),
predicate_set.end());
_row_bitmap &= output_result;
@@ -957,7 +945,7 @@ bool SegmentIterator::_need_read_data(ColumnId cid) {
return true;
}
int32_t unique_id = _opts.tablet_schema->column(cid).unique_id();
- if (_need_read_data_indices.count(unique_id) > 0 &&
!_need_read_data_indices[unique_id] &&
+ if (_need_read_data_indices.count(cid) > 0 &&
!_need_read_data_indices[cid] &&
_output_columns.count(unique_id) < 1) {
VLOG_DEBUG << "SegmentIterator no need read data for column: "
<< _opts.tablet_schema->column_by_uid(unique_id).name();
@@ -1010,7 +998,7 @@ Status SegmentIterator::_init_return_column_iterators() {
for (auto cid : _schema->column_ids()) {
if (_schema->column(cid)->name() == BeConsts::ROWID_COL) {
- _column_iterators[_schema->column(cid)->unique_id()].reset(
+ _column_iterators[cid].reset(
new RowIdColumnIterator(_opts.tablet_id, _opts.rowset_id,
_segment->id()));
continue;
}
@@ -1027,10 +1015,9 @@ Status SegmentIterator::_init_return_column_iterators() {
tmp_is_pred_column[cid] = true;
}
- int32_t unique_id = _opts.tablet_schema->column(cid).unique_id();
- if (_column_iterators.count(unique_id) < 1) {
+ if (_column_iterators[cid] == nullptr) {
RETURN_IF_ERROR(_segment->new_column_iterator(_opts.tablet_schema->column(cid),
-
&_column_iterators[unique_id]));
+
&_column_iterators[cid]));
ColumnIteratorOptions iter_opts;
iter_opts.stats = _opts.stats;
iter_opts.use_page_cache = _opts.use_page_cache;
@@ -1039,7 +1026,7 @@ Status SegmentIterator::_init_return_column_iterators() {
// If the col is predicate column, then should read the last page
to check
// if the column is full dict encoding
iter_opts.is_predicate_column = tmp_is_pred_column[cid];
- RETURN_IF_ERROR(_column_iterators[unique_id]->init(iter_opts));
+ RETURN_IF_ERROR(_column_iterators[cid]->init(iter_opts));
}
}
return Status::OK();
@@ -1050,10 +1037,9 @@ Status SegmentIterator::_init_bitmap_index_iterators() {
return Status::OK();
}
for (auto cid : _schema->column_ids()) {
- int32_t unique_id = _opts.tablet_schema->column(cid).unique_id();
- if (_bitmap_index_iterators.count(unique_id) < 1) {
- RETURN_IF_ERROR(_segment->new_bitmap_index_iterator(
- _opts.tablet_schema->column(cid),
&_bitmap_index_iterators[unique_id]));
+ if (_bitmap_index_iterators[cid] == nullptr) {
+
RETURN_IF_ERROR(_segment->new_bitmap_index_iterator(_opts.tablet_schema->column(cid),
+
&_bitmap_index_iterators[cid]));
}
}
return Status::OK();
@@ -1064,11 +1050,10 @@ Status
SegmentIterator::_init_inverted_index_iterators() {
return Status::OK();
}
for (auto cid : _schema->column_ids()) {
- int32_t unique_id = _opts.tablet_schema->column(cid).unique_id();
- if (_inverted_index_iterators.count(unique_id) < 1) {
+ if (_inverted_index_iterators[cid] == nullptr) {
RETURN_IF_ERROR(_segment->new_inverted_index_iterator(
_opts.tablet_schema->column(cid),
_opts.tablet_schema->get_inverted_index(cid),
- _opts.stats, &_inverted_index_iterators[unique_id]));
+ _opts.stats, &_inverted_index_iterators[cid]));
}
}
return Status::OK();
@@ -1241,7 +1226,7 @@ Status SegmentIterator::_seek_columns(const
std::vector<ColumnId>& column_ids, r
if (!_need_read_data(cid)) {
continue;
}
-
RETURN_IF_ERROR(_column_iterators[_schema->unique_id(cid)]->seek_to_ordinal(pos));
+ RETURN_IF_ERROR(_column_iterators[cid]->seek_to_ordinal(pos));
}
return Status::OK();
}
@@ -1460,7 +1445,7 @@ bool
SegmentIterator::_can_evaluated_by_vectorized(ColumnPredicate* predicate) {
field_type == FieldType::OLAP_FIELD_TYPE_STRING) {
return config::enable_low_cardinality_optimize &&
_opts.io_ctx.reader_type == ReaderType::READER_QUERY &&
-
_column_iterators[_schema->unique_id(cid)]->is_all_dict_encoding();
+ _column_iterators[cid]->is_all_dict_encoding();
} else if (field_type == FieldType::OLAP_FIELD_TYPE_DECIMAL) {
return false;
}
@@ -1519,7 +1504,7 @@ Status SegmentIterator::_read_columns(const
std::vector<ColumnId>& column_ids,
if (_prune_column(cid, column, true, rows_read)) {
continue;
}
-
RETURN_IF_ERROR(_column_iterators[_schema->unique_id(cid)]->next_batch(&rows_read,
column));
+ RETURN_IF_ERROR(_column_iterators[cid]->next_batch(&rows_read,
column));
if (nrows != rows_read) {
return Status::Error<ErrorCode::INTERNAL_ERROR>("nrows({}) !=
rows_read({})", nrows,
rows_read);
@@ -1733,8 +1718,8 @@ Status
SegmentIterator::_read_columns_by_rowids(std::vector<ColumnId>& read_colu
if (_prune_column(cid, (*mutable_columns)[cid], true, select_size)) {
continue;
}
-
RETURN_IF_ERROR(_column_iterators[_schema->unique_id(cid)]->read_by_rowids(
- rowids.data(), select_size, _current_return_columns[cid]));
+ RETURN_IF_ERROR(_column_iterators[cid]->read_by_rowids(rowids.data(),
select_size,
+
_current_return_columns[cid]));
}
return Status::OK();
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h
b/be/src/olap/rowset/segment_v2/segment_iterator.h
index a25cf1b5bf..a44d0b6eba 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -187,7 +187,7 @@ private:
ColumnPredicate* pred, roaring::Roaring* output_result);
[[nodiscard]] Status _apply_inverted_index_except_leafnode_of_andnode(
ColumnPredicate* pred, roaring::Roaring* output_result);
- bool _column_has_fulltext_index(int32_t unique_id);
+ bool _column_has_fulltext_index(int32_t cid);
bool _downgrade_without_index(Status res, bool need_remaining = false);
inline bool _inverted_index_not_support_pred_type(const PredicateType&
type);
bool _can_filter_by_preds_except_leafnode_of_andnode();
@@ -331,12 +331,10 @@ private:
std::shared_ptr<Segment> _segment;
SchemaSPtr _schema;
- // _column_iterators_map.size() == _schema.num_columns()
- // map<unique_id, ColumnIterator*>
_column_iterators_map/_bitmap_index_iterators;
- // can use _schema get unique_id by cid
- std::map<int32_t, std::unique_ptr<ColumnIterator>> _column_iterators;
- std::map<int32_t, std::unique_ptr<BitmapIndexIterator>>
_bitmap_index_iterators;
- std::map<int32_t, std::unique_ptr<InvertedIndexIterator>>
_inverted_index_iterators;
+ // vector idx -> column iterarator
+ std::vector<std::unique_ptr<ColumnIterator>> _column_iterators;
+ std::vector<std::unique_ptr<BitmapIndexIterator>> _bitmap_index_iterators;
+ std::vector<std::unique_ptr<InvertedIndexIterator>>
_inverted_index_iterators;
// after init(), `_row_bitmap` contains all rowid to scan
roaring::Roaring _row_bitmap;
// "column_name+operator+value-> <in_compound_query, rowid_result>
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]