This is an automated email from the ASF dual-hosted git repository.
gabriellee pushed a commit to branch opt_perf
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/opt_perf by this push:
new ca59a30738 [Improvement](string) Optimize scanning for string (#12945)
ca59a30738 is described below
commit ca59a30738bac95f8fc29d430e119d3293f93482
Author: Gabriel <[email protected]>
AuthorDate: Sat Sep 24 21:47:22 2022 +0800
[Improvement](string) Optimize scanning for string (#12945)
---
be/src/olap/rowset/segment_v2/binary_plain_page.h | 27 +++++++++---
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 13 ++++--
be/src/vec/columns/column_string.h | 49 ++++++++++++++++++++++
be/src/vec/columns/predicate_column.h | 2 +-
4 files changed, 81 insertions(+), 10 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h
b/be/src/olap/rowset/segment_v2/binary_plain_page.h
index 1242fd9b75..659df55fee 100644
--- a/be/src/olap/rowset/segment_v2/binary_plain_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h
@@ -248,12 +248,14 @@ public:
return Status::OK();
}
const size_t max_fetch = std::min(*n, static_cast<size_t>(_num_elems -
_cur_idx));
-
uint32_t len_array[max_fetch];
uint32_t start_offset_array[max_fetch];
- for (int i = 0; i < max_fetch; i++, _cur_idx++) {
- const uint32_t start_offset = offset(_cur_idx);
- uint32_t len = offset(_cur_idx + 1) - start_offset;
+
+ uint32_t last_offset = guarded_offset(_cur_idx);
+ for (int i = 0; i < max_fetch - 1; i++, _cur_idx++) {
+ const uint32_t start_offset = last_offset;
+ last_offset = guarded_offset(_cur_idx + 1);
+ uint32_t len = last_offset - start_offset;
len_array[i] = len;
start_offset_array[i] = start_offset;
if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
@@ -262,6 +264,14 @@ public:
}
}
}
+ _cur_idx++;
+ len_array[max_fetch - 1] = offset(_cur_idx) - last_offset;
+ start_offset_array[max_fetch - 1] = last_offset;
+ if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
+ if (_options.need_check_bitmap) {
+ RETURN_IF_ERROR(BitmapTypeCode::validate(*(_data.data +
last_offset)));
+ }
+ }
dst->insert_many_binary_data(_data.mutable_data(), len_array,
start_offset_array,
max_fetch);
@@ -340,13 +350,20 @@ public:
}
private:
+ static constexpr size_t SIZE_OF_INT32 = sizeof(uint32_t);
// Return the offset within '_data' where the string value with index
'idx' can be found.
uint32_t offset(size_t idx) const {
if (idx >= _num_elems) {
return _offsets_pos;
}
const uint8_t* p =
- reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx *
sizeof(uint32_t)]);
+ reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx *
SIZE_OF_INT32]);
+ return decode_fixed32_le(p);
+ }
+
+ uint32_t guarded_offset(size_t idx) const {
+ const uint8_t* p =
+ reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx *
SIZE_OF_INT32]);
return decode_fixed32_le(p);
}
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 98f2cfae27..e6435e8be1 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1150,8 +1150,11 @@ Status SegmentIterator::next_batch(vectorized::Block*
block) {
}
if (!_lazy_materialization_read) {
- Status ret = _output_column_by_sel_idx(block,
_first_read_column_ids, sel_rowid_idx,
- selected_size);
+ Status ret = Status::OK();
+ if (selected_size > 0) {
+ ret = _output_column_by_sel_idx(block, _first_read_column_ids,
sel_rowid_idx,
+ selected_size);
+ }
if (!ret.ok()) {
return ret;
}
@@ -1176,8 +1179,10 @@ Status SegmentIterator::next_batch(vectorized::Block*
block) {
// when lazy materialization enables, _first_read_column_ids =
distinct(_short_cir_pred_column_ids + _vec_pred_column_ids)
// see _vec_init_lazy_materialization
// todo(wb) need to tell input columnids from output columnids
- RETURN_IF_ERROR(_output_column_by_sel_idx(block,
_first_read_column_ids, sel_rowid_idx,
- selected_size));
+ if (selected_size > 0) {
+ RETURN_IF_ERROR(_output_column_by_sel_idx(block,
_first_read_column_ids, sel_rowid_idx,
+ selected_size));
+ }
}
// shrink char_type suffix zero data
diff --git a/be/src/vec/columns/column_string.h
b/be/src/vec/columns/column_string.h
index 469bbdc6df..4dbaab3db4 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -207,6 +207,55 @@ public:
}
}
+ void insert_many_continuous_strings(const StringRef* strings, size_t num) {
+ DCHECK_NE(num, 0);
+ offsets.reserve(offsets.size() + num);
+ std::vector<const char*> start_points(1);
+ auto& head = strings[0];
+ start_points[0] = head.data;
+ size_t new_size = head.size;
+ const char* cursor = head.data + new_size;
+ std::vector<const char*> end_points;
+
+ const size_t old_size = chars.size();
+ size_t offset = old_size;
+ offset += new_size;
+ offsets.push_back(offset);
+ if (num == 1) {
+ end_points.push_back(cursor);
+ } else {
+ for (size_t i = 1; i < num; i++) {
+ auto& str = strings[i];
+ if (cursor != str.data) {
+ end_points.push_back(cursor);
+ start_points.push_back(str.data);
+ cursor = str.data;
+ }
+ size_t sz = str.size;
+ offset += sz;
+ new_size += sz;
+ cursor += sz;
+ offsets.push_back_without_reserve(offset);
+ }
+ end_points.push_back(cursor);
+ }
+ DCHECK_EQ(end_points.size(), start_points.size());
+
+ chars.resize(old_size + new_size);
+
+ size_t num_range = start_points.size();
+ Char* data = chars.data();
+
+ offset = old_size;
+ for (size_t i = 0; i < num_range; i++) {
+ uint32_t len = end_points[i] - start_points[i];
+ if (len) {
+ memcpy(data + offset, start_points[i], len);
+ offset += len;
+ }
+ }
+ }
+
void insert_many_dict_data(const int32_t* data_array, size_t start_index,
const StringRef* dict,
size_t num, uint32_t /*dict_num*/) override {
size_t offset_size = offsets.size();
diff --git a/be/src/vec/columns/predicate_column.h
b/be/src/vec/columns/predicate_column.h
index d5ad52b6ac..01a90c9eb9 100644
--- a/be/src/vec/columns/predicate_column.h
+++ b/be/src/vec/columns/predicate_column.h
@@ -97,7 +97,7 @@ private:
refs[i].data = sv.ptr;
refs[i].size = sv.len;
}
- res_ptr->insert_many_strings(refs, sel_size);
+ res_ptr->insert_many_continuous_strings(refs, sel_size);
}
void insert_decimal_to_res_column(const uint16_t* sel, size_t sel_size,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]