This is an automated email from the ASF dual-hosted git repository.
lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 62f765b7f5 [improvement](scan) speed up inserting strings into
ColumnString (#13397)
62f765b7f5 is described below
commit 62f765b7f530c5af2bb292dab820e0b8077b64d1
Author: Jerry Hu <[email protected]>
AuthorDate: Wed Nov 2 22:19:02 2022 +0800
[improvement](scan) speed up inserting strings into ColumnString (#13397)
---
be/src/olap/rowset/segment_v2/binary_plain_page.h | 14 ++-
be/src/vec/columns/column.h | 12 ++-
be/src/vec/columns/column_complex.h | 11 +++
be/src/vec/columns/column_dictionary.h | 17 ++--
be/src/vec/columns/column_jsonb.h | 26 ++++++
be/src/vec/columns/column_nullable.h | 9 ++
be/src/vec/columns/column_string.h | 101 +++++++++++-----------
be/src/vec/columns/predicate_column.h | 30 ++++++-
8 files changed, 150 insertions(+), 70 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h
b/be/src/olap/rowset/segment_v2/binary_plain_page.h
index 659df55fee..96cfc392a5 100644
--- a/be/src/olap/rowset/segment_v2/binary_plain_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h
@@ -248,16 +248,14 @@ public:
return Status::OK();
}
const size_t max_fetch = std::min(*n, static_cast<size_t>(_num_elems -
_cur_idx));
- uint32_t len_array[max_fetch];
- uint32_t start_offset_array[max_fetch];
uint32_t last_offset = guarded_offset(_cur_idx);
+ uint32_t offsets[max_fetch + 1];
+ offsets[0] = last_offset;
for (int i = 0; i < max_fetch - 1; i++, _cur_idx++) {
const uint32_t start_offset = last_offset;
last_offset = guarded_offset(_cur_idx + 1);
- uint32_t len = last_offset - start_offset;
- len_array[i] = len;
- start_offset_array[i] = start_offset;
+ offsets[i + 1] = last_offset;
if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
if (_options.need_check_bitmap) {
RETURN_IF_ERROR(BitmapTypeCode::validate(*(_data.data +
start_offset)));
@@ -265,15 +263,13 @@ public:
}
}
_cur_idx++;
- len_array[max_fetch - 1] = offset(_cur_idx) - last_offset;
- start_offset_array[max_fetch - 1] = last_offset;
+ offsets[max_fetch] = offset(_cur_idx);
if constexpr (Type == OLAP_FIELD_TYPE_OBJECT) {
if (_options.need_check_bitmap) {
RETURN_IF_ERROR(BitmapTypeCode::validate(*(_data.data +
last_offset)));
}
}
- dst->insert_many_binary_data(_data.mutable_data(), len_array,
start_offset_array,
- max_fetch);
+ dst->insert_many_continuous_binary_data(_data.data, offsets,
max_fetch);
*n = max_fetch;
return Status::OK();
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index ec794bf747..4b89a002af 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -243,6 +243,14 @@ public:
LOG(FATAL) << "Method insert_many_binary_data is not supported for "
<< get_name();
}
+ /// Insert binary data into column from a continuous buffer, the
implementation maybe copy all binary data
+ /// in one single time.
+ virtual void insert_many_continuous_binary_data(const char* data, const
uint32_t* offsets,
+ const size_t num) {
+ LOG(FATAL) << "Method insert_many_continuous_binary_data is not
supported for "
+ << get_name();
+ }
+
virtual void insert_many_strings(const StringRef* strings, size_t num) {
LOG(FATAL) << "Method insert_many_binary_data is not supported for "
<< get_name();
}
@@ -271,10 +279,6 @@ public:
}
}
- virtual void insert_elements(void* elements, size_t num) {
- LOG(FATAL) << "Method insert_elements is not supported for " <<
get_name();
- }
-
/** Removes last n elements.
* Is used to support exception-safety of several operations.
* For example, sometimes insertion should be reverted if we catch an
exception during operation processing.
diff --git a/be/src/vec/columns/column_complex.h
b/be/src/vec/columns/column_complex.h
index 260729f736..ec15c65df7 100644
--- a/be/src/vec/columns/column_complex.h
+++ b/be/src/vec/columns/column_complex.h
@@ -79,6 +79,17 @@ public:
}
}
+ void insert_many_continuous_binary_data(const char* data, const uint32_t*
offsets,
+ const size_t num) override {
+ if (UNLIKELY(num == 0)) {
+ return;
+ }
+
+ for (size_t i = 0; i != num; ++i) {
+ insert_binary_data(data + offsets[i], offsets[i + 1] - offsets[i]);
+ }
+ }
+
void insert_many_binary_data(char* data_array, uint32_t* len_array,
uint32_t* start_offset_array, size_t num)
override {
for (size_t i = 0; i < num; i++) {
diff --git a/be/src/vec/columns/column_dictionary.h
b/be/src/vec/columns/column_dictionary.h
index e226976f0e..b8976e77c5 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -192,14 +192,17 @@ public:
Status filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn*
col_ptr) override {
auto* res_col = reinterpret_cast<vectorized::ColumnString*>(col_ptr);
- res_col->get_offsets().reserve(sel_size);
- res_col->get_chars().reserve(_dict.avg_str_len() * sel_size);
- for (size_t i = 0; i < sel_size; i++) {
- uint16_t n = sel[i];
- auto& code = reinterpret_cast<T&>(_codes[n]);
- auto value = _dict.get_value(code);
- res_col->insert_data_without_reserve(value.ptr, value.len);
+ StringRef strings[sel_size];
+ size_t length = 0;
+ for (size_t i = 0; i != sel_size; ++i) {
+ auto& value = _dict.get_value(_codes[sel[i]]);
+ strings[i].data = value.ptr;
+ strings[i].size = value.len;
+ length += value.len;
}
+ res_col->get_offsets().reserve(sel_size +
res_col->get_offsets().size());
+ res_col->get_chars().reserve(length + res_col->get_chars().size());
+ res_col->insert_many_strings_without_reserve(strings, sel_size);
return Status::OK();
}
diff --git a/be/src/vec/columns/column_jsonb.h
b/be/src/vec/columns/column_jsonb.h
index 58789d0783..66e17d0e68 100644
--- a/be/src/vec/columns/column_jsonb.h
+++ b/be/src/vec/columns/column_jsonb.h
@@ -145,6 +145,32 @@ public:
offsets.push_back(new_size);
}
+ void insert_many_continuous_binary_data(const char* data, const uint32_t*
offsets_,
+ const size_t num) override {
+ if (UNLIKELY(num == 0)) {
+ return;
+ }
+
+ size_t new_size = offsets_[num] - offsets_[0] + num * sizeof(char);
+ const size_t old_size = chars.size();
+ chars.resize(new_size + old_size);
+
+ auto* data_ptr = chars.data();
+ size_t offset = old_size;
+
+ for (size_t i = 0; i != num; ++i) {
+ uint32_t len = offsets_[i + 1] - offsets_[i];
+ if (LIKELY(len)) {
+ memcpy(data_ptr + offset, data + offsets_[i], len);
+ offset += len;
+ }
+ data_ptr[offset] = 0;
+ offset += 1;
+ offsets.push_back(offset);
+ }
+ DCHECK(offset == chars.size());
+ }
+
void insert_many_binary_data(char* data_array, uint32_t* len_array,
uint32_t* start_offset_array, size_t num)
override {
size_t new_size = 0;
diff --git a/be/src/vec/columns/column_nullable.h
b/be/src/vec/columns/column_nullable.h
index 95c11d447b..acc0ba611c 100644
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@@ -124,6 +124,15 @@ public:
dict_num);
}
+ void insert_many_continuous_binary_data(const char* data, const uint32_t*
offsets,
+ const size_t num) override {
+ if (UNLIKELY(num == 0)) {
+ return;
+ }
+ get_null_map_column().fill(0, num);
+ get_nested_column().insert_many_continuous_binary_data(data, offsets,
num);
+ }
+
void insert_many_binary_data(char* data_array, uint32_t* len_array,
uint32_t* start_offset_array, size_t num)
override {
get_null_map_column().fill(0, num);
diff --git a/be/src/vec/columns/column_string.h
b/be/src/vec/columns/column_string.h
index cd70e228b6..26a734fb08 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -164,6 +164,58 @@ public:
offsets.push_back_without_reserve(new_size);
}
+ /// Before insert strings, the caller should calculate the total size of
strings,
+ /// and reserve the chars & the offsets.
+ void insert_many_strings_without_reserve(const StringRef* strings, size_t
num) {
+ Char* data = chars.data();
+ size_t offset = chars.size();
+ size_t length = 0;
+
+ const char* ptr = strings[0].data;
+ for (size_t i = 0; i != num; i++) {
+ uint32_t len = strings[i].size;
+ length += len;
+ offset += len;
+ offsets.push_back(offset);
+
+ if (i != num - 1 && strings[i].data + len == strings[i + 1].data) {
+ continue;
+ }
+ memcpy(data, ptr, length);
+ data += length;
+ if (LIKELY(i != num - 1)) {
+ ptr = strings[i + 1].data;
+ length = 0;
+ }
+ }
+ chars.resize(offset);
+ }
+
+ void insert_many_continuous_binary_data(const char* data, const uint32_t*
offsets_,
+ const size_t num) override {
+ static_assert(sizeof(offsets_[0]) == sizeof(*offsets.data()));
+ if (UNLIKELY(num == 0)) {
+ return;
+ }
+ const auto old_size = chars.size();
+ const auto begin_offset = offsets_[0];
+ const auto total_mem_size = offsets_[num] - begin_offset;
+ if (LIKELY(total_mem_size > 0)) {
+ chars.resize(total_mem_size + old_size);
+ memcpy(chars.data() + old_size, data + begin_offset,
total_mem_size);
+ }
+ const auto old_rows = offsets.size();
+ auto tail_offset = offsets.back();
+ DCHECK(tail_offset == old_size);
+ offsets.resize(old_rows + num);
+ auto* offsets_ptr = &offsets[old_rows];
+
+ for (size_t i = 0; i < num; ++i) {
+ offsets_ptr[i] = tail_offset + offsets_[i + 1] - begin_offset;
+ }
+ DCHECK(chars.size() == offsets.back());
+ }
+
void insert_many_binary_data(char* data_array, uint32_t* len_array,
uint32_t* start_offset_array, size_t num)
override {
size_t new_size = 0;
@@ -207,55 +259,6 @@ public:
}
}
- void insert_many_continuous_strings(const StringRef* strings, size_t num) {
- DCHECK_NE(num, 0);
- offsets.reserve(offsets.size() + num);
- std::vector<const char*> start_points(1);
- auto& head = strings[0];
- start_points[0] = head.data;
- size_t new_size = head.size;
- const char* cursor = head.data + new_size;
- std::vector<const char*> end_points;
-
- const size_t old_size = chars.size();
- size_t offset = old_size;
- offset += new_size;
- offsets.push_back(offset);
- if (num == 1) {
- end_points.push_back(cursor);
- } else {
- for (size_t i = 1; i < num; i++) {
- auto& str = strings[i];
- if (cursor != str.data) {
- end_points.push_back(cursor);
- start_points.push_back(str.data);
- cursor = str.data;
- }
- size_t sz = str.size;
- offset += sz;
- new_size += sz;
- cursor += sz;
- offsets.push_back_without_reserve(offset);
- }
- end_points.push_back(cursor);
- }
- DCHECK_EQ(end_points.size(), start_points.size());
-
- chars.resize(old_size + new_size);
-
- size_t num_range = start_points.size();
- Char* data = chars.data();
-
- offset = old_size;
- for (size_t i = 0; i < num_range; i++) {
- uint32_t len = end_points[i] - start_points[i];
- if (len) {
- memcpy(data + offset, start_points[i], len);
- offset += len;
- }
- }
- }
-
void insert_many_dict_data(const int32_t* data_array, size_t start_index,
const StringRef* dict,
size_t num, uint32_t /*dict_num*/) override {
size_t offset_size = offsets.size();
diff --git a/be/src/vec/columns/predicate_column.h
b/be/src/vec/columns/predicate_column.h
index 128b33bf3e..23c165adcc 100644
--- a/be/src/vec/columns/predicate_column.h
+++ b/be/src/vec/columns/predicate_column.h
@@ -91,13 +91,17 @@ private:
void insert_string_to_res_column(const uint16_t* sel, size_t sel_size,
vectorized::ColumnString* res_ptr) {
StringRef refs[sel_size];
+ size_t length = 0;
for (size_t i = 0; i < sel_size; i++) {
uint16_t n = sel[i];
auto& sv = reinterpret_cast<StringValue&>(data[n]);
refs[i].data = sv.ptr;
refs[i].size = sv.len;
+ length += sv.len;
}
- res_ptr->insert_many_continuous_strings(refs, sel_size);
+ res_ptr->get_offsets().reserve(sel_size +
res_ptr->get_offsets().size());
+ res_ptr->get_chars().reserve(length + res_ptr->get_chars().size());
+ res_ptr->insert_many_strings_without_reserve(refs, sel_size);
}
void insert_decimal_to_res_column(const uint16_t* sel, size_t sel_size,
@@ -256,6 +260,30 @@ public:
}
}
+ void insert_many_continuous_binary_data(const char* data_, const uint32_t*
offsets,
+ const size_t num) override {
+ if (UNLIKELY(num == 0)) {
+ return;
+ }
+ if constexpr (std::is_same_v<T, StringValue>) {
+ if (_pool == nullptr) {
+ _pool.reset(new MemPool());
+ }
+ const auto total_mem_size = offsets[num] - offsets[0];
+ char* destination = (char*)_pool->allocate(total_mem_size);
+ memcpy(destination, data_ + offsets[0], total_mem_size);
+ size_t org_elem_num = data.size();
+ data.resize(org_elem_num + num);
+
+ auto* data_ptr = &data[org_elem_num];
+ for (size_t i = 0; i != num; ++i) {
+ data_ptr[i].ptr = destination + offsets[i] - offsets[0];
+ data_ptr[i].len = offsets[i + 1] - offsets[i];
+ }
+ DCHECK(data_ptr[num - 1].ptr + data_ptr[num - 1].len ==
destination + total_mem_size);
+ }
+ }
+
void insert_many_binary_data(char* data_array, uint32_t* len_array,
uint32_t* start_offset_array, size_t num)
override {
if (num == 0) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]