This is an automated email from the ASF dual-hosted git repository.
gabriellee pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 3aa6d044df1 [refactor](column) Simplify column functions (#44272)
3aa6d044df1 is described below
commit 3aa6d044df16bf93570a6586c91e3fb986406aee
Author: Gabriel <[email protected]>
AuthorDate: Thu Nov 21 10:18:58 2024 +0800
[refactor](column) Simplify column functions (#44272)
`insert_many_binary_data` has the same logics as `insert_many_strings`.
This PR unify those 2 functions.
---
be/src/olap/rowset/segment_v2/binary_plain_page.h | 19 +++++++-----------
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 6 ++++--
be/src/vec/columns/column.h | 6 ------
be/src/vec/columns/column_complex.h | 5 ++---
be/src/vec/columns/column_fixed_length_object.h | 7 ++-----
be/src/vec/columns/column_nullable.cpp | 11 ++++++++---
be/src/vec/columns/column_nullable.h | 6 ------
be/src/vec/columns/column_object.h | 6 ------
be/src/vec/columns/column_string.h | 23 ----------------------
be/src/vec/columns/predicate_column.h | 19 ++++++++----------
10 files changed, 31 insertions(+), 77 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h
b/be/src/olap/rowset/segment_v2/binary_plain_page.h
index e043164ef28..3b3c6ad3fea 100644
--- a/be/src/olap/rowset/segment_v2/binary_plain_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h
@@ -266,8 +266,7 @@ public:
auto total = *n;
size_t read_count = 0;
- _len_array.resize(total);
- _start_offset_array.resize(total);
+ _binary_data.resize(total);
for (size_t i = 0; i < total; ++i) {
ordinal_t ord = rowids[i] - page_first_ordinal;
if (UNLIKELY(ord >= _num_elems)) {
@@ -275,14 +274,13 @@ public:
}
const uint32_t start_offset = offset(ord);
- _start_offset_array[read_count] = start_offset;
- _len_array[read_count] = offset(ord + 1) - start_offset;
+ _binary_data[read_count].data = _data.mutable_data() +
start_offset;
+ _binary_data[read_count].size = offset(ord + 1) - start_offset;
read_count++;
}
if (LIKELY(read_count > 0)) {
- dst->insert_many_binary_data(_data.mutable_data(),
_len_array.data(),
- _start_offset_array.data(),
read_count);
+ dst->insert_many_strings(_binary_data.data(), read_count);
}
*n = read_count;
@@ -342,13 +340,11 @@ private:
if (idx >= _num_elems) {
return _offsets_pos;
}
- const uint8_t* p =
- reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx *
SIZE_OF_INT32]);
- return decode_fixed32_le(p);
+ return guarded_offset(idx);
}
uint32_t guarded_offset(size_t idx) const {
- const uint8_t* p =
+ const auto* p =
reinterpret_cast<const uint8_t*>(&_data[_offsets_pos + idx *
SIZE_OF_INT32]);
return decode_fixed32_le(p);
}
@@ -361,8 +357,7 @@ private:
uint32_t _offsets_pos;
std::vector<uint32_t> _offsets;
- std::vector<uint32_t> _len_array;
- std::vector<uint32_t> _start_offset_array;
+ std::vector<StringRef> _binary_data;
// Index of the currently seeked element in the page.
uint32_t _cur_idx;
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 2c7aa5a0ce1..96b0bea2ae8 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -2264,8 +2264,10 @@ Status
SegmentIterator::_next_batch_internal(vectorized::Block* block) {
size_t rows = block->rows();
for (const auto& entry : *block) {
if (entry.column->size() != rows) {
- throw doris::Exception(ErrorCode::INTERNAL_ERROR, "unmatched size
{}, expected {}",
- entry.column->size(), rows);
+ throw doris::Exception(ErrorCode::INTERNAL_ERROR,
+ "unmatched size {}, expected {}, column:
{}, type: {}",
+ entry.column->size(), rows,
entry.column->get_name(),
+ entry.type->get_name());
}
}
#endif
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index ce155aefad2..e92c246741f 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -249,12 +249,6 @@ public:
"Method insert_many_dict_data is not supported
for " + get_name());
}
- virtual void insert_many_binary_data(char* data_array, uint32_t* len_array,
- uint32_t* start_offset_array, size_t
num) {
- throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR,
- "Method insert_many_binary_data is not
supported for " + get_name());
- }
-
/// Insert binary data into column from a continuous buffer, the
implementation maybe copy all binary data
/// in one single time.
virtual void insert_many_continuous_binary_data(const char* data, const
uint32_t* offsets,
diff --git a/be/src/vec/columns/column_complex.h
b/be/src/vec/columns/column_complex.h
index 1ff074b1129..24b6b7ddbd7 100644
--- a/be/src/vec/columns/column_complex.h
+++ b/be/src/vec/columns/column_complex.h
@@ -98,10 +98,9 @@ public:
}
}
- void insert_many_binary_data(char* data_array, uint32_t* len_array,
- uint32_t* start_offset_array, size_t num)
override {
+ void insert_many_strings(const StringRef* strings, size_t num) override {
for (size_t i = 0; i < num; i++) {
- insert_binary_data(data_array + start_offset_array[i],
len_array[i]);
+ insert_binary_data(strings[i].data, strings[i].size);
}
}
diff --git a/be/src/vec/columns/column_fixed_length_object.h
b/be/src/vec/columns/column_fixed_length_object.h
index ea3437dbb01..947d859ce23 100644
--- a/be/src/vec/columns/column_fixed_length_object.h
+++ b/be/src/vec/columns/column_fixed_length_object.h
@@ -284,8 +284,7 @@ public:
memcpy(_data.data() + old_size, data + begin_offset, total_mem_size);
}
- void insert_many_binary_data(char* data_array, uint32_t* len_array,
- uint32_t* start_offset_array, size_t num)
override {
+ void insert_many_strings(const StringRef* strings, size_t num) override {
if (UNLIKELY(num == 0)) {
return;
}
@@ -294,10 +293,8 @@ public:
resize(old_count + num);
auto* dst = _data.data() + old_count * _item_size;
for (size_t i = 0; i < num; i++) {
- auto* src = data_array + start_offset_array[i];
- uint32_t len = len_array[i];
dst += i * _item_size;
- memcpy(dst, src, len);
+ memcpy(dst, strings[i].data, strings[i].size);
}
}
diff --git a/be/src/vec/columns/column_nullable.cpp
b/be/src/vec/columns/column_nullable.cpp
index 493aabdae31..5e34ad4d8d4 100644
--- a/be/src/vec/columns/column_nullable.cpp
+++ b/be/src/vec/columns/column_nullable.cpp
@@ -185,16 +185,21 @@ void ColumnNullable::insert_data(const char* pos, size_t
length) {
}
void ColumnNullable::insert_many_strings(const StringRef* strings, size_t num)
{
+ auto not_null_count = 0;
for (size_t i = 0; i != num; ++i) {
if (strings[i].data == nullptr) {
- nested_column->insert_default();
+ _push_false_to_nullmap(not_null_count);
+ not_null_count = 0;
get_null_map_data().push_back(1);
_has_null = true;
} else {
- nested_column->insert_data(strings[i].data, strings[i].size);
- _push_false_to_nullmap(1);
+ not_null_count++;
}
}
+ if (not_null_count) {
+ _push_false_to_nullmap(not_null_count);
+ }
+ nested_column->insert_many_strings(strings, num);
}
void ColumnNullable::insert_many_from(const IColumn& src, size_t position,
size_t length) {
diff --git a/be/src/vec/columns/column_nullable.h
b/be/src/vec/columns/column_nullable.h
index 0e5e104dce9..2b87aa982ca 100644
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@@ -245,12 +245,6 @@ public:
get_nested_column().insert_many_continuous_binary_data(data, offsets,
num);
}
- void insert_many_binary_data(char* data_array, uint32_t* len_array,
- uint32_t* start_offset_array, size_t num)
override {
- _push_false_to_nullmap(num);
- get_nested_column().insert_many_binary_data(data_array, len_array,
start_offset_array, num);
- }
-
void insert_default() override {
get_nested_column().insert_default();
get_null_map_data().push_back(1);
diff --git a/be/src/vec/columns/column_object.h
b/be/src/vec/columns/column_object.h
index 244692e9ddd..1c8f38056c9 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -465,12 +465,6 @@ public:
"insert_many_dict_data" + get_name());
}
- void insert_many_binary_data(char* data_array, uint32_t* len_array,
- uint32_t* start_offset_array, size_t num)
override {
- throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR,
- "insert_many_binary_data" + get_name());
- }
-
void insert_many_continuous_binary_data(const char* data, const uint32_t*
offsets,
const size_t num) override {
throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR,
diff --git a/be/src/vec/columns/column_string.h
b/be/src/vec/columns/column_string.h
index a72a3ec5cdc..f116d4ce1f1 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -269,29 +269,6 @@ public:
DCHECK(chars.size() == offsets.back());
}
- void insert_many_binary_data(char* data_array, uint32_t* len_array,
- uint32_t* start_offset_array, size_t num)
override {
- size_t new_size = 0;
- for (size_t i = 0; i < num; i++) {
- new_size += len_array[i];
- }
-
- const size_t old_size = chars.size();
- check_chars_length(old_size + new_size, offsets.size() + num);
- chars.resize(old_size + new_size);
-
- Char* data = chars.data();
- size_t offset = old_size;
- for (size_t i = 0; i < num; i++) {
- uint32_t len = len_array[i];
- uint32_t start_offset = start_offset_array[i];
- // memcpy will deal len == 0, not do it here
- memcpy(data + offset, data_array + start_offset, len);
- offset += len;
- offsets.push_back(offset);
- }
- }
-
void insert_many_strings(const StringRef* strings, size_t num) override {
size_t new_size = 0;
for (size_t i = 0; i < num; i++) {
diff --git a/be/src/vec/columns/predicate_column.h
b/be/src/vec/columns/predicate_column.h
index c9ceec230f7..c2c6456d862 100644
--- a/be/src/vec/columns/predicate_column.h
+++ b/be/src/vec/columns/predicate_column.h
@@ -269,8 +269,7 @@ public:
}
}
- void insert_many_binary_data(char* data_array, uint32_t* len_array,
- uint32_t* start_offset_array, size_t num)
override {
+ void insert_many_strings(const StringRef* strings, size_t num) override {
if (num == 0) {
return;
}
@@ -281,26 +280,25 @@ public:
size_t total_mem_size = 0;
for (size_t i = 0; i < num; i++) {
- total_mem_size += len_array[i];
+ total_mem_size += strings[i].size;
}
char* destination = _arena->alloc(total_mem_size);
char* org_dst = destination;
size_t org_elem_num = data.size();
data.resize(org_elem_num + num);
- uint32_t fragment_start_offset = start_offset_array[0];
+ uint32_t fragment_start_offset = 0;
size_t fragment_len = 0;
for (size_t i = 0; i < num; i++) {
data[org_elem_num + i].data = destination + fragment_len;
- data[org_elem_num + i].size = len_array[i];
- fragment_len += len_array[i];
+ data[org_elem_num + i].size = strings[i].size;
+ fragment_len += strings[i].size;
// Compute the largest continuous memcpy block and copy them.
// If this is the last element in data array, then should copy
the current memory block.
- if (i == num - 1 ||
- start_offset_array[i + 1] != start_offset_array[i] +
len_array[i]) {
- memcpy(destination, data_array + fragment_start_offset,
fragment_len);
+ if (i == num - 1 || strings[i + 1].data != strings[i].data +
strings[i].size) {
+ memcpy(destination, strings[fragment_start_offset].data,
fragment_len);
destination += fragment_len;
- fragment_start_offset = (i == num - 1 ? 0 :
start_offset_array[i + 1]);
+ fragment_start_offset = i == num - 1 ? 0 : i + 1;
fragment_len = 0;
}
}
@@ -309,7 +307,6 @@ public:
} else {
throw doris::Exception(ErrorCode::INTERNAL_ERROR,
"Method insert_many_binary_data is not
supported");
- __builtin_unreachable();
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]