This is an automated email from the ASF dual-hosted git repository.
wangbo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new 802fcbb (#8162)refactor binary dict
802fcbb is described below
commit 802fcbbb056a0c62be2160461ebb5c5e6e11f576
Author: zuochunwei <[email protected]>
AuthorDate: Tue Feb 22 11:23:54 2022 +0800
(#8162)refactor binary dict
Co-authored-by: zuochunwei <[email protected]>
---
be/src/olap/rowset/segment_v2/binary_dict_page.cpp | 14 +++++------
be/src/olap/rowset/segment_v2/binary_dict_page.h | 5 ++--
be/src/olap/rowset/segment_v2/binary_plain_page.h | 16 ++++++++++++
be/src/olap/rowset/segment_v2/column_reader.cpp | 16 +++---------
be/src/olap/rowset/segment_v2/column_reader.h | 3 +--
be/src/vec/columns/column.h | 3 +--
be/src/vec/columns/column_nullable.h | 5 ++--
be/src/vec/columns/column_string.h | 9 +++----
be/src/vec/columns/predicate_column.h | 29 +++++++++-------------
.../rowset/segment_v2/binary_dict_page_test.cpp | 24 +++++-------------
be/test/tools/benchmark_tool.cpp | 13 +++-------
11 files changed, 56 insertions(+), 81 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
index 413b082..974679b 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
@@ -209,7 +209,7 @@ Status BinaryDictPageDecoder::init() {
TypeInfo* type_info = get_scalar_type_info(OLAP_FIELD_TYPE_INT);
RETURN_IF_ERROR(ColumnVectorBatch::create(0, false, type_info,
nullptr, &_batch));
- _data_page_decoder.reset(new
BitShufflePageDecoder<OLAP_FIELD_TYPE_INT>(_data, _options));
+ _data_page_decoder.reset(_bit_shuffle_ptr = new
BitShufflePageDecoder<OLAP_FIELD_TYPE_INT>(_data, _options));
} else if (_encoding_type == PLAIN_ENCODING) {
DCHECK_EQ(_encoding_type, PLAIN_ENCODING);
_data_page_decoder.reset(new BinaryPlainPageDecoder(_data, _options));
@@ -233,11 +233,9 @@ bool BinaryDictPageDecoder::is_dict_encoding() const {
return _encoding_type == DICT_ENCODING;
}
-void BinaryDictPageDecoder::set_dict_decoder(PageDecoder* dict_decoder,
uint32_t* start_offset_array, uint32_t* len_array) {
+void BinaryDictPageDecoder::set_dict_decoder(PageDecoder* dict_decoder,
StringRef* dict_word_info) {
_dict_decoder = (BinaryPlainPageDecoder*)dict_decoder;
- _bit_shuffle_ptr =
reinterpret_cast<BitShufflePageDecoder<OLAP_FIELD_TYPE_INT>*>(_data_page_decoder.get());
- _start_offset_array = start_offset_array;
- _len_array = len_array;
+ _dict_word_info = dict_word_info;
};
Status BinaryDictPageDecoder::next_batch(size_t* n,
vectorized::MutableColumnPtr &dst) {
@@ -259,8 +257,8 @@ Status BinaryDictPageDecoder::next_batch(size_t* n,
vectorized::MutableColumnPtr
const int32_t* data_array = reinterpret_cast<const
int32_t*>(_bit_shuffle_ptr->_chunk.data);
size_t start_index = _bit_shuffle_ptr->_cur_index;
- dst->insert_many_dict_data(data_array, start_index, _start_offset_array,
_len_array,
- _dict_decoder->_data.mutable_data(), max_fetch);
+ dst->insert_many_dict_data(data_array, start_index, _dict_word_info,
max_fetch);
+
_bit_shuffle_ptr->_cur_index += max_fetch;
return Status::OK();
@@ -291,7 +289,7 @@ Status BinaryDictPageDecoder::next_batch(size_t* n,
ColumnBlockView* dst) {
for (int i = 0; i < len; ++i) {
int32_t codeword = *reinterpret_cast<const
int32_t*>(column_block.cell_ptr(i));
// get the string from the dict decoder
- *out = Slice(&_dict_decoder->_data[_start_offset_array[codeword]],
_len_array[codeword]);
+ *out = Slice(_dict_word_info[codeword].data,
_dict_word_info[codeword].size);
mem_len[i] = out->size;
out++;
}
diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h
b/be/src/olap/rowset/segment_v2/binary_dict_page.h
index 15f11aa..54754be 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h
@@ -115,7 +115,7 @@ public:
bool is_dict_encoding() const;
- void set_dict_decoder(PageDecoder* dict_decoder, uint32_t*
start_offset_array = nullptr, uint32_t* len_array = nullptr);
+ void set_dict_decoder(PageDecoder* dict_decoder, StringRef*
dict_word_info);
~BinaryDictPageDecoder();
@@ -130,8 +130,7 @@ private:
// use as data buf.
std::unique_ptr<ColumnVectorBatch> _batch;
- uint32_t* _start_offset_array = nullptr;
- uint32_t* _len_array = nullptr;
+ StringRef* _dict_word_info = nullptr;
};
} // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h
b/be/src/olap/rowset/segment_v2/binary_plain_page.h
index 20e0c98..2060bd5 100644
--- a/be/src/olap/rowset/segment_v2/binary_plain_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h
@@ -268,6 +268,22 @@ public:
return Slice(&_data[start_offset], len);
}
+ void get_dict_word_info(StringRef* dict_word_info) {
+ char* data_begin = (char*)&_data[0];
+ char* offset_ptr = (char*)&_data[_offsets_pos];
+
+ for (uint32_t i = 0; i < _num_elems; ++i) {
+ dict_word_info[i].data = data_begin +
decode_fixed32_le((uint8_t*)offset_ptr);
+ offset_ptr += sizeof(uint32_t);
+ }
+
+ for (int i = 0; i < (int)_num_elems - 1; ++i) {
+ dict_word_info[i].size = (char*)dict_word_info[i+1].data -
(char*)dict_word_info[i].data;
+ }
+
+ dict_word_info[_num_elems-1].size = (data_begin + _offsets_pos) -
(char*)dict_word_info[_num_elems-1].data;
+ }
+
private:
// Return the offset within '_data' where the string value with index
'idx' can be found.
uint32_t offset(size_t idx) const {
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 9b14ff4..93a9151 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -672,23 +672,15 @@ Status FileColumnIterator::_read_data_page(const
OrdinalPageIndexIterator& iter)
&_dict_page_handle,
&dict_data, &dict_footer));
// ignore dict_footer.dict_page_footer().encoding() due to only
// PLAIN_ENCODING is supported for dict page right now
- _dict_decoder.reset(new BinaryPlainPageDecoder(dict_data));
+ _dict_decoder =
std::make_unique<BinaryPlainPageDecoder>(dict_data);
RETURN_IF_ERROR(_dict_decoder->init());
auto* pd_decoder =
(BinaryPlainPageDecoder*)_dict_decoder.get();
- _dict_start_offset_array.reset(new
uint32_t[pd_decoder->_num_elems]);
- _dict_len_array.reset(new uint32_t[pd_decoder->_num_elems]);
-
- // todo(wb) padding dict value for SIMD comparison
- for (int i = 0; i < pd_decoder->_num_elems; i++) {
- const uint32_t start_offset = pd_decoder->offset(i);
- uint32_t len = pd_decoder->offset(i + 1) - start_offset;
- _dict_start_offset_array[i] = start_offset;
- _dict_len_array[i] = len;
- }
+ _dict_word_info.reset(new StringRef[pd_decoder->_num_elems]);
+ pd_decoder->get_dict_word_info(_dict_word_info.get());
}
- dict_page_decoder->set_dict_decoder(_dict_decoder.get(),
_dict_start_offset_array.get(), _dict_len_array.get());
+ dict_page_decoder->set_dict_decoder(_dict_decoder.get(),
_dict_word_info.get());
}
}
return Status::OK();
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h
b/be/src/olap/rowset/segment_v2/column_reader.h
index db77577..e1cb2aa 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -304,8 +304,7 @@ private:
// current value ordinal
ordinal_t _current_ordinal = 0;
- std::unique_ptr<uint32_t[]> _dict_start_offset_array;
- std::unique_ptr<uint32_t[]> _dict_len_array;
+ std::unique_ptr<StringRef[]> _dict_word_info;
};
class ArrayFileColumnIterator final : public ColumnIterator {
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index 10b1e00..cb04a1c 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -174,8 +174,7 @@ public:
LOG(FATAL) << "Method insert_many_fix_len_data is not supported for " <<
get_name();
}
- virtual void insert_many_dict_data(const int32_t* data_array, size_t
start_index, const uint32_t* start_offset_array,
- const uint32_t* len_array, char* dict_data, size_t num) {
+ virtual void insert_many_dict_data(const int32_t* data_array, size_t
start_index, const StringRef* dict, size_t num) {
LOG(FATAL) << "Method insert_many_dict_data is not supported for " <<
get_name();
}
diff --git a/be/src/vec/columns/column_nullable.h
b/be/src/vec/columns/column_nullable.h
index 0de1ea9..21c67a5 100644
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@@ -96,10 +96,9 @@ public:
get_nested_column().insert_many_fix_len_data(pos, num);
}
- void insert_many_dict_data(const int32_t* data_array, size_t start_index,
const uint32_t* start_offset_array,
- const uint32_t* len_array, char* dict_data, size_t num) override {
+ void insert_many_dict_data(const int32_t* data_array, size_t start_index,
const StringRef* dict, size_t num) override {
get_null_map_column().fill(0, num);
- get_nested_column().insert_many_dict_data(data_array, start_index,
start_offset_array, len_array, dict_data, num);
+ get_nested_column().insert_many_dict_data(data_array, start_index,
dict, num);
}
void insert_many_binary_data(char* data_array, uint32_t* len_array,
uint32_t* start_offset_array, size_t num) override {
diff --git a/be/src/vec/columns/column_string.h
b/be/src/vec/columns/column_string.h
index 591fbe2..236216c 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -165,13 +165,10 @@ public:
}
};
- void insert_many_dict_data (const int32_t* data_array, size_t start_index,
const uint32_t* start_offset_array,
- const uint32_t* len_array, char* dict_data, size_t num) override {
- for (int i = 0; i < num; i++, start_index++) {
+ void insert_many_dict_data(const int32_t* data_array, size_t start_index,
const StringRef* dict, size_t num) override {
+ for (size_t end_index = start_index+num; start_index < end_index;
++start_index) {
int32_t codeword = data_array[start_index];
- uint32_t start_offset = start_offset_array[codeword];
- uint32_t str_len = len_array[codeword];
- insert_data(dict_data + start_offset, str_len);
+ insert_data(dict[codeword].data, dict[codeword].size);
}
}
diff --git a/be/src/vec/columns/predicate_column.h
b/be/src/vec/columns/predicate_column.h
index 890d9a2..69a89fd 100644
--- a/be/src/vec/columns/predicate_column.h
+++ b/be/src/vec/columns/predicate_column.h
@@ -169,12 +169,12 @@ public:
LOG(FATAL) << "update_hash_with_value not supported in
PredicateColumnType";
}
- void insert_string_value(char* data_ptr, size_t length) {
- StringValue sv(data_ptr, length);
+ void insert_string_value(const char* data_ptr, size_t length) {
+ StringValue sv((char*)data_ptr, length);
data.push_back_without_reserve(sv);
}
- void insert_decimal_value(char* data_ptr, size_t length) {
+ void insert_decimal_value(const char* data_ptr, size_t length) {
decimal12_t dc12_value;
dc12_value.integer = *(int64_t*)(data_ptr);
dc12_value.fraction = *(int32_t*)(data_ptr + sizeof(int64_t));
@@ -182,27 +182,26 @@ public:
}
// used for int128
- void insert_in_copy_way(char* data_ptr, size_t length) {
+ void insert_in_copy_way(const char* data_ptr, size_t length) {
T val {};
memcpy(&val, data_ptr, sizeof(val));
data.push_back_without_reserve(val);
}
- void insert_default_type(char* data_ptr, size_t length) {
+ void insert_default_type(const char* data_ptr, size_t length) {
T* val = (T*)data_ptr;
data.push_back_without_reserve(*val);
}
void insert_data(const char* data_ptr, size_t length) override {
- char* ch = const_cast<char*>(data_ptr);
if constexpr (std::is_same_v<T, StringValue>) {
- insert_string_value(ch, length);
+ insert_string_value(data_ptr, length);
} else if constexpr (std::is_same_v<T, decimal12_t>) {
- insert_decimal_value(ch, length);
+ insert_decimal_value(data_ptr, length);
} else if constexpr (std::is_same_v<T, doris::vectorized::Int128>) {
- insert_in_copy_way(ch, length);
+ insert_in_copy_way(data_ptr, length);
} else {
- insert_default_type(ch, length);
+ insert_default_type(data_ptr, length);
}
}
@@ -218,15 +217,11 @@ public:
}
}
- void insert_many_dict_data(const int32_t* data_array, size_t start_index,
- const uint32_t* start_offset_array, const
uint32_t* len_array,
- char* dict_data, size_t num) override {
+ void insert_many_dict_data(const int32_t* data_array, size_t start_index,
const StringRef* dict, size_t num) override {
if constexpr (std::is_same_v<T, StringValue>) {
- for (int i = 0; i < num; i++, start_index++) {
+ for (size_t end_index = start_index+num; start_index < end_index;
++start_index) {
int32_t codeword = data_array[start_index];
- uint32_t start_offset = start_offset_array[codeword];
- uint32_t str_len = len_array[codeword];
- insert_string_value(dict_data + start_offset, str_len);
+ insert_string_value(dict[codeword].data, dict[codeword].size);
}
}
}
diff --git a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
index a65b404..de48637 100644
--- a/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
+++ b/be/test/olap/rowset/segment_v2/binary_dict_page_test.cpp
@@ -73,20 +73,14 @@ public:
// because every slice is unique
ASSERT_EQ(slices.size(), dict_page_decoder->count());
- uint32_t dict_start_offset_array[dict_page_decoder->_num_elems];
- uint32_t dict_len_array[dict_page_decoder->_num_elems];
- for (int i = 0; i < dict_page_decoder->_num_elems; i++) {
- const uint32_t start_offset = dict_page_decoder->offset(i);
- uint32_t len = dict_page_decoder->offset(i + 1) - start_offset;
- dict_start_offset_array[i] = start_offset;
- dict_len_array[i] = len;
- }
+ StringRef dict_word_info[dict_page_decoder->_num_elems];
+ dict_page_decoder->get_dict_word_info(dict_word_info);
// decode
PageDecoderOptions decoder_options;
BinaryDictPageDecoder page_decoder(s.slice(), decoder_options);
- page_decoder.set_dict_decoder(dict_page_decoder.get(),
dict_start_offset_array, dict_len_array);
+ page_decoder.set_dict_decoder(dict_page_decoder.get(), dict_word_info);
status = page_decoder.init();
ASSERT_TRUE(status.ok());
@@ -177,21 +171,15 @@ public:
status = dict_page_decoder->init();
ASSERT_TRUE(status.ok());
- uint32_t dict_start_offset_array[dict_page_decoder->_num_elems];
- uint32_t dict_len_array[dict_page_decoder->_num_elems];
- for (int i = 0; i < dict_page_decoder->_num_elems; i++) {
- const uint32_t start_offset = dict_page_decoder->offset(i);
- uint32_t len = dict_page_decoder->offset(i + 1) - start_offset;
- dict_start_offset_array[i] = start_offset;
- dict_len_array[i] = len;
- }
+ StringRef dict_word_info[dict_page_decoder->_num_elems];
+ dict_page_decoder->get_dict_word_info(dict_word_info);
// decode
PageDecoderOptions decoder_options;
BinaryDictPageDecoder page_decoder(results[slice_index].slice(),
decoder_options);
status = page_decoder.init();
- page_decoder.set_dict_decoder(dict_page_decoder.get(),
dict_start_offset_array, dict_len_array);
+ page_decoder.set_dict_decoder(dict_page_decoder.get(),
dict_word_info);
ASSERT_TRUE(status.ok());
//check values
diff --git a/be/test/tools/benchmark_tool.cpp b/be/test/tools/benchmark_tool.cpp
index 7a1f708..e06cc3d 100644
--- a/be/test/tools/benchmark_tool.cpp
+++ b/be/test/tools/benchmark_tool.cpp
@@ -174,22 +174,15 @@ public:
new BinaryPlainPageDecoder(dict_slice.slice(),
dict_decoder_options));
dict_page_decoder->init();
- uint32_t dict_start_offset_array[dict_page_decoder->_num_elems];
- uint32_t dict_len_array[dict_page_decoder->_num_elems];
- for (int i = 0; i < dict_page_decoder->_num_elems; i++) {
- const uint32_t start_offset = dict_page_decoder->offset(i);
- uint32_t len = dict_page_decoder->offset(i + 1) - start_offset;
- dict_start_offset_array[i] = start_offset;
- dict_len_array[i] = len;
- }
+ StringRef dict_word_info[dict_page_decoder->_num_elems];
+ dict_page_decoder->get_dict_word_info(dict_word_info);
// decode
PageDecoderOptions decoder_options;
BinaryDictPageDecoder page_decoder(src.slice(), decoder_options);
page_decoder.init();
- page_decoder.set_dict_decoder(dict_page_decoder.get(),
dict_start_offset_array,
- dict_len_array);
+ page_decoder.set_dict_decoder(dict_page_decoder.get(),
dict_word_info);
//check values
size_t num = page_start_ids[slice_index + 1] -
page_start_ids[slice_index];
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]