This is an automated email from the ASF dual-hosted git repository.
lihaopeng pushed a commit to branch vectorized
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/vectorized by this push:
new ee90c94 [Vectorization] Support SegmentIterator vectorization (#7613)
ee90c94 is described below
commit ee90c948f113a72c74d6bf56ae2a39e644017c9b
Author: wangbo <[email protected]>
AuthorDate: Fri Jan 14 11:44:27 2022 +0800
[Vectorization] Support SegmentIterator vectorization (#7613)
---
be/src/olap/column_predicate.h | 2 +
be/src/olap/comparison_predicate.cpp | 2 +-
be/src/olap/in_list_predicate.cpp | 39 +++
be/src/olap/in_list_predicate.h | 7 +-
be/src/olap/rowset/segment_v2/binary_dict_page.cpp | 21 +-
be/src/olap/rowset/segment_v2/binary_plain_page.h | 34 +-
be/src/olap/rowset/segment_v2/bitshuffle_page.h | 36 +-
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 362 ++++++++++++++++++++-
be/src/olap/rowset/segment_v2/segment_iterator.h | 29 +-
be/src/olap/schema.cpp | 66 ++++
be/src/olap/schema.h | 4 +
be/src/vec/columns/column.h | 13 +-
be/src/vec/columns/column_complex.h | 2 +
be/src/vec/columns/column_nullable.cpp | 15 +-
be/src/vec/columns/column_nullable.h | 6 +
be/src/vec/columns/column_vector.h | 26 ++
be/src/vec/columns/predicate_column.h | 27 +-
17 files changed, 661 insertions(+), 30 deletions(-)
diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h
index 10b8a91..6b1aa23 100644
--- a/be/src/olap/column_predicate.h
+++ b/be/src/olap/column_predicate.h
@@ -69,6 +69,8 @@ public:
virtual void evaluate_vec(vectorized::IColumn& column, uint16_t size,
bool* flags) const {};
uint32_t column_id() const { return _column_id; }
+ virtual bool is_in_predicate() { return false; }
+
protected:
uint32_t _column_id;
bool _opposite;
diff --git a/be/src/olap/comparison_predicate.cpp
b/be/src/olap/comparison_predicate.cpp
index 598e7f3..a154a04 100644
--- a/be/src/olap/comparison_predicate.cpp
+++ b/be/src/olap/comparison_predicate.cpp
@@ -188,7 +188,7 @@ COMPARISON_PRED_COLUMN_EVALUATE(GreaterEqualPredicate, >=)
void CLASS<type>::evaluate_vec(vectorized::IColumn& column, uint16_t size,
bool* flags) const { \
if (column.is_nullable()) {
\
auto* nullable_column =
vectorized::check_and_get_column<vectorized::ColumnNullable>(column);
\
- auto& data_array = reinterpret_cast<const
vectorized::ColumnVector<type>&>(nullable_column->get_nested_column()).get_data();
\
+ auto& data_array = reinterpret_cast<const
vectorized::PredicateColumnType<type>&>(nullable_column->get_nested_column()).get_data();
\
auto& null_bitmap = reinterpret_cast<const
vectorized::ColumnVector<uint8_t>&>(*(nullable_column->get_null_map_column_ptr())).get_data();
\
for (uint16_t i = 0; i < size; i++) {
\
flags[i] = (data_array[i] OP _value) && (!null_bitmap[i]);
\
diff --git a/be/src/olap/in_list_predicate.cpp
b/be/src/olap/in_list_predicate.cpp
index c167a17..a17e157 100644
--- a/be/src/olap/in_list_predicate.cpp
+++ b/be/src/olap/in_list_predicate.cpp
@@ -20,6 +20,8 @@
#include "olap/field.h"
#include "runtime/string_value.hpp"
#include "runtime/vectorized_row_batch.h"
+#include "vec/columns/predicate_column.h"
+#include "vec/columns/column_nullable.h"
namespace doris {
@@ -115,6 +117,43 @@ IN_LIST_PRED_EVALUATE(NotInListPredicate, ==)
IN_LIST_PRED_COLUMN_BLOCK_EVALUATE(InListPredicate, !=)
IN_LIST_PRED_COLUMN_BLOCK_EVALUATE(NotInListPredicate, ==)
+#define IN_LIST_PRED_COLUMN_EVALUATE(CLASS, OP)
\
+ template <class type>
\
+ void CLASS<type>::evaluate(vectorized::IColumn& column, uint16_t* sel,
uint16_t* size) const { \
+ uint16_t new_size = 0;
\
+ if (column.is_nullable()) {
\
+ auto* nullable_column =
\
+
vectorized::check_and_get_column<vectorized::ColumnNullable>(column);
\
+ auto& null_bitmap = reinterpret_cast<const
vectorized::ColumnVector<uint8_t>&>(*( \
+ nullable_column->get_null_map_column_ptr())).get_data();
\
+ auto* nest_column_vector = vectorized::check_and_get_column
\
+
<vectorized::PredicateColumnType<type>>(nullable_column->get_nested_column());
\
+ auto& data_array = nest_column_vector->get_data();
\
+ for (uint16_t i = 0; i < *size; i++) {
\
+ uint16_t idx = sel[i];
\
+ sel[new_size] = idx;
\
+ const type& cell_value = reinterpret_cast<const
type&>(data_array[idx]); \
+ bool ret = !null_bitmap[idx] && (_values.find(cell_value) OP
_values.end()); \
+ new_size += _opposite ? !ret : ret;
\
+ }
\
+ *size = new_size;
\
+ } else {
\
+ auto& number_column =
reinterpret_cast<vectorized::PredicateColumnType<type>&>(column);\
+ auto& data_array = number_column.get_data();
\
+ for (uint16_t i = 0; i < *size; i++) {
\
+ uint16_t idx = sel[i];
\
+ sel[new_size] = idx;
\
+ const type& cell_value = reinterpret_cast<const
type&>(data_array[idx]); \
+ auto result = (_values.find(cell_value) OP _values.end());
\
+ new_size += _opposite ? !result : result;
\
+ }
\
+ }
\
+ *size = new_size;
\
+ }
+
+IN_LIST_PRED_COLUMN_EVALUATE(InListPredicate, !=)
+IN_LIST_PRED_COLUMN_EVALUATE(NotInListPredicate, ==)
+
#define IN_LIST_PRED_COLUMN_BLOCK_EVALUATE_OR(CLASS, OP)
\
template <class type>
\
void CLASS<type>::evaluate_or(ColumnBlock* block, uint16_t* sel, uint16_t
size, bool* flags) \
diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h
index cf9bf61..7cd237b 100644
--- a/be/src/olap/in_list_predicate.h
+++ b/be/src/olap/in_list_predicate.h
@@ -76,6 +76,8 @@ namespace doris {
class VectorizedRowBatch;
+// todo(wb) support evaluate_and,evaluate_or
+
#define IN_LIST_PRED_CLASS_DEFINE(CLASS)
\
template <class type>
\
class CLASS : public ColumnPredicate {
\
@@ -90,7 +92,10 @@ class VectorizedRowBatch;
virtual Status evaluate(const Schema& schema,
\
const std::vector<BitmapIndexIterator*>&
iterators, \
uint32_t num_rows, roaring::Roaring* bitmap)
const override; \
-
\
+ void evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t*
size) const override; \
+ void evaluate_and(vectorized::IColumn& column, uint16_t* sel, uint16_t
size, bool* flags) const override {} \
+ void evaluate_or(vectorized::IColumn& column, uint16_t* sel, uint16_t
size, bool* flags) const override {} \
+ bool is_in_predicate() override { return true; }
\
private:
\
phmap::flat_hash_set<type> _values;
\
};
diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
index 7825dcc..edd64db 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
@@ -257,15 +257,26 @@ Status BinaryDictPageDecoder::next_batch(size_t* n,
vectorized::MutableColumnPtr
const int32_t* data_array = reinterpret_cast<const
int32_t*>(_bit_shuffle_ptr->_chunk.data);
size_t start_index = _bit_shuffle_ptr->_cur_index;
- // todo(wb) support nullable
- if (dst->is_predicate_column()) {
+ auto* dst_col_ptr = dst.get();
+ if (dst->is_nullable()) {
+ auto nullable_column =
assert_cast<vectorized::ColumnNullable*>(dst.get());
+ dst_col_ptr = nullable_column->get_nested_column_ptr().get();
+
+ // fill null bitmap here, not null;
+ // todo(wb) using SIMD speed up here
+ for (int i = 0; i < max_fetch; i++) {
+ nullable_column->get_null_map_data().push_back(0);
+ }
+ }
+
+ if (dst_col_ptr->is_predicate_column()) {
// cast columnptr to columnstringvalue just for avoid virtual function
call overhead
- vectorized::ColumnStringValue& string_value_vector =
reinterpret_cast<vectorized::ColumnStringValue&>(*dst);
+ auto* string_value_column_ptr =
reinterpret_cast<vectorized::ColumnStringValue*>(dst_col_ptr);
for (int i = 0; i < max_fetch; i++, start_index++) {
int32_t codeword = data_array[start_index];
uint32_t start_offset = _start_offset_array[codeword];
uint32_t str_len = _len_array[codeword];
-
string_value_vector.insert_data(&_dict_decoder->_data[start_offset], str_len);
+
string_value_column_ptr->insert_data(&_dict_decoder->_data[start_offset],
str_len);
}
} else {
// todo(wb) research whether using batch memcpy to insert
columnString can has better performance when data set is big
@@ -273,7 +284,7 @@ Status BinaryDictPageDecoder::next_batch(size_t* n,
vectorized::MutableColumnPtr
int32_t codeword = data_array[start_index];
const uint32_t start_offset = _start_offset_array[codeword];
const uint32_t str_len = _len_array[codeword];
- dst->insert_data(&_dict_decoder->_data[start_offset], str_len);
+ dst_col_ptr->insert_data(&_dict_decoder->_data[start_offset],
str_len);
}
}
_bit_shuffle_ptr->_cur_index += max_fetch;
diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h
b/be/src/olap/rowset/segment_v2/binary_plain_page.h
index 25ce9e0..3c55d2b 100644
--- a/be/src/olap/rowset/segment_v2/binary_plain_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h
@@ -38,6 +38,8 @@
#include "runtime/mem_pool.h"
#include "util/coding.h"
#include "util/faststring.h"
+#include "vec/columns/column_complex.h"
+#include "vec/columns/column_nullable.h"
namespace doris {
namespace segment_v2 {
@@ -235,20 +237,46 @@ public:
}
const size_t max_fetch = std::min(*n, static_cast<size_t>(_num_elems -
_cur_idx));
- if (dst->is_predicate_column()) {
+ auto* dst_col_ptr = dst.get();
+ if (dst->is_nullable()) {
+ auto nullable_column =
assert_cast<vectorized::ColumnNullable*>(dst.get());
+ dst_col_ptr = nullable_column->get_nested_column_ptr().get();
+ // fill null bitmap here, not null;
+ for (int i = 0; i < max_fetch; i++) {
+ nullable_column->get_null_map_data().push_back(0);
+ }
+ }
+
+ if (dst_col_ptr->is_bitmap()) {
+ auto& bitmap_column =
reinterpret_cast<vectorized::ColumnBitmap&>(*dst_col_ptr);
+ for (size_t i = 0; i < max_fetch; i++, _cur_idx++) {
+ const uint32_t start_offset = offset(_cur_idx);
+ uint32_t len = offset(_cur_idx + 1) - start_offset;
+
+ bitmap_column.insert_default();
+ BitmapValue* pvalue =
&bitmap_column.get_element(bitmap_column.size() - 1);
+ if (len != 0) {
+ BitmapValue value;
+ value.deserialize(&_data[start_offset]);
+ *pvalue = std::move(value);
+ } else {
+ *pvalue =
std::move(*reinterpret_cast<BitmapValue*>(const_cast<char*>(&_data[start_offset])));
+ }
+ }
+ } else if (dst_col_ptr->is_predicate_column()) {
// todo(wb) padding sv here for better comparison performance
for (size_t i = 0; i < max_fetch; i++, _cur_idx++) {
const uint32_t start_offset = offset(_cur_idx);
uint32_t len = offset(_cur_idx + 1) - start_offset;
StringValue sv(const_cast<char*>(&_data[start_offset]), len);
- dst->insert_data(reinterpret_cast<char*>(&sv), 0);
+ dst_col_ptr->insert_data(reinterpret_cast<char*>(&sv), 0);
}
} else {
for (size_t i = 0; i < max_fetch; i++, _cur_idx++) {
// todo(wb) need more test case and then improve here
const uint32_t start_offset = offset(_cur_idx);
uint32_t len = offset(_cur_idx + 1) - start_offset;
- dst->insert_data(&_data[start_offset], len);
+ dst_col_ptr->insert_data(&_data[start_offset], len);
}
}
diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_page.h
b/be/src/olap/rowset/segment_v2/bitshuffle_page.h
index 5afbb7d..83eebae 100644
--- a/be/src/olap/rowset/segment_v2/bitshuffle_page.h
+++ b/be/src/olap/rowset/segment_v2/bitshuffle_page.h
@@ -37,6 +37,8 @@
#include "util/coding.h"
#include "util/faststring.h"
#include "util/slice.h"
+#include "vec/runtime/vdatetime_value.h"
+#include "vec/columns/column_nullable.h"
namespace doris {
namespace segment_v2 {
@@ -359,17 +361,28 @@ public:
int begin = _cur_index;
int end = _cur_index + max_fetch;
-
+
+ auto* dst_col_ptr = dst.get();
+ if (dst->is_nullable()) {
+ auto nullable_column =
assert_cast<vectorized::ColumnNullable*>(dst.get());
+ dst_col_ptr = nullable_column->get_nested_column_ptr().get();
+
+ // fill null bitmap here, not null;
+ for (int j = begin; j < end; j++) {
+ nullable_column->get_null_map_data().push_back(0);
+ }
+ }
+
// todo(wb) Try to eliminate type judgment in pagedecoder
- if (dst->is_column_decimal()) { // decimal non-predicate column
+ if (dst_col_ptr->is_column_decimal()) { // decimal non-predicate column
for (; begin < end; begin++) {
const char* cur_ptr = (const char*)&_chunk.data[begin *
SIZE_OF_TYPE];
int64_t int_value = *(int64_t*)(cur_ptr);
int32_t frac_value = *(int32_t*)(cur_ptr + sizeof(int64_t));
DecimalV2Value data(int_value, frac_value);
- dst->insert_data(reinterpret_cast<char*>(&data), 0);
+ dst_col_ptr->insert_data(reinterpret_cast<char*>(&data), 0);
}
- } else if (dst->is_date_type()) {
+ } else if (dst_col_ptr->is_date_type()) {
for (; begin < end; begin++) {
const char* cur_ptr = (const char*)&_chunk.data[begin *
SIZE_OF_TYPE];
uint64_t value = 0;
@@ -378,14 +391,21 @@ public:
value |= *(unsigned char*)(cur_ptr + 1);
value <<= 8;
value |= *(unsigned char*)(cur_ptr);
- DateTimeValue date;
+ vectorized::VecDateTimeValue date;
date.from_olap_date(value);
- dst->insert_data(reinterpret_cast<char*>(&date), 0);
+ dst_col_ptr->insert_data(reinterpret_cast<char*>(&date), 0);
+ }
+ } else if (dst_col_ptr->is_datetime_type()) {
+ for (; begin < end; begin++) {
+ const char* cur_ptr = (const char*)&_chunk.data[begin *
SIZE_OF_TYPE];
+ uint64_t value = *reinterpret_cast<const uint64_t*>(cur_ptr);
+ vectorized::VecDateTimeValue date(value);
+ dst_col_ptr->insert_data(reinterpret_cast<char*>(&date), 0);
}
} else {
- // todo(wb) need performance test here, may be batch memory copy?
+ // todo(wb) batch insert here
for (; begin < end; begin++) {
- dst->insert_data((const char*)&_chunk.data[begin *
SIZE_OF_TYPE], 0);
+ dst_col_ptr->insert_data((const char*)&_chunk.data[begin *
SIZE_OF_TYPE], 0);
}
}
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index d19f56e..86a89ac 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -30,6 +30,7 @@
#include "olap/rowset/segment_v2/segment.h"
#include "olap/short_key_index.h"
#include "util/doris_metrics.h"
+#include "olap/in_list_predicate.h"
using strings::Substitute;
@@ -120,7 +121,7 @@ Status SegmentIterator::init(const StorageReadOptions&
opts) {
return Status::OK();
}
-Status SegmentIterator::_init() {
+Status SegmentIterator::_init(bool is_vec) {
DorisMetrics::instance()->segment_read_total->increment(1);
// get file handle from file descriptor of segment
fs::BlockManager* block_mgr =
fs::fs_util::block_manager(_segment->_path_desc.storage_medium);
@@ -133,7 +134,11 @@ Status SegmentIterator::_init() {
RETURN_IF_ERROR(_get_row_ranges_by_keys());
}
RETURN_IF_ERROR(_get_row_ranges_by_column_conditions());
- _init_lazy_materialization();
+ if (is_vec) {
+ _vec_init_lazy_materialization();
+ } else {
+ _init_lazy_materialization();
+ }
_range_iter.reset(new BitmapRangeIterator(_row_bitmap));
return Status::OK();
}
@@ -581,9 +586,358 @@ Status SegmentIterator::next_batch(RowBlockV2* block) {
return Status::OK();
}
+/* ---------------------- for vecterization implementation
---------------------- */
+
+// todo(wb) need a UT here
+void SegmentIterator::_vec_init_lazy_materialization() {
+ _is_pred_column.resize(_schema.columns().size(), false);
+
+ std::set<ColumnId> pred_column_ids; // including short_cir_pred_col_id_set
and vec_pred_col_id_set
+ _is_all_column_basic_type = true;
+ bool is_predicate_column_exists = false;
+ bool is_non_predicate_column_exists = false;
+
+ if (!_col_predicates.empty()) {
+ is_predicate_column_exists = true;
+
+ std::set<ColumnId> short_cir_pred_col_id_set; // using set for
distinct cid
+ std::set<ColumnId> vec_pred_col_id_set;
+
+ for (auto predicate : _col_predicates) {
+ auto cid = predicate->column_id();
+ FieldType type = _schema.column(cid)->type();
+ _is_pred_column[cid] = true;
+ pred_column_ids.insert(cid);
+
+ // for date type which can not be executed in a vectorized way,
using short circuit execution
+ if (type == OLAP_FIELD_TYPE_VARCHAR || type ==
OLAP_FIELD_TYPE_CHAR || type == OLAP_FIELD_TYPE_DECIMAL
+ || type == OLAP_FIELD_TYPE_DATE ||
predicate->is_in_predicate()) {
+ short_cir_pred_col_id_set.insert(cid);
+ _short_cir_eval_predicate.push_back(predicate);
+ _is_all_column_basic_type = false;
+ } else {
+ vec_pred_col_id_set.insert(predicate->column_id());
+ if (_pre_eval_block_predicate == nullptr) {
+ _pre_eval_block_predicate = new AndBlockColumnPredicate();
+ }
+
reinterpret_cast<MutilColumnBlockPredicate*>(_pre_eval_block_predicate)->add_column_predicate(new
SingleColumnBlockPredicate(predicate));
+ }
+ }
+
+ std::set<ColumnId> del_cond_id_set;
+
_opts.delete_condition_predicates.get()->get_all_column_ids(del_cond_id_set);
+ short_cir_pred_col_id_set.insert(del_cond_id_set.begin(),
del_cond_id_set.end());
+ pred_column_ids.insert(del_cond_id_set.begin(), del_cond_id_set.end());
+
+ if (_schema.column_ids().size() > pred_column_ids.size()) {
+ for (auto cid : _schema.column_ids()) {
+ if (!_is_pred_column[cid]) {
+ _non_predicate_columns.push_back(cid);
+ is_non_predicate_column_exists = true;
+ }
+ }
+ }
+
+ _vec_pred_column_ids.assign(vec_pred_col_id_set.cbegin(),
vec_pred_col_id_set.cend());
+ _short_cir_pred_column_ids.assign(short_cir_pred_col_id_set.cbegin(),
short_cir_pred_col_id_set.cend());
+ } else {
+ _is_all_column_basic_type = false;
+ is_non_predicate_column_exists = true;
+ for (auto cid : _schema.column_ids()) {
+ _non_predicate_columns.push_back(cid);
+ }
+ }
+
+ // note(wb) in following cases we disable lazy materialization
+ // case 1: when all column is basic type(is_all_column_basic_type = true)
+ // because we think `seek and read` cost > read page cost, lazy
materialize may cause more `seek and read`, so disable it
+ // case 2: all column is predicate column
+ // case 3: all column is not predicate column
+ // todo(wb) need further research more lazy materialization rule, such as
get more info from `statistics` for better decision
+ if (_is_all_column_basic_type) {
+ std::set<ColumnId> pred_set(_vec_pred_column_ids.begin(),
_vec_pred_column_ids.end());
+ std::set<ColumnId> non_pred_set(_non_predicate_columns.begin(),
_non_predicate_columns.end());
+
+ // when _is_all_column_basic_type = true, _first_read_column_ids
should keep the same order with _schema.column_ids which stands for return
column order
+ for (int i = 0; i < _schema.num_column_ids(); i++) {
+ auto cid = _schema.column_ids()[i];
+ if (pred_set.find(cid) != pred_set.end()) {
+ _first_read_column_ids.push_back(cid);
+ } else if (non_pred_set.find(cid) != non_pred_set.end()) {
+ _first_read_column_ids.push_back(cid);
+ _is_pred_column[cid] = true; // in this case, non-predicate
column should also be filtered by sel idx, so we regard it as pred columns
+ }
+ }
+
+ } else if (is_predicate_column_exists && !is_non_predicate_column_exists) {
+ _first_read_column_ids.assign(pred_column_ids.cbegin(),
pred_column_ids.cend());
+ } else if (!is_predicate_column_exists && is_non_predicate_column_exists) {
+ for (auto cid : _non_predicate_columns) {
+ _first_read_column_ids.push_back(cid);
+ }
+ } else {
+ _lazy_materialization_read = true;
+ _first_read_column_ids.assign(pred_column_ids.cbegin(),
pred_column_ids.cend());
+ }
+
+ // make _schema_block_id_map
+ _schema_block_id_map.resize(_schema.columns().size());
+ for (int i = 0; i < _schema.num_column_ids(); i++) {
+ auto cid = _schema.column_ids()[i];
+ _schema_block_id_map[cid] = i;
+ }
+
+}
+
+Status SegmentIterator::_read_columns(const std::vector<ColumnId>& column_ids,
vectorized::MutableColumns& column_block, size_t nrows) {
+ for (auto cid : column_ids) {
+ auto& column = column_block[cid];
+ size_t rows_read = nrows;
+ RETURN_IF_ERROR(_column_iterators[cid]->next_batch(&rows_read,
column));
+ DCHECK_EQ(nrows, rows_read);
+ }
+ return Status::OK();
+}
+
+void SegmentIterator::_init_current_block(vectorized::Block* block,
std::vector<vectorized::MutableColumnPtr>& current_columns) {
+ bool is_block_mem_reuse= block->mem_reuse();
+ if (is_block_mem_reuse) {
+ size_t column_to_keep = _schema.num_column_ids();
+ for (int i = block->columns() - 1; i >= column_to_keep; i--) {
+ block->erase(i);
+ }
+ block->clear_column_data();
+ } else { // pre fill output block here
+ for (size_t i = 0; i < _schema.num_column_ids(); i++) {
+ auto cid = _schema.column_ids()[i];
+ auto* column_desc = _schema.columns()[cid];
+ auto data_type = Schema::get_data_type_ptr(column_desc->type());
+ if (column_desc->is_nullable()) {
+ block->insert({nullptr,
std::make_shared<vectorized::DataTypeNullable>(std::move(data_type)),
column_desc->name()});
+ } else {
+ block->insert({nullptr, std::move(data_type),
column_desc->name()});
+ }
+ }
+ }
+
+ for (size_t i = 0; i < _schema.num_column_ids(); i++) {
+ auto cid = _schema.column_ids()[i];
+ if (_is_pred_column[cid]) { //todo(wb) maybe we can relase it after
output block
+ current_columns[cid]->clear();
+ } else { // non-predicate column
+ auto &column_desc = _schema.columns()[cid];
+ if (is_block_mem_reuse) {
+ current_columns[cid] =
std::move(*block->get_by_position(i).column).mutate();
+ } else {
+ auto data_type =
Schema::get_data_type_ptr(column_desc->type());
+ if (column_desc->is_nullable()) {
+ current_columns[cid] =
doris::vectorized::ColumnNullable::create(
+ std::move(data_type->create_column()),
doris::vectorized::ColumnUInt8::create());
+ } else {
+ current_columns[cid] = data_type->create_column();
+ }
+ }
+ if (column_desc->type() == OLAP_FIELD_TYPE_DATE) {
+ current_columns[cid]->set_date_type();
+ } else if (column_desc->type() == OLAP_FIELD_TYPE_DATETIME) {
+ current_columns[cid]->set_datetime_type();
+ }
+ }
+ }
+}
+
+void SegmentIterator::_output_non_pred_columns(vectorized::Block* block, bool
is_block_mem_reuse) {
+ for (auto cid : _non_predicate_columns) {
+ block->replace_by_position(_schema_block_id_map[cid],
std::move(_current_return_columns[cid]));
+ }
+ }
+
+void SegmentIterator::_output_column_by_sel_idx(vectorized::Block* block,
std::vector<ColumnId> columnIds,
+ uint16_t* sel_rowid_idx, uint16_t select_size, bool
is_block_mem_reuse) {
+ for (auto cid : columnIds) {
+ auto &column_ptr = _current_return_columns[cid];
+ if (is_block_mem_reuse) {
+ column_ptr->filter_by_selector(sel_rowid_idx, select_size,
+ &block->get_by_position(_schema_block_id_map[cid]).column);
+ } else {
+ block->replace_by_position(_schema_block_id_map[cid],
+ (*column_ptr).get_ptr()->filter_by_selector(sel_rowid_idx,
select_size));
+ }
+ }
+ }
+
+
+Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit,
uint32_t& nrows_read, bool set_block_rowid) {
+ do {
+ uint32_t range_from;
+ uint32_t range_to;
+ bool has_next_range =
+ _range_iter->next_range(nrows_read_limit - nrows_read,
&range_from, &range_to);
+ if (!has_next_range) {
+ break;
+ }
+ if (_cur_rowid == 0 || _cur_rowid != range_from) {
+ _cur_rowid = range_from;
+ RETURN_IF_ERROR(_seek_columns(_first_read_column_ids, _cur_rowid));
+ }
+ size_t rows_to_read = range_to - range_from;
+ RETURN_IF_ERROR(_read_columns(_first_read_column_ids,
_current_return_columns, rows_to_read));
+ _cur_rowid += rows_to_read;
+ if (set_block_rowid) {
+ for (uint32_t rid = range_from; rid < range_to; rid++) {
+ _block_rowids[nrows_read++] = rid;
+ }
+ } else {
+ nrows_read += rows_to_read;
+ }
+ } while (nrows_read < nrows_read_limit);
+ return Status::OK();
+}
+
+void SegmentIterator::_evaluate_vectorization_predicate(uint16_t*
sel_rowid_idx, uint16_t& selected_size) {
+ uint16_t new_size = 0;
+ if (_vec_pred_column_ids.size() == 0) {
+ for (uint32_t i = 0; i < selected_size; ++i) {
+ sel_rowid_idx[new_size++] = i;
+ }
+ return;
+ }
+
+ uint16_t original_size = selected_size;
+ bool ret_flags[selected_size];
+ memset(ret_flags, 1, selected_size);
+ _pre_eval_block_predicate->evaluate_vec(_current_return_columns,
selected_size, ret_flags);
+
+ for (uint32_t i = 0; i < selected_size; ++i) {
+ if (ret_flags[i]) {
+ sel_rowid_idx[new_size++] = i;
+ }
+ }
+
+ _opts.stats->rows_vec_cond_filtered += original_size - new_size;
+ selected_size = new_size;
+}
+
+void SegmentIterator::_evaluate_short_circuit_predicate(uint16_t*
vec_sel_rowid_idx, uint16_t* selected_size_ptr) {
+ if (_short_cir_pred_column_ids.size() == 0) {
+ return;
+ }
+
+ for (auto column_predicate : _short_cir_eval_predicate) {
+ auto column_id = column_predicate->column_id();
+ auto& short_cir_column = _current_return_columns[column_id];
+ column_predicate->evaluate(*short_cir_column, vec_sel_rowid_idx,
selected_size_ptr);
+ }
+
+ // evaluate delete condition
+ _opts.delete_condition_predicates->evaluate(_current_return_columns,
vec_sel_rowid_idx, selected_size_ptr);
+}
+
+void SegmentIterator::_read_columns_by_rowids(std::vector<ColumnId>&
read_column_ids, std::vector<rowid_t>& rowid_vector,
+ uint16_t* sel_rowid_idx, size_t select_size,
vectorized::MutableColumns* mutable_columns) {
+ size_t start_idx = 0;
+ while (start_idx < select_size) {
+ size_t end_idx = start_idx + 1;
+ while (end_idx < select_size && (rowid_vector[sel_rowid_idx[end_idx -
1]] == rowid_vector[sel_rowid_idx[end_idx]] - 1)) {
+ end_idx++;
+ }
+ size_t range = end_idx - start_idx;
+ _seek_columns(read_column_ids, rowid_vector[sel_rowid_idx[start_idx]]);
+ _read_columns(read_column_ids, *mutable_columns, range);
+ start_idx += range;
+ }
+}
+
Status SegmentIterator::next_batch(vectorized::Block* block) {
- //TODO
- return Status::NotSupported("not implement now");
+ bool is_mem_reuse = block->mem_reuse();
+ SCOPED_RAW_TIMER(&_opts.stats->block_load_ns);
+ if (UNLIKELY(!_inited)) {
+ RETURN_IF_ERROR(_init(true));
+ _inited = true;
+ if (_vec_pred_column_ids.size() > 0 ||
_short_cir_pred_column_ids.size() > 0) {
+ _block_rowids.reserve(_opts.block_row_max);
+ }
+ _current_return_columns.resize(_schema.columns().size());
+ for (size_t i = 0; i < _schema.num_column_ids(); i++) {
+ auto cid = _schema.column_ids()[i];
+ if (_is_pred_column[cid]) {
+ auto& column_desc = _schema.columns()[cid];
+ _current_return_columns[cid] =
Schema::get_predicate_column_nullable_ptr(column_desc->type(),
column_desc->is_nullable());
+ _current_return_columns[cid]->reserve(_opts.block_row_max);
+ }
+ }
+ }
+
+ _init_current_block(block, _current_return_columns);
+
+ uint32_t nrows_read = 0;
+ uint32_t nrows_read_limit = _opts.block_row_max;
+ _read_columns_by_index(nrows_read_limit, nrows_read,
_col_predicates.size() > 0);
+
+ _opts.stats->blocks_load += 1;
+ _opts.stats->raw_rows_read += nrows_read;
+
+ if (nrows_read == 0) {
+ for (int i = 0; i < _schema.num_column_ids(); i++) {
+ auto cid = _schema.column_ids()[i];
+ // todo(wb) abstract make column where
+ if (!_is_pred_column[cid]) { // non-predicate
+ block->replace_by_position(i,
std::move(_current_return_columns[cid]));
+ } else { // predicate
+ if (!is_mem_reuse) {
+ auto* column_desc = _schema.columns()[cid];
+ auto data_type =
Schema::get_data_type_ptr(column_desc->type());
+ block->replace_by_position(i, data_type->create_column());
+ }
+ }
+ // not sure whether block is clear before enter segmentIter, so
clear it here.
+ if (is_mem_reuse) {
+ block->clear_column_data();
+ }
+ }
+ return Status::EndOfFile("no more data in segment");
+ }
+
+ // when no predicate(include delete condition) is provided, output column
directly
+ if (_vec_pred_column_ids.size() == 0 && _short_cir_pred_column_ids.size()
== 0) {
+ _output_non_pred_columns(block, is_mem_reuse);
+ } else { // need predicate evaluation
+ uint16_t selected_size = nrows_read;
+ uint16_t sel_rowid_idx[selected_size];
+
+ // step 1: evaluate vectorization predicate
+ _evaluate_vectorization_predicate(sel_rowid_idx, selected_size);
+
+ // When predicate column and no-predicate column are both basic type,
lazy materialization is eliminate
+ // So output block directly after vecorization evaluation
+ if (_is_all_column_basic_type) {
+ _output_column_by_sel_idx(block, _first_read_column_ids,
sel_rowid_idx, selected_size, is_mem_reuse);
+ return Status::OK();
+ }
+
+ // step 2: evaluate short ciruit predicate
+ // todo(wb) research whether need to read short predicate after
vectorization evaluation
+ // to reduce cost of read short circuit columns.
+ // In SSB test, it make no difference; So need more scenarios
to test
+ _evaluate_short_circuit_predicate(sel_rowid_idx, &selected_size);
+
+ // step3: read non_predicate column
+ if (_non_predicate_columns.size() != 0) {
+ _read_columns_by_rowids(_non_predicate_columns, _block_rowids,
sel_rowid_idx, selected_size, &_current_return_columns);
+ }
+
+ // step4: output columns
+ // 4.1 output non-predicate column
+ _output_non_pred_columns(block, is_mem_reuse);
+
+ // 4.2 output short circuit predicate column
+ _output_column_by_sel_idx(block, _short_cir_pred_column_ids,
sel_rowid_idx, selected_size, is_mem_reuse);
+ // 4.3 output vectorizatioin predicate column
+ _output_column_by_sel_idx(block, _vec_pred_column_ids, sel_rowid_idx,
selected_size, is_mem_reuse);
+
+ }
+
+ return Status::OK();
}
} // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h
b/be/src/olap/rowset/segment_v2/segment_iterator.h
index fabefd5..c06077e 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -60,7 +60,7 @@ public:
uint64_t data_id() const { return _segment->id(); }
private:
- Status _init();
+ Status _init(bool is_vec = false);
Status _init_return_column_iterators();
Status _init_bitmap_index_iterators();
@@ -78,6 +78,7 @@ private:
Status _apply_bitmap_index();
void _init_lazy_materialization();
+ void _vec_init_lazy_materialization();
uint32_t segment_id() const { return _segment->id(); }
uint32_t num_rows() const { return _segment->num_rows(); }
@@ -87,6 +88,17 @@ private:
Status _read_columns(const std::vector<ColumnId>& column_ids, RowBlockV2*
block,
size_t row_offset, size_t nrows);
+ // for vectorization implementation
+ Status _read_columns(const std::vector<ColumnId>& column_ids,
vectorized::MutableColumns& column_block, size_t nrows);
+ Status _read_columns_by_index(uint32_t nrows_read_limit, uint32_t&
nrows_read, bool set_block_rowid);
+ void _init_current_block(vectorized::Block* block,
std::vector<vectorized::MutableColumnPtr>& non_pred_vector);
+ void _evaluate_vectorization_predicate(uint16_t* sel_rowid_idx, uint16_t&
selected_size);
+ void _evaluate_short_circuit_predicate(uint16_t* sel_rowid_idx, uint16_t*
selected_size);
+ void _output_non_pred_columns(vectorized::Block* block, bool
is_block_mem_reuse);
+ void _output_column_by_sel_idx(vectorized::Block* block,
std::vector<ColumnId> columnids, uint16_t* sel_rowid_idx, uint16_t select_size,
bool is_block_mem_reuse);
+ void _read_columns_by_rowids(std::vector<ColumnId>& read_column_ids,
std::vector<rowid_t>& rowid_vector,
+ uint16_t* sel_rowid_idx, size_t select_size,
vectorized::MutableColumns* mutable_columns);
+
private:
class BitmapRangeIterator;
@@ -116,6 +128,21 @@ private:
// could be a local variable of next_batch(), kept here to reuse vector
memory
std::vector<rowid_t> _block_rowids;
+ // fields for vectorization execution
+ bool _is_all_column_basic_type;
+ std::vector<ColumnId> _vec_pred_column_ids; // keep columnId of columns
for vectorized predicate evaluation
+ std::vector<ColumnId> _short_cir_pred_column_ids; // keep columnId of
columns for short circuit predicate evaluation
+ vector<bool> _is_pred_column; // columns hold by segmentIter
+ vectorized::MutableColumns _current_return_columns;
+ AndBlockColumnPredicate* _pre_eval_block_predicate = nullptr;
+ std::vector<ColumnPredicate*> _short_cir_eval_predicate;
+ // when lazy materialization is enable, segmentIter need to read data at
least twice
+ // first, read predicate columns by various index
+ // second, read non-predicate columns
+ // so we need a field to stand for columns first time to read
+ vector<ColumnId> _first_read_column_ids;
+ vector<int> _schema_block_id_map; // map from schema column id to column
idx in Block
+
// the actual init process is delayed to the first call to next_batch()
bool _inited;
diff --git a/be/src/olap/schema.cpp b/be/src/olap/schema.cpp
index d588b58..f5a0900 100644
--- a/be/src/olap/schema.cpp
+++ b/be/src/olap/schema.cpp
@@ -18,6 +18,11 @@
#include "olap/schema.h"
#include "olap/row_block2.h"
+#include "vec/columns/column_nullable.h"
+#include "vec/columns/column_complex.h"
+#include "vec/columns/predicate_column.h"
+#include "vec/core/types.h"
+#include "olap/uint24.h"
namespace doris {
@@ -105,6 +110,9 @@ Schema::~Schema() {
vectorized::DataTypePtr Schema::get_data_type_ptr(FieldType type) {
switch (type) {
+ case OLAP_FIELD_TYPE_BOOL:
+ return std::make_shared<vectorized::DataTypeUInt8>();
+
case OLAP_FIELD_TYPE_TINYINT:
return std::make_shared<vectorized::DataTypeInt8>();
@@ -135,7 +143,11 @@ vectorized::DataTypePtr
Schema::get_data_type_ptr(FieldType type) {
case OLAP_FIELD_TYPE_CHAR:
case OLAP_FIELD_TYPE_VARCHAR:
case OLAP_FIELD_TYPE_HLL:
+ case OLAP_FIELD_TYPE_STRING:
return std::make_shared<vectorized::DataTypeString>();
+
+ case TYPE_OBJECT:
+ return std::make_shared<vectorized::DataTypeBitMap>();
case OLAP_FIELD_TYPE_DECIMAL:
return
std::make_shared<vectorized::DataTypeDecimal<vectorized::Decimal128>>(27, 9);
@@ -147,4 +159,58 @@ vectorized::DataTypePtr
Schema::get_data_type_ptr(FieldType type) {
return nullptr;
}
+vectorized::IColumn::MutablePtr
Schema::get_predicate_column_nullable_ptr(FieldType type, bool is_null) {
+ vectorized::IColumn::MutablePtr ptr =
Schema::get_predicate_column_ptr(type);
+ if (is_null) {
+ return doris::vectorized::ColumnNullable::create(std::move(ptr),
doris::vectorized::ColumnUInt8::create());
+ }
+ return ptr;
+}
+
+vectorized::IColumn::MutablePtr Schema::get_predicate_column_ptr(FieldType
type) {
+ switch (type) {
+ case OLAP_FIELD_TYPE_BOOL:
+ return doris::vectorized::PredicateColumnType<bool>::create();;
+ case OLAP_FIELD_TYPE_TINYINT:
+ return
doris::vectorized::PredicateColumnType<doris::vectorized::Int8>::create();
+
+ case OLAP_FIELD_TYPE_SMALLINT:
+ return
doris::vectorized::PredicateColumnType<doris::vectorized::Int16>::create();
+
+ case OLAP_FIELD_TYPE_INT:
+ return
doris::vectorized::PredicateColumnType<doris::vectorized::Int32>::create();
+
+ case OLAP_FIELD_TYPE_FLOAT:
+ return
doris::vectorized::PredicateColumnType<doris::vectorized::Float32>::create();
+
+ case OLAP_FIELD_TYPE_DOUBLE:
+ return
doris::vectorized::PredicateColumnType<doris::vectorized::Float64>::create();
+
+ case OLAP_FIELD_TYPE_BIGINT:
+ return
doris::vectorized::PredicateColumnType<doris::vectorized::Int64>::create();
+
+ case OLAP_FIELD_TYPE_LARGEINT:
+ return
doris::vectorized::PredicateColumnType<doris::vectorized::Int128>::create();
+
+ case OLAP_FIELD_TYPE_DATE:
+ return doris::vectorized::PredicateColumnType<uint24_t>::create();
+
+ case OLAP_FIELD_TYPE_DATETIME:
+ return doris::vectorized::PredicateColumnType<uint64_t>::create();
+
+ case OLAP_FIELD_TYPE_CHAR:
+ case OLAP_FIELD_TYPE_VARCHAR:
+ case OLAP_FIELD_TYPE_STRING:
+ return
doris::vectorized::PredicateColumnType<StringValue>::create();
+
+ case OLAP_FIELD_TYPE_DECIMAL:
+ return
doris::vectorized::PredicateColumnType<decimal12_t>::create();
+
+ default:
+ DCHECK(false);
+ }
+ // For llvm complain
+ return nullptr;
+}
+
} // namespace doris
diff --git a/be/src/olap/schema.h b/be/src/olap/schema.h
index 3fd90bf..2596f97 100644
--- a/be/src/olap/schema.h
+++ b/be/src/olap/schema.h
@@ -102,6 +102,10 @@ public:
static vectorized::DataTypePtr get_data_type_ptr(FieldType type);
+ static vectorized::IColumn::MutablePtr get_predicate_column_ptr(FieldType
type);
+
+ static vectorized::IColumn::MutablePtr
get_predicate_column_nullable_ptr(FieldType type, bool is_null = false);
+
const std::vector<Field*>& columns() const { return _cols; }
const Field* column(ColumnId cid) const { return _cols[cid]; }
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index 5e41028..a869a65 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -215,7 +215,7 @@ public:
* used by lazy materialization to filter column by selected rowids
*/
virtual Ptr filter_by_selector(const uint16_t* sel, size_t sel_size, Ptr*
ptr = nullptr) {
- return nullptr;
+ LOG(FATAL) << "column not support filter_by_selector";
};
/// Permutes elements using specified permutation. Is used in sortings.
@@ -333,6 +333,8 @@ public:
/// True if column contains something nullable inside. It's true for
ColumnNullable, can be true or false for ColumnConst, etc.
virtual bool is_nullable() const { return false; }
+ virtual bool is_bitmap() const { return false; }
+
// true iff column has null element
virtual bool has_null() const { return false; }
@@ -414,10 +416,15 @@ public:
// only used in ColumnNullable replace_column_data
virtual void replace_column_data_default(size_t self_row = 0) = 0;
- bool is_date_type() { return is_date; }
+ virtual bool is_date_type() { return is_date; }
+ virtual bool is_datetime_type() { return is_date_time; }
+
+ virtual void set_date_type() { is_date = true; }
+ virtual void set_datetime_type() { is_date_time = true; }
- // todo(wb): a temporary implemention, need refactor here
+ // todo(wb): a temporary implemention, need re-abstract here
bool is_date = false;
+ bool is_date_time = false;
protected:
/// Template is to devirtualize calls to insert_from method.
diff --git a/be/src/vec/columns/column_complex.h
b/be/src/vec/columns/column_complex.h
index 63a120a..296f94b 100644
--- a/be/src/vec/columns/column_complex.h
+++ b/be/src/vec/columns/column_complex.h
@@ -45,6 +45,8 @@ public:
bool is_numeric() const override { return false; }
+ bool is_bitmap() const override { return std::is_same_v<T, BitmapValue>; }
+
size_t size() const override { return data.size(); }
StringRef get_data_at(size_t n) const override {
diff --git a/be/src/vec/columns/column_nullable.cpp
b/be/src/vec/columns/column_nullable.cpp
index bee6909..bf4bb44 100644
--- a/be/src/vec/columns/column_nullable.cpp
+++ b/be/src/vec/columns/column_nullable.cpp
@@ -177,9 +177,18 @@ ColumnPtr ColumnNullable::filter(const Filter& filt,
ssize_t result_size_hint) c
}
ColumnPtr ColumnNullable::filter_by_selector(const uint16_t* sel, size_t
sel_size, ColumnPtr* ptr) {
- ColumnPtr filtered_data = get_nested_column().filter_by_selector(sel,
sel_size, ptr);
- ColumnPtr filtered_null_map =
get_null_map_column().filter_by_selector(sel, sel_size, ptr);
- return ColumnNullable::create(filtered_data, filtered_null_map);
+ if (ptr != nullptr) {
+ const ColumnNullable* nullable_col_ptr = reinterpret_cast<const
ColumnNullable*>((*ptr).get());
+ ColumnPtr nest_col_ptr = nullable_col_ptr->nested_column;
+ ColumnPtr null_map_ptr = nullable_col_ptr->null_map;
+ get_nested_column().filter_by_selector(sel, sel_size, &nest_col_ptr);
+ get_null_map_column().filter_by_selector(sel, sel_size, &null_map_ptr);
+ return *ptr;
+ } else {
+ ColumnPtr filtered_data = get_nested_column().filter_by_selector(sel,
sel_size);
+ ColumnPtr filtered_null_map =
get_null_map_column().filter_by_selector(sel, sel_size);
+ return ColumnNullable::create(filtered_data, filtered_null_map);
+ }
}
ColumnPtr ColumnNullable::permute(const Permutation& perm, size_t limit) const
{
diff --git a/be/src/vec/columns/column_nullable.h
b/be/src/vec/columns/column_nullable.h
index 8f47777..fb6aa8f 100644
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@@ -130,7 +130,13 @@ public:
return false;
}
+ bool is_date_type() override { return get_nested_column().is_date_type(); }
+ bool is_datetime_type() override { return
get_nested_column().is_datetime_type(); }
+ void set_date_type() override { get_nested_column().set_date_type(); }
+ void set_datetime_type() override {
get_nested_column().set_datetime_type(); }
+
bool is_nullable() const override { return true; }
+ bool is_bitmap() const override { return get_nested_column().is_bitmap(); }
bool is_column_decimal() const override { return
get_nested_column().is_column_decimal(); }
bool is_column_string() const override { return
get_nested_column().is_column_string(); }
bool is_fixed_and_contiguous() const override { return false; }
diff --git a/be/src/vec/columns/column_vector.h
b/be/src/vec/columns/column_vector.h
index 5323f60..6626229 100644
--- a/be/src/vec/columns/column_vector.h
+++ b/be/src/vec/columns/column_vector.h
@@ -119,6 +119,13 @@ private:
/// Sugar constructor.
ColumnVector(std::initializer_list<T> il) : data {il} {}
+ void insert_res_column(const uint16_t* sel, size_t sel_size,
vectorized::ColumnVector<T>* res_ptr) {
+ for (size_t i = 0; i < sel_size; i++) {
+ T* val_ptr = &data[sel[i]];
+ res_ptr->insert_data((char*)val_ptr, 0);
+ }
+ }
+
public:
bool is_numeric() const override { return IsNumber<T>; }
@@ -195,6 +202,25 @@ public:
ColumnPtr filter(const IColumn::Filter& filt, ssize_t result_size_hint)
const override;
+ // note(wb) this method is only used in storage layer now
+ ColumnPtr filter_by_selector(const uint16_t* sel, size_t sel_size,
ColumnPtr* ptr = nullptr) override {
+ if (ptr == nullptr) {
+ auto res_ptr = vectorized::ColumnVector<T>::create();
+ if (sel_size == 0) {
+ return res_ptr;
+ }
+ insert_res_column(sel, sel_size, res_ptr.get());
+ return res_ptr;
+ } else {
+ auto res_ptr = (*std::move(*ptr)).assume_mutable();
+ if (sel_size == 0) {
+ return res_ptr;
+ }
+ insert_res_column(sel, sel_size,
reinterpret_cast<vectorized::ColumnVector<T>*>(res_ptr.get()));
+ return *ptr;
+ }
+ }
+
ColumnPtr permute(const IColumn::Permutation& perm, size_t limit) const
override;
// ColumnPtr index(const IColumn & indexes, size_t limit) const
override;
diff --git a/be/src/vec/columns/predicate_column.h
b/be/src/vec/columns/predicate_column.h
index c8f8627..8095cff 100644
--- a/be/src/vec/columns/predicate_column.h
+++ b/be/src/vec/columns/predicate_column.h
@@ -97,6 +97,14 @@ private:
}
}
+ void insert_byte_to_res_column(const uint16_t* sel, size_t sel_size,
vectorized::IColumn* res_ptr) {
+ for (size_t i = 0; i < sel_size; i++) {
+ uint16_t n = sel[i];
+ char* ch_val = reinterpret_cast<char*>(&data[n]);
+ res_ptr->insert_data(ch_val, 0);
+ }
+ }
+
template <typename Y>
ColumnPtr filter_default_type_by_selector(const uint16_t* sel, size_t
sel_size, ColumnPtr* ptr = nullptr) {
static_assert(std::is_same_v<T, Y>);
@@ -371,6 +379,23 @@ public:
return *ptr;
}
+ ColumnPtr filter_bool_by_selector(const uint16_t* sel, size_t sel_size,
ColumnPtr* ptr = nullptr) {
+ if (ptr == nullptr) {
+ auto res = vectorized::ColumnVector<vectorized::UInt8>::create();
+ if (sel_size == 0) {
+ return res;
+ }
+ res->reserve(sel_size);
+ insert_byte_to_res_column(sel, sel_size, res.get());
+ } else {
+ if (sel_size != 0) {
+ MutableColumnPtr ptr_res = (*std::move(*ptr)).assume_mutable();
+ insert_byte_to_res_column(sel, sel_size, ptr_res.get());
+ }
+ }
+ return *ptr;
+ }
+
//todo(wb) need refactor this method, using return status to check
unexpect args instead of LOG(FATAL)
ColumnPtr filter_by_selector(const uint16_t* sel, size_t sel_size,
ColumnPtr* ptr = nullptr) override {
if constexpr (std::is_same_v<T, StringValue>) {
@@ -396,7 +421,7 @@ public:
} else if constexpr (std::is_same_v<T, doris::vectorized::Int128>) {
return
filter_default_type_by_selector<doris::vectorized::Int128>(sel, sel_size, ptr);
} else if (std::is_same_v<T, bool>) {
- LOG(FATAL) << "bool will be support later";
+ return filter_bool_by_selector(sel, sel_size, ptr);
} else {
LOG(FATAL) << "unexpected type in predicate column";
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]