This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch dev-1.0.1 in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
commit ac8a008814d8ec54b6e0fff3fd08f24355534126 Author: HappenLee <[email protected]> AuthorDate: Sun Apr 3 19:50:25 2022 +0800 [Bug][Vectorized] Fix core bug of segment vectorized (#8800) * [Bug][Vectorized] Fix core bug of segment vectorized 1. Read table with delete condition 2. Read table with default value HLL/Bitmap Column * refactor some code Co-authored-by: lihaopeng <[email protected]> --- be/src/olap/rowset/segment_v2/column_reader.cpp | 79 ++++++++++++++-------- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 20 ++++-- be/src/olap/rowset/segment_v2/segment_iterator.h | 4 ++ be/src/vec/core/block.cpp | 32 +++++---- be/src/vec/core/block.h | 18 ++++- 5 files changed, 98 insertions(+), 55 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 69fc02d8c6..a67793463f 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -772,39 +772,58 @@ Status DefaultValueColumnIterator::next_batch(size_t* n, ColumnBlockView* dst, b void DefaultValueColumnIterator::insert_default_data(vectorized::MutableColumnPtr &dst, size_t n) { vectorized::Int128 int128; - char* data_ptr = (char*)&int128; + char* data_ptr = (char *) &int128; size_t data_len = sizeof(int128); - auto type = _type_info->type(); - if (type == OLAP_FIELD_TYPE_DATE) { - assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::CppType)); //uint24_t - std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::to_string(_mem_value); - - vectorized::VecDateTimeValue value; - value.from_date_str(str.c_str(), str.length()); - value.cast_to_date(); - //TODO: here is int128 = int64 - int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value); - } else if (type == OLAP_FIELD_TYPE_DATETIME) { - assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::CppType)); //int64_t - std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::to_string(_mem_value); - - vectorized::VecDateTimeValue value; - value.from_date_str(str.c_str(), str.length()); - value.to_datetime(); - - int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value); - } else if (type == OLAP_FIELD_TYPE_DECIMAL) { - assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DECIMAL>::CppType)); //decimal12_t - decimal12_t* d = (decimal12_t*)_mem_value; - int128 = DecimalV2Value(d->integer, d->fraction).value(); - } else { - data_ptr = (char*)_mem_value; - data_len = _type_size; - } + auto insert_column_data = [&]() { + for (size_t i = 0; i < n; ++i) { + dst->insert_data(data_ptr, data_len); + } + }; + + switch (_type_info->type()) { + case OLAP_FIELD_TYPE_OBJECT: + case OLAP_FIELD_TYPE_HLL:{ + dst->insert_many_defaults(n); + break; + } + + case OLAP_FIELD_TYPE_DATE: { + assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::CppType)); //uint24_t + std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATE>::to_string(_mem_value); + + vectorized::VecDateTimeValue value; + value.from_date_str(str.c_str(), str.length()); + value.cast_to_date(); + //TODO: here is int128 = int64, here rely on the logic of little endian + int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value); + insert_column_data(); + break; + } + case OLAP_FIELD_TYPE_DATETIME: { + assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::CppType)); //int64_t + std::string str = FieldTypeTraits<OLAP_FIELD_TYPE_DATETIME>::to_string(_mem_value); - for (size_t i = 0; i < n; ++i) { - dst->insert_data(data_ptr, data_len); + vectorized::VecDateTimeValue value; + value.from_date_str(str.c_str(), str.length()); + value.to_datetime(); + + int128 = binary_cast<vectorized::VecDateTimeValue, vectorized::Int64>(value); + insert_column_data(); + break; + } + case OLAP_FIELD_TYPE_DECIMAL: { + assert(_type_size == sizeof(FieldTypeTraits<OLAP_FIELD_TYPE_DECIMAL>::CppType)); //decimal12_t + decimal12_t *d = (decimal12_t *) _mem_value; + int128 = DecimalV2Value(d->integer, d->fraction).value(); + insert_column_data(); + break; + } + default: { + data_ptr = (char *) _mem_value; + data_len = _type_size; + insert_column_data(); + } } } diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 549977d0db..487bcd0a53 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -138,6 +138,7 @@ Status SegmentIterator::_init(bool is_vec) { RETURN_IF_ERROR(_get_row_ranges_by_column_conditions()); if (is_vec) { _vec_init_lazy_materialization(); + _vec_init_char_column_id(); } else { _init_lazy_materialization(); } @@ -713,6 +714,17 @@ void SegmentIterator::_vec_init_lazy_materialization() { } } +void SegmentIterator::_vec_init_char_column_id() { + for (size_t i = 0; i < _schema.num_column_ids(); i++) { + auto cid = _schema.column_id(i); + auto column_desc = _schema.column(cid); + + if (column_desc->type() == OLAP_FIELD_TYPE_CHAR) { + _char_type_idx.emplace_back(i); + } + } +} + Status SegmentIterator::_read_columns(const std::vector<ColumnId>& column_ids, vectorized::MutableColumns& column_block, size_t nrows) { for (auto cid : column_ids) { @@ -726,8 +738,6 @@ Status SegmentIterator::_read_columns(const std::vector<ColumnId>& column_ids, void SegmentIterator::_init_current_block( vectorized::Block* block, std::vector<vectorized::MutableColumnPtr>& current_columns) { - _char_type_idx.clear(); - bool is_block_mem_reuse = block->mem_reuse(); if (is_block_mem_reuse) { block->clear_column_data(_schema.num_column_ids()); @@ -750,11 +760,7 @@ void SegmentIterator::_init_current_block( auto cid = _schema.column_id(i); auto column_desc = _schema.column(cid); - if (column_desc->type() == OLAP_FIELD_TYPE_CHAR) { - _char_type_idx.emplace_back(i); - } - - if (_is_pred_column[cid]) { //todo(wb) maybe we can relase it after output block + if (_is_pred_column[cid]) { //todo(wb) maybe we can release it after output block current_columns[cid]->clear(); } else { // non-predicate column if (is_block_mem_reuse) { diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index 5a3f8809c1..7f2d11e0b6 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -80,6 +80,10 @@ private: void _init_lazy_materialization(); void _vec_init_lazy_materialization(); + // TODO: Fix Me + // CHAR type in storge layer padding the 0 in length. But query engine need ignore the padding 0. + // so segment iterator need to shrink char column before output it. only use in vec query engine. + void _vec_init_char_column_id(); uint32_t segment_id() const { return _segment->id(); } uint32_t num_rows() const { return _segment->num_rows(); } diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp index eb69fc2eae..33c73205a5 100644 --- a/be/src/vec/core/block.cpp +++ b/be/src/vec/core/block.cpp @@ -946,22 +946,24 @@ std::unique_ptr<Block> Block::create_same_struct_block(size_t size) const { return temp_block; } -void Block::shrink_char_type_column_suffix_zero(std::vector<size_t> char_type_idx) { +void Block::shrink_char_type_column_suffix_zero(const std::vector<size_t>& char_type_idx) { for (auto idx : char_type_idx) { - if (this->get_by_position(idx).column->is_nullable()) { - this->get_by_position(idx).column = ColumnNullable::create( - reinterpret_cast<const ColumnString*>( - reinterpret_cast<const ColumnNullable*>( - this->get_by_position(idx).column.get()) - ->get_nested_column_ptr() - .get()) - ->get_shinked_column(), - reinterpret_cast<const ColumnNullable*>(this->get_by_position(idx).column.get()) - ->get_null_map_column_ptr()); - } else { - this->get_by_position(idx).column = - reinterpret_cast<const ColumnString*>(this->get_by_position(idx).column.get()) - ->get_shinked_column(); + if (idx < data.size()) { + if (this->get_by_position(idx).column->is_nullable()) { + this->get_by_position(idx).column = ColumnNullable::create( + reinterpret_cast<const ColumnString *>( + reinterpret_cast<const ColumnNullable *>( + this->get_by_position(idx).column.get()) + ->get_nested_column_ptr() + .get()) + ->get_shinked_column(), + reinterpret_cast<const ColumnNullable *>(this->get_by_position(idx).column.get()) + ->get_null_map_column_ptr()); + } else { + this->get_by_position(idx).column = + reinterpret_cast<const ColumnString *>(this->get_by_position(idx).column.get()) + ->get_shinked_column(); + } } } } diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h index b0c8238a3e..6ef105cf3b 100644 --- a/be/src/vec/core/block.h +++ b/be/src/vec/core/block.h @@ -97,8 +97,20 @@ public: ColumnWithTypeAndName& get_by_position(size_t position) { return data[position]; } const ColumnWithTypeAndName& get_by_position(size_t position) const { return data[position]; } - Status copy_column_data_to_block(bool is_block_mem_reuse, doris::vectorized::IColumn* input_col_ptr, - uint16_t* sel_rowid_idx, uint16_t select_size, int block_cid, size_t batch_size) { + Status copy_column_data_to_block(bool is_block_mem_reuse, + doris::vectorized::IColumn* input_col_ptr, + uint16_t* sel_rowid_idx, uint16_t select_size, int block_cid, + size_t batch_size) { + // Only the additional deleted filter condition need to materialize column be at the end of the block + // We should not to materialize the column of query engine do not need. So here just return OK. + // Eg: + // `delete from table where a = 10;` + // `select b from table;` + // a column only effective in segment iterator, the block from query engine only contain the b column. + // so the `block_cid >= data.size()` is true + if (block_cid >= data.size()) + return Status::OK(); + if (is_block_mem_reuse) { auto* raw_res_ptr = this->get_by_position(block_cid).column.get(); const_cast<doris::vectorized::IColumn*>(raw_res_ptr)->reserve(batch_size); @@ -284,7 +296,7 @@ public: doris::Tuple* deep_copy_tuple(const TupleDescriptor&, MemPool*, int, int, bool padding_char = false); - void shrink_char_type_column_suffix_zero(std::vector<size_t> char_type_idx); + void shrink_char_type_column_suffix_zero(const std::vector<size_t>& char_type_idx); private: void erase_impl(size_t position); --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
