This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch dev-1.0.1 in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
commit 9861c3c028165fe913bf1f804e8a538b6ffa3800 Author: Pxl <[email protected]> AuthorDate: Thu Mar 24 09:12:42 2022 +0800 [fix][vectorized] fix core on get_predicate_column_ptr && fix double copy on _read_columns_by_rowids (#8581) --- be/src/olap/bloom_filter_predicate.h | 2 +- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 83 +++++++++++----------- be/src/olap/rowset/segment_v2/segment_iterator.h | 29 ++++++-- be/src/runtime/primitive_type.h | 22 ++++++ be/src/vec/columns/column_nullable.h | 15 ++-- be/src/vec/columns/column_string.h | 22 ++++-- be/src/vec/columns/predicate_column.h | 6 +- be/src/vec/core/block.cpp | 24 +++++++ be/src/vec/core/block.h | 2 + be/src/vec/core/types.h | 24 ++++++- 10 files changed, 166 insertions(+), 63 deletions(-) diff --git a/be/src/olap/bloom_filter_predicate.h b/be/src/olap/bloom_filter_predicate.h index c86e991..3b49cb0 100644 --- a/be/src/olap/bloom_filter_predicate.h +++ b/be/src/olap/bloom_filter_predicate.h @@ -111,7 +111,7 @@ template <PrimitiveType type> void BloomFilterColumnPredicate<type>::evaluate(vectorized::IColumn& column, uint16_t* sel, uint16_t* size) const { uint16_t new_size = 0; - using T = typename PrimitiveTypeTraits<type>::CppType; + using T = typename PredicatePrimitiveTypeTraits<type>::PredicateFieldType; if (column.is_nullable()) { auto* nullable_col = vectorized::check_and_get_column<vectorized::ColumnNullable>(column); diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 1ea0193..da1c219 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -25,6 +25,7 @@ #include "olap/column_predicate.h" #include "olap/fs/fs_util.h" #include "olap/in_list_predicate.h" +#include "olap/olap_common.h" #include "olap/row.h" #include "olap/row_block2.h" #include "olap/row_cursor.h" @@ -614,9 +615,9 @@ void SegmentIterator::_vec_init_lazy_materialization() { _is_pred_column[cid] = true; pred_column_ids.insert(cid); - if (type == OLAP_FIELD_TYPE_VARCHAR || type == OLAP_FIELD_TYPE_CHAR - || type == OLAP_FIELD_TYPE_STRING || predicate->is_in_predicate() - || predicate->is_bloom_filter_predicate()) { + if (type == OLAP_FIELD_TYPE_VARCHAR || type == OLAP_FIELD_TYPE_CHAR || + type == OLAP_FIELD_TYPE_STRING || predicate->is_in_predicate() || + predicate->is_bloom_filter_predicate()) { short_cir_pred_col_id_set.insert(cid); _short_cir_eval_predicate.push_back(predicate); _is_all_column_basic_type = false; @@ -640,7 +641,7 @@ void SegmentIterator::_vec_init_lazy_materialization() { _is_pred_column[cid] = true; } } - + if (_schema.column_ids().size() > pred_column_ids.size()) { for (auto cid : _schema.column_ids()) { if (!_is_pred_column[cid]) { @@ -716,6 +717,8 @@ Status SegmentIterator::_read_columns(const std::vector<ColumnId>& column_ids, void SegmentIterator::_init_current_block( vectorized::Block* block, std::vector<vectorized::MutableColumnPtr>& current_columns) { + _char_type_idx.clear(); + bool is_block_mem_reuse = block->mem_reuse(); if (is_block_mem_reuse) { block->clear_column_data(_schema.num_column_ids()); @@ -736,10 +739,15 @@ void SegmentIterator::_init_current_block( for (size_t i = 0; i < _schema.num_column_ids(); i++) { auto cid = _schema.column_id(i); + auto column_desc = _schema.column(cid); + + if (column_desc->type() == OLAP_FIELD_TYPE_CHAR) { + _char_type_idx.emplace_back(i); + } + if (_is_pred_column[cid]) { //todo(wb) maybe we can relase it after output block current_columns[cid]->clear(); } else { // non-predicate column - auto column_desc = _schema.column(cid); if (is_block_mem_reuse) { current_columns[cid] = std::move(*block->get_by_position(i).column).mutate(); } else { @@ -768,19 +776,6 @@ void SegmentIterator::_output_non_pred_columns(vectorized::Block* block, bool is } } -Status SegmentIterator::_output_column_by_sel_idx(vectorized::Block* block, - const std::vector<ColumnId>& columnIds, - uint16_t* sel_rowid_idx, uint16_t select_size, - bool is_block_mem_reuse) { - for (auto cid : columnIds) { - int block_cid = _schema_block_id_map[cid]; - RETURN_IF_ERROR(block->copy_column_data_to_block( - is_block_mem_reuse, _current_return_columns[cid].get(), sel_rowid_idx, select_size, - block_cid, _opts.block_row_max)); - } - return Status::OK(); -} - Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32_t& nrows_read, bool set_block_rowid) { do { @@ -962,34 +957,40 @@ Status SegmentIterator::next_batch(vectorized::Block* block) { // When predicate column and no-predicate column are both basic type, lazy materialization is eliminate // So output block directly after vectorization evaluation if (_is_all_column_basic_type) { - return _output_column_by_sel_idx(block, _first_read_column_ids, sel_rowid_idx, - selected_size, is_mem_reuse); - } + RETURN_IF_ERROR(_output_column_by_sel_idx(block, _first_read_column_ids, sel_rowid_idx, + selected_size, is_mem_reuse)); + } else { + // step 2: evaluate short ciruit predicate + // todo(wb) research whether need to read short predicate after vectorization evaluation + // to reduce cost of read short circuit columns. + // In SSB test, it make no difference; So need more scenarios to test + _evaluate_short_circuit_predicate(sel_rowid_idx, &selected_size); + + // step3: read non_predicate column + if (!_non_predicate_columns.empty()) { + _read_columns_by_rowids(_non_predicate_columns, _block_rowids, sel_rowid_idx, + selected_size, &_current_return_columns); + } - // step 2: evaluate short ciruit predicate - // todo(wb) research whether need to read short predicate after vectorization evaluation - // to reduce cost of read short circuit columns. - // In SSB test, it make no difference; So need more scenarios to test - _evaluate_short_circuit_predicate(sel_rowid_idx, &selected_size); + // step4: output columns + // 4.1 output non-predicate column + _output_non_pred_columns(block, is_mem_reuse); - // step3: read non_predicate column - if (!_non_predicate_columns.empty()) { - _read_columns_by_rowids(_non_predicate_columns, _block_rowids, sel_rowid_idx, - selected_size, &_current_return_columns); - } + // 4.2 get union of short_cir_pred and vec_pred + std::set<ColumnId> pred_column_ids; + pred_column_ids.insert(_short_cir_pred_column_ids.begin(), + _short_cir_pred_column_ids.end()); + pred_column_ids.insert(_vec_pred_column_ids.begin(), _vec_pred_column_ids.end()); - // step4: output columns - // 4.1 output non-predicate column - _output_non_pred_columns(block, is_mem_reuse); - - // 4.2 output short circuit predicate column - RETURN_IF_ERROR(_output_column_by_sel_idx(block, _short_cir_pred_column_ids, sel_rowid_idx, - selected_size, is_mem_reuse)); - // 4.3 output vectorizatioin predicate column - return _output_column_by_sel_idx(block, _vec_pred_column_ids, sel_rowid_idx, selected_size, - is_mem_reuse); + // 4.3 output short circuit and predicate column + RETURN_IF_ERROR(_output_column_by_sel_idx(block, pred_column_ids, sel_rowid_idx, + selected_size, is_mem_reuse)); + } } + // shink char_type suffix zero data + block->shrink_char_type_column_suffix_zero(_char_type_idx); + return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index ba82be3..5a3f880 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -22,6 +22,7 @@ #include <vector> #include "common/status.h" +#include "olap/olap_common.h" #include "olap/olap_cond.h" #include "olap/rowset/segment_v2/common.h" #include "olap/rowset/segment_v2/row_ranges.h" @@ -98,9 +99,22 @@ private: void _evaluate_vectorization_predicate(uint16_t* sel_rowid_idx, uint16_t& selected_size); void _evaluate_short_circuit_predicate(uint16_t* sel_rowid_idx, uint16_t* selected_size); void _output_non_pred_columns(vectorized::Block* block, bool is_block_mem_reuse); - Status _output_column_by_sel_idx(vectorized::Block* block, const std::vector<ColumnId>& columnids, uint16_t* sel_rowid_idx, uint16_t select_size, bool is_block_mem_reuse); - void _read_columns_by_rowids(std::vector<ColumnId>& read_column_ids, std::vector<rowid_t>& rowid_vector, - uint16_t* sel_rowid_idx, size_t select_size, vectorized::MutableColumns* mutable_columns); + void _read_columns_by_rowids(std::vector<ColumnId>& read_column_ids, + std::vector<rowid_t>& rowid_vector, uint16_t* sel_rowid_idx, + size_t select_size, vectorized::MutableColumns* mutable_columns); + + template <class Container> + Status _output_column_by_sel_idx(vectorized::Block* block, const Container& column_ids, + uint16_t* sel_rowid_idx, uint16_t select_size, + bool is_block_mem_reuse) { + for (auto cid : column_ids) { + int block_cid = _schema_block_id_map[cid]; + RETURN_IF_ERROR(block->copy_column_data_to_block( + is_block_mem_reuse, _current_return_columns[cid].get(), sel_rowid_idx, + select_size, block_cid, _opts.block_row_max)); + } + return Status::OK(); + } private: class BitmapRangeIterator; @@ -136,7 +150,7 @@ private: _vec_pred_column_ids; // keep columnId of columns for vectorized predicate evaluation std::vector<ColumnId> _short_cir_pred_column_ids; // keep columnId of columns for short circuit predicate evaluation - vector<bool> _is_pred_column; // columns hold by segmentIter + std::vector<bool> _is_pred_column; // columns hold by segmentIter vectorized::MutableColumns _current_return_columns; std::unique_ptr<AndBlockColumnPredicate> _pre_eval_block_predicate; std::vector<ColumnPredicate*> _short_cir_eval_predicate; @@ -144,8 +158,8 @@ private: // first, read predicate columns by various index // second, read non-predicate columns // so we need a field to stand for columns first time to read - vector<ColumnId> _first_read_column_ids; - vector<int> _schema_block_id_map; // map from schema column id to column idx in Block + std::vector<ColumnId> _first_read_column_ids; + std::vector<int> _schema_block_id_map; // map from schema column id to column idx in Block // the actual init process is delayed to the first call to next_batch() bool _inited; @@ -163,6 +177,9 @@ private: // block for file to read std::unique_ptr<fs::ReadableBlock> _rblock; + + // char_type columns cid + std::vector<size_t> _char_type_idx; }; } // namespace segment_v2 diff --git a/be/src/runtime/primitive_type.h b/be/src/runtime/primitive_type.h index 224957f..03d13b2 100644 --- a/be/src/runtime/primitive_type.h +++ b/be/src/runtime/primitive_type.h @@ -32,6 +32,7 @@ #include "vec/columns/column_decimal.h" #include "vec/columns/column_string.h" #include "vec/columns/columns_number.h" +#include "vec/common/string_ref.h" #include "vec/core/types.h" #include "vec/runtime/vdatetime_value.h" @@ -350,6 +351,27 @@ struct PrimitiveTypeTraits<TYPE_STRING> { using ColumnType = vectorized::ColumnString; }; +// only for adapt get_predicate_column_ptr +template <PrimitiveType type> +struct PredicatePrimitiveTypeTraits { + using PredicateFieldType = typename PrimitiveTypeTraits<type>::CppType; +}; + +template <> +struct PredicatePrimitiveTypeTraits<TYPE_DECIMALV2> { + using PredicateFieldType = decimal12_t; +}; + +template <> +struct PredicatePrimitiveTypeTraits<TYPE_DATE> { + using PredicateFieldType = uint24_t; +}; + +template <> +struct PredicatePrimitiveTypeTraits<TYPE_DATETIME> { + using PredicateFieldType = uint64_t; +}; + } // namespace doris #endif diff --git a/be/src/vec/columns/column_nullable.h b/be/src/vec/columns/column_nullable.h index 5076358..8badf6e 100644 --- a/be/src/vec/columns/column_nullable.h +++ b/be/src/vec/columns/column_nullable.h @@ -86,7 +86,8 @@ public: StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const override; const char* deserialize_and_insert_from_arena(const char* pos) override; void insert_range_from(const IColumn& src, size_t start, size_t length) override; - void insert_indices_from(const IColumn& src, const int* indices_begin, const int* indices_end) override; + void insert_indices_from(const IColumn& src, const int* indices_begin, + const int* indices_end) override; void insert(const Field& x) override; void insert_from(const IColumn& src, size_t n) override; @@ -98,14 +99,16 @@ public: get_null_map_column().fill(0, num); get_nested_column().insert_many_fix_len_data(pos, num); } - + void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict, size_t data_num, uint32_t dict_num) override { get_null_map_column().fill(0, data_num); - get_nested_column().insert_many_dict_data(data_array, start_index, dict, data_num, dict_num); + get_nested_column().insert_many_dict_data(data_array, start_index, dict, data_num, + dict_num); } - - void insert_many_binary_data(char* data_array, uint32_t* len_array, uint32_t* start_offset_array, size_t num) override { + + void insert_many_binary_data(char* data_array, uint32_t* len_array, + uint32_t* start_offset_array, size_t num) override { get_null_map_column().fill(0, num); get_nested_column().insert_many_binary_data(data_array, len_array, start_offset_array, num); } @@ -187,6 +190,8 @@ public: /// Return the column that represents the byte map. const ColumnPtr& get_null_map_column_ptr() const { return null_map; } + MutableColumnPtr get_null_map_column_ptr() { return null_map->assume_mutable(); } + ColumnUInt8& get_null_map_column() { return assert_cast<ColumnUInt8&>(*null_map); } const ColumnUInt8& get_null_map_column() const { return assert_cast<const ColumnUInt8&>(*null_map); diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index a7975df..47cb68c 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -157,17 +157,18 @@ public: offsets.push_back(new_size); } - void insert_many_binary_data(char* data_array, uint32_t* len_array, uint32_t* start_offset_array, size_t num) override { + void insert_many_binary_data(char* data_array, uint32_t* len_array, + uint32_t* start_offset_array, size_t num) override { for (size_t i = 0; i < num; i++) { uint32_t len = len_array[i]; uint32_t start_offset = start_offset_array[i]; insert_data(data_array + start_offset, len); } }; - + void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict, size_t num, uint32_t /*dict_num*/) override { - for (size_t end_index = start_index+num; start_index < end_index; ++start_index) { + for (size_t end_index = start_index + num; start_index < end_index; ++start_index) { int32_t codeword = data_array[start_index]; insert_data(dict[codeword].data, dict[codeword].size); } @@ -203,7 +204,8 @@ public: void insert_range_from(const IColumn& src, size_t start, size_t length) override; - void insert_indices_from(const IColumn& src, const int* indices_begin, const int* indices_end) override; + void insert_indices_from(const IColumn& src, const int* indices_begin, + const int* indices_end) override; ColumnPtr filter(const Filter& filt, ssize_t result_size_hint) const override; @@ -226,7 +228,7 @@ public: const size_t old_size = offsets.size(); const size_t new_size = old_size + length; - const auto num = offsets.back() + 1; + const auto num = offsets.back() + 1; offsets.resize_fill(new_size, num); for (size_t i = old_size, j = 0; i < new_size; i++, j++) { offsets[i] += j; @@ -315,6 +317,16 @@ public: chars.emplace_back(0); } + + MutableColumnPtr get_shinked_column() const { + auto shrinked_column = ColumnString::create(); + for (int i = 0; i < size(); i++) { + StringRef str = get_data_at(i); + reinterpret_cast<ColumnString*>(shrinked_column.get()) + ->insert_data(str.data, strnlen(str.data, str.size)); + } + return shrinked_column; + } }; } // namespace doris::vectorized diff --git a/be/src/vec/columns/predicate_column.h b/be/src/vec/columns/predicate_column.h index 2598cc4..f41c239 100644 --- a/be/src/vec/columns/predicate_column.h +++ b/be/src/vec/columns/predicate_column.h @@ -220,7 +220,7 @@ public: void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict, size_t num, uint32_t /*dict_num*/) override { if constexpr (std::is_same_v<T, StringValue>) { - for (size_t end_index = start_index+num; start_index < end_index; ++start_index) { + for (size_t end_index = start_index + num; start_index < end_index; ++start_index) { int32_t codeword = data_array[start_index]; insert_string_value(dict[codeword].data, dict[codeword].size); } @@ -255,9 +255,7 @@ public: void reserve(size_t n) override { data.reserve(n); } - [[noreturn]] const char* get_family_name() const override { - LOG(FATAL) << "get_family_name not supported in PredicateColumnType"; - } + const char* get_family_name() const override { return TypeName<T>::get(); } [[noreturn]] MutableColumnPtr clone_resized(size_t size) const override { LOG(FATAL) << "clone_resized not supported in PredicateColumnType"; diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp index 23022d3..eb69fc2 100644 --- a/be/src/vec/core/block.cpp +++ b/be/src/vec/core/block.cpp @@ -23,6 +23,7 @@ #include <fmt/format.h> #include <snappy.h> +#include <cstring> #include <iomanip> #include <iterator> #include <memory> @@ -32,13 +33,17 @@ #include "runtime/row_batch.h" #include "runtime/tuple.h" #include "runtime/tuple_row.h" +#include "udf/udf.h" +#include "vec/columns/column.h" #include "vec/columns/column_const.h" #include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" #include "vec/columns/column_vector.h" #include "vec/columns/columns_common.h" #include "vec/columns/columns_number.h" #include "vec/common/assert_cast.h" #include "vec/common/exception.h" +#include "vec/common/string_ref.h" #include "vec/common/typeid_cast.h" #include "vec/data_types/data_type_bitmap.h" #include "vec/data_types/data_type_date.h" @@ -941,4 +946,23 @@ std::unique_ptr<Block> Block::create_same_struct_block(size_t size) const { return temp_block; } +void Block::shrink_char_type_column_suffix_zero(std::vector<size_t> char_type_idx) { + for (auto idx : char_type_idx) { + if (this->get_by_position(idx).column->is_nullable()) { + this->get_by_position(idx).column = ColumnNullable::create( + reinterpret_cast<const ColumnString*>( + reinterpret_cast<const ColumnNullable*>( + this->get_by_position(idx).column.get()) + ->get_nested_column_ptr() + .get()) + ->get_shinked_column(), + reinterpret_cast<const ColumnNullable*>(this->get_by_position(idx).column.get()) + ->get_null_map_column_ptr()); + } else { + this->get_by_position(idx).column = + reinterpret_cast<const ColumnString*>(this->get_by_position(idx).column.get()) + ->get_shinked_column(); + } + } +} } // namespace doris::vectorized diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h index c660bdb..b0c8238 100644 --- a/be/src/vec/core/block.h +++ b/be/src/vec/core/block.h @@ -284,6 +284,8 @@ public: doris::Tuple* deep_copy_tuple(const TupleDescriptor&, MemPool*, int, int, bool padding_char = false); + void shrink_char_type_column_suffix_zero(std::vector<size_t> char_type_idx); + private: void erase_impl(size_t position); void initialize_index_by_name(); diff --git a/be/src/vec/core/types.h b/be/src/vec/core/types.h index a5a39b6..0d70204 100644 --- a/be/src/vec/core/types.h +++ b/be/src/vec/core/types.h @@ -25,9 +25,13 @@ #include <string> #include <vector> +#include "gen_cpp/data.pb.h" +#include "olap/decimal12.h" +#include "olap/hll.h" +#include "olap/uint24.h" +#include "udf/udf.h" #include "util/binary_cast.hpp" #include "util/bitmap_value.h" -#include "olap/hll.h" namespace doris::vectorized { @@ -119,6 +123,24 @@ inline constexpr bool IsNumber<Float64> = true; template <typename T> struct TypeName; +// only used at predicate_column +template <> +struct TypeName<bool> { + static const char* get() { return "bool"; } +}; +template <> +struct TypeName<decimal12_t> { + static const char* get() { return "decimal12_t"; } +}; +template <> +struct TypeName<uint24_t> { + static const char* get() { return "uint24_t"; } +}; +template <> +struct TypeName<StringValue> { + static const char* get() { return "SringValue"; } +}; + template <> struct TypeName<UInt8> { static const char* get() { return "UInt8"; } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
