This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch dev-1.0.1 in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
commit 2265374d28c2411079e1b3a844b86e5859e41de1 Author: HappenLee <[email protected]> AuthorDate: Mon May 23 15:13:57 2022 +0800 [vec][opt] opt hash join build resize hash table before insert data (#9735) Co-authored-by: lihaopeng <[email protected]> --- be/src/vec/common/hash_table/hash_table.h | 6 ++++++ be/src/vec/exec/join/vhash_join_node.cpp | 10 +++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/be/src/vec/common/hash_table/hash_table.h b/be/src/vec/common/hash_table/hash_table.h index f0a94b77f3..c55d806699 100644 --- a/be/src/vec/common/hash_table/hash_table.h +++ b/be/src/vec/common/hash_table/hash_table.h @@ -731,6 +731,12 @@ protected: } public: + void expanse_for_add_elem(size_t num_elem) { + if (add_elem_size_overflow(num_elem)) { + resize(grower.buf_size() + num_elem); + } + } + /// Insert a value. In the case of any more complex values, it is better to use the `emplace` function. std::pair<LookupResult, bool> ALWAYS_INLINE insert(const value_type& x) { std::pair<LookupResult, bool> res; diff --git a/be/src/vec/exec/join/vhash_join_node.cpp b/be/src/vec/exec/join/vhash_join_node.cpp index 7ec9f23439..8a8691a51d 100644 --- a/be/src/vec/exec/join/vhash_join_node.cpp +++ b/be/src/vec/exec/join/vhash_join_node.cpp @@ -67,6 +67,10 @@ struct ProcessHashTableBuild { KeyGetter key_getter(_build_raw_ptrs, _join_node->_build_key_sz, nullptr); SCOPED_TIMER(_join_node->_build_table_insert_timer); + // only not build_unique, we need expanse hash table before insert data + if constexpr (!build_unique) { + hash_table_ctx.hash_table.expanse_for_add_elem(_rows); + } hash_table_ctx.hash_table.reset_resize_timer(); vector<int>& inserted_rows = _join_node->_inserted_rows[&_acquired_block]; @@ -980,8 +984,8 @@ Status HashJoinNode::_hash_table_build(RuntimeState* state) { if (block.rows() != 0) { mutable_block.merge(block); } // make one block for each 4 gigabytes - constexpr static auto BUILD_BLOCK_MAX_SIZE = 4 * 1024UL * 1024UL * 1024UL; - if (_mem_used - last_mem_used > BUILD_BLOCK_MAX_SIZE) { + constexpr static auto BUILD_BLOCK_MAX_SIZE = 4 * 1024UL * 1024UL * 1024UL; + if (UNLIKELY(_mem_used - last_mem_used > BUILD_BLOCK_MAX_SIZE)) { _build_blocks.emplace_back(mutable_block.to_block()); // TODO:: Rethink may we should do the proess after we recevie all build blocks ? // which is better. @@ -1099,7 +1103,7 @@ Status HashJoinNode::extract_probe_join_column(Block& block, NullMap& null_map, Status HashJoinNode::_process_build_block(RuntimeState* state, Block& block, uint8_t offset) { SCOPED_TIMER(_build_table_timer); size_t rows = block.rows(); - if (rows == 0) { + if (UNLIKELY(rows == 0)) { return Status::OK(); } COUNTER_UPDATE(_build_rows_counter, rows); --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
