This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch dev-1.1.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/dev-1.1.1 by this push:
new 7e44079212 [improvement 1.1.1]fix_vec_hash_table_resize_dev1.1.1
(#10968)
7e44079212 is described below
commit 7e44079212188157871659f93e0eae3f326a36a8
Author: Xinyi Zou <[email protected]>
AuthorDate: Mon Jul 18 14:37:04 2022 +0800
[improvement 1.1.1]fix_vec_hash_table_resize_dev1.1.1 (#10968)
---
be/CMakeLists.txt | 9 +++++++++
be/src/vec/common/hash_table/hash_table.h | 31 +++++++++++++++++++++++++------
be/src/vec/exec/join/vhash_join_node.cpp | 1 +
build.sh | 6 ++++++
run-be-ut.sh | 1 +
5 files changed, 42 insertions(+), 6 deletions(-)
diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index d2368f4b7b..6118cae793 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -415,6 +415,15 @@ if (WITH_LZO)
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DDORIS_WITH_LZO")
endif()
+# STRICT_MEMORY_USE=ON` expects BE to use less memory, and gives priority to
ensuring stability
+# when the cluster memory is limited.
+# TODO In the future, expect a dynamic soft memory limit, combined with
real-time memory usage of the cluster,
+# to control the main memory consumers, including HashTable, LRU Cache
elimination strategy,
+# ChunkAllocator cache strategy, Disk IO buffer cache strategy, etc.
+if (STRICT_MEMORY_USE)
+ set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -DSTRICT_MEMORY_USE")
+endif()
+
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -faligned-new")
endif()
diff --git a/be/src/vec/common/hash_table/hash_table.h
b/be/src/vec/common/hash_table/hash_table.h
index 920c819694..a60e71e42b 100644
--- a/be/src/vec/common/hash_table/hash_table.h
+++ b/be/src/vec/common/hash_table/hash_table.h
@@ -244,11 +244,22 @@ template <size_t initial_size_degree = 10>
struct HashTableGrower {
/// The state of this structure is enough to get the buffer size of the
hash table.
doris::vectorized::UInt8 size_degree = initial_size_degree;
+ doris::vectorized::Int64 double_grow_degree = 31; // 2GB
/// The size of the hash table in the cells.
size_t buf_size() const { return 1ULL << size_degree; }
+#ifndef STRICT_MEMORY_USE
size_t max_fill() const { return 1ULL << (size_degree - 1); }
+#else
+ // When capacity is greater than 2G, grow when 75% of the capacity is
satisfied.
+ size_t max_fill() const {
+ return size_degree < double_grow_degree
+ ? 1ULL << (size_degree - 1)
+ : (1ULL << size_degree) - (1ULL << (size_degree - 2));
+ }
+#endif
+
size_t mask() const { return buf_size() - 1; }
/// From the hash value, get the cell number in the hash table.
@@ -268,12 +279,20 @@ struct HashTableGrower {
/// Set the buffer size by the number of elements in the hash table. Used
when deserializing a hash table.
void set(size_t num_elems) {
- size_degree =
- num_elems <= 1
- ? initial_size_degree
- : ((initial_size_degree >
static_cast<size_t>(log2(num_elems - 1)) + 2)
- ? initial_size_degree
- : (static_cast<size_t>(log2(num_elems - 1))
+ 2));
+#ifndef STRICT_MEMORY_USE
+ size_t fill_capacity = static_cast<size_t>(log2(num_elems - 1)) + 2;
+#else
+ size_t fill_capacity = static_cast<size_t>(log2(num_elems - 1)) + 1;
+ fill_capacity =
+ fill_capacity < double_grow_degree
+ ? fill_capacity + 1
+ : (num_elems < (1ULL << fill_capacity) - (1ULL <<
(fill_capacity - 2))
+ ? fill_capacity
+ : fill_capacity + 1);
+#endif
+ size_degree = num_elems <= 1 ? initial_size_degree
+ : (initial_size_degree > fill_capacity ?
initial_size_degree
+ :
fill_capacity);
}
void set_buf_size(size_t buf_size_) {
diff --git a/be/src/vec/exec/join/vhash_join_node.cpp
b/be/src/vec/exec/join/vhash_join_node.cpp
index 0309b4fa94..efab21aaf1 100644
--- a/be/src/vec/exec/join/vhash_join_node.cpp
+++ b/be/src/vec/exec/join/vhash_join_node.cpp
@@ -69,6 +69,7 @@ struct ProcessHashTableBuild {
SCOPED_TIMER(_join_node->_build_table_insert_timer);
// only not build_unique, we need expanse hash table before insert data
if constexpr (!build_unique) {
+ // _rows contains null row, which will cause hash table resize to
be large.
hash_table_ctx.hash_table.expanse_for_add_elem(_rows);
}
hash_table_ctx.hash_table.reset_resize_timer();
diff --git a/build.sh b/build.sh
index 7d1071fd19..4f53378a8e 100755
--- a/build.sh
+++ b/build.sh
@@ -215,6 +215,10 @@ if [[ -z ${STRIP_DEBUG_INFO} ]]; then
STRIP_DEBUG_INFO=OFF
fi
+if [[ -z ${STRICT_MEMORY_USE} ]]; then
+ STRICT_MEMORY_USE=OFF
+fi
+
echo "Get params:
BUILD_BE -- $BUILD_BE
BUILD_FE -- $BUILD_FE
@@ -231,6 +235,7 @@ echo "Get params:
BUILD_META_TOOL -- $BUILD_META_TOOL
USE_LLD -- $USE_LLD
STRIP_DEBUG_INFO -- $STRIP_DEBUG_INFO
+ STRICT_MEMORY_USE -- $STRICT_MEMORY_USE
"
# Clean and build generated code
@@ -267,6 +272,7 @@ if [ ${BUILD_BE} -eq 1 ] ; then
-DBUILD_META_TOOL=${BUILD_META_TOOL} \
-DUSE_LLD=${USE_LLD} \
-DSTRIP_DEBUG_INFO=${STRIP_DEBUG_INFO} \
+ -DSTRICT_MEMORY_USE=${STRICT_MEMORY_USE} \
-DUSE_AVX2=${USE_AVX2} \
-DGLIBC_COMPATIBILITY=${GLIBC_COMPATIBILITY} ../
${BUILD_SYSTEM} -j ${PARALLEL}
diff --git a/run-be-ut.sh b/run-be-ut.sh
index af8ae5bac7..930c013918 100755
--- a/run-be-ut.sh
+++ b/run-be-ut.sh
@@ -142,6 +142,7 @@ ${CMAKE_CMD} -G "${GENERATOR}" \
-DBUILD_META_TOOL=OFF \
-DWITH_MYSQL=OFF \
-DWITH_KERBEROS=OFF \
+ -DSTRICT_MEMORY_USE=OFF \
${CMAKE_USE_CCACHE} ../
${BUILD_SYSTEM} -j ${PARALLEL} $RUN_FILE
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]