This is an automated email from the ASF dual-hosted git repository.
gabriellee pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 8bdbd0da564 [Improvement](bloom filter) Forbid small bloom filter
(#38349) (#39387) (#40256)
8bdbd0da564 is described below
commit 8bdbd0da5645ec74f0df1e16684515940f5be425
Author: Gabriel <[email protected]>
AuthorDate: Tue Sep 3 09:46:38 2024 +0800
[Improvement](bloom filter) Forbid small bloom filter (#38349) (#39387)
(#40256)
Bloom filter has a expected filter ratio when data is enough. This PR
forbid too small bloom filter which has a big bias for filter ratio.
---
be/src/exprs/bloom_filter_func.h | 20 ++++++++++++++++----
be/src/exprs/runtime_filter.cpp | 3 +++
be/src/exprs/runtime_filter.h | 1 +
.../java/org/apache/doris/qe/SessionVariable.java | 3 ++-
4 files changed, 22 insertions(+), 5 deletions(-)
diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h
index eb13490cec8..95d50642448 100644
--- a/be/src/exprs/bloom_filter_func.h
+++ b/be/src/exprs/bloom_filter_func.h
@@ -100,8 +100,12 @@ public:
virtual ~BloomFilterFuncBase() = default;
void init_params(const RuntimeFilterParams* params) {
- _bloom_filter_length = params->bloom_filter_size;
+ _bloom_filter_length =
+ params->runtime_bloom_filter_min_size > 0
+ ? std::max(params->bloom_filter_size,
params->runtime_bloom_filter_min_size)
+ : params->bloom_filter_size;
_build_bf_exactly = params->build_bf_exactly;
+ _runtime_bloom_filter_min_size = params->runtime_bloom_filter_min_size;
_null_aware = params->null_aware;
_bloom_filter_size_calculated_by_ndv =
params->bloom_filter_size_calculated_by_ndv;
}
@@ -124,9 +128,16 @@ public:
// if FE do use ndv stat to predict the bf size, BE only use the
row count. FE have more
// exactly row count stat. which one is min is more correctly.
if (_bloom_filter_size_calculated_by_ndv) {
- _bloom_filter_length = std::min(be_calculate_size,
_bloom_filter_length);
+ _bloom_filter_length =
+ _runtime_bloom_filter_min_size > 0
+ ? std::max(_runtime_bloom_filter_min_size,
+ std::min(be_calculate_size,
_bloom_filter_length))
+ : std::min(be_calculate_size,
_bloom_filter_length);
} else {
- _bloom_filter_length = be_calculate_size;
+ _bloom_filter_length =
+ _runtime_bloom_filter_min_size > 0
+ ? std::max(_runtime_bloom_filter_min_size,
be_calculate_size)
+ : be_calculate_size;
}
}
return init_with_fixed_length(_bloom_filter_length);
@@ -222,8 +233,9 @@ protected:
// bloom filter size
int32_t _bloom_filter_alloced;
std::shared_ptr<BloomFilterAdaptor> _bloom_filter;
- bool _inited {};
+ bool _inited = false;
int64_t _bloom_filter_length;
+ int64_t _runtime_bloom_filter_min_size;
bool _build_bf_exactly = false;
bool _bloom_filter_size_calculated_by_ndv = false;
};
diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp
index 67ae2cecd69..8d7281e9426 100644
--- a/be/src/exprs/runtime_filter.cpp
+++ b/be/src/exprs/runtime_filter.cpp
@@ -1288,6 +1288,9 @@ Status IRuntimeFilter::init_with_desc(const
TRuntimeFilterDesc* desc, const TQue
params.filter_type = _runtime_filter_type;
params.column_return_type = build_ctx->root()->type().type;
params.max_in_num = options->runtime_filter_max_in_num;
+ params.runtime_bloom_filter_min_size =
options->__isset.runtime_bloom_filter_min_size
+ ?
options->runtime_bloom_filter_min_size
+ : 0;
// We build runtime filter by exact distinct count iff three conditions
are met:
// 1. Only 1 join key
// 2. Do not have remote target (e.g. do not need to merge), or broadcast
join
diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h
index df0dd9c8c6e..f199e173e84 100644
--- a/be/src/exprs/runtime_filter.h
+++ b/be/src/exprs/runtime_filter.h
@@ -127,6 +127,7 @@ struct RuntimeFilterParams {
// used in bloom filter
int64_t bloom_filter_size;
int32_t max_in_num;
+ int64_t runtime_bloom_filter_min_size;
int32_t filter_id;
bool bitmap_filter_not_in;
bool build_bf_exactly;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 60aff519443..f391cb24097 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -1075,7 +1075,7 @@ public class SessionVariable implements Serializable,
Writable {
private int runtimeBloomFilterSize = 2097152;
@VariableMgr.VarAttr(name = RUNTIME_BLOOM_FILTER_MIN_SIZE, needForward =
true)
- private int runtimeBloomFilterMinSize = 2048;
+ private int runtimeBloomFilterMinSize = 1048576;
@VariableMgr.VarAttr(name = RUNTIME_BLOOM_FILTER_MAX_SIZE, needForward =
true)
private int runtimeBloomFilterMaxSize = 16777216;
@@ -3576,6 +3576,7 @@ public class SessionVariable implements Serializable,
Writable {
tResult.setRuntimeFilterWaitTimeMs(runtimeFilterWaitTimeMs);
tResult.setRuntimeFilterMaxInNum(runtimeFilterMaxInNum);
+ tResult.setRuntimeBloomFilterMinSize(runtimeBloomFilterMinSize);
tResult.setRuntimeFilterWaitInfinitely(runtimeFilterWaitInfinitely);
if (cpuResourceLimit > 0) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]