This is an automated email from the ASF dual-hosted git repository.

gabriellee pushed a commit to branch branch-3.0.2-tmp
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0.2-tmp by this push:
     new 0c252781152 [Improvement](bloom filter) Forbid small bloom filter 
(#38349) (#39387)
0c252781152 is described below

commit 0c25278115211ed8ee77b60284a53386075104d5
Author: Gabriel <[email protected]>
AuthorDate: Mon Aug 19 10:50:54 2024 +0800

    [Improvement](bloom filter) Forbid small bloom filter (#38349) (#39387)
    
    Bloom filter has a expected filter ratio when data is enough. This PR
    forbid too small bloom filter which has a big bias for filter ratio.
---
 be/src/exprs/bloom_filter_func.h                     | 20 ++++++++++++++++----
 be/src/exprs/runtime_filter.cpp                      |  3 +++
 be/src/exprs/runtime_filter.h                        |  1 +
 .../java/org/apache/doris/qe/SessionVariable.java    |  3 ++-
 gensrc/thrift/PaloInternalService.thrift             |  1 +
 5 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h
index a831395a5ea..e88f692a23d 100644
--- a/be/src/exprs/bloom_filter_func.h
+++ b/be/src/exprs/bloom_filter_func.h
@@ -100,8 +100,12 @@ public:
     virtual ~BloomFilterFuncBase() = default;
 
     void init_params(const RuntimeFilterParams* params) {
-        _bloom_filter_length = params->bloom_filter_size;
+        _bloom_filter_length =
+                params->runtime_bloom_filter_min_size > 0
+                        ? std::max(params->bloom_filter_size, 
params->runtime_bloom_filter_min_size)
+                        : params->bloom_filter_size;
         _build_bf_exactly = params->build_bf_exactly;
+        _runtime_bloom_filter_min_size = params->runtime_bloom_filter_min_size;
         _null_aware = params->null_aware;
         _bloom_filter_size_calculated_by_ndv = 
params->bloom_filter_size_calculated_by_ndv;
     }
@@ -124,9 +128,16 @@ public:
             // if FE do use ndv stat to predict the bf size, BE only use the 
row count. FE have more
             // exactly row count stat. which one is min is more correctly.
             if (_bloom_filter_size_calculated_by_ndv) {
-                _bloom_filter_length = std::min(be_calculate_size, 
_bloom_filter_length);
+                _bloom_filter_length =
+                        _runtime_bloom_filter_min_size > 0
+                                ? std::max(_runtime_bloom_filter_min_size,
+                                           std::min(be_calculate_size, 
_bloom_filter_length))
+                                : std::min(be_calculate_size, 
_bloom_filter_length);
             } else {
-                _bloom_filter_length = be_calculate_size;
+                _bloom_filter_length =
+                        _runtime_bloom_filter_min_size > 0
+                                ? std::max(_runtime_bloom_filter_min_size, 
be_calculate_size)
+                                : be_calculate_size;
             }
         }
         return init_with_fixed_length(_bloom_filter_length);
@@ -221,8 +232,9 @@ protected:
     // bloom filter size
     int32_t _bloom_filter_alloced;
     std::shared_ptr<BloomFilterAdaptor> _bloom_filter;
-    bool _inited {};
+    bool _inited = false;
     int64_t _bloom_filter_length;
+    int64_t _runtime_bloom_filter_min_size;
     bool _build_bf_exactly = false;
     bool _bloom_filter_size_calculated_by_ndv = false;
 };
diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp
index f6fcb9ba213..b95adecef2c 100644
--- a/be/src/exprs/runtime_filter.cpp
+++ b/be/src/exprs/runtime_filter.cpp
@@ -1288,6 +1288,9 @@ Status IRuntimeFilter::init_with_desc(const 
TRuntimeFilterDesc* desc, const TQue
     params.filter_type = _runtime_filter_type;
     params.column_return_type = build_ctx->root()->type().type;
     params.max_in_num = options->runtime_filter_max_in_num;
+    params.runtime_bloom_filter_min_size = 
options->__isset.runtime_bloom_filter_min_size
+                                                   ? 
options->runtime_bloom_filter_min_size
+                                                   : 0;
     // We build runtime filter by exact distinct count iff three conditions 
are met:
     // 1. Only 1 join key
     // 2. Do not have remote target (e.g. do not need to merge), or broadcast 
join
diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h
index df0dd9c8c6e..f199e173e84 100644
--- a/be/src/exprs/runtime_filter.h
+++ b/be/src/exprs/runtime_filter.h
@@ -127,6 +127,7 @@ struct RuntimeFilterParams {
     // used in bloom filter
     int64_t bloom_filter_size;
     int32_t max_in_num;
+    int64_t runtime_bloom_filter_min_size;
     int32_t filter_id;
     bool bitmap_filter_not_in;
     bool build_bf_exactly;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java 
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 1aae9c67f74..2f8d9b64466 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -1067,7 +1067,7 @@ public class SessionVariable implements Serializable, 
Writable {
     private int runtimeBloomFilterSize = 2097152;
 
     @VariableMgr.VarAttr(name = RUNTIME_BLOOM_FILTER_MIN_SIZE, needForward = 
true)
-    private int runtimeBloomFilterMinSize = 2048;
+    private int runtimeBloomFilterMinSize = 1048576;
 
     @VariableMgr.VarAttr(name = RUNTIME_BLOOM_FILTER_MAX_SIZE, needForward = 
true)
     private int runtimeBloomFilterMaxSize = 16777216;
@@ -3541,6 +3541,7 @@ public class SessionVariable implements Serializable, 
Writable {
 
         tResult.setRuntimeFilterWaitTimeMs(runtimeFilterWaitTimeMs);
         tResult.setRuntimeFilterMaxInNum(runtimeFilterMaxInNum);
+        tResult.setRuntimeBloomFilterMinSize(runtimeBloomFilterMinSize);
         tResult.setRuntimeFilterWaitInfinitely(runtimeFilterWaitInfinitely);
 
         if (cpuResourceLimit > 0) {
diff --git a/gensrc/thrift/PaloInternalService.thrift 
b/gensrc/thrift/PaloInternalService.thrift
index b201882d113..5d84b7766eb 100644
--- a/gensrc/thrift/PaloInternalService.thrift
+++ b/gensrc/thrift/PaloInternalService.thrift
@@ -321,6 +321,7 @@ struct TQueryOptions {
 
   120: optional bool enable_fallback_on_missing_inverted_index = true;
 
+  122: optional i32 runtime_bloom_filter_min_size = 1048576;
   // For cloud, to control if the content would be written into file cache
   1000: optional bool disable_file_cache = false
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to