This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 8a274d7851 [feature-wip](new-scan) refactor some interface about 
predicate push down in scan node (#12527)
8a274d7851 is described below

commit 8a274d7851b2007c03d31a60b8fedbb329dc8f82
Author: Mingyu Chen <[email protected]>
AuthorDate: Tue Sep 13 10:25:13 2022 +0800

    [feature-wip](new-scan) refactor some interface about predicate push down 
in scan node (#12527)
    
    This PR introduce a new enum type `PushDownType`:
    ```
    enum class PushDownType {
            // The predicate can not be pushed down to data source
            UNACCEPTABLE,
            // The predicate can be pushed down to data source
            // and the data source can fully evaludate it
            ACCEPTABLE,
            // The predicate can be pushed down to data source
            // but the data source can not fully evaluate it.
            PARTIAL_ACCEPTABLE
        };
    ```
    
    And derived class of VScanNode can override following method to determine 
whether to accept
    a bianry/in/bloom filter/is null predicate:
    
    ```
    PushDownType _should_push_down_binary_predicate();
    PushDownType _should_push_down_in_predicate();
    PushDownType _should_push_down_function_filter();
    PushDownType _should_push_down_bloom_filter();
    PushDownType _should_push_down_is_null_predicate();
    ```
---
 be/src/vec/exec/scan/new_olap_scan_node.cpp |  77 ++-------
 be/src/vec/exec/scan/new_olap_scan_node.h   |  14 +-
 be/src/vec/exec/scan/vscan_node.cpp         | 259 ++++++++++++++++++----------
 be/src/vec/exec/scan/vscan_node.h           |  59 +++++--
 4 files changed, 224 insertions(+), 185 deletions(-)

diff --git a/be/src/vec/exec/scan/new_olap_scan_node.cpp 
b/be/src/vec/exec/scan/new_olap_scan_node.cpp
index d4b1b6a7d7..973e6c23ee 100644
--- a/be/src/vec/exec/scan/new_olap_scan_node.cpp
+++ b/be/src/vec/exec/scan/new_olap_scan_node.cpp
@@ -125,7 +125,7 @@ static std::string olap_filters_to_string(const 
std::vector<doris::TCondition>&
     filters_string += "[";
     for (auto it = filters.cbegin(); it != filters.cend(); it++) {
         if (it != filters.cbegin()) {
-            filters_string += ",";
+            filters_string += ", ";
         }
         filters_string += olap_filter_to_string(*it);
     }
@@ -181,6 +181,12 @@ Status NewOlapScanNode::_build_key_ranges_and_filters() {
         }
     }
 
+    // Append value ranges in "_not_in_value_ranges"
+    for (auto& range : _not_in_value_ranges) {
+        std::visit([&](auto&& the_range) { 
the_range.to_in_condition(_olap_filters, false); },
+                   range);
+    }
+
     _runtime_profile->add_info_string("PushDownPredicates", 
olap_filters_to_string(_olap_filters));
     _runtime_profile->add_info_string("KeyRanges", _scan_keys.debug_string());
     VLOG_CRITICAL << _scan_keys.debug_string();
@@ -188,67 +194,12 @@ Status NewOlapScanNode::_build_key_ranges_and_filters() {
     return Status::OK();
 }
 
-bool NewOlapScanNode::_should_push_down_binary_predicate(
-        VectorizedFnCall* fn_call, VExprContext* expr_ctx, StringRef* 
constant_val,
-        int* slot_ref_child, const std::function<bool(const std::string&)>& 
fn_checker) {
-    if (!fn_checker(fn_call->fn().name.function_name)) {
-        return false;
-    }
-
-    const auto& children = fn_call->children();
-    DCHECK(children.size() == 2);
-    for (size_t i = 0; i < children.size(); i++) {
-        if (VExpr::expr_without_cast(children[i])->node_type() != 
TExprNodeType::SLOT_REF) {
-            // not a slot ref(column)
-            continue;
-        }
-        if (!children[1 - i]->is_constant()) {
-            // only handle constant value
-            return false;
-        } else {
-            if (const ColumnConst* const_column = 
check_and_get_column<ColumnConst>(
-                        children[1 - i]->get_const_col(expr_ctx)->column_ptr)) 
{
-                *slot_ref_child = i;
-                *constant_val = const_column->get_data_at(0);
-            } else {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-bool NewOlapScanNode::_should_push_down_in_predicate(VInPredicate* pred, 
VExprContext* expr_ctx,
-                                                     bool is_not_in) {
-    if (pred->is_not_in() != is_not_in) {
-        return false;
-    }
-    InState* state = reinterpret_cast<InState*>(
-            expr_ctx->fn_context(pred->fn_context_index())
-                    ->get_function_state(FunctionContext::FRAGMENT_LOCAL));
-    HybridSetBase* set = state->hybrid_set.get();
-
-    // if there are too many elements in InPredicate, exceed the limit,
-    // we will not push any condition of this column to storage engine.
-    // because too many conditions pushed down to storage engine may even
-    // slow down the query process.
-    // ATTN: This is just an experience value. You may need to try
-    // different thresholds to improve performance.
-    if (set->size() > _max_pushdown_conditions_per_column) {
-        VLOG_NOTICE << "Predicate value num " << set->size() << " exceed limit 
"
-                    << _max_pushdown_conditions_per_column;
-        return false;
-    }
-    return true;
-}
-
-bool NewOlapScanNode::_should_push_down_function_filter(VectorizedFnCall* 
fn_call,
-                                                        VExprContext* expr_ctx,
-                                                        StringVal* 
constant_str,
-                                                        
doris_udf::FunctionContext** fn_ctx) {
+VScanNode::PushDownType NewOlapScanNode::_should_push_down_function_filter(
+        VectorizedFnCall* fn_call, VExprContext* expr_ctx, StringVal* 
constant_str,
+        doris_udf::FunctionContext** fn_ctx) {
     // Now only `like` function filters is supported to push down
     if (fn_call->fn().name.function_name != "like") {
-        return false;
+        return PushDownType::UNACCEPTABLE;
     }
 
     const auto& children = fn_call->children();
@@ -262,19 +213,19 @@ bool 
NewOlapScanNode::_should_push_down_function_filter(VectorizedFnCall* fn_cal
         }
         if (!children[1 - i]->is_constant()) {
             // only handle constant value
-            return false;
+            return PushDownType::UNACCEPTABLE;
         } else {
             DCHECK(children[1 - i]->type().is_string_type());
             if (const ColumnConst* const_column = 
check_and_get_column<ColumnConst>(
                         children[1 - i]->get_const_col(expr_ctx)->column_ptr)) 
{
                 *constant_str = const_column->get_data_at(0).to_string_val();
             } else {
-                return false;
+                return PushDownType::UNACCEPTABLE;
             }
         }
     }
     *fn_ctx = func_cxt;
-    return true;
+    return PushDownType::ACCEPTABLE;
 }
 
 // PlanFragmentExecutor will call this method to set scan range
diff --git a/be/src/vec/exec/scan/new_olap_scan_node.h 
b/be/src/vec/exec/scan/new_olap_scan_node.h
index a721737585..a922b009b9 100644
--- a/be/src/vec/exec/scan/new_olap_scan_node.h
+++ b/be/src/vec/exec/scan/new_olap_scan_node.h
@@ -38,17 +38,13 @@ protected:
     Status _process_conjuncts() override;
     bool _is_key_column(const std::string& col_name) override;
 
-    bool _should_push_down_binary_predicate(
-            VectorizedFnCall* fn_call, VExprContext* expr_ctx, StringRef* 
constant_val,
-            int* slot_ref_child,
-            const std::function<bool(const std::string&)>& fn_checker) 
override;
+    PushDownType _should_push_down_function_filter(VectorizedFnCall* fn_call,
+                                                   VExprContext* expr_ctx, 
StringVal* constant_str,
+                                                   
doris_udf::FunctionContext** fn_ctx) override;
 
-    bool _should_push_down_in_predicate(VInPredicate* in_pred, VExprContext* 
expr_ctx,
-                                        bool is_not_in) override;
+    PushDownType _should_push_down_bloom_filter() override { return 
PushDownType::ACCEPTABLE; }
 
-    bool _should_push_down_function_filter(VectorizedFnCall* fn_call, 
VExprContext* expr_ctx,
-                                           StringVal* constant_str,
-                                           doris_udf::FunctionContext** 
fn_ctx) override;
+    PushDownType _should_push_down_is_null_predicate() override { return 
PushDownType::ACCEPTABLE; }
 
     Status _init_scanners(std::list<VScanner*>* scanners) override;
 
diff --git a/be/src/vec/exec/scan/vscan_node.cpp 
b/be/src/vec/exec/scan/vscan_node.cpp
index 0dd48d58b4..23661f7c5b 100644
--- a/be/src/vec/exec/scan/vscan_node.cpp
+++ b/be/src/vec/exec/scan/vscan_node.cpp
@@ -30,11 +30,11 @@
 
 namespace doris::vectorized {
 
-#define RETURN_IF_PUSH_DOWN(stmt) \
-    if (!push_down) {             \
-        stmt;                     \
-    } else {                      \
-        return;                   \
+#define RETURN_IF_PUSH_DOWN(stmt)            \
+    if (pdt == PushDownType::UNACCEPTABLE) { \
+        stmt;                                \
+    } else {                                 \
+        return;                              \
     }
 
 static bool ignore_cast(SlotDescriptor* slot, VExpr* expr) {
@@ -403,43 +403,44 @@ VExpr* VScanNode::_normalize_predicate(VExpr* 
conjunct_expr_root) {
     if (conjunct_expr_root != nullptr) {
         if (is_leaf(conjunct_expr_root)) {
             auto impl = conjunct_expr_root->get_impl();
+            // If impl is not null, which means this a conjuncts from runtime 
filter.
             VExpr* cur_expr = impl ? const_cast<VExpr*>(impl) : 
conjunct_expr_root;
             SlotDescriptor* slot;
             ColumnValueRangeType* range = nullptr;
-            bool push_down = false;
-            _eval_const_conjuncts(cur_expr, *(_vconjunct_ctx_ptr.get()), 
&push_down);
-            if (!push_down &&
+            PushDownType pdt = PushDownType::UNACCEPTABLE;
+            _eval_const_conjuncts(cur_expr, *(_vconjunct_ctx_ptr.get()), &pdt);
+            if (pdt == PushDownType::UNACCEPTABLE &&
                 (_is_predicate_acting_on_slot(cur_expr, in_predicate_checker, 
&slot, &range) ||
                  _is_predicate_acting_on_slot(cur_expr, eq_predicate_checker, 
&slot, &range))) {
                 std::visit(
                         [&](auto& value_range) {
                             RETURN_IF_PUSH_DOWN(_normalize_in_and_eq_predicate(
                                     cur_expr, *(_vconjunct_ctx_ptr.get()), 
slot, value_range,
-                                    &push_down));
+                                    &pdt));
                             
RETURN_IF_PUSH_DOWN(_normalize_not_in_and_not_eq_predicate(
                                     cur_expr, *(_vconjunct_ctx_ptr.get()), 
slot, value_range,
-                                    &push_down));
+                                    &pdt));
                             RETURN_IF_PUSH_DOWN(_normalize_is_null_predicate(
                                     cur_expr, *(_vconjunct_ctx_ptr.get()), 
slot, value_range,
-                                    &push_down));
+                                    &pdt));
                             
RETURN_IF_PUSH_DOWN(_normalize_noneq_binary_predicate(
                                     cur_expr, *(_vconjunct_ctx_ptr.get()), 
slot, value_range,
-                                    &push_down));
+                                    &pdt));
                             if (_is_key_column(slot->col_name())) {
                                 RETURN_IF_PUSH_DOWN(_normalize_bloom_filter(
-                                        cur_expr, *(_vconjunct_ctx_ptr.get()), 
slot, &push_down));
+                                        cur_expr, *(_vconjunct_ctx_ptr.get()), 
slot, &pdt));
                                 if (_state->enable_function_pushdown()) {
                                     
RETURN_IF_PUSH_DOWN(_normalize_function_filters(
-                                            cur_expr, 
*(_vconjunct_ctx_ptr.get()), slot,
-                                            &push_down));
+                                            cur_expr, 
*(_vconjunct_ctx_ptr.get()), slot, &pdt));
                                 }
                             }
                         },
                         *range);
             }
-            if (push_down && _is_key_column(slot->col_name())) {
+            if (pdt == PushDownType::ACCEPTABLE && 
_is_key_column(slot->col_name())) {
                 return nullptr;
             } else {
+                // for PARTIAL_ACCEPTABLE and UNACCEPTABLE, do not remove expr 
from the tree
                 return conjunct_expr_root;
             }
         } else {
@@ -464,17 +465,20 @@ VExpr* VScanNode::_normalize_predicate(VExpr* 
conjunct_expr_root) {
 }
 
 Status VScanNode::_normalize_bloom_filter(VExpr* expr, VExprContext* expr_ctx, 
SlotDescriptor* slot,
-                                          bool* push_down) {
+                                          PushDownType* pdt) {
     if (TExprNodeType::BLOOM_PRED == expr->node_type()) {
         DCHECK(expr->children().size() == 1);
-        _bloom_filters_push_down.emplace_back(slot->col_name(), 
expr->get_bloom_filter_func());
-        *push_down = true;
+        PushDownType temp_pdt = _should_push_down_bloom_filter();
+        if (temp_pdt != PushDownType::UNACCEPTABLE) {
+            _bloom_filters_push_down.emplace_back(slot->col_name(), 
expr->get_bloom_filter_func());
+            *pdt = temp_pdt;
+        }
     }
     return Status::OK();
 }
 
 Status VScanNode::_normalize_function_filters(VExpr* expr, VExprContext* 
expr_ctx,
-                                              SlotDescriptor* slot, bool* 
push_down) {
+                                              SlotDescriptor* slot, 
PushDownType* pdt) {
     bool opposite = false;
     VExpr* fn_expr = expr;
     if (TExprNodeType::COMPOUND_PRED == expr->node_type() &&
@@ -486,11 +490,12 @@ Status VScanNode::_normalize_function_filters(VExpr* 
expr, VExprContext* expr_ct
     if (TExprNodeType::FUNCTION_CALL == fn_expr->node_type()) {
         doris_udf::FunctionContext* fn_ctx = nullptr;
         StringVal val;
-        if 
(_should_push_down_function_filter(reinterpret_cast<VectorizedFnCall*>(fn_expr),
-                                              expr_ctx, &val, &fn_ctx)) {
+        PushDownType temp_pdt = _should_push_down_function_filter(
+                reinterpret_cast<VectorizedFnCall*>(fn_expr), expr_ctx, &val, 
&fn_ctx);
+        if (temp_pdt != PushDownType::UNACCEPTABLE) {
             std::string col = slot->col_name();
             _push_down_functions.emplace_back(opposite, col, fn_ctx, val);
-            *push_down = true;
+            *pdt = temp_pdt;
         }
     }
     return Status::OK();
@@ -523,14 +528,14 @@ bool VScanNode::_is_predicate_acting_on_slot(
     return true;
 }
 
-void VScanNode::_eval_const_conjuncts(VExpr* vexpr, VExprContext* expr_ctx, 
bool* push_down) {
+void VScanNode::_eval_const_conjuncts(VExpr* vexpr, VExprContext* expr_ctx, 
PushDownType* pdt) {
     char* constant_val = nullptr;
     if (vexpr->is_constant()) {
         if (const ColumnConst* const_column =
                     
check_and_get_column<ColumnConst>(vexpr->get_const_col(expr_ctx)->column_ptr)) {
             constant_val = 
const_cast<char*>(const_column->get_data_at(0).data);
             if (constant_val == nullptr || 
*reinterpret_cast<bool*>(constant_val) == false) {
-                *push_down = true;
+                *pdt = PushDownType::ACCEPTABLE;
                 _eos = true;
             }
         } else {
@@ -543,14 +548,14 @@ void VScanNode::_eval_const_conjuncts(VExpr* vexpr, 
VExprContext* expr_ctx, bool
 template <PrimitiveType T>
 Status VScanNode::_normalize_in_and_eq_predicate(VExpr* expr, VExprContext* 
expr_ctx,
                                                  SlotDescriptor* slot, 
ColumnValueRange<T>& range,
-                                                 bool* push_down) {
+                                                 PushDownType* pdt) {
     auto temp_range = 
ColumnValueRange<T>::create_empty_column_value_range(slot->type().precision,
                                                                            
slot->type().scale);
-    bool effect = false;
     // 1. Normalize in conjuncts like 'where col in (v1, v2, v3)'
     if (TExprNodeType::IN_PRED == expr->node_type()) {
         VInPredicate* pred = static_cast<VInPredicate*>(expr);
-        if (!_should_push_down_in_predicate(pred, expr_ctx, false)) {
+        PushDownType temp_pdt = _should_push_down_in_predicate(pred, expr_ctx, 
false);
+        if (temp_pdt == PushDownType::UNACCEPTABLE) {
             return Status::OK();
         }
 
@@ -573,43 +578,45 @@ Status VScanNode::_normalize_in_and_eq_predicate(VExpr* 
expr, VExprContext* expr
                                                       fn_name, 
!state->hybrid_set->is_date_v2()));
             iter->next();
         }
-
         range.intersection(temp_range);
-        effect = true;
+        *pdt = temp_pdt;
     } else if (TExprNodeType::BINARY_PRED == expr->node_type()) {
         DCHECK(expr->children().size() == 2);
         auto eq_checker = [](const std::string& fn_name) { return fn_name == 
"eq"; };
 
         StringRef value;
         int slot_ref_child = -1;
-        if 
(_should_push_down_binary_predicate(reinterpret_cast<VectorizedFnCall*>(expr), 
expr_ctx,
-                                               &value, &slot_ref_child, 
eq_checker)) {
-            DCHECK(slot_ref_child >= 0);
-            // where A = nullptr should return empty result set
-            auto fn_name = std::string("");
-            if (value.data != nullptr) {
-                if constexpr (T == TYPE_CHAR || T == TYPE_VARCHAR || T == 
TYPE_STRING ||
-                              T == TYPE_HLL) {
-                    auto val = StringValue(value.data, value.size);
-                    RETURN_IF_ERROR(_change_value_range<true>(
-                            temp_range, reinterpret_cast<void*>(&val),
-                            ColumnValueRange<T>::add_fixed_value_range, 
fn_name));
-                } else {
-                    RETURN_IF_ERROR(_change_value_range<true>(
-                            temp_range, 
reinterpret_cast<void*>(const_cast<char*>(value.data)),
-                            ColumnValueRange<T>::add_fixed_value_range, 
fn_name));
-                }
-                range.intersection(temp_range);
-                effect = true;
+
+        PushDownType temp_pdt =
+                
_should_push_down_binary_predicate(reinterpret_cast<VectorizedFnCall*>(expr),
+                                                   expr_ctx, &value, 
&slot_ref_child, eq_checker);
+        if (temp_pdt == PushDownType::UNACCEPTABLE) {
+            return Status::OK();
+        }
+        DCHECK(slot_ref_child >= 0);
+        // where A = nullptr should return empty result set
+        auto fn_name = std::string("");
+        if (value.data != nullptr) {
+            if constexpr (T == TYPE_CHAR || T == TYPE_VARCHAR || T == 
TYPE_STRING ||
+                          T == TYPE_HLL) {
+                auto val = StringValue(value.data, value.size);
+                RETURN_IF_ERROR(_change_value_range<true>(
+                        temp_range, reinterpret_cast<void*>(&val),
+                        ColumnValueRange<T>::add_fixed_value_range, fn_name));
+            } else {
+                RETURN_IF_ERROR(_change_value_range<true>(
+                        temp_range, 
reinterpret_cast<void*>(const_cast<char*>(value.data)),
+                        ColumnValueRange<T>::add_fixed_value_range, fn_name));
             }
+            range.intersection(temp_range);
         }
+        *pdt = temp_pdt;
     }
 
     // exceed limit, no conditions will be pushed down to storage engine.
     if (range.get_fixed_value_size() > _max_pushdown_conditions_per_column) {
         range.set_whole_value_range();
-    } else {
-        *push_down = effect;
+        *pdt = PushDownType::UNACCEPTABLE;
     }
     return Status::OK();
 }
@@ -618,14 +625,15 @@ template <PrimitiveType T>
 Status VScanNode::_normalize_not_in_and_not_eq_predicate(VExpr* expr, 
VExprContext* expr_ctx,
                                                          SlotDescriptor* slot,
                                                          ColumnValueRange<T>& 
range,
-                                                         bool* push_down) {
+                                                         PushDownType* pdt) {
     bool is_fixed_range = range.is_fixed_value_range();
     auto not_in_range = 
ColumnValueRange<T>::create_empty_column_value_range(range.column_name());
-    bool effect = false;
+    PushDownType temp_pdt = PushDownType::UNACCEPTABLE;
     // 1. Normalize in conjuncts like 'where col in (v1, v2, v3)'
     if (TExprNodeType::IN_PRED == expr->node_type()) {
         VInPredicate* pred = static_cast<VInPredicate*>(expr);
-        if (!_should_push_down_in_predicate(pred, expr_ctx, true)) {
+        if ((temp_pdt = _should_push_down_in_predicate(pred, expr_ctx, true)) 
==
+            PushDownType::UNACCEPTABLE) {
             return Status::OK();
         }
 
@@ -652,55 +660,56 @@ Status 
VScanNode::_normalize_not_in_and_not_eq_predicate(VExpr* expr, VExprConte
             }
             iter->next();
         }
-        effect = true;
     } else if (TExprNodeType::BINARY_PRED == expr->node_type()) {
         DCHECK(expr->children().size() == 2);
 
         auto ne_checker = [](const std::string& fn_name) { return fn_name == 
"ne"; };
         StringRef value;
         int slot_ref_child = -1;
-        if 
(_should_push_down_binary_predicate(reinterpret_cast<VectorizedFnCall*>(expr), 
expr_ctx,
-                                               &value, &slot_ref_child, 
ne_checker)) {
-            DCHECK(slot_ref_child >= 0);
-            // where A = nullptr should return empty result set
-            if (value.data != nullptr) {
-                auto fn_name = std::string("");
-                if constexpr (T == TYPE_CHAR || T == TYPE_VARCHAR || T == 
TYPE_STRING ||
-                              T == TYPE_HLL) {
-                    auto val = StringValue(value.data, value.size);
-                    if (is_fixed_range) {
-                        RETURN_IF_ERROR(_change_value_range<true>(
-                                range, reinterpret_cast<void*>(&val),
-                                ColumnValueRange<T>::remove_fixed_value_range, 
fn_name));
-                    } else {
-                        RETURN_IF_ERROR(_change_value_range<true>(
-                                not_in_range, reinterpret_cast<void*>(&val),
-                                ColumnValueRange<T>::add_fixed_value_range, 
fn_name));
-                    }
+        if ((temp_pdt = _should_push_down_binary_predicate(
+                     reinterpret_cast<VectorizedFnCall*>(expr), expr_ctx, 
&value, &slot_ref_child,
+                     ne_checker)) == PushDownType::UNACCEPTABLE) {
+            return Status::OK();
+        }
+
+        DCHECK(slot_ref_child >= 0);
+        // where A = nullptr should return empty result set
+        if (value.data != nullptr) {
+            auto fn_name = std::string("");
+            if constexpr (T == TYPE_CHAR || T == TYPE_VARCHAR || T == 
TYPE_STRING ||
+                          T == TYPE_HLL) {
+                auto val = StringValue(value.data, value.size);
+                if (is_fixed_range) {
+                    RETURN_IF_ERROR(_change_value_range<true>(
+                            range, reinterpret_cast<void*>(&val),
+                            ColumnValueRange<T>::remove_fixed_value_range, 
fn_name));
                 } else {
-                    if (is_fixed_range) {
-                        RETURN_IF_ERROR(_change_value_range<true>(
-                                range, 
reinterpret_cast<void*>(const_cast<char*>(value.data)),
-                                ColumnValueRange<T>::remove_fixed_value_range, 
fn_name));
-                    } else {
-                        RETURN_IF_ERROR(_change_value_range<true>(
-                                not_in_range,
-                                
reinterpret_cast<void*>(const_cast<char*>(value.data)),
-                                ColumnValueRange<T>::add_fixed_value_range, 
fn_name));
-                    }
+                    RETURN_IF_ERROR(_change_value_range<true>(
+                            not_in_range, reinterpret_cast<void*>(&val),
+                            ColumnValueRange<T>::add_fixed_value_range, 
fn_name));
+                }
+            } else {
+                if (is_fixed_range) {
+                    RETURN_IF_ERROR(_change_value_range<true>(
+                            range, 
reinterpret_cast<void*>(const_cast<char*>(value.data)),
+                            ColumnValueRange<T>::remove_fixed_value_range, 
fn_name));
+                } else {
+                    RETURN_IF_ERROR(_change_value_range<true>(
+                            not_in_range, 
reinterpret_cast<void*>(const_cast<char*>(value.data)),
+                            ColumnValueRange<T>::add_fixed_value_range, 
fn_name));
                 }
-                effect = true;
             }
         }
+    } else {
+        return Status::OK();
     }
 
     if (is_fixed_range ||
         not_in_range.get_fixed_value_size() <= 
_max_pushdown_conditions_per_column) {
         if (!is_fixed_range) {
-            // push down not in condition to storage engine
-            not_in_range.to_in_condition(_olap_filters, false);
+            _not_in_value_ranges.push_back(not_in_range);
         }
-        *push_down = effect;
+        *pdt = temp_pdt;
     }
     return Status::OK();
 }
@@ -708,21 +717,26 @@ Status 
VScanNode::_normalize_not_in_and_not_eq_predicate(VExpr* expr, VExprConte
 template <PrimitiveType T>
 Status VScanNode::_normalize_is_null_predicate(VExpr* expr, VExprContext* 
expr_ctx,
                                                SlotDescriptor* slot, 
ColumnValueRange<T>& range,
-                                               bool* push_down) {
+                                               PushDownType* pdt) {
+    PushDownType temp_pdt = _should_push_down_is_null_predicate();
+    if (temp_pdt == PushDownType::UNACCEPTABLE) {
+        return Status::OK();
+    }
+
     if (TExprNodeType::FUNCTION_CALL == expr->node_type()) {
         if (reinterpret_cast<VectorizedFnCall*>(expr)->fn().name.function_name 
== "is_null_pred") {
             auto temp_range = 
ColumnValueRange<T>::create_empty_column_value_range(
                     slot->type().precision, slot->type().scale);
             temp_range.set_contain_null(true);
             range.intersection(temp_range);
-            *push_down = true;
+            *pdt = temp_pdt;
         } else if 
(reinterpret_cast<VectorizedFnCall*>(expr)->fn().name.function_name ==
                    "is_not_null_pred") {
             auto temp_range = 
ColumnValueRange<T>::create_empty_column_value_range(
                     slot->type().precision, slot->type().scale);
             temp_range.set_contain_null(false);
             range.intersection(temp_range);
-            *push_down = true;
+            *pdt = temp_pdt;
         }
     }
     return Status::OK();
@@ -731,7 +745,7 @@ Status VScanNode::_normalize_is_null_predicate(VExpr* expr, 
VExprContext* expr_c
 template <PrimitiveType T>
 Status VScanNode::_normalize_noneq_binary_predicate(VExpr* expr, VExprContext* 
expr_ctx,
                                                     SlotDescriptor* slot,
-                                                    ColumnValueRange<T>& 
range, bool* push_down) {
+                                                    ColumnValueRange<T>& 
range, PushDownType* pdt) {
     if (TExprNodeType::BINARY_PRED == expr->node_type()) {
         DCHECK(expr->children().size() == 2);
 
@@ -740,15 +754,16 @@ Status 
VScanNode::_normalize_noneq_binary_predicate(VExpr* expr, VExprContext* e
         };
         StringRef value;
         int slot_ref_child = -1;
-        if 
(_should_push_down_binary_predicate(reinterpret_cast<VectorizedFnCall*>(expr), 
expr_ctx,
-                                               &value, &slot_ref_child, 
noneq_checker)) {
+        PushDownType temp_pdt = _should_push_down_binary_predicate(
+                reinterpret_cast<VectorizedFnCall*>(expr), expr_ctx, &value, 
&slot_ref_child,
+                noneq_checker);
+        if (temp_pdt != PushDownType::UNACCEPTABLE) {
             DCHECK(slot_ref_child >= 0);
             const std::string& fn_name =
                     
reinterpret_cast<VectorizedFnCall*>(expr)->fn().name.function_name;
 
             // where A = nullptr should return empty result set
             if (value.data != nullptr) {
-                *push_down = true;
                 if constexpr (T == TYPE_CHAR || T == TYPE_VARCHAR || T == 
TYPE_STRING ||
                               T == TYPE_HLL) {
                     auto val = StringValue(value.data, value.size);
@@ -760,6 +775,7 @@ Status VScanNode::_normalize_noneq_binary_predicate(VExpr* 
expr, VExprContext* e
                             range, 
reinterpret_cast<void*>(const_cast<char*>(value.data)),
                             ColumnValueRange<T>::add_value_range, fn_name, 
true, slot_ref_child));
                 }
+                *pdt = temp_pdt;
             }
         }
     }
@@ -906,4 +922,59 @@ Status VScanNode::clone_vconjunct_ctx(VExprContext** 
_vconjunct_ctx) {
     return Status::OK();
 }
 
+VScanNode::PushDownType VScanNode::_should_push_down_binary_predicate(
+        VectorizedFnCall* fn_call, VExprContext* expr_ctx, StringRef* 
constant_val,
+        int* slot_ref_child, const std::function<bool(const std::string&)>& 
fn_checker) {
+    if (!fn_checker(fn_call->fn().name.function_name)) {
+        return PushDownType::UNACCEPTABLE;
+    }
+
+    const auto& children = fn_call->children();
+    DCHECK(children.size() == 2);
+    for (size_t i = 0; i < children.size(); i++) {
+        if (VExpr::expr_without_cast(children[i])->node_type() != 
TExprNodeType::SLOT_REF) {
+            // not a slot ref(column)
+            continue;
+        }
+        if (!children[1 - i]->is_constant()) {
+            // only handle constant value
+            return PushDownType::UNACCEPTABLE;
+        } else {
+            if (const ColumnConst* const_column = 
check_and_get_column<ColumnConst>(
+                        children[1 - i]->get_const_col(expr_ctx)->column_ptr)) 
{
+                *slot_ref_child = i;
+                *constant_val = const_column->get_data_at(0);
+            } else {
+                return PushDownType::UNACCEPTABLE;
+            }
+        }
+    }
+    return PushDownType::ACCEPTABLE;
+}
+
+VScanNode::PushDownType 
VScanNode::_should_push_down_in_predicate(VInPredicate* pred,
+                                                                  
VExprContext* expr_ctx,
+                                                                  bool 
is_not_in) {
+    if (pred->is_not_in() != is_not_in) {
+        return PushDownType::UNACCEPTABLE;
+    }
+    InState* state = reinterpret_cast<InState*>(
+            expr_ctx->fn_context(pred->fn_context_index())
+                    ->get_function_state(FunctionContext::FRAGMENT_LOCAL));
+    HybridSetBase* set = state->hybrid_set.get();
+
+    // if there are too many elements in InPredicate, exceed the limit,
+    // we will not push any condition of this column to storage engine.
+    // because too many conditions pushed down to storage engine may even
+    // slow down the query process.
+    // ATTN: This is just an experience value. You may need to try
+    // different thresholds to improve performance.
+    if (set->size() > _max_pushdown_conditions_per_column) {
+        VLOG_NOTICE << "Predicate value num " << set->size() << " exceed limit 
"
+                    << _max_pushdown_conditions_per_column;
+        return PushDownType::UNACCEPTABLE;
+    }
+    return PushDownType::ACCEPTABLE;
+}
+
 } // namespace doris::vectorized
diff --git a/be/src/vec/exec/scan/vscan_node.h 
b/be/src/vec/exec/scan/vscan_node.h
index a37074948a..33386ffdab 100644
--- a/be/src/vec/exec/scan/vscan_node.h
+++ b/be/src/vec/exec/scan/vscan_node.h
@@ -73,6 +73,17 @@ public:
     const TupleDescriptor* input_tuple_desc() const { return 
_input_tuple_desc; }
     const TupleDescriptor* output_tuple_desc() const { return 
_output_tuple_desc; }
 
+    enum class PushDownType {
+        // The predicate can not be pushed down to data source
+        UNACCEPTABLE,
+        // The predicate can be pushed down to data source
+        // and the data source can fully evaludate it
+        ACCEPTABLE,
+        // The predicate can be pushed down to data source
+        // but the data source can not fully evaluate it.
+        PARTIAL_ACCEPTABLE
+    };
+
 protected:
     // Different data sources register different profiles by implementing this 
method
     virtual Status _init_profile();
@@ -105,21 +116,24 @@ protected:
     //      2. in/not in predicate
     //      3. function predicate
     //  TODO: these interfaces should be change to become more common.
-    virtual bool _should_push_down_binary_predicate(
+    virtual PushDownType _should_push_down_binary_predicate(
             VectorizedFnCall* fn_call, VExprContext* expr_ctx, StringRef* 
constant_val,
-            int* slot_ref_child, const std::function<bool(const 
std::string&)>& fn_checker) {
-        return false;
-    }
+            int* slot_ref_child, const std::function<bool(const 
std::string&)>& fn_checker);
 
-    virtual bool _should_push_down_in_predicate(VInPredicate* in_pred, 
VExprContext* expr_ctx,
-                                                bool is_not_in) {
-        return false;
+    virtual PushDownType _should_push_down_in_predicate(VInPredicate* in_pred,
+                                                        VExprContext* 
expr_ctx, bool is_not_in);
+
+    virtual PushDownType _should_push_down_function_filter(VectorizedFnCall* 
fn_call,
+                                                           VExprContext* 
expr_ctx,
+                                                           StringVal* 
constant_str,
+                                                           
doris_udf::FunctionContext** fn_ctx) {
+        return PushDownType::UNACCEPTABLE;
     }
 
-    virtual bool _should_push_down_function_filter(VectorizedFnCall* fn_call,
-                                                   VExprContext* expr_ctx, 
StringVal* constant_str,
-                                                   
doris_udf::FunctionContext** fn_ctx) {
-        return false;
+    virtual PushDownType _should_push_down_bloom_filter() { return 
PushDownType::UNACCEPTABLE; }
+
+    virtual PushDownType _should_push_down_is_null_predicate() {
+        return PushDownType::UNACCEPTABLE;
     }
 
     // Return true if it is a key column.
@@ -175,11 +189,18 @@ protected:
     std::vector<FunctionFilter> _push_down_functions;
 
     // slot id -> ColumnValueRange
-    // Parsed from conjunts
+    // Parsed from conjuncts
     phmap::flat_hash_map<int, std::pair<SlotDescriptor*, ColumnValueRangeType>>
             _slot_id_to_value_range;
     // column -> ColumnValueRange
     std::map<std::string, ColumnValueRangeType> _colname_to_value_range;
+    // We use _colname_to_value_range to store a column and its conresponding 
value ranges.
+    // But if a col is with value range, eg: 1 < col < 10, which is 
"!is_fixed_range",
+    // in this case we can not merge "1 < col < 10" with "col not in (2)".
+    // So we have to save "col not in (2)" to another structure: 
"_not_in_value_ranges".
+    // When the data source try to use the value ranges, it should use both 
ranges in
+    // "_colname_to_value_range" and in "_not_in_value_ranges"
+    std::vector<ColumnValueRangeType> _not_in_value_ranges;
 
     bool _need_agg_finalize = true;
 
@@ -233,13 +254,13 @@ private:
 
     Status _normalize_conjuncts();
     VExpr* _normalize_predicate(VExpr* conjunct_expr_root);
-    void _eval_const_conjuncts(VExpr* vexpr, VExprContext* expr_ctx, bool* 
push_down);
+    void _eval_const_conjuncts(VExpr* vexpr, VExprContext* expr_ctx, 
PushDownType* pdt);
 
     Status _normalize_bloom_filter(VExpr* expr, VExprContext* expr_ctx, 
SlotDescriptor* slot,
-                                   bool* push_down);
+                                   PushDownType* pdt);
 
     Status _normalize_function_filters(VExpr* expr, VExprContext* expr_ctx, 
SlotDescriptor* slot,
-                                       bool* push_down);
+                                       PushDownType* pdt);
 
     bool _is_predicate_acting_on_slot(VExpr* expr,
                                       const std::function<bool(const 
std::vector<VExpr*>&,
@@ -249,21 +270,21 @@ private:
     template <PrimitiveType T>
     Status _normalize_in_and_eq_predicate(vectorized::VExpr* expr, 
VExprContext* expr_ctx,
                                           SlotDescriptor* slot, 
ColumnValueRange<T>& range,
-                                          bool* push_down);
+                                          PushDownType* pdt);
     template <PrimitiveType T>
     Status _normalize_not_in_and_not_eq_predicate(vectorized::VExpr* expr, 
VExprContext* expr_ctx,
                                                   SlotDescriptor* slot, 
ColumnValueRange<T>& range,
-                                                  bool* push_down);
+                                                  PushDownType* pdt);
 
     template <PrimitiveType T>
     Status _normalize_noneq_binary_predicate(vectorized::VExpr* expr, 
VExprContext* expr_ctx,
                                              SlotDescriptor* slot, 
ColumnValueRange<T>& range,
-                                             bool* push_down);
+                                             PushDownType* pdt);
 
     template <PrimitiveType T>
     Status _normalize_is_null_predicate(vectorized::VExpr* expr, VExprContext* 
expr_ctx,
                                         SlotDescriptor* slot, 
ColumnValueRange<T>& range,
-                                        bool* push_down);
+                                        PushDownType* pdt);
 
     template <bool IsFixed, PrimitiveType PrimitiveType, typename 
ChangeFixedValueRangeFunc>
     static Status _change_value_range(ColumnValueRange<PrimitiveType>& range, 
void* value,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to