This is an automated email from the ASF dual-hosted git repository.
panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new d5e95188554 [Bug](scan) Preserve IN_LIST runtime filter predicates
when key range is a scope range (#62027)
d5e95188554 is described below
commit d5e951885544bcde7a08e49b336395677a304876
Author: Pxl <[email protected]>
AuthorDate: Fri Apr 3 11:48:10 2026 +0800
[Bug](scan) Preserve IN_LIST runtime filter predicates when key range is a
scope range (#62027)
This pull request addresses a bug in the OLAP scan operator where
`IN_LIST` predicates could be incorrectly erased when both `MINMAX` and
`IN` runtime filters targeted the same key column, and the number of
`IN` values exceeded the maximum allowed for pushdown. The changes
ensure that `IN_LIST` predicates are preserved in such cases, preventing
incorrect query results. Additionally, a regression test is added to
verify the fix.
**Bug fix in predicate handling:**
* Modified the logic in `_build_key_ranges_and_filters()` within
`olap_scan_operator.cpp` to ensure that `IN_LIST` predicates are not
erased when the key range is a scope range (e.g., `>= X AND <= Y`) and
the `IN` filter's value count exceeds
`max_pushdown_conditions_per_column`. This preserves filtering semantics
that are not captured by the scope range.
[[1]](diffhunk://#diff-3ddc75656071d9c0e6b0be450e152a1c94559f7e70ea820e7f0c80a7078e3292R972)
[[2]](diffhunk://#diff-3ddc75656071d9c0e6b0be450e152a1c94559f7e70ea820e7f0c80a7078e3292R986)
[[3]](diffhunk://#diff-3ddc75656071d9c0e6b0be450e152a1c94559f7e70ea820e7f0c80a7078e3292L986-R1013)
* Enhanced the profiling output in `_process_conjuncts()` to accurately
reflect the set of predicates that will reach the storage layer after
key range and filter construction. This helps with debugging and
verification of predicate pushdown.
**Testing and regression coverage:**
* Added a new regression test
`test_rf_in_list_not_erased_by_scope_range.groovy` to verify that
`IN_LIST` predicates are not incorrectly erased when both `MINMAX` and
`IN` filters are present and the `IN` list is too large to be absorbed
into the key range.
* Added the corresponding expected output file for the new regression
test.
---
be/src/exec/operator/olap_scan_operator.cpp | 44 ++++++++++-
be/src/exec/operator/scan_operator.cpp | 41 +++++-----
be/src/storage/predicate/column_predicate.h | 3 +-
be/src/storage/predicate/comparison_predicate.h | 6 --
be/src/storage/predicate/in_list_predicate.h | 7 --
be/src/storage/predicate/null_predicate.h | 1 -
.../test_rf_in_list_not_erased_by_scope_range.out | 9 +++
...est_rf_in_list_not_erased_by_scope_range.groovy | 91 ++++++++++++++++++++++
8 files changed, 166 insertions(+), 36 deletions(-)
diff --git a/be/src/exec/operator/olap_scan_operator.cpp
b/be/src/exec/operator/olap_scan_operator.cpp
index 45c4728ba57..ccdd82bc5f1 100644
--- a/be/src/exec/operator/olap_scan_operator.cpp
+++ b/be/src/exec/operator/olap_scan_operator.cpp
@@ -951,6 +951,7 @@ Status OlapScanLocalState::_build_key_ranges_and_filters() {
const auto& value_range = iter->second;
std::optional<int> key_to_erase;
+ bool is_fixed_value_range = false;
RETURN_IF_ERROR(std::visit(
[&](auto&& range) {
@@ -964,6 +965,7 @@ Status OlapScanLocalState::_build_key_ranges_and_filters() {
&exact_range,
&eos, &should_break));
if (exact_range) {
key_to_erase = iter->first;
+ is_fixed_value_range =
range.is_fixed_value_range();
}
} else {
// if exceed max_pushdown_conditions_per_column,
use whole_value_rang instead
@@ -981,9 +983,49 @@ Status OlapScanLocalState::_build_key_ranges_and_filters()
{
if (key_to_erase.has_value()) {
_slot_id_to_value_range.erase(*key_to_erase);
+ // Determine which predicates are subsumed by the scan key
range and can
+ // be removed. The rule depends on the ColumnValueRange type:
+ //
+ // Fixed value range → scan keys are exact point lookups,
so both
+ // comparison (EQ/LT/LE/GT/GE) and
positive IN_LIST
+ // predicates are fully captured and
can be erased.
+ //
+ // Scope range → scan keys only capture [low, high]
boundaries,
+ // so only comparison predicates are
subsumed.
+ // IN_LIST predicates (whose values
may NOT have been
+ // absorbed into the ColumnValueRange,
e.g., because
+ // the value count exceeded
max_pushdown_conditions_per_column)
+ // must be preserved.
+ //
+ // In either case, predicates with negation semantics
(effective NE / NOT_IN_LIST)
+ // are never subsumed by scan key ranges and must always be
preserved.
+ auto can_erase_predicate = [is_fixed_value_range](const
ColumnPredicate& pred) {
+ PredicateType pt = pred.type();
+ bool opposite = pred.opposite();
+
+ // Effective NE: never subsumed by any scan key range.
+ if ((pt == PredicateType::NE && !opposite) ||
+ (pt == PredicateType::EQ && opposite)) {
+ return false;
+ }
+ // Comparison predicates (EQ/LT/LE/GT/GE) or
IS_NULL/IS_NOT_NULL: subsumed by both
+ // fixed value and scope ranges.
+ if (PredicateTypeTraits::is_comparison(pt) || pt ==
PredicateType::IS_NULL ||
+ pt == PredicateType::IS_NOT_NULL) {
+ return true;
+ }
+ // Effective IN_LIST: only subsumed by fixed value range.
+ if ((pt == PredicateType::IN_LIST && !opposite) ||
+ (pt == PredicateType::NOT_IN_LIST && opposite)) {
+ return is_fixed_value_range;
+ }
+ // Everything else (BF, BITMAP, NOT_IN_LIST, etc.): keep.
+ return false;
+ };
+
std::vector<std::shared_ptr<ColumnPredicate>> new_predicates;
for (const auto& it : _slot_id_to_predicates[*key_to_erase]) {
- if (!it->could_be_erased()) {
+ if (!can_erase_predicate(*it)) {
new_predicates.push_back(it);
}
}
diff --git a/be/src/exec/operator/scan_operator.cpp
b/be/src/exec/operator/scan_operator.cpp
index 251e4e437ca..ceb0e1254be 100644
--- a/be/src/exec/operator/scan_operator.cpp
+++ b/be/src/exec/operator/scan_operator.cpp
@@ -153,6 +153,23 @@ Status ScanLocalState<Derived>::init(RuntimeState* state,
LocalStateInfo& info)
return Status::OK();
}
+static std::string predicates_to_string(
+ const phmap::flat_hash_map<int,
std::vector<std::shared_ptr<ColumnPredicate>>>&
+ slot_id_to_predicates) {
+ fmt::memory_buffer debug_string_buffer;
+ for (const auto& [slot_id, predicates] : slot_id_to_predicates) {
+ if (predicates.empty()) {
+ continue;
+ }
+ fmt::format_to(debug_string_buffer, "Slot ID: {}: [", slot_id);
+ for (const auto& predicate : predicates) {
+ fmt::format_to(debug_string_buffer, "{{{}}}, ",
predicate->debug_string());
+ }
+ fmt::format_to(debug_string_buffer, "] ");
+ }
+ return fmt::to_string(debug_string_buffer);
+}
+
template <typename Derived>
Status ScanLocalState<Derived>::open(RuntimeState* state) {
SCOPED_TIMER(exec_time_counter());
@@ -193,6 +210,11 @@ Status ScanLocalState<Derived>::open(RuntimeState* state) {
RETURN_IF_ERROR(_process_conjuncts(state));
+ if (state->enable_profile()) {
+ custom_profile()->add_info_string("PushDownPredicates",
+
predicates_to_string(_slot_id_to_predicates));
+ }
+
auto status = _eos ? Status::OK() : _prepare_scanners();
RETURN_IF_ERROR(status);
if (auto ctx = _scanner_ctx.load()) {
@@ -203,23 +225,6 @@ Status ScanLocalState<Derived>::open(RuntimeState* state) {
return status;
}
-static std::string predicates_to_string(
- const phmap::flat_hash_map<int,
std::vector<std::shared_ptr<ColumnPredicate>>>&
- slot_id_to_predicates) {
- fmt::memory_buffer debug_string_buffer;
- for (const auto& [slot_id, predicates] : slot_id_to_predicates) {
- if (predicates.empty()) {
- continue;
- }
- fmt::format_to(debug_string_buffer, "Slot ID: {}: [", slot_id);
- for (const auto& predicate : predicates) {
- fmt::format_to(debug_string_buffer, "{{{}}}, ",
predicate->debug_string());
- }
- fmt::format_to(debug_string_buffer, "] ");
- }
- return fmt::to_string(debug_string_buffer);
-}
-
static void init_slot_value_range(
phmap::flat_hash_map<int, ColumnValueRangeType>&
slot_id_to_value_range,
SlotDescriptor* slot, const DataTypePtr type_desc) {
@@ -325,8 +330,6 @@ Status
ScanLocalState<Derived>::_normalize_conjuncts(RuntimeState* state) {
}
if (state->enable_profile()) {
- custom_profile()->add_info_string("PushDownPredicates",
-
predicates_to_string(_slot_id_to_predicates));
std::string message;
for (auto& conjunct : _conjuncts) {
if (conjunct->root()) {
diff --git a/be/src/storage/predicate/column_predicate.h
b/be/src/storage/predicate/column_predicate.h
index 59b6813ad4e..6ac2c45e93f 100644
--- a/be/src/storage/predicate/column_predicate.h
+++ b/be/src/storage/predicate/column_predicate.h
@@ -226,8 +226,7 @@ public:
}
virtual double get_ignore_threshold() const { return 0; }
- // If this predicate acts on the key column, this predicate should be
erased.
- virtual bool could_be_erased() const { return false; }
+
// Return the size of value set for IN/NOT IN predicates and 0 for others.
virtual std::string debug_string() const {
fmt::memory_buffer debug_string_buffer;
diff --git a/be/src/storage/predicate/comparison_predicate.h
b/be/src/storage/predicate/comparison_predicate.h
index a76ff865530..1a7885a8fa5 100644
--- a/be/src/storage/predicate/comparison_predicate.h
+++ b/be/src/storage/predicate/comparison_predicate.h
@@ -51,12 +51,6 @@ public:
ColumnPredicate::debug_string());
return fmt::to_string(debug_string_buffer);
}
- bool could_be_erased() const override {
- if ((PT == PredicateType::NE && !_opposite) || (PT ==
PredicateType::EQ && _opposite)) {
- return false;
- }
- return true;
- }
PredicateType type() const override { return PT; }
diff --git a/be/src/storage/predicate/in_list_predicate.h
b/be/src/storage/predicate/in_list_predicate.h
index 8aec120bd94..3ed8a3d3f74 100644
--- a/be/src/storage/predicate/in_list_predicate.h
+++ b/be/src/storage/predicate/in_list_predicate.h
@@ -147,13 +147,6 @@ public:
PredicateType type() const override { return PT; }
- bool could_be_erased() const override {
- if ((PT == PredicateType::NOT_IN_LIST && !_opposite) ||
- (PT == PredicateType::IN_LIST && _opposite)) {
- return false;
- }
- return true;
- }
Status evaluate(const IndexFieldNameAndTypePair& name_with_type,
IndexIterator* iterator,
uint32_t num_rows, roaring::Roaring* result) const
override {
if (iterator == nullptr) {
diff --git a/be/src/storage/predicate/null_predicate.h
b/be/src/storage/predicate/null_predicate.h
index 5a9c547a525..8e346208877 100644
--- a/be/src/storage/predicate/null_predicate.h
+++ b/be/src/storage/predicate/null_predicate.h
@@ -58,7 +58,6 @@ public:
ColumnPredicate::debug_string(), _is_null);
return fmt::to_string(debug_string_buffer);
}
- bool could_be_erased() const override { return true; }
PredicateType type() const override;
diff --git
a/regression-test/data/correctness_p0/test_rf_in_list_not_erased_by_scope_range.out
b/regression-test/data/correctness_p0/test_rf_in_list_not_erased_by_scope_range.out
new file mode 100644
index 00000000000..48792fec92c
--- /dev/null
+++
b/regression-test/data/correctness_p0/test_rf_in_list_not_erased_by_scope_range.out
@@ -0,0 +1,9 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !join --
+10 10
+12 12
+2 2
+4 4
+6 6
+8 8
+
diff --git
a/regression-test/suites/correctness_p0/test_rf_in_list_not_erased_by_scope_range.groovy
b/regression-test/suites/correctness_p0/test_rf_in_list_not_erased_by_scope_range.groovy
new file mode 100644
index 00000000000..113524d68dd
--- /dev/null
+++
b/regression-test/suites/correctness_p0/test_rf_in_list_not_erased_by_scope_range.groovy
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This test verifies that when both MINMAX and IN runtime filters target the
same
+// key column, and the IN filter's value count exceeds
max_pushdown_conditions_per_column,
+// the IN_LIST predicate is NOT incorrectly erased by the key range
construction logic.
+// Regression test for the bug where _build_key_ranges_and_filters() erased
IN_LIST
+// predicates when the ColumnValueRange was a scope range (from MINMAX filter).
+suite("test_rf_in_list_not_erased_by_scope_range") {
+ sql "drop table if exists rf_scope_probe;"
+ sql "drop table if exists rf_scope_build;"
+
+ sql """
+ CREATE TABLE rf_scope_probe (
+ k1 BIGINT,
+ v1 INT
+ )
+ DUPLICATE KEY(k1)
+ DISTRIBUTED BY HASH(k1) BUCKETS 1
+ PROPERTIES ("replication_num" = "1");
+ """
+
+ sql """
+ CREATE TABLE rf_scope_build (
+ k1 BIGINT,
+ v1 INT
+ )
+ DUPLICATE KEY(k1)
+ DISTRIBUTED BY HASH(k1) BUCKETS 1
+ PROPERTIES ("replication_num" = "1");
+ """
+
+ // Probe table: insert 20 rows with k1 from 1 to 20.
+ // The build side will only match a subset (k1 in {2,4,6,8,10,12}).
+ // Rows NOT in this subset (k1=1,3,5,7,9,11,13..20) should be filtered out
+ // by the IN_LIST runtime filter.
+ sql """
+ INSERT INTO rf_scope_probe VALUES
+ (1, 1), (2, 2), (3, 3), (4, 4), (5, 5),
+ (6, 6), (7, 7), (8, 8), (9, 9), (10, 10),
+ (11, 11), (12, 12), (13, 13), (14, 14), (15, 15),
+ (16, 16), (17, 17), (18, 18), (19, 19), (20, 20);
+ """
+
+ // Build table: 6 distinct k1 values. This exceeds
max_pushdown_conditions_per_column=5
+ // so the IN values are NOT added to ColumnValueRange, but the IN_LIST
predicate is created.
+ // MINMAX range: [2, 12]
+ sql """
+ INSERT INTO rf_scope_build VALUES
+ (2, 100), (4, 200), (6, 300), (8, 400), (10, 500), (12, 600);
+ """
+
+ sql "sync;"
+
+ // Set max_pushdown_conditions_per_column to 5, so the 6 IN values exceed
it.
+ // This causes IN values to NOT be added to the ColumnValueRange (it stays
as
+ // a scope range from the MINMAX filter), but the IN_LIST ColumnPredicate
is still created.
+ sql "set max_pushdown_conditions_per_column = 5;"
+ // Use both IN and MIN_MAX runtime filter types so both are generated on
the join key.
+ sql "set runtime_filter_type = 'IN_OR_BLOOM_FILTER,MIN_MAX';"
+ sql "set runtime_filter_wait_time_ms = 10000;"
+ sql "set runtime_filter_wait_infinitely = true;"
+ sql "set enable_runtime_filter_prune = false;"
+ sql "set enable_left_semi_direct_return_opt = true;"
+ sql "set parallel_pipeline_task_num = 1;"
+
+ // The join should only return 6 rows (matching k1 in {2,4,6,8,10,12}).
+ // If the IN_LIST predicate is incorrectly erased, the MINMAX scope [2,12]
+ // would let through rows with k1 in {3,5,7,9,11} as well, producing wrong
results.
+ // We verify correctness by checking the result.
+ order_qt_join """
+ SELECT p.k1, p.v1
+ FROM rf_scope_probe p
+ LEFT SEMI JOIN rf_scope_build b ON p.k1 = b.k1
+ ORDER BY p.k1;
+ """
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]