This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 163e02b5ca3 [Bug](join) fix left_semi_direct_return_opt get wrong
result with local shuffle (#60952)
163e02b5ca3 is described below
commit 163e02b5ca3c4f890098ba28eb2fe35b29e2ca9c
Author: Pxl <[email protected]>
AuthorDate: Tue Mar 3 17:34:20 2026 +0800
[Bug](join) fix left_semi_direct_return_opt get wrong result with local
shuffle (#60952)
### What problem does this PR solve?
This pull request introduces improvements to the runtime filter producer
logic and adds a new regression test for validating CTE EXISTS queries
with runtime filters. The changes enhance the detection and merging
logic for local runtime filters, clarify filter state transitions, and
provide comprehensive test coverage for the new logic.
### Runtime filter producer logic improvements
* Refactored the local merge condition in
`RuntimeFilterProducer::publish` to use the new `_need_do_merge` method,
improving clarity and maintainability of the merge decision logic.
* Added the `_need_do_merge` method to encapsulate the logic for
determining when a local merge is required, based on remote targets and
global consumers.
* Replaced `detect_in_filter` with `detect_local_in_filter`, updating
its logic to use `_need_do_merge` and ensuring local in filter
optimization is only enabled when appropriate.
[[1]](diffhunk://#diff-21ae533fb353c1ee2c03ee083e392b59f71e122af89b20b4767380aef83a6502L134-R138)
[[2]](diffhunk://#diff-bddbf8ad6f2ca00f5bd80c7ff32db5ec5d57a8ceab20a80b72e9296ec5281d7eL189-R189)
* Improved the runtime filter state transition message in
`RuntimeFilterWrapper::merge` to clarify when the filter is disabled due
to exceeding the maximum number of IN values during a merge.
### Regression testing
* Added a new regression test suite `test_cte_exists` in
`test_cte_exists.groovy` and corresponding output file, covering CTE
EXISTS queries and validating runtime filter behavior with various
configurations and data.
[[1]](diffhunk://#diff-2170eb63e82f077f69d5cc1c1821c71e0a5f957cdafa03df565e352ff97afcf4R1-R76)
[[2]](diffhunk://#diff-1ebe1b318d8106de4b351bea4577631616ece0b39a027c6bcafcafc8a19a4fb1R1-R44)
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
be/src/runtime_filter/runtime_filter_producer.cpp | 13 +---
be/src/runtime_filter/runtime_filter_producer.h | 18 ++++-
.../runtime_filter_producer_helper.cpp | 2 +-
be/src/runtime_filter/runtime_filter_wrapper.cpp | 3 +-
.../join/test_cte_exists/test_cte_exists.out | 44 ++++++++++++
.../join/test_cte_exists/test_cte_exists.groovy | 78 ++++++++++++++++++++++
6 files changed, 142 insertions(+), 16 deletions(-)
diff --git a/be/src/runtime_filter/runtime_filter_producer.cpp
b/be/src/runtime_filter/runtime_filter_producer.cpp
index 027ad55b515..555f91bd964 100644
--- a/be/src/runtime_filter/runtime_filter_producer.cpp
+++ b/be/src/runtime_filter/runtime_filter_producer.cpp
@@ -50,12 +50,7 @@ Status RuntimeFilterProducer::publish(RuntimeState* state,
bool build_hash_table
_check_state({State::READY_TO_PUBLISH});
auto do_merge = [&]() {
- // two case we need do local merge:
- // 1. has remote target
- // 2. has local target and has global consumer (means target scan has
local shuffle)
- if (!_has_remote_target && state->global_runtime_filter_mgr()
-
->get_consume_filters(_wrapper->filter_id())
- .empty()) {
+ if (!_need_do_merge(state)) {
// when global consumer not exist, send_to_local_targets will do
nothing, so merge rf is useless
return Status::OK();
}
@@ -171,11 +166,7 @@ Status RuntimeFilterProducer::send_size(RuntimeState*
state, uint64_t local_filt
}
set_state(State::WAITING_FOR_SYNCED_SIZE);
- // two case we need do local merge:
- // 1. has remote target
- // 2. has local target and has global consumer (means target scan has
local shuffle)
- if (_has_remote_target ||
-
!state->global_runtime_filter_mgr()->get_consume_filters(_wrapper->filter_id()).empty())
{
+ if (_need_do_merge(state)) {
LocalMergeContext* merger_context = nullptr;
RETURN_IF_ERROR(state->global_runtime_filter_mgr()->get_local_merge_producer_filters(
_wrapper->filter_id(), &merger_context));
diff --git a/be/src/runtime_filter/runtime_filter_producer.h
b/be/src/runtime_filter/runtime_filter_producer.h
index e686a97ee3a..a21697c7005 100644
--- a/be/src/runtime_filter/runtime_filter_producer.h
+++ b/be/src/runtime_filter/runtime_filter_producer.h
@@ -131,11 +131,14 @@ public:
_wrapper = wrapper;
}
- std::shared_ptr<RuntimeFilterWrapper> detect_in_filter() {
- if (_has_remote_target) {
+ std::shared_ptr<RuntimeFilterWrapper> detect_local_in_filter(RuntimeState*
state) {
+ std::unique_lock<std::recursive_mutex> l(_rmtx);
+ // need merge mean this filter not pure local
+ // the data not directly colocated with scan node
+ // so that can not enable local in filter optimization
+ if (_need_do_merge(state)) {
return nullptr;
}
- std::unique_lock<std::recursive_mutex> l(_rmtx);
if (_wrapper->is_ready_in_filter()) {
return _wrapper;
}
@@ -157,6 +160,15 @@ private:
}
}
+ bool _need_do_merge(RuntimeState* state) {
+ // two cases where we need to do a local merge:
+ // 1. has remote target
+ // 2. has local target and has global consumer (means target scan has
local shuffle)
+ return (_has_remote_target || !state->global_runtime_filter_mgr()
+
->get_consume_filters(_wrapper->filter_id())
+ .empty());
+ }
+
Status _init_with_desc(const TRuntimeFilterDesc* desc, const
TQueryOptions* options) override {
RETURN_IF_ERROR(RuntimeFilter::_init_with_desc(desc, options));
_need_sync_filter_size = _wrapper->build_bf_by_runtime_size() &&
!_is_broadcast_join;
diff --git a/be/src/runtime_filter/runtime_filter_producer_helper.cpp
b/be/src/runtime_filter/runtime_filter_producer_helper.cpp
index e2d46f2989f..f49d274a47e 100644
--- a/be/src/runtime_filter/runtime_filter_producer_helper.cpp
+++ b/be/src/runtime_filter/runtime_filter_producer_helper.cpp
@@ -186,7 +186,7 @@ std::shared_ptr<RuntimeFilterWrapper>
RuntimeFilterProducerHelper::detect_local_
// If any runtime filter is local in filter, return true.
// Local in filter is used to LEFT_SEMI_DIRECT_RETURN_OPT
for (const auto& filter : _producers) {
- if (auto wrapper = filter->detect_in_filter(); wrapper != nullptr) {
+ if (auto wrapper = filter->detect_local_in_filter(state); wrapper !=
nullptr) {
return wrapper;
}
}
diff --git a/be/src/runtime_filter/runtime_filter_wrapper.cpp
b/be/src/runtime_filter/runtime_filter_wrapper.cpp
index 5e7ec72a30e..36561042abf 100644
--- a/be/src/runtime_filter/runtime_filter_wrapper.cpp
+++ b/be/src/runtime_filter/runtime_filter_wrapper.cpp
@@ -180,7 +180,8 @@ Status RuntimeFilterWrapper::merge(const
RuntimeFilterWrapper* other) {
_hybrid_set->insert(other->_hybrid_set.get());
if (_max_in_num >= 0 && _hybrid_set->size() > _max_in_num) {
_hybrid_set->clear();
- set_state(State::DISABLED, fmt::format("reach max in num: {}",
_max_in_num));
+ set_state(State::DISABLED,
+ fmt::format("merge reached max IN num threshold: {}",
_max_in_num));
}
break;
}
diff --git
a/regression-test/data/query_p0/join/test_cte_exists/test_cte_exists.out
b/regression-test/data/query_p0/join/test_cte_exists/test_cte_exists.out
new file mode 100644
index 00000000000..6162a7faf43
--- /dev/null
+++ b/regression-test/data/query_p0/join/test_cte_exists/test_cte_exists.out
@@ -0,0 +1,44 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !select_with_cte_exists --
+0 -26516 42 28.17350 66.01650 \N on \N
my \N 2002-03-23 2015-05-15 2003-04-22T00:00
2007-04-10T00:00 13 8 \N 25 78.1141000000
51.1176000000
+1 1 -100 41.17160 \N ⭐ now \N \N
r 2013-08-21 2009-05-26 2005-10-11T00:00
2003-01-17T14:47:30 2 9 6 13 2.1381000000
32.0589000000
+2 15 \N 30.00870 42.09590 “ 2016-12-25
w will \N 2018-04-23 2011-02-17 2007-02-23T10:25:54
2008-06-14T00:00 \N 13 53 \N 14.0125000000
98.0868000000
+3 108 -18369 53.16640 68.19440 back 2005-07-03
15:52:40 \N 2007-05-06 o 2014-04-11 2002-06-22
2004-12-27T10:53:04 2016-02-12T00:00 2 8 \N 72
49.1006000000 84.1622000000
+5 \N \N 94.10040 89.14280 \N was \N
÷ f 2011-12-10 2008-01-13 2019-01-22T10:32:16
2014-11-11T22:37:16 5 6 84 34 61.0046000000
58.1874000000
+7 10 \N 69.09860 75.12140 \N w were
2000-01-25 06:33:02 a 2003-11-01 2008-07-11
2006-04-24T21:56:50 2013-11-25T00:00 1 13 46 14
3.1337000000 37.1948000000
+8 \N 2109123390 20.09540 60.03210 then b
tell 2004-04-05 13:37:57 m 2004-12-13 2007-03-25
2018-10-14T00:00 2018-07-02T00:00 6 14 32 40
\N 31.1016000000
+9 77 -321144118 48.11470 32.10830 k \N
\N j q 2011-05-22 2008-03-10 2017-03-27T00:00
2003-11-13T00:00 14 4 39 66 14.1892000000
27.0975000000
+10 1339657358 6 67.07760 63.01650 I'll or
b just \N 2010-06-11 2006-03-09 2018-09-01T00:00
2017-01-27T00:00 2 3 \N 73 18.1075000000
18.1771000000
+12 840810997 \N 20.15110 31.11050 2005-06-04
09:31:03 \N 2015-04-03 21:51:11 2015-06-25 \N 2018-05-24
2011-10-24 2007-04-23T00:00 2016-05-26T03:48:15 \N 13
\N 19 55.1909000000 87.1879000000
+13 \N 213501779 70.10390 28.06790 DEL about
mean 2013-08-24 u 2002-09-06 2001-07-20
2008-06-04T04:41:48 2013-09-09T00:00 8 14 3 95
68.1937000000 15.1266000000
+14 -14681 46 35.17140 83.07530 2008-03-25 17:28:53
you're k 2016-02-28 f 2012-04-21 2004-01-24
2010-05-20T00:00 2018-05-10T18:31:16 4 1 9 25
18.1370000000 \N
+15 87 -39 65.06230 66.10880 2014-10-27 07:04:00
¥ n \N e 2001-10-14 2003-05-15
2013-06-26T00:00 2016-11-12T10:50:57 1 9 10 91
\N 60.1894000000
+16 7876 34 19.02430 98.11340 ♪ up
2010-09-17 15:10:51 2011-05-14 h 2006-02-12 2009-01-13
2000-04-26T00:04:07 2010-10-12T00:00 6 9 83 76
\N 6.0435000000
+17 -94 -426161667 48.01440 18.02980 k
2015-05-16 19:40:14 2013-11-12 \N e 2009-12-08
2010-04-03 2005-01-17T00:00 2014-05-07T07:32:31 \N 13
\N \N 20.1666000000 \N
+18 \N -28299 62.18820 37.07860 some 2006-12-18
2015-09-12 \N \N 2013-05-25 2006-01-07
2006-11-12T01:13:11 2001-12-27T00:00 12 7 90 41
82.0662000000 2.0787000000
+19 96 -5149 10.17780 2.04820 oh é a f
z 2004-06-26 2014-10-23 2004-06-15T00:00
2015-12-06T00:00 1 12 91 86 \N 41.1384000000
+20 \N -367544364 15.16010 69.02020 2019-03-16
x and 2012-11-25 \N 2003-12-16 2001-12-12
2019-05-08T00:00 2013-09-23T00:00 14 5 44 \N
\N \N
+21 83504484 102 67.07690 17.09480 2016-03-08
09:20:32 look u 2005-01-24 13:11:51 t 2017-11-10
2015-03-04 2013-02-17T02:49:09 2009-01-15T00:00 6 3
34 88 93.1511000000 90.1330000000
+22 -30359 18454 62.06520 59.06150 o 2006-06-18
19:24:41 2001-04-19 i c 2011-09-21 2019-05-20
2007-05-01T00:00 2000-01-18T00:00 7 11 17 39
60.1966000000 27.0348000000
+23 47 55 92.14180 11.06560 \N m
2019-06-21 09:51:53 2003-12-20 22:29:10 \N 2013-10-03
2000-07-14 2005-08-08T00:50:51 2001-09-22T00:00 11 5
23 54 67.0501000000 72.1124000000
+24 174 \N 15.05920 53.17740 a 2014-01-02
09:56:05 look when d 2010-12-22 2004-01-27
2017-01-03T00:00 2014-07-26T17:04:22 1 12 34 16
\N 31.1348000000
+27 -4501 -37 72.16810 7.08690 2017-01-09 \N as
2001-05-15 11:36:04 z 2002-07-20 2003-02-09
2002-04-02T15:01:52 2012-08-22T00:00 3 1 15 85
36.0699000000 71.0232000000
+28 \N 80 84.01840 \N 2008-09-08 09:41:36
2018-10-08 2017-01-08 y n 2007-12-15 2010-08-27
2005-04-25T00:00 2007-07-22T00:00 12 8 36 53
63.1981000000 25.1409000000
+30 1678810266 1507463100 62.03340 23.08120 t
2003-07-06 you're it r 2018-02-15 2004-09-28
2003-04-21T15:21:25 2019-03-25T00:00 5 12 9 85
32.0118000000 57.1966000000
+31 \N \N 35.17790 \N 2004-04-21 2013-09-04
2018-06-02 2004-09-24 01:32:23 p 2015-08-01 2017-11-08
2013-10-21T00:00 2009-02-13T22:10:15 5 12 \N 78
72.1830000000 34.1290000000
+32 \N -8223 27.12450 68.00550 b oh how
\N g 2019-01-02 2014-12-05 2016-11-17T00:00
2016-03-13T00:00 7 8 47 20 4.0035000000
33.1087000000
+33 -27712 109 20.17710 5.15390 2019-09-06 yeah
2009-12-14 07:08:30 here k 2004-08-01 2007-06-07
2000-11-16T00:00 2010-03-12T00:00 13 \N 0 \N
\N 81.1432000000
+35 -83 -11149 47.14330 \N who i \N back
x 2006-02-19 2015-10-28 2007-08-20T00:00
2012-04-26T00:00 13 10 \N 62 78.0910000000
36.0115000000
+36 16516 -112 13.10140 \N \N + 2013-11-27
05:39:59 “ d 2004-02-19 2004-01-18 2001-09-27T00:00
2013-03-27T10:45:14 3 11 \N \N 21.1940000000
10.1906000000
+37 2231 -140663446 82.14340 79.19350 \N
2004-10-01 05:53:25 \N \N \N 2002-02-23 2002-04-03
2019-05-24T00:00 2012-03-02T00:00 \N 3 51 15
75.0719000000 81.1907000000
+38 19 \N 87.07490 61.07680 something s
2011-10-24 2010-03-19 10:30:14 s 2016-04-08 2010-04-04
2009-10-05T13:25:07 2013-01-02T00:00 2 5 8 98
\N 36.1245000000
+39 -44 18 41.01640 69.03730 2014-11-04 19:39:52
2019-05-17 02:09:47 2001-10-25 a u 2015-04-07
2003-02-28 2018-09-08T10:56:03 2007-05-21T08:08:42 2 3
66 63 30.1838000000 53.1648000000
+41 36 60 82.17390 100.03910 \N 2017-12-08
06:14:04 2000-05-22 from p 2011-12-17 2000-11-14
2019-04-04T00:00 2019-03-21T00:00 11 11 \N \N
56.0672000000 54.1659000000
+42 \N 75 15.11650 30.06680 him about
2008-04-13 2008-01-12 23:59:57 x 2011-04-07 2016-06-26
2000-01-23T00:00 2019-08-19T00:00 13 14 80 48
60.1511000000 50.1581000000
+43 1486635733 \N 23.02720 9.17860 get could \N
2008-11-27 14:13:54 d 2005-09-27 2016-12-05
2017-11-23T15:48:28 2015-07-17T07:44:43 7 14 8 41
19.1376000000 27.0721000000
+44 21183 91 49.13480 16.10830 2008-07-05 09:22:06
2006-02-24 it's 2000-01-14 09:38:35 y 2007-10-11
2010-09-19 2014-10-23T00:00 2001-06-23T07:32:12 14 15
\N 11 \N 20.1074000000
+45 -19092 -10 75.10910 37.07120 2017-12-16 09:51:22
his é 2012-11-28 05:50:31 f 2003-01-27 2018-02-12
2018-03-12T00:00 2005-02-08T00:00 5 4 98 14
96.1744000000 91.0384000000
+47 -1585723780 \N 68.05130 59.16600 could \N
2011-01-12 m b 2012-11-15 2003-10-03
2016-05-08T00:00 2002-05-23T00:00 \N 5 61 99
30.0624000000 \N
+48 26182 \N 85.09230 13.19010 🍕 all \N
2007-11-02 14:22:31 y 2014-02-24 2006-05-10
2003-04-02T00:00 2005-06-12T00:00 13 6 49 25
36.0866000000 97.1200000000
+49 -814629551 -320575127 92.19940 \N and
2010-09-03 21:32:52 x well z 2009-08-21 2017-12-05
2005-08-24T00:00 2012-04-06T19:44:44 5 14 93 94
20.0309000000 94.0261000000
+
diff --git
a/regression-test/suites/query_p0/join/test_cte_exists/test_cte_exists.groovy
b/regression-test/suites/query_p0/join/test_cte_exists/test_cte_exists.groovy
new file mode 100644
index 00000000000..ff3f6538888
--- /dev/null
+++
b/regression-test/suites/query_p0/join/test_cte_exists/test_cte_exists.groovy
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_cte_exists") {
+ def tableName =
"table_20_50_undef_partitions2_keys3_properties4_distributed_by5"
+
+ sql """ DROP TABLE IF EXISTS ${tableName} """
+
+ sql "set enable_left_semi_direct_return_opt = true; "
+
+ sql "set parallel_pipeline_task_num=16;"
+
+ sql "set runtime_filter_max_in_num=10;"
+
+ sql "set runtime_filter_type = 'IN';"
+
+ sql "set enable_local_shuffle = true;"
+
+ sql """
+ create table ${tableName} (
+ pk int,
+ col_int_undef_signed int ,
+ col_int_undef_signed__index_inverted int ,
+ col_decimal_20_5__undef_signed decimal(20,5) ,
+ col_decimal_20_5__undef_signed__index_inverted decimal(20,5) ,
+ col_varchar_100__undef_signed varchar(100) ,
+ col_varchar_100__undef_signed__index_inverted varchar(100) ,
+ col_char_50__undef_signed char(50) ,
+ col_char_50__undef_signed__index_inverted char(50) ,
+ col_string_undef_signed string ,
+ col_date_undef_signed date ,
+ col_date_undef_signed__index_inverted date ,
+ col_datetime_undef_signed datetime ,
+ col_datetime_undef_signed__index_inverted datetime ,
+ col_tinyint_undef_signed tinyint ,
+ col_tinyint_undef_signed__index_inverted tinyint ,
+ col_decimal_10_0__undef_signed decimal(10,0) ,
+ col_decimal_10_0__undef_signed__index_inverted decimal(10,0) ,
+ col_decimal_38_10__undef_signed decimal(38,10) ,
+ col_decimal_38_10__undef_signed__index_inverted decimal(38,10) ,
+ INDEX col_int_undef_signed__index_inverted_idx
(`col_int_undef_signed__index_inverted`) USING INVERTED,
+ INDEX col_decimal_20_5__undef_signed__index_inverted_idx
(`col_decimal_20_5__undef_signed__index_inverted`) USING INVERTED,
+ INDEX col_varchar_100__undef_signed__index_inverted_idx
(`col_varchar_100__undef_signed__index_inverted`) USING INVERTED,
+ INDEX col_char_50__undef_signed__index_inverted_idx
(`col_char_50__undef_signed__index_inverted`) USING INVERTED,
+ INDEX col_date_undef_signed__index_inverted_idx
(`col_date_undef_signed__index_inverted`) USING INVERTED,
+ INDEX col_datetime_undef_signed__index_inverted_idx
(`col_datetime_undef_signed__index_inverted`) USING INVERTED,
+ INDEX col_tinyint_undef_signed__index_inverted_idx
(`col_tinyint_undef_signed__index_inverted`) USING INVERTED,
+ INDEX col_decimal_10_0__undef_signed__index_inverted_idx
(`col_decimal_10_0__undef_signed__index_inverted`) USING INVERTED,
+ INDEX col_decimal_38_10__undef_signed__index_inverted_idx
(`col_decimal_38_10__undef_signed__index_inverted`) USING INVERTED
+ ) engine=olap
+ DUPLICATE KEY(pk, col_int_undef_signed,
col_int_undef_signed__index_inverted)
+ distributed by hash(pk) buckets 10
+ properties("store_row_column" = "false", "replication_num" = "1");
+ """
+
+ sql """
+ insert into
${tableName}(pk,col_int_undef_signed,col_int_undef_signed__index_inverted,col_decimal_20_5__undef_signed,col_decimal_20_5__undef_signed__index_inverted,col_varchar_100__undef_signed,col_varchar_100__undef_signed__index_inverted,col_char_50__undef_signed,col_char_50__undef_signed__index_inverted,col_string_undef_signed,col_date_undef_signed,col_date_undef_signed__index_inverted,col_datetime_undef_signed,col_datetime_undef_signed__index_inverted,col_tinyint_undef_signed
[...]
+ """
+
+ qt_select_with_cte_exists """
+ with cte1 as (select pk from ${tableName} where
col_decimal_20_5__undef_signed != 8)
+ select * from ${tableName} o where exists(select 1 from cte1 au where
au.pk = o.pk) order by pk;
+ """
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]