This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new d110859fad8 [fix](nereids) refine row count estimation for mark join
(#38270)
d110859fad8 is described below
commit d110859fad8602c77001d16a867137ac77aae37c
Author: xzj7019 <[email protected]>
AuthorDate: Wed Jul 24 16:30:06 2024 +0800
[fix](nereids) refine row count estimation for mark join (#38270)
Current semi/anti stats estimation doesn't consider the mark join case,
whose row count should follow either side's stats without change.
---
.../main/java/org/apache/doris/nereids/stats/JoinEstimation.java | 4 ++--
regression-test/data/nereids_hint_tpcds_p0/shape/query45.out | 2 +-
.../nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out | 2 +-
.../data/nereids_tpcds_shape_sf1000_p0/shape/query45.out | 2 +-
.../data/nereids_tpcds_shape_sf1000_p0/shape/query51.out | 8 ++++----
.../data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out | 2 +-
.../data/nereids_tpcds_shape_sf100_p0/shape/query45.out | 2 +-
.../data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out | 2 +-
regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out | 2 +-
.../new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out | 2 +-
regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query45.out | 2 +-
regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query51.out | 8 ++++----
12 files changed, 19 insertions(+), 19 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
index 29e30b30f33..f8298871f0d 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
@@ -267,8 +267,8 @@ public class JoinEstimation {
}
private static Statistics estimateSemiOrAnti(Statistics leftStats,
Statistics rightStats, Join join) {
- if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats,
join)) {
- double sel =
computeSelectivityForBuildSideWhenColStatsUnknown(rightStats, join);
+ if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats,
join) || join.isMarkJoin()) {
+ double sel = join.isMarkJoin() ? 1.0 :
computeSelectivityForBuildSideWhenColStatsUnknown(rightStats, join);
if (join.getJoinType().isLeftSemiOrAntiJoin()) {
return new
StatisticsBuilder().setRowCount(leftStats.getRowCount() * sel)
.putColumnStatistics(leftStats.columnStatistics())
diff --git a/regression-test/data/nereids_hint_tpcds_p0/shape/query45.out
b/regression-test/data/nereids_hint_tpcds_p0/shape/query45.out
index b65fa9047c0..e032d162e9e 100644
--- a/regression-test/data/nereids_hint_tpcds_p0/shape/query45.out
+++ b/regression-test/data/nereids_hint_tpcds_p0/shape/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
------------hashAgg[LOCAL]
--------------PhysicalProject
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405',
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffle]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
--------------------PhysicalProject
----------------------hashJoin[INNER_JOIN shuffle]
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk))
otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
------------------------PhysicalProject
diff --git
a/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out
b/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out
index b65fa9047c0..e032d162e9e 100644
---
a/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out
+++
b/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
------------hashAgg[LOCAL]
--------------PhysicalProject
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405',
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffle]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
--------------------PhysicalProject
----------------------hashJoin[INNER_JOIN shuffle]
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk))
otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
------------------------PhysicalProject
diff --git
a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out
b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out
index b65fa9047c0..e032d162e9e 100644
--- a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out
+++ b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
------------hashAgg[LOCAL]
--------------PhysicalProject
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405',
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffle]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
--------------------PhysicalProject
----------------------hashJoin[INNER_JOIN shuffle]
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk))
otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
------------------------PhysicalProject
diff --git
a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out
b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out
index 6c22d2df308..38bec2403ec 100644
--- a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out
+++ b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out
@@ -19,9 +19,9 @@ PhysicalResultSink
--------------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------------hashAgg[LOCAL]
------------------------------------PhysicalProject
---------------------------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk))
otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
+--------------------------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk))
otherCondition=() build RFs:RF1 d_date_sk->[ss_sold_date_sk]
----------------------------------------PhysicalProject
-------------------------------------------PhysicalOlapScan[web_sales] apply
RFs: RF1
+------------------------------------------PhysicalOlapScan[store_sales] apply
RFs: RF1
----------------------------------------PhysicalProject
------------------------------------------filter((date_dim.d_month_seq <=
1223) and (date_dim.d_month_seq >= 1212))
--------------------------------------------PhysicalOlapScan[date_dim]
@@ -34,9 +34,9 @@ PhysicalResultSink
--------------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------------hashAgg[LOCAL]
------------------------------------PhysicalProject
---------------------------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk))
otherCondition=() build RFs:RF0 d_date_sk->[ss_sold_date_sk]
+--------------------------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk))
otherCondition=() build RFs:RF0 d_date_sk->[ws_sold_date_sk]
----------------------------------------PhysicalProject
-------------------------------------------PhysicalOlapScan[store_sales] apply
RFs: RF0
+------------------------------------------PhysicalOlapScan[web_sales] apply
RFs: RF0
----------------------------------------PhysicalProject
------------------------------------------filter((date_dim.d_month_seq <=
1223) and (date_dim.d_month_seq >= 1212))
--------------------------------------------PhysicalOlapScan[date_dim]
diff --git
a/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out
b/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out
index 48886e631ea..377431110d4 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
------------hashAgg[LOCAL]
--------------PhysicalProject
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405',
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffle]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
--------------------PhysicalProject
----------------------hashJoin[INNER_JOIN shuffle]
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk))
otherCondition=()
------------------------PhysicalProject
diff --git
a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out
b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out
index a3a6b75ca6d..5c74bb70d39 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
------------hashAgg[LOCAL]
--------------PhysicalProject
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405',
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffle]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
--------------------PhysicalProject
----------------------hashJoin[INNER_JOIN shuffle]
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk))
otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
------------------------PhysicalProject
diff --git
a/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out
b/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out
index 83f6b9ca5df..b8f9dc6e8a0 100644
--- a/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out
+++ b/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
------------hashAgg[LOCAL]
--------------PhysicalProject
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405',
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffleBucket]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
--------------------PhysicalProject
----------------------hashJoin[INNER_JOIN shuffle]
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk))
otherCondition=()
------------------------PhysicalProject
diff --git a/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out
b/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out
index e05c3f0537a..95b5d1168d7 100644
--- a/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out
+++ b/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
------------hashAgg[LOCAL]
--------------PhysicalProject
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405',
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffleBucket]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
--------------------PhysicalProject
----------------------hashJoin[INNER_JOIN shuffle]
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk))
otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
------------------------PhysicalProject
diff --git
a/regression-test/data/new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out
b/regression-test/data/new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out
index 6ac3b85090a..3995aa66e33 100644
---
a/regression-test/data/new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out
+++
b/regression-test/data/new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
------------hashAgg[LOCAL]
--------------PhysicalProject
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405',
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffleBucket]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
--------------------PhysicalProject
----------------------hashJoin[INNER_JOIN shuffle]
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk))
otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
------------------------PhysicalProject
diff --git a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query45.out
b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query45.out
index 6ac3b85090a..3995aa66e33 100644
--- a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query45.out
+++ b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
------------hashAgg[LOCAL]
--------------PhysicalProject
----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405',
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffleBucket]
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build
RFs:RF3 i_item_sk->[ws_item_sk]
--------------------PhysicalProject
----------------------hashJoin[INNER_JOIN shuffle]
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk))
otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
------------------------PhysicalProject
diff --git a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query51.out
b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query51.out
index 6c22d2df308..38bec2403ec 100644
--- a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query51.out
+++ b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query51.out
@@ -19,9 +19,9 @@ PhysicalResultSink
--------------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------------hashAgg[LOCAL]
------------------------------------PhysicalProject
---------------------------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk))
otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
+--------------------------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk))
otherCondition=() build RFs:RF1 d_date_sk->[ss_sold_date_sk]
----------------------------------------PhysicalProject
-------------------------------------------PhysicalOlapScan[web_sales] apply
RFs: RF1
+------------------------------------------PhysicalOlapScan[store_sales] apply
RFs: RF1
----------------------------------------PhysicalProject
------------------------------------------filter((date_dim.d_month_seq <=
1223) and (date_dim.d_month_seq >= 1212))
--------------------------------------------PhysicalOlapScan[date_dim]
@@ -34,9 +34,9 @@ PhysicalResultSink
--------------------------------PhysicalDistribute[DistributionSpecHash]
----------------------------------hashAgg[LOCAL]
------------------------------------PhysicalProject
---------------------------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk))
otherCondition=() build RFs:RF0 d_date_sk->[ss_sold_date_sk]
+--------------------------------------hashJoin[INNER_JOIN broadcast]
hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk))
otherCondition=() build RFs:RF0 d_date_sk->[ws_sold_date_sk]
----------------------------------------PhysicalProject
-------------------------------------------PhysicalOlapScan[store_sales] apply
RFs: RF0
+------------------------------------------PhysicalOlapScan[web_sales] apply
RFs: RF0
----------------------------------------PhysicalProject
------------------------------------------filter((date_dim.d_month_seq <=
1223) and (date_dim.d_month_seq >= 1212))
--------------------------------------------PhysicalOlapScan[date_dim]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]