This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new a7548006ef1 branch-3.1: [opt](nereids) add cost penalty for pattern
Aggregation-nestedLoopJoin #51127 (#53891)
a7548006ef1 is described below
commit a7548006ef1bac6850ca3ce0bc2e9e445a0c2890
Author: minghong <[email protected]>
AuthorDate: Mon Jul 28 17:14:02 2025 +0800
branch-3.1: [opt](nereids) add cost penalty for pattern
Aggregation-nestedLoopJoin #51127 (#53891)
pick #51127
---
.../org/apache/doris/nereids/cost/CostModelV1.java | 17 ++++-
regression-test/data/shape_check/others/nlj.out | Bin 0 -> 481 bytes
.../suites/shape_check/others/nlj.groovy | 74 +++++++++++++++++++++
3 files changed, 90 insertions(+), 1 deletion(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java
index 4b876015522..db965485357 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java
@@ -493,9 +493,24 @@ class CostModelV1 extends PlanVisitor<Cost, PlanContext> {
Preconditions.checkState(context.arity() == 2);
Statistics leftStatistics = context.getChildStatistics(0);
Statistics rightStatistics = context.getChildStatistics(1);
+ /*
+ * nljPenalty:
+ * The row count estimation for nested loop join (NLJ) results often
has significant errors.
+ * When the estimated row count is higher than the actual value, the
cost benefits of subsequent
+ * operators (e.g., aggregation) may be overestimated. This can lead
the optimizer to choose a
+ * plan where a small table joins a large table, severely impacting
the overall SQL execution efficiency.
+ *
+ * For example, if the subsequent operator is an aggregation (AGG) and
the GROUP BY key aligns with
+ * the distribution key of the small table, the optimizer might
prioritize avoiding shuffling the NLJ
+ * results by choosing to join the small table to the large table,
even if this is suboptimal.
+ */
+ double nljPenalty = 1.0;
+ if (leftStatistics.getRowCount() < 10 * rightStatistics.getRowCount())
{
+ nljPenalty = Math.min(leftStatistics.getRowCount(),
rightStatistics.getRowCount());
+ }
return CostV1.of(context.getSessionVariable(),
leftStatistics.getRowCount() * rightStatistics.getRowCount(),
- rightStatistics.getRowCount(),
+ rightStatistics.getRowCount() * nljPenalty,
0);
}
diff --git a/regression-test/data/shape_check/others/nlj.out
b/regression-test/data/shape_check/others/nlj.out
new file mode 100644
index 00000000000..7a4caa03911
Binary files /dev/null and b/regression-test/data/shape_check/others/nlj.out
differ
diff --git a/regression-test/suites/shape_check/others/nlj.groovy
b/regression-test/suites/shape_check/others/nlj.groovy
new file mode 100644
index 00000000000..63920380a73
--- /dev/null
+++ b/regression-test/suites/shape_check/others/nlj.groovy
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+suite("nlj") {
+ sql 'set enable_parallel_result_sink=false;'
+ sql """
+ drop table if exists a;
+ create table a (
+ `id` int,
+ a_date dateV2,
+ x int,
+ y int
+ )
+ DISTRIBUTED BY HASH(id) BUCKETS 8
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ alter table a modify column a_date set stats ('row_count'='1428418116',
'ndv'='498', 'min_value'='2023-12-31', 'max_value'='2025-05-18',
'avg_size'='4');
+
+
+ drop table if exists b;
+ create table b (
+ `id` int,
+ b_date dateV2,
+ )
+ DISTRIBUTED BY HASH(b_date) BUCKETS 8
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+
+
+ alter table b modify column b_date set stats ('row_count'='505',
'ndv'='505', 'min_value'='2025-05-14', 'max_value'='2025-05-15',
'avg_size'='4');
+
+ drop table if exists c;
+ create table c (
+ `id` int,
+ c_date dateV2,
+ )
+ DISTRIBUTED BY HASH(id) BUCKETS 8
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+
+ alter table c modify column c_date set stats ('row_count'='505',
'ndv'='505', 'min_value'='2025-05-14', 'max_value'='2025-05-15',
'avg_size'='4');
+
+ set disable_nereids_rules='PRUNE_EMPTY_PARTITION';
+ set runtime_filter_mode=off;
+ """
+
+ // expected join order : a b
+ // order b-a is good for aggregation, but usually this order is extremely
bad for join.
+ qt_shape """
+ explain shape plan
+ select b_date
+ from a join b on a_date>b_date
+ group by b_date, a_date, a.x
+ """
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]