This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit d8e18fac619a129270634513359b3386212ff919 Author: Riza Suminto <[email protected]> AuthorDate: Thu Apr 24 15:50:42 2025 -0700 IMPALA-13991: Skip CROSS_JOIN rewrite if subquery is in disjunctive Inside StmtRewriter.mergeExpr() there is an optimization that set JoinOperator.CROSS_JOIN under certain scenario. This patch adds criteria to SKIP that rewrite if subquery is coming from inside disjunctive expression, regardless of joinConjunct value. If joinConjunct is NOT NULL, the inlineView maybe correlated through that joinConjunct. If joinConjunct is NULL, then expr is a (NOT) EXISTS predicate. EXISTS within a disjunct is not supported yet (see IMPALA-9931). Testing: - Add planner and query test for the corner case. Before this patch, the query return wrong result. - Fixed wrong testcase in subquery-rewrite.test. Change-Id: Iac0deb0b2fb1536684cce2e004156a20b769b9ab Reviewed-on: http://gerrit.cloudera.org:8080/22815 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- .../org/apache/impala/analysis/StmtRewriter.java | 9 ++++-- .../queries/PlannerTest/nested-loop-join.test | 36 ++++++++++++++++++++++ .../queries/PlannerTest/subquery-rewrite.test | 17 +++++----- .../queries/QueryTest/single-node-nlj.test | 17 ++++++++++ 4 files changed, 70 insertions(+), 9 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java b/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java index 5b0868d71..2ec24786e 100644 --- a/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java +++ b/fe/src/main/java/org/apache/impala/analysis/StmtRewriter.java @@ -22,7 +22,6 @@ import java.util.Collections; import java.util.List; import java.util.Set; -import org.apache.impala.analysis.AnalysisContext.AnalysisResult; import org.apache.impala.analysis.SetOperationStmt.SetOperand; import org.apache.impala.analysis.SetOperationStmt.SetOperator; import org.apache.impala.common.AnalysisException; @@ -773,7 +772,13 @@ public class StmtRewriter { break; } - if (!hasEqJoinPred && !inlineView.isCorrelated()) { + if (!hasEqJoinPred && !inlineView.isCorrelated() && !isDisjunctive) { + // IMPALA-13991: It is not safe to rewrite into CROSS_JOIN if + // isDisjunctive is True, regardless of joinConjunct value. + // If joinConjunct is NOT NULL, the inlineView maybe correlated through + // that joinConjunct. + // If joinConjunct is NULL, then expr is a (NOT) EXISTS predicate. + // EXISTS within a disjunct is not supported yet (see IMPALA-9931). // TODO: IMPALA-9948: we could support non-equi joins here // TODO: Remove this when independent subquery evaluation is implemented. // TODO: IMPALA-5100 to cover all cases, we do let through runtime scalars with diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/nested-loop-join.test b/testdata/workloads/functional-planner/queries/PlannerTest/nested-loop-join.test index 9ceac1560..18bb5bbbc 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/nested-loop-join.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/nested-loop-join.test @@ -328,3 +328,39 @@ PLAN-ROOT SINK HDFS partitions=24/24 files=24 size=478.45KB row-size=4B cardinality=7.30K ==== +# IMPALA-13991: Disable CROSS_JOIN rewrite if there is a scalar subquery inside +# disjunctive expression. right_tc is '2009-04-01 01:01:00'. +select id, timestamp_col from functional.alltypes +where (id in (1000, 1001)) + or ((id in (1, 2)) + and timestamp_col < (select hours_add(timestamp_col, 1) right_tc + from functional.alltypestiny + order by timestamp_col DESC limit 1)); +---- DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +06:EXCHANGE [UNPARTITIONED] +| +03:NESTED LOOP JOIN [LEFT OUTER JOIN, BROADCAST] +| join predicates: timestamp_col < hours_add(timestamp_col, 1) +| predicates: (id IN (1000, 1001)) OR ((id IN (1, 2)) AND hours_add(timestamp_col, 1) IS NOT NULL) +| row-size=36B cardinality=7.30K +| +|--05:EXCHANGE [BROADCAST] +| | +| 04:MERGING-EXCHANGE [UNPARTITIONED] +| | order by: timestamp_col DESC +| | limit: 1 +| | +| 02:TOP-N [LIMIT=1] +| | order by: timestamp_col DESC +| | row-size=16B cardinality=1 +| | +| 01:SCAN HDFS [functional.alltypestiny] +| HDFS partitions=4/4 files=4 size=460B +| row-size=16B cardinality=8 +| +00:SCAN HDFS [functional.alltypes] + HDFS partitions=24/24 files=24 size=478.45KB + row-size=20B cardinality=7.30K +==== diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test index 8e83a774e..6946a5fee 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test @@ -4106,17 +4106,19 @@ PLAN-ROOT SINK HDFS partitions=24/24 files=24 size=478.45KB row-size=8B cardinality=7.30K ==== -# Uncorrelated subquery in BETWEEN in OR rewritten to INNER JOIN. +# Uncorrelated subquery in BETWEEN in OR rewritten to LEFT OUTER JOIN. +# Subquery will return 1. select int_col from functional.alltypes t -where (t.int_col is null or +where (t.int_col = 0 or (t.int_col between (select min(int_col) from functional.alltypesagg t2) and 2)) ---- PLAN PLAN-ROOT SINK | -03:NESTED LOOP JOIN [INNER JOIN] -| predicates: t.int_col >= min(int_col), (t.int_col IS NULL OR min(int_col) IS NOT NULL AND t.int_col <= 2) +03:NESTED LOOP JOIN [LEFT OUTER JOIN] +| join predicates: t.int_col >= min(int_col) +| predicates: (t.int_col = 0 OR min(int_col) IS NOT NULL AND t.int_col <= 2) | row-size=8B cardinality=7.30K | |--02:AGGREGATE [FINALIZE] @@ -4135,8 +4137,9 @@ PLAN-ROOT SINK | 07:EXCHANGE [UNPARTITIONED] | -03:NESTED LOOP JOIN [INNER JOIN, BROADCAST] -| predicates: t.int_col >= min(int_col), (t.int_col IS NULL OR min(int_col) IS NOT NULL AND t.int_col <= 2) +03:NESTED LOOP JOIN [LEFT OUTER JOIN, BROADCAST] +| join predicates: t.int_col >= min(int_col) +| predicates: (t.int_col = 0 OR min(int_col) IS NOT NULL AND t.int_col <= 2) | row-size=8B cardinality=7.30K | |--06:EXCHANGE [BROADCAST] @@ -4149,7 +4152,7 @@ PLAN-ROOT SINK | | | 02:AGGREGATE | | output: min(int_col) -| | row-size=4B cardinality=1 +| | row-size=4B cardinality=3 | | | 01:SCAN HDFS [functional.alltypesagg t2] | HDFS partitions=11/11 files=11 size=814.73KB diff --git a/testdata/workloads/functional-query/queries/QueryTest/single-node-nlj.test b/testdata/workloads/functional-query/queries/QueryTest/single-node-nlj.test index 82370af53..143776538 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/single-node-nlj.test +++ b/testdata/workloads/functional-query/queries/QueryTest/single-node-nlj.test @@ -222,4 +222,21 @@ limit 5 7295 ---- TYPES INT +==== +---- QUERY +# IMPALA-13991: Disable CROSS_JOIN rewrite if there is a scalar subquery inside +# disjunctive expression. right_tc is '2009-04-01 01:01:00'. +select id, timestamp_col from functional.alltypes +where (id in (1000, 1001)) + or ((id in (1, 2)) + and timestamp_col < (select hours_add(timestamp_col, 1) right_tc + from functional.alltypestiny + order by timestamp_col DESC limit 1)); +---- RESULTS: VERIFY_IS_EQUAL_SORTED +1,2009-01-01 00:01:00 +2,2009-01-01 00:02:00.100000000 +1000,2009-04-11 01:40:04.500000000 +1001,2009-04-11 01:41:04.500000000 +---- TYPES +INT,TIMESTAMP ==== \ No newline at end of file
