IMPALA-3232: Allow not-exists uncorrelated subqueries Before this patch, correlated exists and not exists subqueries were rewritten as as left semi and anti joins respectively. Uncorrelated exists subqueries were rewritten as cross joins, and uncorrelated not-exists subqueries were not supported at all. This patch takes advantage of the nested loop join that was recently introduced, which allows us to rewrite both correlated and uncorrelated exists subqueries as left semi joins and both correlated and uncorrelated not-exists subqueries as anti joins.
Change-Id: I52ae12f116d026190f3a2a7575cda855317d11e8 Reviewed-on: http://gerrit.cloudera.org:8080/2792 Reviewed-by: Taras Bobrovytsky <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/46c3e43e Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/46c3e43e Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/46c3e43e Branch: refs/heads/master Commit: 46c3e43edb753b432b104a1953fbde779e70d2b4 Parents: 7767d30 Author: Taras Bobrovytsky <[email protected]> Authored: Thu Apr 14 17:04:27 2016 -0700 Committer: Tim Armstrong <[email protected]> Committed: Thu May 12 23:06:36 2016 -0700 ---------------------------------------------------------------------- .../cloudera/impala/analysis/StmtRewriter.java | 32 +++------- .../impala/analysis/AnalyzeStmtsTest.java | 10 ++- .../impala/analysis/AnalyzeSubqueriesTest.java | 10 ++- .../queries/PlannerTest/analytic-fns.test | 2 +- .../queries/PlannerTest/subquery-rewrite.test | 66 +++++++++++++++++++- .../queries/QueryTest/subquery.test | 27 ++++++++ 6 files changed, 113 insertions(+), 34 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/46c3e43e/fe/src/main/java/com/cloudera/impala/analysis/StmtRewriter.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/com/cloudera/impala/analysis/StmtRewriter.java b/fe/src/main/java/com/cloudera/impala/analysis/StmtRewriter.java index d682ea9..c3fbc20 100644 --- a/fe/src/main/java/com/cloudera/impala/analysis/StmtRewriter.java +++ b/fe/src/main/java/com/cloudera/impala/analysis/StmtRewriter.java @@ -420,34 +420,22 @@ public class StmtRewriter { if (onClausePredicate == null) { Preconditions.checkState(expr instanceof ExistsPredicate); ExistsPredicate existsPred = (ExistsPredicate) expr; + // TODO This is very expensive if uncorrelated. Remove it when we implement + // independent subquery evaluation. + if (existsPred.isNotExists()) { + inlineView.setJoinOp(JoinOperator.LEFT_ANTI_JOIN); + } else { + inlineView.setJoinOp(JoinOperator.LEFT_SEMI_JOIN); + } // Note that the concept of a 'correlated inline view' is similar but not the same // as a 'correlated subquery', i.e., a subquery with a correlated predicate. - if (inlineView.isCorrelated()) { - if (existsPred.isNotExists()) { - inlineView.setJoinOp(JoinOperator.LEFT_ANTI_JOIN); - } else { - inlineView.setJoinOp(JoinOperator.LEFT_SEMI_JOIN); - } - // No visible tuples added. - return false; - } else { - // TODO: Remove this when we support independent subquery evaluation. - if (existsPred.isNotExists()) { - throw new AnalysisException("Unsupported uncorrelated NOT EXISTS subquery: " + - subqueryStmt.toSql()); - } + if (!inlineView.isCorrelated()) { // For uncorrelated subqueries, we limit the number of rows returned by the // subquery. subqueryStmt.setLimit(1); - // We don't have an ON clause predicate to create an equi-join. Rewrite the - // subquery using a CROSS JOIN. - // TODO This is very expensive. Remove it when we implement independent - // subquery evaluation. - inlineView.setJoinOp(JoinOperator.CROSS_JOIN); - LOG.warn("uncorrelated subquery rewritten using a cross join"); - // Indicate that new visible tuples may be added in stmt's select list. - return true; + inlineView.setOnClause(new BoolLiteral(true)); } + return false; } // Create an smap from the original select-list exprs of the select list to http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/46c3e43e/fe/src/test/java/com/cloudera/impala/analysis/AnalyzeStmtsTest.java ---------------------------------------------------------------------- diff --git a/fe/src/test/java/com/cloudera/impala/analysis/AnalyzeStmtsTest.java b/fe/src/test/java/com/cloudera/impala/analysis/AnalyzeStmtsTest.java index 72c58d5..80120b9 100644 --- a/fe/src/test/java/com/cloudera/impala/analysis/AnalyzeStmtsTest.java +++ b/fe/src/test/java/com/cloudera/impala/analysis/AnalyzeStmtsTest.java @@ -1129,7 +1129,7 @@ public class AnalyzeStmtsTest extends AnalyzerTest { AnalyzesOk("select key, av from functional.allcomplextypes t, " + "(select a1.key, av from t.array_map_col a1, " + "(select avg(item) av from a1.value a2) v1) v2"); - // TOOD: Enable once we support complex-typed exprs in the select list. + // TODO: Enable once we support complex-typed exprs in the select list. //AnalyzesOk("select key, av from functional.allcomplextypes t, " + // "(select a1.key, a1.value from t.array_map_col a1) v1, " + // "(select avg(item) av from v1.value) v2"); @@ -1152,7 +1152,7 @@ public class AnalyzeStmtsTest extends AnalyzerTest { "(select a1.key, av from t.array_map_col a1, " + "(select avg(item) av from a1.value a2) v1) v2) " + "select * from w"); - // TOOD: Enable once we support complex-typed exprs in the select list. + // TODO: Enable once we support complex-typed exprs in the select list. //AnalyzesOk("with w as (select key, av from functional.allcomplextypes t, " + // "(select a1.key, a1.value from t.array_map_col a1) v1, " + // "(select avg(item) av from v1.value) v2) " + @@ -1196,7 +1196,7 @@ public class AnalyzeStmtsTest extends AnalyzerTest { "uncorrelated one 'functional.alltypes':\n" + "SELECT * FROM functional.alltypes, (SELECT count(1) cnt " + "FROM t.int_array_col) v1"); - // TOOD: Enable once we support complex-typed exprs in the select list. + // TODO: Enable once we support complex-typed exprs in the select list. // Correlated table ref has correlated inline view as parent. //AnalysisError("select cnt from functional.allcomplextypes t, " + // "(select value arr from t.array_map_col) v1, " + @@ -2741,6 +2741,10 @@ public class AnalyzeStmtsTest extends AnalyzerTest { "where timestamp_col between cast('2001-01-01' as timestamp) and " + "(cast('2001-01-01' as timestamp) + interval 10 days)) " + "select * from with_1"); + AnalyzesOk("with with_1 as (select 1 as col_name), " + + "with_2 as (select 1 as col_name) " + + "select a.tinyint_col from functional.alltypes a " + + "where not exists (select 1 from with_1) "); } @Test http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/46c3e43e/fe/src/test/java/com/cloudera/impala/analysis/AnalyzeSubqueriesTest.java ---------------------------------------------------------------------- diff --git a/fe/src/test/java/com/cloudera/impala/analysis/AnalyzeSubqueriesTest.java b/fe/src/test/java/com/cloudera/impala/analysis/AnalyzeSubqueriesTest.java index 206c245..a5b4fe2 100644 --- a/fe/src/test/java/com/cloudera/impala/analysis/AnalyzeSubqueriesTest.java +++ b/fe/src/test/java/com/cloudera/impala/analysis/AnalyzeSubqueriesTest.java @@ -635,15 +635,13 @@ public class AnalyzeSubqueriesTest extends AnalyzerTest { AnalyzesOk("select id from functional.alltypes where exists " + "(select id from functional.alltypestiny where int_col < 10 and exists (" + "select id from functional.alltypessmall where bool_col = true))"); - // Uncorrelated NOT EXISTS subquery is illegal with only relative table refs + // Uncorrelated NOT EXISTS with relative table ref AnalyzesOk(String.format( "select id from functional.allcomplextypes t where not exists " + "(select item from t.int_array_col a where item < 10)")); - // Uncorrelated NOT EXISTS subquery is illegal with absolute table refs - AnalysisError("select * from functional.alltypestiny where not exists " + - "(select 1 from functional.alltypessmall where bool_col = false)", - "Unsupported uncorrelated NOT EXISTS subquery: SELECT 1 FROM " + - "functional.alltypessmall WHERE bool_col = FALSE"); + // Uncorrelated NOT EXISTS subquery + AnalyzesOk("select * from functional.alltypestiny where not exists " + + "(select 1 from functional.alltypessmall where bool_col = false)"); // Subquery references an explicit alias from the outer block in the FROM // clause http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/46c3e43e/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test b/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test index bbabef0..0f9b2d0 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test @@ -1653,7 +1653,7 @@ WHERE EXISTS (SELECT t1.year AS int_col_1 FROM functional.alltypesagg t1) 03:SORT | order by: day ASC | -02:NESTED LOOP JOIN [CROSS JOIN] +02:NESTED LOOP JOIN [LEFT SEMI JOIN] | |--01:SCAN HDFS [functional.alltypesagg t1] | partitions=11/11 files=11 size=814.73KB http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/46c3e43e/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test index bbff166..8115502 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/subquery-rewrite.test @@ -640,7 +640,7 @@ select * from functional.alltypestiny t where exists (select * from functional.alltypessmall s where s.id < 5) ---- PLAN -02:NESTED LOOP JOIN [CROSS JOIN] +02:NESTED LOOP JOIN [LEFT SEMI JOIN] | |--01:SCAN HDFS [functional.alltypessmall s] | partitions=4/4 files=4 size=6.32KB @@ -658,7 +658,7 @@ where exists from functional.alltypesagg where tinyint_col = 10 group by id, int_col, bigint_col) ---- PLAN -03:NESTED LOOP JOIN [CROSS JOIN] +03:NESTED LOOP JOIN [RIGHT SEMI JOIN] | |--00:SCAN HDFS [functional.alltypestiny t] | partitions=4/4 files=4 size=460B @@ -678,6 +678,68 @@ where exists (select * from functional.alltypessmall limit 0) ---- PLAN 00:EMPTYSET ==== +# Uncorrelated NOT EXISTS +select * +from functional.alltypestiny t +where not exists (select * from functional.alltypessmall s where s.id < 5) +---- PLAN +02:NESTED LOOP JOIN [LEFT ANTI JOIN] +| +|--01:SCAN HDFS [functional.alltypessmall s] +| partitions=4/4 files=4 size=6.32KB +| predicates: s.id < 5 +| limit: 1 +| +00:SCAN HDFS [functional.alltypestiny t] + partitions=4/4 files=4 size=460B +==== +# Uncorrelated NOT exists referencing a WITH clause +with + w1 as (select * from functional.alltypestiny t), + w2 as (select * from functional.alltypessmall s where s.id < 0) +select * +from w1 t +where not exists (select 1 from w2) +---- PLAN +02:NESTED LOOP JOIN [LEFT ANTI JOIN] +| +|--01:SCAN HDFS [functional.alltypessmall s] +| partitions=4/4 files=4 size=6.32KB +| predicates: s.id < 0 +| limit: 1 +| +00:SCAN HDFS [functional.alltypestiny t] + partitions=4/4 files=4 size=460B +==== +# Uncorrelated NOT EXISTS with an analytic function and a group by clause +select 1 +from functional.alltypestiny t +where not exists + (select id, max(int_col) over (partition by bigint_col) + from functional.alltypesagg where tinyint_col = 10 + group by id, int_col, bigint_col) +---- PLAN +03:NESTED LOOP JOIN [RIGHT ANTI JOIN] +| +|--00:SCAN HDFS [functional.alltypestiny t] +| partitions=4/4 files=4 size=460B +| +02:AGGREGATE [FINALIZE] +| group by: id, int_col, bigint_col +| limit: 1 +| +01:SCAN HDFS [functional.alltypesagg] + partitions=11/11 files=11 size=814.73KB + predicates: tinyint_col = 10 +==== +# Uncorrelated NOT EXISTS with a LIMIT 0 clause +select 1 +from functional.alltypestiny t +where not exists (select * from functional.alltypessmall limit 0) +---- PLAN +00:SCAN HDFS [functional.alltypestiny t] + partitions=4/4 files=4 size=460B +==== # Multiple nesting levels select count(*) from functional.alltypes a http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/46c3e43e/testdata/workloads/functional-query/queries/QueryTest/subquery.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/subquery.test b/testdata/workloads/functional-query/queries/QueryTest/subquery.test index af587f0..32b74d3 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/subquery.test +++ b/testdata/workloads/functional-query/queries/QueryTest/subquery.test @@ -330,6 +330,33 @@ and t.id > 0 TINYINT ==== ---- QUERY +# Uncorrelated NOT EXISTS +select id +from functional.alltypestiny t +where not exists (select 1 from functional.alltypessmall where bool_col = false) +and bool_col = true +---- RESULTS +---- TYPES +INT +==== +---- QUERY +# Uncorrelated NOT EXISTS that returns an empty set +select 1 +from functional.alltypestiny t +where not exists (select null from functional.alltypessmall where id < 0) +and t.id > 0 +---- RESULTS +1 +1 +1 +1 +1 +1 +1 +---- TYPES +TINYINT +==== +---- QUERY # Uncorrelated aggregate subquery select count(*) from functional.alltypessmall t
