IMPALA-1882: Remove ORDER BY restriction from first_value()/last_value() In order to conform to the ISO SQL specifications, this patch allows the 'order by' clause to be optional for first_value() and last_value() analytical functions.
Testing: 1. Added Analyzer tests 2. Added Planner tests which checks that a sort node is not used if both 'order by' and 'partition by' clauses are not used. Also, checks that if only 'partition by' clause is used then the sorting is done only on the partition column. 3. Added a query test that checks that the input is not reordered by these analytic functions if 'order by' clause is not used Change-Id: I5a3a56833ac062839629353ea240b361bc727d96 Reviewed-on: http://gerrit.cloudera.org:8080/7502 Reviewed-by: Matthew Jacobs <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/32960649 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/32960649 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/32960649 Branch: refs/heads/master Commit: 32960649746bca7517d0981f3563efea14c51950 Parents: 8623145 Author: Bikramjeet Vig <[email protected]> Authored: Mon Jul 24 12:50:19 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Sat Jul 29 20:21:40 2017 +0000 ---------------------------------------------------------------------- .../apache/impala/analysis/AnalyticExpr.java | 19 +-- .../impala/analysis/AnalyzeExprsTest.java | 11 ++ .../queries/PlannerTest/analytic-fns.test | 70 ++++++++++ .../queries/QueryTest/analytic-fns.test | 134 +++++++++++++++++++ 4 files changed, 226 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/32960649/fe/src/main/java/org/apache/impala/analysis/AnalyticExpr.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/analysis/AnalyticExpr.java b/fe/src/main/java/org/apache/impala/analysis/AnalyticExpr.java index bf0fb1b..534d09a 100644 --- a/fe/src/main/java/org/apache/impala/analysis/AnalyticExpr.java +++ b/fe/src/main/java/org/apache/impala/analysis/AnalyticExpr.java @@ -227,6 +227,12 @@ public class AnalyticExpr extends Expr { isAnalyticFn(fn, ROWNUMBER); } + static private boolean isFirstOrLastValueFn(Function fn) { + return isAnalyticFn(fn, LAST_VALUE) || isAnalyticFn(fn, FIRST_VALUE) || + isAnalyticFn(fn, LAST_VALUE_IGNORE_NULLS) || + isAnalyticFn(fn, FIRST_VALUE_IGNORE_NULLS); + } + /** * Rewrite the following analytic functions: * percent_rank(), cume_dist() and ntile() @@ -447,16 +453,13 @@ public class AnalyticExpr extends Expr { "DISTINCT not allowed in analytic function: " + getFnCall().toSql()); } - if (getFnCall().getParams().isIgnoreNulls()) { - String fnName = getFnCall().getFnName().getFunction(); - if (!fnName.equals(LAST_VALUE) && !fnName.equals(FIRST_VALUE)) { - throw new AnalysisException("Function " + fnName.toUpperCase() - + " does not accept the keyword IGNORE NULLS."); - } + Function fn = getFnCall().getFn(); + if (getFnCall().getParams().isIgnoreNulls() && !isFirstOrLastValueFn(fn)) { + throw new AnalysisException("Function " + fn.functionName().toUpperCase() + + " does not accept the keyword IGNORE NULLS."); } // check for correct composition of analytic expr - Function fn = getFnCall().getFn(); if (!(fn instanceof AggregateFunction)) { throw new AnalysisException( "OVER clause requires aggregate or analytic function: " @@ -471,7 +474,7 @@ public class AnalyticExpr extends Expr { } if (isAnalyticFn(fn) && !isAggregateFn(fn)) { - if (orderByElements_.isEmpty()) { + if (!isFirstOrLastValueFn(fn) && orderByElements_.isEmpty()) { throw new AnalysisException( "'" + getFnCall().toSql() + "' requires an ORDER BY clause"); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/32960649/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java ---------------------------------------------------------------------- diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java index cc65a23..6f883a3 100644 --- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java +++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java @@ -735,6 +735,17 @@ public class AnalyzeExprsTest extends AnalyzerTest { "from functional.alltypesagg a " + "where exists (select 1 from functional.alltypes b where a.id = b.id)"); + // last_value/first_value without using order by + AnalyzesOk("select first_value(tinyint_col) over () from functional.alltypesagg"); + AnalyzesOk("select last_value(tinyint_col) over () from functional.alltypesagg"); + AnalyzesOk("select first_value(tinyint_col ignore nulls) over () from " + + "functional.alltypesagg"); + AnalyzesOk("select last_value(tinyint_col ignore nulls) over () from " + + "functional.alltypesagg"); + AnalyzesOk("select first_value(tinyint_col ignore nulls) over ()," + + "last_value(tinyint_col ignore nulls) over () from functional.alltypesagg a " + + "where exists (select 1 from functional.alltypes b where a.id = b.id)"); + // legal combinations of analytic and agg functions AnalyzesOk("select sum(count(id)) over (partition by min(int_col) " + "order by max(bigint_col)) from functional.alltypes group by id, tinyint_col " http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/32960649/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test b/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test index c6d9c1a..2977f5d 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test @@ -2364,3 +2364,73 @@ PLAN-ROOT SINK partitions=24/24 files=24 size=478.45KB runtime filters: RF000 -> t1.id ==== +# IMPALA-1882: Confirm that first_value function used without a partition by and order +# by clause does not need a sort node +select first_value(tinyint_col ignore nulls) over () from functional.alltypesagg +---- DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +01:ANALYTIC +| functions: first_value_ignore_nulls(tinyint_col) +| +02:EXCHANGE [UNPARTITIONED] +| +00:SCAN HDFS [functional.alltypesagg] + partitions=11/11 files=11 size=814.73KB +==== +# IMPALA-1882: Confirm that last_value function used without a partition by and order +# by clause does not need a sort node +select last_value(tinyint_col ignore nulls) over () from functional.alltypesagg +---- DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +01:ANALYTIC +| functions: last_value_ignore_nulls(tinyint_col) +| +02:EXCHANGE [UNPARTITIONED] +| +00:SCAN HDFS [functional.alltypesagg] + partitions=11/11 files=11 size=814.73KB +==== +# IMPALA-1882: Confirm that first_value function using only a partition by clause +# sorts over partition column +select *, first_value(id) over (partition by bool_col) first_val from +functional.alltypessmall; +---- DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +04:EXCHANGE [UNPARTITIONED] +| +02:ANALYTIC +| functions: first_value(id) +| partition by: bool_col +| +01:SORT +| order by: bool_col ASC NULLS FIRST +| +03:EXCHANGE [HASH(bool_col)] +| +00:SCAN HDFS [functional.alltypessmall] + partitions=4/4 files=4 size=6.32KB +==== +# IMPALA-1882: Confirm that last_value function using only a partition by clause +# sorts over partition column +select *, last_value(id) over (partition by bool_col) first_val from +functional.alltypessmall; +---- DISTRIBUTEDPLAN +PLAN-ROOT SINK +| +04:EXCHANGE [UNPARTITIONED] +| +02:ANALYTIC +| functions: last_value(id) +| partition by: bool_col +| +01:SORT +| order by: bool_col ASC NULLS FIRST +| +03:EXCHANGE [HASH(bool_col)] +| +00:SCAN HDFS [functional.alltypessmall] + partitions=4/4 files=4 size=6.32KB +==== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/32960649/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test b/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test index 0e2f6b0..e697914 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test +++ b/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test @@ -1955,3 +1955,137 @@ TINYINT 4 4 ==== +---- QUERY +# Test for IMPALA-1882: first_value works without order by clause. Therefore, output of +# function depends on order of input rows, which in this case is DESC +select id, bool_col, first_value(id) over (partition by bool_col) from +(select * from functional.alltypessmall order by id desc limit 5) as t1 +---- TYPES +INT, BOOLEAN, INT +---- RESULTS +99,false,99 +97,false,99 +95,false,99 +98,true,98 +96,true,98 +==== +---- QUERY +# Test for IMPALA-1882: first_value works without order by clause. Therefore, output of +# function depends on order of input rows, which in this case is ASC +select id, bool_col, first_value(id) over (partition by bool_col) from +(select * from functional.alltypessmall order by id asc limit 5) as t1 +---- TYPES +INT, BOOLEAN, INT +---- RESULTS +1,false,1 +3,false,1 +0,true,0 +2,true,0 +4,true,0 +---- QUERY +# Test for IMPALA-1882: last_value works without order by clause. Therefore, output of +# function depends on order of input rows, which in this case is DESC +select id, bool_col, last_value(id) over (partition by bool_col) from +(select * from functional.alltypessmall order by id desc limit 5) as t1 +---- TYPES +INT, BOOLEAN, INT +---- RESULTS +99,false,95 +97,false,95 +95,false,95 +98,true,96 +96,true,96 +==== +---- QUERY +# Test for IMPALA-1882: last_value works without order by clause. Therefore, output of +# function depends on order of input rows, which in this case is ASC +select id, bool_col, last_value(id) over (partition by bool_col) from +(select * from functional.alltypessmall order by id asc limit 5) as t1 +---- TYPES +INT, BOOLEAN, INT +---- RESULTS +1,false,3 +3,false,3 +0,true,4 +2,true,4 +4,true,4 +==== +---- QUERY +# Test for IMPALA-1882: first_value works with ignore nulls and without order by clause. +# Therefore, output of function depends on order of input rows, which in this case is ASC +select bool_col, smallint_col, first_value(smallint_col ignore nulls) over +(partition by bool_col) from (select * from functional.alltypesagg where +id > 99 order by id asc limit 10) as t1 +---- TYPES +BOOLEAN, SMALLINT, SMALLINT +---- RESULTS +false,1,1 +false,3,1 +false,5,1 +false,7,1 +true,NULL,2 +true,NULL,2 +true,2,2 +true,4,2 +true,6,2 +true,8,2 +==== +---- QUERY +# Test for IMPALA-1882: first_value works with ignore nulls and without order by clause. +# Therefore, output of function depends on order of input rows, which in this case is DESC +select bool_col, smallint_col, first_value(smallint_col ignore nulls) over +(partition by bool_col) from (select * from functional.alltypesagg where +id < 101 order by id desc limit 10) as t1; +---- TYPES +BOOLEAN, SMALLINT, SMALLINT +---- RESULTS +false,99,99 +false,97,99 +false,95,99 +false,93,99 +true,NULL,98 +true,NULL,98 +true,98,98 +true,96,98 +true,94,98 +true,92,98 +==== +---- QUERY +# Test for IMPALA-1882: last_value works with ignore nulls and without order by clause. +# Therefore, output of function depends on order of input rows, which in this case is ASC +select bool_col, smallint_col, last_value(smallint_col ignore nulls) over +(partition by bool_col) from (select * from functional.alltypesagg where +id > 99 order by id asc limit 10) as t1; +---- TYPES +BOOLEAN, SMALLINT, SMALLINT +---- RESULTS +false,1,7 +false,3,7 +false,5,7 +false,7,7 +true,NULL,8 +true,NULL,8 +true,2,8 +true,4,8 +true,6,8 +true,8,8 +==== +---- QUERY +# Test for IMPALA-1882: last_value works with ignore nulls and without order by clause. +# Therefore, output of function depends on order of input rows, which in this case is DESC +select bool_col, smallint_col, last_value(smallint_col ignore nulls) over +(partition by bool_col) from (select * from functional.alltypesagg where +id < 110 order by id desc limit 10) as t1; +---- TYPES +BOOLEAN, SMALLINT, SMALLINT +---- RESULTS +false,9,1 +false,7,1 +false,5,1 +false,3,1 +false,1,1 +true,8,2 +true,6,2 +true,4,2 +true,2,2 +true,NULL,2
