IMPALA-1882: Remove ORDER BY restriction from first_value()/last_value()

In order to conform to the ISO SQL specifications, this patch allows
the 'order by' clause to be optional for first_value() and last_value()
analytical functions.

Testing:
1. Added Analyzer tests
2. Added Planner tests which checks that a sort node is not used if both
'order by' and 'partition by' clauses are not used. Also, checks that
if only 'partition by' clause is used then the sorting is done only on
the partition column.
3. Added a query test that checks that the input is not reordered by
these analytic functions if 'order by' clause is not used

Change-Id: I5a3a56833ac062839629353ea240b361bc727d96
Reviewed-on: http://gerrit.cloudera.org:8080/7502
Reviewed-by: Matthew Jacobs <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/32960649
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/32960649
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/32960649

Branch: refs/heads/master
Commit: 32960649746bca7517d0981f3563efea14c51950
Parents: 8623145
Author: Bikramjeet Vig <[email protected]>
Authored: Mon Jul 24 12:50:19 2017 -0700
Committer: Impala Public Jenkins <[email protected]>
Committed: Sat Jul 29 20:21:40 2017 +0000

----------------------------------------------------------------------
 .../apache/impala/analysis/AnalyticExpr.java    |  19 +--
 .../impala/analysis/AnalyzeExprsTest.java       |  11 ++
 .../queries/PlannerTest/analytic-fns.test       |  70 ++++++++++
 .../queries/QueryTest/analytic-fns.test         | 134 +++++++++++++++++++
 4 files changed, 226 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/32960649/fe/src/main/java/org/apache/impala/analysis/AnalyticExpr.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/AnalyticExpr.java 
b/fe/src/main/java/org/apache/impala/analysis/AnalyticExpr.java
index bf0fb1b..534d09a 100644
--- a/fe/src/main/java/org/apache/impala/analysis/AnalyticExpr.java
+++ b/fe/src/main/java/org/apache/impala/analysis/AnalyticExpr.java
@@ -227,6 +227,12 @@ public class AnalyticExpr extends Expr {
         isAnalyticFn(fn, ROWNUMBER);
   }
 
+  static private boolean isFirstOrLastValueFn(Function fn) {
+    return isAnalyticFn(fn, LAST_VALUE) || isAnalyticFn(fn, FIRST_VALUE) ||
+        isAnalyticFn(fn, LAST_VALUE_IGNORE_NULLS) ||
+        isAnalyticFn(fn, FIRST_VALUE_IGNORE_NULLS);
+  }
+
   /**
    * Rewrite the following analytic functions:
    * percent_rank(), cume_dist() and ntile()
@@ -447,16 +453,13 @@ public class AnalyticExpr extends Expr {
           "DISTINCT not allowed in analytic function: " + getFnCall().toSql());
     }
 
-    if (getFnCall().getParams().isIgnoreNulls()) {
-      String fnName = getFnCall().getFnName().getFunction();
-      if (!fnName.equals(LAST_VALUE) && !fnName.equals(FIRST_VALUE)) {
-        throw new AnalysisException("Function " + fnName.toUpperCase()
-            + " does not accept the keyword IGNORE NULLS.");
-      }
+    Function fn = getFnCall().getFn();
+    if (getFnCall().getParams().isIgnoreNulls() && !isFirstOrLastValueFn(fn)) {
+      throw new AnalysisException("Function " + fn.functionName().toUpperCase()
+          + " does not accept the keyword IGNORE NULLS.");
     }
 
     // check for correct composition of analytic expr
-    Function fn = getFnCall().getFn();
     if (!(fn instanceof AggregateFunction)) {
         throw new AnalysisException(
             "OVER clause requires aggregate or analytic function: "
@@ -471,7 +474,7 @@ public class AnalyticExpr extends Expr {
     }
 
     if (isAnalyticFn(fn) && !isAggregateFn(fn)) {
-      if (orderByElements_.isEmpty()) {
+      if (!isFirstOrLastValueFn(fn) && orderByElements_.isEmpty()) {
         throw new AnalysisException(
             "'" + getFnCall().toSql() + "' requires an ORDER BY clause");
       }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/32960649/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java 
b/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java
index cc65a23..6f883a3 100644
--- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java
+++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java
@@ -735,6 +735,17 @@ public class AnalyzeExprsTest extends AnalyzerTest {
                "from functional.alltypesagg a " +
                "where exists (select 1 from functional.alltypes b where a.id = 
b.id)");
 
+    // last_value/first_value without using order by
+    AnalyzesOk("select first_value(tinyint_col) over () from 
functional.alltypesagg");
+    AnalyzesOk("select last_value(tinyint_col) over () from 
functional.alltypesagg");
+    AnalyzesOk("select first_value(tinyint_col ignore nulls) over () from "
+        + "functional.alltypesagg");
+    AnalyzesOk("select last_value(tinyint_col ignore nulls) over () from "
+        + "functional.alltypesagg");
+    AnalyzesOk("select first_value(tinyint_col ignore nulls) over ()," +
+        "last_value(tinyint_col ignore nulls) over () from 
functional.alltypesagg a " +
+        "where exists (select 1 from functional.alltypes b where a.id = 
b.id)");
+
     // legal combinations of analytic and agg functions
     AnalyzesOk("select sum(count(id)) over (partition by min(int_col) "
         + "order by max(bigint_col)) from functional.alltypes group by id, 
tinyint_col "

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/32960649/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test 
b/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test
index c6d9c1a..2977f5d 100644
--- 
a/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test
+++ 
b/testdata/workloads/functional-planner/queries/PlannerTest/analytic-fns.test
@@ -2364,3 +2364,73 @@ PLAN-ROOT SINK
    partitions=24/24 files=24 size=478.45KB
    runtime filters: RF000 -> t1.id
 ====
+# IMPALA-1882: Confirm that first_value function used without a partition by 
and order
+# by clause does not need a sort node
+select first_value(tinyint_col ignore nulls) over () from 
functional.alltypesagg
+---- DISTRIBUTEDPLAN
+PLAN-ROOT SINK
+|
+01:ANALYTIC
+|  functions: first_value_ignore_nulls(tinyint_col)
+|
+02:EXCHANGE [UNPARTITIONED]
+|
+00:SCAN HDFS [functional.alltypesagg]
+   partitions=11/11 files=11 size=814.73KB
+====
+# IMPALA-1882: Confirm that last_value function used without a partition by 
and order
+# by clause does not need a sort node
+select last_value(tinyint_col ignore nulls) over () from functional.alltypesagg
+---- DISTRIBUTEDPLAN
+PLAN-ROOT SINK
+|
+01:ANALYTIC
+|  functions: last_value_ignore_nulls(tinyint_col)
+|
+02:EXCHANGE [UNPARTITIONED]
+|
+00:SCAN HDFS [functional.alltypesagg]
+   partitions=11/11 files=11 size=814.73KB
+====
+# IMPALA-1882: Confirm that first_value function using only a partition by 
clause
+# sorts over partition column
+select *, first_value(id) over (partition by bool_col) first_val from
+functional.alltypessmall;
+---- DISTRIBUTEDPLAN
+PLAN-ROOT SINK
+|
+04:EXCHANGE [UNPARTITIONED]
+|
+02:ANALYTIC
+|  functions: first_value(id)
+|  partition by: bool_col
+|
+01:SORT
+|  order by: bool_col ASC NULLS FIRST
+|
+03:EXCHANGE [HASH(bool_col)]
+|
+00:SCAN HDFS [functional.alltypessmall]
+   partitions=4/4 files=4 size=6.32KB
+====
+# IMPALA-1882: Confirm that last_value function using only a partition by 
clause
+# sorts over partition column
+select *, last_value(id) over (partition by bool_col) first_val from
+functional.alltypessmall;
+---- DISTRIBUTEDPLAN
+PLAN-ROOT SINK
+|
+04:EXCHANGE [UNPARTITIONED]
+|
+02:ANALYTIC
+|  functions: last_value(id)
+|  partition by: bool_col
+|
+01:SORT
+|  order by: bool_col ASC NULLS FIRST
+|
+03:EXCHANGE [HASH(bool_col)]
+|
+00:SCAN HDFS [functional.alltypessmall]
+   partitions=4/4 files=4 size=6.32KB
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/32960649/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test 
b/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test
index 0e2f6b0..e697914 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/analytic-fns.test
@@ -1955,3 +1955,137 @@ TINYINT
 4
 4
 ====
+---- QUERY
+# Test for IMPALA-1882: first_value works without order by clause. Therefore, 
output of
+# function depends on order of input rows, which in this case is DESC
+select id, bool_col, first_value(id) over (partition by bool_col) from
+(select * from functional.alltypessmall order by id desc limit 5) as t1
+---- TYPES
+INT, BOOLEAN, INT
+---- RESULTS
+99,false,99
+97,false,99
+95,false,99
+98,true,98
+96,true,98
+====
+---- QUERY
+# Test for IMPALA-1882: first_value works without order by clause. Therefore, 
output of
+# function depends on order of input rows, which in this case is ASC
+select id, bool_col, first_value(id) over (partition by bool_col) from
+(select * from functional.alltypessmall order by id asc limit 5) as t1
+---- TYPES
+INT, BOOLEAN, INT
+---- RESULTS
+1,false,1
+3,false,1
+0,true,0
+2,true,0
+4,true,0
+---- QUERY
+# Test for IMPALA-1882: last_value works without order by clause. Therefore, 
output of
+# function depends on order of input rows, which in this case is DESC
+select id, bool_col, last_value(id) over (partition by bool_col) from
+(select * from functional.alltypessmall order by id desc limit 5) as t1
+---- TYPES
+INT, BOOLEAN, INT
+---- RESULTS
+99,false,95
+97,false,95
+95,false,95
+98,true,96
+96,true,96
+====
+---- QUERY
+# Test for IMPALA-1882: last_value works without order by clause. Therefore, 
output of
+# function depends on order of input rows, which in this case is ASC
+select id, bool_col, last_value(id) over (partition by bool_col) from
+(select * from functional.alltypessmall order by id asc limit 5) as t1
+---- TYPES
+INT, BOOLEAN, INT
+---- RESULTS
+1,false,3
+3,false,3
+0,true,4
+2,true,4
+4,true,4
+====
+---- QUERY
+# Test for IMPALA-1882: first_value works with ignore nulls and without order 
by clause.
+# Therefore, output of function depends on order of input rows, which in this 
case is ASC
+select bool_col, smallint_col, first_value(smallint_col ignore nulls) over
+(partition by bool_col) from (select * from functional.alltypesagg where
+id > 99 order by id asc limit 10) as t1
+---- TYPES
+BOOLEAN, SMALLINT, SMALLINT
+---- RESULTS
+false,1,1
+false,3,1
+false,5,1
+false,7,1
+true,NULL,2
+true,NULL,2
+true,2,2
+true,4,2
+true,6,2
+true,8,2
+====
+---- QUERY
+# Test for IMPALA-1882: first_value works with ignore nulls and without order 
by clause.
+# Therefore, output of function depends on order of input rows, which in this 
case is DESC
+select bool_col, smallint_col, first_value(smallint_col ignore nulls) over
+(partition by bool_col) from (select * from functional.alltypesagg where
+id < 101 order by id desc limit 10) as t1;
+---- TYPES
+BOOLEAN, SMALLINT, SMALLINT
+---- RESULTS
+false,99,99
+false,97,99
+false,95,99
+false,93,99
+true,NULL,98
+true,NULL,98
+true,98,98
+true,96,98
+true,94,98
+true,92,98
+====
+---- QUERY
+# Test for IMPALA-1882: last_value works with ignore nulls and without order 
by clause.
+# Therefore, output of function depends on order of input rows, which in this 
case is ASC
+select bool_col, smallint_col, last_value(smallint_col ignore nulls) over
+(partition by bool_col) from (select * from functional.alltypesagg where
+id > 99 order by id asc limit 10) as t1;
+---- TYPES
+BOOLEAN, SMALLINT, SMALLINT
+---- RESULTS
+false,1,7
+false,3,7
+false,5,7
+false,7,7
+true,NULL,8
+true,NULL,8
+true,2,8
+true,4,8
+true,6,8
+true,8,8
+====
+---- QUERY
+# Test for IMPALA-1882: last_value works with ignore nulls and without order 
by clause.
+# Therefore, output of function depends on order of input rows, which in this 
case is DESC
+select bool_col, smallint_col, last_value(smallint_col ignore nulls) over
+(partition by bool_col) from (select * from functional.alltypesagg where
+id < 110 order by id desc limit 10) as t1;
+---- TYPES
+BOOLEAN, SMALLINT, SMALLINT
+---- RESULTS
+false,9,1
+false,7,1
+false,5,1
+false,3,1
+false,1,1
+true,8,2
+true,6,2
+true,4,2
+true,2,2
+true,NULL,2

Reply via email to