subject:"\[GitHub\] \[spark\] beliefer commented on a change in pull request #26656\: \[SPARK\-27986\]\[SQL\] Support ANSI SQL filter clause for aggregate expression"

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-23 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r361051245
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,156 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp;
+SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "-MM-dd") = 
"2001-01-01") FROM emp;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "-MM-dd 
HH:mm:ss") = "2001-01-01 00:00:00") FROM emp;
+
+-- Aggregate with filter and non-empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b;
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "-MM-dd") 
> "2003-01-01") FROM emp GROUP BY dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, 
"-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id;
+
+-- Aggregate with filter and grouped by literals.
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) 
FROM emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= 
to_timestamp("2003-01-01")) FROM emp GROUP BY 1;
+
+-- Aggregate with filter, more than one aggregate function goes with distinct.
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id;
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate), sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct emp_name), sum(salary) from emp group by dept_id;
+-- select dept_id,

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-23 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r361042960
 
 

 ##
 File path: sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
 ##
 @@ -2835,6 +2835,54 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession {
 checkAnswer(df, Row(1, 3, 4) :: Row(2, 3, 4) :: Row(3, 3, 4) :: Nil)
   }
 
+  test("Support filter clause for aggregate function with hash aggregate") {
+Seq(("COUNT(a)", 3), ("COLLECT_LIST(a)", Seq(1, 2, 3))).foreach { 
funcToResult =>
+  val query = s"SELECT ${funcToResult._1} FILTER (WHERE b > 1) FROM 
testData2"
+  val df = sql(query)
+  val physical = df.queryExecution.sparkPlan
+  val aggregateExpressions = physical.collectFirst {
+case agg: HashAggregateExec => agg.aggregateExpressions
+case agg: ObjectHashAggregateExec => agg.aggregateExpressions
+  }
+  assert(aggregateExpressions.isDefined)
+  assert(aggregateExpressions.get.size == 1)
+  aggregateExpressions.get.foreach { expr =>
+assert(expr.filter.isDefined)
+  }
+  checkAnswer(df, Row(funcToResult._2) :: Nil)
+}
+  }
+
+  test("Support filter clause for aggregate function uses SortAggregateExec") {
+withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") {
+  val query = s"SELECT PERCENTILE(a, 1) FILTER (WHERE b > 1) FROM 
testData2"
+  val df = sql(query)
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-23 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r361042691
 
 

 ##
 File path: sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
 ##
 @@ -2835,6 +2835,54 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession {
 checkAnswer(df, Row(1, 3, 4) :: Row(2, 3, 4) :: Row(3, 3, 4) :: Nil)
   }
 
+  test("Support filter clause for aggregate function with hash aggregate") {
+Seq(("COUNT(a)", 3), ("COLLECT_LIST(a)", Seq(1, 2, 3))).foreach { 
funcToResult =>
+  val query = s"SELECT ${funcToResult._1} FILTER (WHERE b > 1) FROM 
testData2"
+  val df = sql(query)
+  val physical = df.queryExecution.sparkPlan
+  val aggregateExpressions = physical.collectFirst {
+case agg: HashAggregateExec => agg.aggregateExpressions
+case agg: ObjectHashAggregateExec => agg.aggregateExpressions
+  }
+  assert(aggregateExpressions.isDefined)
+  assert(aggregateExpressions.get.size == 1)
+  aggregateExpressions.get.foreach { expr =>
+assert(expr.filter.isDefined)
+  }
+  checkAnswer(df, Row(funcToResult._2) :: Nil)
+}
+  }
+
+  test("Support filter clause for aggregate function uses SortAggregateExec") {
+withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") {
+  val query = s"SELECT PERCENTILE(a, 1) FILTER (WHERE b > 1) FROM 
testData2"
+  val df = sql(query)
+  val physical = df.queryExecution.sparkPlan
+  val aggregateExpressions = physical.collectFirst {
+case agg: SortAggregateExec => agg.aggregateExpressions
+  }
+  assert(aggregateExpressions.isDefined)
+  assert(aggregateExpressions.get.size == 1)
+  aggregateExpressions.get.foreach { expr =>
+assert(expr.filter.isDefined)
+  }
+  checkAnswer(df, Row(3) :: Nil)
+}
+  }
+
+  test("Support filter clause for aggregate function with group") {
 
 Review comment:
   OK. I will remove this


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-23 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r361042553
 
 

 ##
 File path: sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
 ##
 @@ -2835,6 +2835,54 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession {
 checkAnswer(df, Row(1, 3, 4) :: Row(2, 3, 4) :: Row(3, 3, 4) :: Nil)
   }
 
+  test("Support filter clause for aggregate function with hash aggregate") {
+Seq(("COUNT(a)", 3), ("COLLECT_LIST(a)", Seq(1, 2, 3))).foreach { 
funcToResult =>
+  val query = s"SELECT ${funcToResult._1} FILTER (WHERE b > 1) FROM 
testData2"
+  val df = sql(query)
+  val physical = df.queryExecution.sparkPlan
+  val aggregateExpressions = physical.collectFirst {
+case agg: HashAggregateExec => agg.aggregateExpressions
+case agg: ObjectHashAggregateExec => agg.aggregateExpressions
+  }
+  assert(aggregateExpressions.isDefined)
+  assert(aggregateExpressions.get.size == 1)
+  aggregateExpressions.get.foreach { expr =>
+assert(expr.filter.isDefined)
+  }
+  checkAnswer(df, Row(funcToResult._2) :: Nil)
+}
+  }
+
+  test("Support filter clause for aggregate function uses SortAggregateExec") {
+withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") {
+  val query = s"SELECT PERCENTILE(a, 1) FILTER (WHERE b > 1) FROM 
testData2"
+  val df = sql(query)
+  val physical = df.queryExecution.sparkPlan
+  val aggregateExpressions = physical.collectFirst {
+case agg: SortAggregateExec => agg.aggregateExpressions
+  }
+  assert(aggregateExpressions.isDefined)
+  assert(aggregateExpressions.get.size == 1)
+  aggregateExpressions.get.foreach { expr =>
+assert(expr.filter.isDefined)
+  }
+  checkAnswer(df, Row(3) :: Nil)
+}
+  }
+
+  test("Support filter clause for aggregate function with group") {
+val query = s"SELECT b, MAX(a) FILTER (WHERE b = (select 2)) FROM 
testData2 GROUP BY b"
+val df = sql(query)
+val physical = df.queryExecution.sparkPlan
+val aggregateExpressions = physical.collectFirst {
+  case agg: HashAggregateExec => agg.aggregateExpressions
+  case agg: SortAggregateExec => agg.aggregateExpressions
+}
+assert(aggregateExpressions.isDefined)
+assert(aggregateExpressions.get.size == 1)
 
 Review comment:
   OK. I will remove this.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-23 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r361042249
 
 

 ##
 File path: 
sql/core/src/test/resources/sql-tests/inputs/postgreSQL/groupingsets.sql
 ##
 @@ -337,7 +337,8 @@ order by 2,1;
 
 -- FILTER queries
 -- [SPARK-27986] Support Aggregate Expressions with filter
 
 Review comment:
   OK. Thanks for your remind.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-22 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360748210
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,156 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp;
+SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "-MM-dd") = 
"2001-01-01") FROM emp;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "-MM-dd 
HH:mm:ss") = "2001-01-01 00:00:00") FROM emp;
+
+-- Aggregate with filter and non-empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b;
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "-MM-dd") 
> "2003-01-01") FROM emp GROUP BY dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, 
"-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id;
+
+-- Aggregate with filter and grouped by literals.
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) 
FROM emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= 
to_timestamp("2003-01-01")) FROM emp GROUP BY 1;
+
+-- Aggregate with filter, more than one aggregate function goes with distinct.
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id;
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate), sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct emp_name), sum(salary) from emp group by dept_id;
+-- select dept_id,

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-22 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360752300
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
 ##
 @@ -194,7 +238,10 @@ object RewriteDistinctAggregates extends 
Rule[LogicalPlan] {
   val regularAggOperatorMap = regularAggExprs.map { e =>
 // Perform the actual aggregation in the initial aggregate.
 val af = 
patchAggregateFunctionChildren(e.aggregateFunction)(regularAggChildAttrLookup.get)
-val operator = Alias(e.copy(aggregateFunction = af), e.sql)()
+val filterOpt = e.filter.map(_.transform {
 
 Review comment:
   OK. There need the comments indeed.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-22 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360751512
 
 

 ##
 File path: 
sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out
 ##
 @@ -0,0 +1,569 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 47
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state)
+-- !query 2 schema
+struct<>
+-- !query 2 output
+
+
+
+-- !query 3
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData
+-- !query 3 schema
+struct<>
+-- !query 3 output
+org.apache.spark.sql.AnalysisException
+grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate 
function. Wrap '(count(testdata.`b`) AS `count(b)`)' in windowing function(s) 
or wrap 'testdata.`a`' in first() (or first_value) if you don't care which 
value you get.;
+
+
+-- !query 4
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData
+-- !query 4 schema
+struct
+-- !query 4 output
+2  4
+
+
+-- !query 5
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp
+-- !query 5 schema
+struct
+-- !query 5 output
+2
+
+
+-- !query 6
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp
+-- !query 6 schema
+struct
+-- !query 6 output
+2
+
+
+-- !query 7
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp
+-- !query 7 schema
+struct
+-- !query 7 output
+2
+
+
+-- !query 8
+SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "-MM-dd") = 
"2001-01-01") FROM emp
+-- !query 8 schema
+struct
+-- !query 8 output
+2
+
+
+-- !query 9
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a
+-- !query 9 schema
+struct
+-- !query 9 output
+1  0
+2  2
+3  2
+NULL   0
+
+
+-- !query 10
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an 
aggregate function. Add to group by or wrap in first() (or first_value) if you 
don't care which value you get.;
+
+
+-- !query 11
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a
+-- !query 11 schema
+struct
+-- !query 11 output
+0  0
+2  0
+2  0
+3  2
+
+
+-- !query 12
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id
+-- !query 12 schema
+struct
+-- !query 12 output
+10 200.0
+100400.0
+20 NULL
+30 400.0
+70 150.0
+NULL   NULL
+
+
+-- !query 13
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id
+-- !query 13 schema
+struct
+-- !query 13 output
+10 200.0
+100400.0
+20 NULL
+30 400.0
+70 150.0
+NULL   NULL
+
+
+-- !query 14
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id
+-- !query 14 schema
+struct
+-- !query 14 output
+10 200.0
+100400.0
+20 NULL
+30 400.0
+70 150.0
+NULL   NULL
+
+
+-- !query 15
+SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "-MM-dd") 
> "2003-01-01") FROM emp GROUP BY dept_id
+-- !query 15 schema
+struct
+-- !query 15 output
+10 200.0
+100400.0
+20 NULL
+30 400.0
+70 150.0
+NULL   NULL
+
+
+-- !query 16
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1
+-- !query 16 schema
+struct
+-- !query 16 output
+foo6
+
+
+-- !query 17
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1
+-- !query 17 schema
+struct
+-- !query 17 output
+foo1350.0
+
+
+-- !query 18
+SELECT 'foo', SUM(salary) FILTER (WHERE

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-22 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360750781
 
 

 ##
 File path: sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
 ##
 @@ -2835,6 +2835,84 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession {
 checkAnswer(df, Row(1, 3, 4) :: Row(2, 3, 4) :: Row(3, 3, 4) :: Nil)
   }
 
+  test("Support filter clause for aggregate function with hash aggregate") {
+Seq(("APPROX_COUNT_DISTINCT(a)", 3), ("COUNT(a)", 3), ("FIRST(a)", 1), 
("LAST(a)", 3),
+  ("MAX(a)", 3), ("AVG(a)", 2.0), ("MIN(a)", 1), ("SUM(a)", 6), 
("PERCENTILE(a, 1)", 3),
+  ("PERCENTILE_APPROX(a, 0.5, 100)", 2.0), ("COLLECT_LIST(a)", Seq(1, 2, 
3)),
+  ("COLLECT_SET(a)", Seq(1, 2, 3))).foreach { funcToResult =>
+  val query = s"SELECT ${funcToResult._1} FILTER (WHERE b > 1) FROM 
testData2"
+  val df = sql(query)
+  val physical = df.queryExecution.sparkPlan
+  val aggregateExpressions = physical.collectFirst {
+case agg: HashAggregateExec => agg.aggregateExpressions
+case agg: ObjectHashAggregateExec => agg.aggregateExpressions
+  }
+  assert(aggregateExpressions.isDefined)
+  assert(aggregateExpressions.get.size == 1)
+  aggregateExpressions.get.foreach { expr =>
+assert(expr.filter.isDefined)
+  }
+  checkAnswer(df, Row(funcToResult._2) :: Nil)
+}
+  }
+
+  test("Support filter clause for aggregate function uses SortAggregateExec") {
+withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") {
+  Seq(("PERCENTILE(a, 1)", 3),
+("PERCENTILE_APPROX(a, 0.5, 100)", 2.0),
+("COLLECT_LIST(a)", Seq(1, 2, 3)),
+("COLLECT_SET(a)", Seq(1, 2, 3))).foreach { funcToResult =>
+val query = s"SELECT ${funcToResult._1} FILTER (WHERE b > 1) FROM 
testData2"
+val df = sql(query)
+val physical = df.queryExecution.sparkPlan
+val aggregateExpressions = physical.collectFirst {
+  case agg: SortAggregateExec => agg.aggregateExpressions
+}
+assert(aggregateExpressions.isDefined)
+assert(aggregateExpressions.get.size == 1)
+aggregateExpressions.get.foreach { expr =>
+  assert(expr.filter.isDefined)
+}
+checkAnswer(df, Row(funcToResult._2) :: Nil)
+  }
+}
+  }
+
+  test("Support filter clause for multiple aggregate function") {
+val query =
+  """
+| SELECT
+| COUNT(a), COUNT(a) FILTER (WHERE b >= 2),
+| SUM(a), SUM(a) FILTER (WHERE b < 2),
+| MAX(a), MAX(a) FILTER (WHERE b > 0),
+| MIN(a), MIN(a) FILTER (WHERE b = 1),
+| AVG(a), AVG(a) FILTER (WHERE b IN (1, 2)) FROM testData2
+  """.stripMargin
+val df = sql(query)
+val physical = df.queryExecution.sparkPlan
+val aggregateExpressions = physical.collectFirst {
+  case agg: HashAggregateExec => agg.aggregateExpressions
+}
+assert(aggregateExpressions.isDefined)
+assert(aggregateExpressions.get.size == 10)
+checkAnswer(df, Row(6, 3, 12, 6, 3, 3, 1, 1, 2, 2) :: Nil)
+  }
+
+  test("Support filter clause for aggregate function with group") {
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-22 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360750209
 
 

 ##
 File path: sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
 ##
 @@ -2835,6 +2835,84 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession {
 checkAnswer(df, Row(1, 3, 4) :: Row(2, 3, 4) :: Row(3, 3, 4) :: Nil)
   }
 
+  test("Support filter clause for aggregate function with hash aggregate") {
+Seq(("APPROX_COUNT_DISTINCT(a)", 3), ("COUNT(a)", 3), ("FIRST(a)", 1), 
("LAST(a)", 3),
+  ("MAX(a)", 3), ("AVG(a)", 2.0), ("MIN(a)", 1), ("SUM(a)", 6), 
("PERCENTILE(a, 1)", 3),
+  ("PERCENTILE_APPROX(a, 0.5, 100)", 2.0), ("COLLECT_LIST(a)", Seq(1, 2, 
3)),
+  ("COLLECT_SET(a)", Seq(1, 2, 3))).foreach { funcToResult =>
+  val query = s"SELECT ${funcToResult._1} FILTER (WHERE b > 1) FROM 
testData2"
+  val df = sql(query)
+  val physical = df.queryExecution.sparkPlan
+  val aggregateExpressions = physical.collectFirst {
+case agg: HashAggregateExec => agg.aggregateExpressions
+case agg: ObjectHashAggregateExec => agg.aggregateExpressions
+  }
+  assert(aggregateExpressions.isDefined)
+  assert(aggregateExpressions.get.size == 1)
+  aggregateExpressions.get.foreach { expr =>
+assert(expr.filter.isDefined)
+  }
+  checkAnswer(df, Row(funcToResult._2) :: Nil)
+}
+  }
+
+  test("Support filter clause for aggregate function uses SortAggregateExec") {
+withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") {
+  Seq(("PERCENTILE(a, 1)", 3),
 
 Review comment:
   I will reserve one function.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-22 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360749857
 
 

 ##
 File path: sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
 ##
 @@ -2835,6 +2835,84 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession {
 checkAnswer(df, Row(1, 3, 4) :: Row(2, 3, 4) :: Row(3, 3, 4) :: Nil)
   }
 
+  test("Support filter clause for aggregate function with hash aggregate") {
+Seq(("APPROX_COUNT_DISTINCT(a)", 3), ("COUNT(a)", 3), ("FIRST(a)", 1), 
("LAST(a)", 3),
 
 Review comment:
   OK. I will only keep one function in each case.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-22 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360748210
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,156 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp;
+SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "-MM-dd") = 
"2001-01-01") FROM emp;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "-MM-dd 
HH:mm:ss") = "2001-01-01 00:00:00") FROM emp;
+
+-- Aggregate with filter and non-empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b;
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "-MM-dd") 
> "2003-01-01") FROM emp GROUP BY dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, 
"-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id;
+
+-- Aggregate with filter and grouped by literals.
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) 
FROM emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= 
to_timestamp("2003-01-01")) FROM emp GROUP BY 1;
+
+-- Aggregate with filter, more than one aggregate function goes with distinct.
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id;
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate), sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct emp_name), sum(salary) from emp group by dept_id;
+-- select dept_id,

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-22 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360748104
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,156 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp;
+SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "-MM-dd") = 
"2001-01-01") FROM emp;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "-MM-dd 
HH:mm:ss") = "2001-01-01 00:00:00") FROM emp;
+
+-- Aggregate with filter and non-empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b;
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "-MM-dd") 
> "2003-01-01") FROM emp GROUP BY dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, 
"-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id;
+
+-- Aggregate with filter and grouped by literals.
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) 
FROM emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= 
to_timestamp("2003-01-01")) FROM emp GROUP BY 1;
+
+-- Aggregate with filter, more than one aggregate function goes with distinct.
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id;
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate), sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct emp_name), sum(salary) from emp group by dept_id;
+-- select dept_id,

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-22 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360747964
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,156 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp;
+SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "-MM-dd") = 
"2001-01-01") FROM emp;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "-MM-dd 
HH:mm:ss") = "2001-01-01 00:00:00") FROM emp;
+
+-- Aggregate with filter and non-empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b;
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "-MM-dd") 
> "2003-01-01") FROM emp GROUP BY dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, 
"-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id;
+
+-- Aggregate with filter and grouped by literals.
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) 
FROM emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= 
to_timestamp("2003-01-01")) FROM emp GROUP BY 1;
+
+-- Aggregate with filter, more than one aggregate function goes with distinct.
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id;
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate), sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct emp_name), sum(salary) from emp group by dept_id;
+-- select dept_id,

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-22 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360747950
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,156 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp;
+SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "-MM-dd") = 
"2001-01-01") FROM emp;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "-MM-dd 
HH:mm:ss") = "2001-01-01 00:00:00") FROM emp;
+
+-- Aggregate with filter and non-empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b;
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "-MM-dd") 
> "2003-01-01") FROM emp GROUP BY dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, 
"-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id;
+
+-- Aggregate with filter and grouped by literals.
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) 
FROM emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= 
to_timestamp("2003-01-01")) FROM emp GROUP BY 1;
+
+-- Aggregate with filter, more than one aggregate function goes with distinct.
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id;
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate), sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct emp_name), sum(salary) from emp group by dept_id;
+-- select dept_id,

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-22 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360629642
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,156 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp;
+SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "-MM-dd") = 
"2001-01-01") FROM emp;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "-MM-dd 
HH:mm:ss") = "2001-01-01 00:00:00") FROM emp;
+
+-- Aggregate with filter and non-empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b;
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "-MM-dd") 
> "2003-01-01") FROM emp GROUP BY dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, 
"-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id;
+
+-- Aggregate with filter and grouped by literals.
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) 
FROM emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= 
to_timestamp("2003-01-01")) FROM emp GROUP BY 1;
+
+-- Aggregate with filter, more than one aggregate function goes with distinct.
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id;
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate), sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct emp_name), sum(salary) from emp group by dept_id;
+-- select dept_id,

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-22 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360629597
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,156 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp;
+SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "-MM-dd") = 
"2001-01-01") FROM emp;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "-MM-dd 
HH:mm:ss") = "2001-01-01 00:00:00") FROM emp;
+
+-- Aggregate with filter and non-empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b;
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "-MM-dd") 
> "2003-01-01") FROM emp GROUP BY dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, 
"-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id;
+
+-- Aggregate with filter and grouped by literals.
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) 
FROM emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= 
to_timestamp("2003-01-01")) FROM emp GROUP BY 1;
+
+-- Aggregate with filter, more than one aggregate function goes with distinct.
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id;
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate), sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct emp_name), sum(salary) from emp group by dept_id;
+-- select dept_id,

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-22 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360629486
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,156 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp;
+SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "-MM-dd") = 
"2001-01-01") FROM emp;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "-MM-dd 
HH:mm:ss") = "2001-01-01 00:00:00") FROM emp;
+
+-- Aggregate with filter and non-empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b;
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "-MM-dd") 
> "2003-01-01") FROM emp GROUP BY dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, 
"-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id;
+
+-- Aggregate with filter and grouped by literals.
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) 
FROM emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= 
to_timestamp("2003-01-01")) FROM emp GROUP BY 1;
+
+-- Aggregate with filter, more than one aggregate function goes with distinct.
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id;
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate), sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct emp_name), sum(salary) from emp group by dept_id;
+-- select dept_id,

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-22 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360747017
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,156 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp;
+SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "-MM-dd") = 
"2001-01-01") FROM emp;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "-MM-dd 
HH:mm:ss") = "2001-01-01 00:00:00") FROM emp;
+
+-- Aggregate with filter and non-empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b;
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "-MM-dd") 
> "2003-01-01") FROM emp GROUP BY dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, 
"-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id;
+
+-- Aggregate with filter and grouped by literals.
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) 
FROM emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= 
to_timestamp("2003-01-01")) FROM emp GROUP BY 1;
+
+-- Aggregate with filter, more than one aggregate function goes with distinct.
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id;
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-22 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360746036
 
 

 ##
 File path: docs/sql-keywords.md
 ##
 @@ -117,6 +117,7 @@ Below is a list of all the keywords in Spark SQL.
   
FALSEreservednon-reservedreserved
   
FETCHreservednon-reservedreserved
   
FIELDSnon-reservednon-reservednon-reserved
+  
FILTERreservednon-reservedreserved
 
 Review comment:
   A function named FILTER, so I removed from `TableIdentifierParserSuite`.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-21 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360681453
 
 

 ##
 File path: sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
 ##
 @@ -2835,6 +2835,84 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession {
 checkAnswer(df, Row(1, 3, 4) :: Row(2, 3, 4) :: Row(3, 3, 4) :: Nil)
   }
 
+  test("Support filter clause for aggregate function with hash aggregate") {
+Seq(("APPROX_COUNT_DISTINCT(a)", 3), ("COUNT(a)", 3), ("FIRST(a)", 1), 
("LAST(a)", 3),
+  ("MAX(a)", 3), ("AVG(a)", 2.0), ("MIN(a)", 1), ("SUM(a)", 6), 
("PERCENTILE(a, 1)", 3),
+  ("PERCENTILE_APPROX(a, 0.5, 100)", 2.0), ("COLLECT_LIST(a)", Seq(1, 2, 
3)),
+  ("COLLECT_SET(a)", Seq(1, 2, 3))).foreach { funcToResult =>
+  val query = s"SELECT ${funcToResult._1} FILTER (WHERE b > 1) FROM 
testData2"
+  val df = sql(query)
+  val physical = df.queryExecution.sparkPlan
+  val aggregateExpressions = physical.collectFirst {
+case agg: HashAggregateExec => agg.aggregateExpressions
+case agg: ObjectHashAggregateExec => agg.aggregateExpressions
+  }
+  assert(aggregateExpressions.isDefined)
+  assert(aggregateExpressions.get.size == 1)
+  aggregateExpressions.get.foreach { expr =>
+assert(expr.filter.isDefined)
+  }
+  checkAnswer(df, Row(funcToResult._2) :: Nil)
+}
+  }
+
+  test("Support filter clause for aggregate function uses SortAggregateExec") {
+withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") {
+  Seq(("PERCENTILE(a, 1)", 3),
+("PERCENTILE_APPROX(a, 0.5, 100)", 2.0),
+("COLLECT_LIST(a)", Seq(1, 2, 3)),
+("COLLECT_SET(a)", Seq(1, 2, 3))).foreach { funcToResult =>
+val query = s"SELECT ${funcToResult._1} FILTER (WHERE b > 1) FROM 
testData2"
+val df = sql(query)
+val physical = df.queryExecution.sparkPlan
+val aggregateExpressions = physical.collectFirst {
+  case agg: SortAggregateExec => agg.aggregateExpressions
+}
+assert(aggregateExpressions.isDefined)
+assert(aggregateExpressions.get.size == 1)
+aggregateExpressions.get.foreach { expr =>
+  assert(expr.filter.isDefined)
+}
+checkAnswer(df, Row(funcToResult._2) :: Nil)
+  }
+}
+  }
+
+  test("Support filter clause for multiple aggregate function") {
 
 Review comment:
   I will remove this


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-20 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360629642
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,156 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp;
+SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "-MM-dd") = 
"2001-01-01") FROM emp;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "-MM-dd 
HH:mm:ss") = "2001-01-01 00:00:00") FROM emp;
+
+-- Aggregate with filter and non-empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b;
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "-MM-dd") 
> "2003-01-01") FROM emp GROUP BY dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, 
"-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id;
+
+-- Aggregate with filter and grouped by literals.
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) 
FROM emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= 
to_timestamp("2003-01-01")) FROM emp GROUP BY 1;
+
+-- Aggregate with filter, more than one aggregate function goes with distinct.
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id;
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate), sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct emp_name), sum(salary) from emp group by dept_id;
+-- select dept_id,

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-20 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360629597
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,156 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp;
+SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "-MM-dd") = 
"2001-01-01") FROM emp;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "-MM-dd 
HH:mm:ss") = "2001-01-01 00:00:00") FROM emp;
+
+-- Aggregate with filter and non-empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b;
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "-MM-dd") 
> "2003-01-01") FROM emp GROUP BY dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, 
"-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id;
+
+-- Aggregate with filter and grouped by literals.
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) 
FROM emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= 
to_timestamp("2003-01-01")) FROM emp GROUP BY 1;
+
+-- Aggregate with filter, more than one aggregate function goes with distinct.
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id;
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate), sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct emp_name), sum(salary) from emp group by dept_id;
+-- select dept_id,

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-20 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360629486
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,156 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp;
+SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "-MM-dd") = 
"2001-01-01") FROM emp;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "-MM-dd 
HH:mm:ss") = "2001-01-01 00:00:00") FROM emp;
+
+-- Aggregate with filter and non-empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b;
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "-MM-dd") 
> "2003-01-01") FROM emp GROUP BY dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, 
"-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id;
+
+-- Aggregate with filter and grouped by literals.
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) 
FROM emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= 
to_timestamp("2003-01-01")) FROM emp GROUP BY 1;
+
+-- Aggregate with filter, more than one aggregate function goes with distinct.
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id;
+select dept_id, count(distinct emp_name), count(distinct hiredate), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate), sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct hiredate) filter (where hiredate > date "2003-01-01"), 
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 
200) from emp group by dept_id;
+-- select dept_id, count(distinct emp_name) filter (where id > 200), 
count(distinct emp_name), sum(salary) from emp group by dept_id;
+-- select dept_id,

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-20 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360309111
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
 ##
 @@ -183,9 +226,10 @@ object RewriteDistinctAggregates extends 
Rule[LogicalPlan] {
   // only expand unfoldable children
   val regularAggExprs = aggExpressions
 .filter(e => !e.isDistinct && e.children.exists(!_.foldable))
-  val regularAggChildren = regularAggExprs
+  val regularAggFunChildren = regularAggExprs
 .flatMap(_.aggregateFunction.children.filter(!_.foldable))
-.distinct
+  val regularAggFilterChildren = 
regularAggExprs.flatMap(_.filterAttributes)
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-20 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360303985
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
 ##
 @@ -194,7 +238,11 @@ object RewriteDistinctAggregates extends 
Rule[LogicalPlan] {
   val regularAggOperatorMap = regularAggExprs.map { e =>
 // Perform the actual aggregation in the initial aggregate.
 val af = 
patchAggregateFunctionChildren(e.aggregateFunction)(regularAggChildAttrLookup.get)
-val operator = Alias(e.copy(aggregateFunction = af), e.sql)()
+val filterOpt = e.filter.map { fe =>
+  val newChildren = fe.children.map(c => 
regularAggChildAttrLookup.getOrElse(c, c))
+  fe.withNewChildren(newChildren)
 
 Review comment:
   OK. Thanks for your remind and I learned more.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-17 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r359139107
 
 

 ##
 File path: docs/sql-keywords.md
 ##
 @@ -117,6 +117,7 @@ Below is a list of all the keywords in Spark SQL.
   
FALSEreservednon-reservedreserved
   
FETCHreservednon-reservedreserved
   
FIELDSnon-reservednon-reservednon-reserved
+  
FILTERnon-reservednon-reservedreserved
 
 Review comment:
   I put it in `functionName` still occur the issue.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-17 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r359139107
 
 

 ##
 File path: docs/sql-keywords.md
 ##
 @@ -117,6 +117,7 @@ Below is a list of all the keywords in Spark SQL.
   
FALSEreservednon-reservedreserved
   
FETCHreservednon-reservedreserved
   
FIELDSnon-reservednon-reservednon-reserved
+  
FILTERnon-reservednon-reservedreserved
 
 Review comment:
   OK. I learned it.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-17 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r358719472
 
 

 ##
 File path: docs/sql-keywords.md
 ##
 @@ -117,6 +117,7 @@ Below is a list of all the keywords in Spark SQL.
   
FALSEreservednon-reservedreserved
   
FETCHreservednon-reservedreserved
   
FIELDSnon-reservednon-reservednon-reserved
+  
FILTERnon-reservednon-reservedreserved
 
 Review comment:
   @cloud-fan 
   There exists a issue as below:
   
https://github.com/apache/spark/blob/bf7215c510e0eb2afa0a8f1337c535b4091f08f9/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out#L85
   ```
   no viable alternative at input 'filter'(line 1, pos 7)
   == SQL ==
   select filter(ys, y -> y > 30) as v from nested
   ---^^^
   ```
   filter is a function.
   It seems we can't put the keyword in `ansiNonreserved`


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-17 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r358719472
 
 

 ##
 File path: docs/sql-keywords.md
 ##
 @@ -117,6 +117,7 @@ Below is a list of all the keywords in Spark SQL.
   
FALSEreservednon-reservedreserved
   
FETCHreservednon-reservedreserved
   
FIELDSnon-reservednon-reservednon-reserved
+  
FILTERnon-reservednon-reservedreserved
 
 Review comment:
   There exists a issue as below:
   
https://github.com/apache/spark/blob/bf7215c510e0eb2afa0a8f1337c535b4091f08f9/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out#L85
   ```
   no viable alternative at input 'filter'(line 1, pos 7)
   == SQL ==
   select filter(ys, y -> y > 30) as v from nested
   ---^^^
   ```
   filter is a function.
   It seems we can't put the keyword in `ansiNonreserved`


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-17 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r358719472
 
 

 ##
 File path: docs/sql-keywords.md
 ##
 @@ -117,6 +117,7 @@ Below is a list of all the keywords in Spark SQL.
   
FALSEreservednon-reservedreserved
   
FETCHreservednon-reservedreserved
   
FIELDSnon-reservednon-reservednon-reserved
+  
FILTERnon-reservednon-reservedreserved
 
 Review comment:
   There exists a issue as below:
   
https://github.com/apache/spark/blob/bf7215c510e0eb2afa0a8f1337c535b4091f08f9/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out#L85
   ```
   no viable alternative at input 'filter'(line 1, pos 7)
   == SQL ==
   select filter(ys, y -> y > 30) as v from nested
   ```
   filter is a function.
   It seems we can't put the keyword in `ansiNonreserved`


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-17 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r358719472
 
 

 ##
 File path: docs/sql-keywords.md
 ##
 @@ -117,6 +117,7 @@ Below is a list of all the keywords in Spark SQL.
   
FALSEreservednon-reservedreserved
   
FETCHreservednon-reservedreserved
   
FIELDSnon-reservednon-reservednon-reserved
+  
FILTERnon-reservednon-reservedreserved
 
 Review comment:
   There exists a issue as below:
   
https://github.com/apache/spark/blob/bf7215c510e0eb2afa0a8f1337c535b4091f08f9/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out#L85
   ```
   no viable alternative at input 'filter'(line 1, pos 7)
   == SQL ==
   select filter(ys, y -> y > 30) as v from nested
   ```
   filter is a function.
   It seems we can't put the keyword in ansiNonreserved


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-17 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r358695284
 
 

 ##
 File path: docs/sql-keywords.md
 ##
 @@ -117,6 +117,7 @@ Below is a list of all the keywords in Spark SQL.
   
FALSEreservednon-reservedreserved
   
FETCHreservednon-reservedreserved
   
FIELDSnon-reservednon-reservednon-reserved
+  
FILTERnon-reservednon-reservedreserved
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-17 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r358719472
 
 

 ##
 File path: docs/sql-keywords.md
 ##
 @@ -117,6 +117,7 @@ Below is a list of all the keywords in Spark SQL.
   
FALSEreservednon-reservedreserved
   
FETCHreservednon-reservedreserved
   
FIELDSnon-reservednon-reservednon-reserved
+  
FILTERnon-reservednon-reservedreserved
 
 Review comment:
   There exists a issue as below:
   
https://github.com/apache/spark/blob/bf7215c510e0eb2afa0a8f1337c535b4091f08f9/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out#L85
   ```
   no viable alternative at input 'filter'(line 1, pos 7)
   == SQL ==
   select filter(ys, y -> y > 30) as v from nested
   ```
   filter is a function.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-17 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r358695732
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
 ##
 @@ -183,9 +183,12 @@ object RewriteDistinctAggregates extends 
Rule[LogicalPlan] {
   // only expand unfoldable children
   val regularAggExprs = aggExpressions
 .filter(e => !e.isDistinct && e.children.exists(!_.foldable))
-  val regularAggChildren = regularAggExprs
+  val regularAggFunChildren = regularAggExprs
 .flatMap(_.aggregateFunction.children.filter(!_.foldable))
-.distinct
+  val regularAggFilterChildren = regularAggExprs
+.flatMap(ae => ae.filter.map(_.children.filter(!_.foldable)))
+.flatten
+  val regularAggChildren = (regularAggFunChildren ++ 
regularAggFilterChildren).distinct
 
 Review comment:
   OK. I forgot it when I revert it.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-17 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r358695284
 
 

 ##
 File path: docs/sql-keywords.md
 ##
 @@ -117,6 +117,7 @@ Below is a list of all the keywords in Spark SQL.
   
FALSEreservednon-reservedreserved
   
FETCHreservednon-reservedreserved
   
FIELDSnon-reservednon-reservednon-reserved
+  
FILTERnon-reservednon-reservedreserved
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-12 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r357019501
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
 ##
 @@ -75,6 +75,99 @@ import org.apache.spark.sql.types.IntegerType
  *   LocalTableScan [...]
  * }}}
  *
+ * Second example: aggregate function without distinct and with filter clauses 
(in sql):
+ * {{{
+ *   SELECT
+ * COUNT(DISTINCT cat1) as cat1_cnt,
+ * COUNT(DISTINCT cat2) as cat2_cnt,
+ * SUM(value) FILTER (
+ *   WHERE
+ * id > 1
+ * ) AS total
+ *  FROM
+ *data
+ *  GROUP BY
+ *key
+ * }}}
+ *
+ * This translates to the following (pseudo) logical plan:
+ * {{{
+ * Aggregate(
+ *key = ['key]
+ *functions = [COUNT(DISTINCT 'cat1),
+ * COUNT(DISTINCT 'cat2),
+ * sum('value) with FILTER('id > 1)]
+ *output = ['key, 'cat1_cnt, 'cat2_cnt, 'total])
+ *   LocalTableScan [...]
+ * }}}
+ *
+ * This rule rewrites this logical plan to the following (pseudo) logical plan:
+ * {{{
+ * Aggregate(
+ *key = ['key]
+ *functions = [count(if (('gid = 1)) 'cat1 else null),
+ * count(if (('gid = 2)) 'cat2 else null),
+ * first(if (('gid = 0)) 'total else null) ignore nulls]
+ *output = ['key, 'cat1_cnt, 'cat2_cnt, 'total])
+ *   Aggregate(
+ *  key = ['key, 'cat1, 'cat2, 'gid]
+ *  functions = [sum('value) with FILTER('id > 1)]
+ *  output = ['key, 'cat1, 'cat2, 'gid, 'total])
+ * Expand(
+ *projections = [('key, null, null, 0, cast('value as bigint), 'id),
+ *   ('key, 'cat1, null, 1, null, null),
+ *   ('key, null, 'cat2, 2, null, null)]
+ *output = ['key, 'cat1, 'cat2, 'gid, 'value, 'id])
+ *   LocalTableScan [...]
+ * }}}
+ *
+ * Third example: aggregate function with distinct and filter clauses (in sql):
+ * {{{
+ *   SELECT
+ * COUNT(DISTINCT cat1) FILTER (
+ *   WHERE
+ * id > 1
+ * ) as cat1_cnt,
+ * COUNT(DISTINCT cat2) as cat2_cnt,
+ * SUM(value) as total
+ *   FROM
+ * data
+ *   GROUP BY
+ * key
+ * }}}
+ *
+ * This translates to the following (pseudo) logical plan:
+ * {{{
+ * Aggregate(
+ *key = ['key]
+ *functions = [COUNT(DISTINCT 'cat1) with FILTER('id > 1),
 
 Review comment:
   Thanks! wenchen. I will check this situation


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-12 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r357019501
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
 ##
 @@ -75,6 +75,99 @@ import org.apache.spark.sql.types.IntegerType
  *   LocalTableScan [...]
  * }}}
  *
+ * Second example: aggregate function without distinct and with filter clauses 
(in sql):
+ * {{{
+ *   SELECT
+ * COUNT(DISTINCT cat1) as cat1_cnt,
+ * COUNT(DISTINCT cat2) as cat2_cnt,
+ * SUM(value) FILTER (
+ *   WHERE
+ * id > 1
+ * ) AS total
+ *  FROM
+ *data
+ *  GROUP BY
+ *key
+ * }}}
+ *
+ * This translates to the following (pseudo) logical plan:
+ * {{{
+ * Aggregate(
+ *key = ['key]
+ *functions = [COUNT(DISTINCT 'cat1),
+ * COUNT(DISTINCT 'cat2),
+ * sum('value) with FILTER('id > 1)]
+ *output = ['key, 'cat1_cnt, 'cat2_cnt, 'total])
+ *   LocalTableScan [...]
+ * }}}
+ *
+ * This rule rewrites this logical plan to the following (pseudo) logical plan:
+ * {{{
+ * Aggregate(
+ *key = ['key]
+ *functions = [count(if (('gid = 1)) 'cat1 else null),
+ * count(if (('gid = 2)) 'cat2 else null),
+ * first(if (('gid = 0)) 'total else null) ignore nulls]
+ *output = ['key, 'cat1_cnt, 'cat2_cnt, 'total])
+ *   Aggregate(
+ *  key = ['key, 'cat1, 'cat2, 'gid]
+ *  functions = [sum('value) with FILTER('id > 1)]
+ *  output = ['key, 'cat1, 'cat2, 'gid, 'total])
+ * Expand(
+ *projections = [('key, null, null, 0, cast('value as bigint), 'id),
+ *   ('key, 'cat1, null, 1, null, null),
+ *   ('key, null, 'cat2, 2, null, null)]
+ *output = ['key, 'cat1, 'cat2, 'gid, 'value, 'id])
+ *   LocalTableScan [...]
+ * }}}
+ *
+ * Third example: aggregate function with distinct and filter clauses (in sql):
+ * {{{
+ *   SELECT
+ * COUNT(DISTINCT cat1) FILTER (
+ *   WHERE
+ * id > 1
+ * ) as cat1_cnt,
+ * COUNT(DISTINCT cat2) as cat2_cnt,
+ * SUM(value) as total
+ *   FROM
+ * data
+ *   GROUP BY
+ * key
+ * }}}
+ *
+ * This translates to the following (pseudo) logical plan:
+ * {{{
+ * Aggregate(
+ *key = ['key]
+ *functions = [COUNT(DISTINCT 'cat1) with FILTER('id > 1),
 
 Review comment:
   I will check this situation


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-12 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r357012871
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,150 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
 
 Review comment:
   Do you mean I insert some null values to ensure that each column has a null?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-11 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r356986082
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
 ##
 @@ -151,7 +244,7 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] 
{
   }
 
   // Setup unique distinct aggregate children.
-  val distinctAggChildren = distinctAggGroups.keySet.flatten.toSeq.distinct
+  val distinctAggChildren = distinctAggGroups.keySet.flatten.toSeq
 
 Review comment:
   Sorry!, I fogot to restore it. Thanks!


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-11 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r356512007
 
 

 ##
 File path: 
sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql
 ##
 @@ -230,16 +230,16 @@ select max(min(unique1)) from tenk1;
 -- [SPARK-27986] Support Aggregate Expressions with filter
 -- FILTER tests
 
--- select min(unique1) filter (where unique1 > 100) from tenk1;
+select min(unique1) filter (where unique1 > 100) from tenk1;
 
--- select sum(1/ten) filter (where ten > 0) from tenk1;
+select sum(1/ten) filter (where ten > 0) from tenk1;
 
 -- select ten, sum(distinct four) filter (where four::text ~ '123') from onek a
 -- group by ten;
 
--- select ten, sum(distinct four) filter (where four > 10) from onek a
--- group by ten
--- having exists (select 1 from onek b where sum(distinct a.four) = b.four);
+select ten, sum(distinct four) filter (where four > 10) from onek a
+group by ten
+having exists (select 1 from onek b where sum(distinct a.four) = b.four);
 
 Review comment:
   I created two jira:
   https://issues.apache.org/jira/browse/SPARK-30219
   https://issues.apache.org/jira/browse/SPARK-30220


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-11 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r356465432
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
 ##
 @@ -75,6 +75,83 @@ import org.apache.spark.sql.types.IntegerType
  *   LocalTableScan [...]
  * }}}
  *
+ * Second example: aggregate function without distinct and with filter clauses 
(in sql):
+ * {{{
+ *   select count(distinct cat1) as cat1_cnt, count(distinct cat2) as cat2_cnt,
+ * sum(value) filter (where id > 1) as total
+ *   from data group by key
+ * }}}
+ *
+ * This translates to the following (pseudo) logical plan:
+ * {{{
+ * Aggregate(
+ *key = ['key]
+ *functions = [COUNT(DISTINCT 'cat1),
+ * COUNT(DISTINCT 'cat2),
+ * sum('value) with FILTER('id > 1)]
+ *output = ['key, 'cat1_cnt, 'cat2_cnt, 'total])
+ *   LocalTableScan [...]
+ * }}}
+ *
+ * This rule rewrites this logical plan to the following (pseudo) logical plan:
+ * {{{
+ * Aggregate(
+ *key = ['key]
+ *functions = [count(if (('gid = 1)) 'cat1 else null),
+ * count(if (('gid = 2)) 'cat2 else null),
+ * first(if (('gid = 0)) 'total else null) ignore nulls]
+ *output = ['key, 'cat1_cnt, 'cat2_cnt, 'total])
+ *   Aggregate(
+ *  key = ['key, 'cat1, 'cat2, 'gid]
+ *  functions = [sum('value) with FILTER('id > 1)]
+ *  output = ['key, 'cat1, 'cat2, 'gid, 'total])
+ * Expand(
+ *projections = [('key, null, null, 0, cast('value as bigint), 'id),
+ *   ('key, 'cat1, null, 1, null, null),
+ *   ('key, null, 'cat2, 2, null, null)]
+ *output = ['key, 'cat1, 'cat2, 'gid, 'value, 'id])
+ *   LocalTableScan [...]
+ * }}}
+ *
+ * Third example: aggregate function with distinct and filter clauses (in sql):
+ * {{{
+ *   select count(distinct cat1) filter (where id > 1) as cat1_cnt,
+ * count(distinct cat2) as cat2_cnt, sum(value) as total
+ *   from data group by key
+ * }}}
+ *
+ * This translates to the following (pseudo) logical plan:
+ * {{{
+ * Aggregate(
+ *key = ['key]
+ *functions = [COUNT(DISTINCT 'cat1) with FILTER('id > 1),
+ * COUNT(DISTINCT 'cat2),
+ * sum('value)]
+ *output = ['key, 'cat1_cnt, 'cat2_cnt, 'total])
+ *   LocalTableScan [...]
+ * }}}
+ *
+ * This rule rewrites this logical plan to the following (pseudo) logical plan:
+ *
+ * {{{
+ * Aggregate(
+ *key = ['key]
+ *functions = [count(if (('gid = 1)) 'cat1 else null) with FILTER('id > 1),
+ * count(if (('gid = 2)) 'cat2 else null),
+ * first(if (('gid = 0)) 'total else null) ignore nulls]
+ *output = ['key, 'cat1_cnt, 'cat2_cnt, 'total])
+ *   Aggregate(
+ *  key = ['key, 'cat1, 'id, 'cat2, 'gid]
+ *  functions = [sum('value)]
+ *  output = ['key, 'cat1, 'id, 'cat2, 'gid, 'total])
+ * Expand(
+ *projections = [('key, null, null, null, 0, cast('value as bigint)),
+ *   ('key, 'cat1, 'id, null, 1, null),
 
 Review comment:
   Good idea. I changed it.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-11 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r356474465
 
 

 ##
 File path: sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
 ##
 @@ -2835,6 +2835,84 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession {
 checkAnswer(df, Row(1, 3, 4) :: Row(2, 3, 4) :: Row(3, 3, 4) :: Nil)
   }
 
+  test("Support filter clause for aggregate function with hash") {
+Seq(("APPROX_COUNT_DISTINCT(a)", 3), ("COUNT(a)", 3), ("FIRST(a)", 1), 
("LAST(a)", 3),
+  ("MAX(a)", 3), ("AVG(a)", 2.0), ("MIN(a)", 1), ("SUM(a)", 6), 
("PERCENTILE(a, 1)", 3),
+("PERCENTILE_APPROX(a, 0.5, 100)", 2.0), ("COLLECT_LIST(a)", Seq(1, 2, 
3)),
+("COLLECT_SET(a)", Seq(1, 2, 3))).foreach { funcToResult =>
+  val query = s"SELECT ${funcToResult._1} FILTER (WHERE b > 1) FROM 
testData2"
+  val df = sql(query)
+  val physical = df.queryExecution.sparkPlan
+  val aggregateExpressions = physical.collectFirst {
+case agg: HashAggregateExec => agg.aggregateExpressions
+case agg: ObjectHashAggregateExec => agg.aggregateExpressions
+  }
+  assert(aggregateExpressions.isDefined)
+  assert(aggregateExpressions.get.size == 1)
+  aggregateExpressions.get.foreach { expr =>
+assert(expr.filter.isDefined)
+  }
+  checkAnswer(df, Row(funcToResult._2) :: Nil)
+}
+  }
+
+  test("Support filter clause for aggregate function uses SortAggregateExec") {
+withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") {
+  Seq(("PERCENTILE(a, 1)", 3),
+("PERCENTILE_APPROX(a, 0.5, 100)", 2.0),
+("COLLECT_LIST(a)", Seq(1, 2, 3)),
+("COLLECT_SET(a)", Seq(1, 2, 3))).foreach { funcToResult =>
+  val query = s"SELECT ${funcToResult._1} FILTER (WHERE b > 1) FROM 
testData2"
 
 Review comment:
   I removed the indentation.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-11 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r356468666
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,150 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
 
 Review comment:
   I think it's OK. Do you have other idea?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-11 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r356467533
 
 

 ##
 File path: 
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
 ##
 @@ -169,11 +169,27 @@ class AnalysisErrorSuite extends AnalysisTest {
 CatalystSqlParser.parsePlan("SELECT hex(DISTINCT a) FROM TaBlE"),
 "DISTINCT specified, but hex is not an aggregate function" :: Nil)
 
+  errorTest(
+"non aggregate function with filter predicate",
+CatalystSqlParser.parsePlan("SELECT hex(a) filter (where c = 1) FROM 
TaBlE2"),
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-11 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r356466545
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
 ##
 @@ -28,13 +28,13 @@ import org.apache.spark.sql.types.IntegerType
  * aggregation in which the regular aggregation expressions and every distinct 
clause is aggregated
  * in a separate group. The results are then combined in a second aggregate.
  *
- * For example (in scala):
+ * First example: queried without filter clauses (in scala):
 
 Review comment:
   query


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-11 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r356465432
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
 ##
 @@ -75,6 +75,83 @@ import org.apache.spark.sql.types.IntegerType
  *   LocalTableScan [...]
  * }}}
  *
+ * Second example: aggregate function without distinct and with filter clauses 
(in sql):
+ * {{{
+ *   select count(distinct cat1) as cat1_cnt, count(distinct cat2) as cat2_cnt,
+ * sum(value) filter (where id > 1) as total
+ *   from data group by key
+ * }}}
+ *
+ * This translates to the following (pseudo) logical plan:
+ * {{{
+ * Aggregate(
+ *key = ['key]
+ *functions = [COUNT(DISTINCT 'cat1),
+ * COUNT(DISTINCT 'cat2),
+ * sum('value) with FILTER('id > 1)]
+ *output = ['key, 'cat1_cnt, 'cat2_cnt, 'total])
+ *   LocalTableScan [...]
+ * }}}
+ *
+ * This rule rewrites this logical plan to the following (pseudo) logical plan:
+ * {{{
+ * Aggregate(
+ *key = ['key]
+ *functions = [count(if (('gid = 1)) 'cat1 else null),
+ * count(if (('gid = 2)) 'cat2 else null),
+ * first(if (('gid = 0)) 'total else null) ignore nulls]
+ *output = ['key, 'cat1_cnt, 'cat2_cnt, 'total])
+ *   Aggregate(
+ *  key = ['key, 'cat1, 'cat2, 'gid]
+ *  functions = [sum('value) with FILTER('id > 1)]
+ *  output = ['key, 'cat1, 'cat2, 'gid, 'total])
+ * Expand(
+ *projections = [('key, null, null, 0, cast('value as bigint), 'id),
+ *   ('key, 'cat1, null, 1, null, null),
+ *   ('key, null, 'cat2, 2, null, null)]
+ *output = ['key, 'cat1, 'cat2, 'gid, 'value, 'id])
+ *   LocalTableScan [...]
+ * }}}
+ *
+ * Third example: aggregate function with distinct and filter clauses (in sql):
+ * {{{
+ *   select count(distinct cat1) filter (where id > 1) as cat1_cnt,
+ * count(distinct cat2) as cat2_cnt, sum(value) as total
+ *   from data group by key
+ * }}}
+ *
+ * This translates to the following (pseudo) logical plan:
+ * {{{
+ * Aggregate(
+ *key = ['key]
+ *functions = [COUNT(DISTINCT 'cat1) with FILTER('id > 1),
+ * COUNT(DISTINCT 'cat2),
+ * sum('value)]
+ *output = ['key, 'cat1_cnt, 'cat2_cnt, 'total])
+ *   LocalTableScan [...]
+ * }}}
+ *
+ * This rule rewrites this logical plan to the following (pseudo) logical plan:
+ *
+ * {{{
+ * Aggregate(
+ *key = ['key]
+ *functions = [count(if (('gid = 1)) 'cat1 else null) with FILTER('id > 1),
+ * count(if (('gid = 2)) 'cat2 else null),
+ * first(if (('gid = 0)) 'total else null) ignore nulls]
+ *output = ['key, 'cat1_cnt, 'cat2_cnt, 'total])
+ *   Aggregate(
+ *  key = ['key, 'cat1, 'id, 'cat2, 'gid]
+ *  functions = [sum('value)]
+ *  output = ['key, 'cat1, 'id, 'cat2, 'gid, 'total])
+ * Expand(
+ *projections = [('key, null, null, null, 0, cast('value as bigint)),
+ *   ('key, 'cat1, 'id, null, 1, null),
 
 Review comment:
   Good idea.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-11 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r356465331
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala
 ##
 @@ -75,6 +75,83 @@ import org.apache.spark.sql.types.IntegerType
  *   LocalTableScan [...]
  * }}}
  *
+ * Second example: aggregate function without distinct and with filter clauses 
(in sql):
+ * {{{
+ *   select count(distinct cat1) as cat1_cnt, count(distinct cat2) as cat2_cnt,
+ * sum(value) filter (where id > 1) as total
+ *   from data group by key
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-10 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r356376953
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 ##
 @@ -1636,26 +1636,33 @@ class Analyzer(
 s"its class is ${other.getClass.getCanonicalName}, which 
is not a generator.")
   }
 }
-  case u @ UnresolvedFunction(funcId, children, isDistinct) =>
+  case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) =>
 withPosition(u) {
-  v1SessionCatalog.lookupFunction(funcId, children) match {
+  v1SessionCatalog.lookupFunction(funcId, arguments) match {
 // AggregateWindowFunctions are AggregateFunctions that can 
only be evaluated within
 // the context of a Window clause. They do not need to be 
wrapped in an
 // AggregateExpression.
 case wf: AggregateWindowFunction =>
-  if (isDistinct) {
+  val notSupportedWords = (if (isDistinct) "DISTINCT" :: Nil 
else Nil) ++
+(if (filter.isDefined) "FILTER predicate" :: Nil else Nil)
+  if (notSupportedWords.nonEmpty) {
 failAnalysis(
-  s"DISTINCT specified, but ${wf.prettyName} is not an 
aggregate function")
+  s"${notSupportedWords.mkString(" and ")} specified, but 
${wf.prettyName} " +
+"is not an aggregate function")
   } else {
 wf
   }
 // We get an aggregate function, we need to wrap it in an 
AggregateExpression.
-case agg: AggregateFunction => AggregateExpression(agg, 
Complete, isDistinct)
+case agg: AggregateFunction =>
+  AggregateExpression(agg, Complete, isDistinct, filter)
 // This function is not an aggregate function, just return the 
resolved one.
 case other =>
-  if (isDistinct) {
+  val notSupportedWords = (if (isDistinct) "DISTINCT" :: Nil 
else Nil) ++
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-10 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r356376920
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 ##
 @@ -1636,26 +1636,33 @@ class Analyzer(
 s"its class is ${other.getClass.getCanonicalName}, which 
is not a generator.")
   }
 }
-  case u @ UnresolvedFunction(funcId, children, isDistinct) =>
+  case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) =>
 withPosition(u) {
-  v1SessionCatalog.lookupFunction(funcId, children) match {
+  v1SessionCatalog.lookupFunction(funcId, arguments) match {
 // AggregateWindowFunctions are AggregateFunctions that can 
only be evaluated within
 // the context of a Window clause. They do not need to be 
wrapped in an
 // AggregateExpression.
 case wf: AggregateWindowFunction =>
-  if (isDistinct) {
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-09 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r355863327
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
 ##
 @@ -119,18 +125,24 @@ case class AggregateExpression(
   normalizedAggFunc.canonicalized.asInstanceOf[AggregateFunction],
   mode,
   isDistinct,
+  filter,
   ExprId(0))
   }
 
-  override def children: Seq[Expression] = aggregateFunction :: Nil
+  override def children: Seq[Expression] = filter match {
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-09 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r355862006
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   @maropu I have modified `RewriteDistinctAggregates`.
   If query exists more than one distinct aggregate function, then the filter 
will apply to second aggregate.
   If query exists only one distinct aggregate function, then the filter will 
apply to third aggregate.
   I think if we allow the distinct changed to group key, filter should allow 
also.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-09 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r355862006
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   @maropu I have modified `RewriteDistinctAggregates`.
   If query exists more than one distinct aggregate function, then the filter 
will apply to second aggregate.
   If query exists only one distinct aggregate function, then the filter will 
apply to third aggregate.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-09 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r355846565
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/higherOrderFunctions.scala
 ##
 @@ -33,11 +33,14 @@ import org.apache.spark.sql.types.DataType
 case class ResolveHigherOrderFunctions(catalog: SessionCatalog) extends 
Rule[LogicalPlan] {
 
   override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveExpressions 
{
-case u @ UnresolvedFunction(fn, children, false)
+case u @ UnresolvedFunction(fn, children, false, filter)
 if hasLambdaAndResolvedArguments(children) =>
   withPosition(u) {
 catalog.lookupFunction(fn, children) match {
-  case func: HigherOrderFunction => func
+  case func: HigherOrderFunction =>
+filter.foreach(_.failAnalysis("FILTER predicate specified, " +
+  s"but ${func.prettyName} is not an aggregate function"))
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-09 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r355829918
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 ##
 @@ -1636,26 +1636,33 @@ class Analyzer(
 s"its class is ${other.getClass.getCanonicalName}, which 
is not a generator.")
   }
 }
-  case u @ UnresolvedFunction(funcId, children, isDistinct) =>
+  case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) =>
 withPosition(u) {
-  v1SessionCatalog.lookupFunction(funcId, children) match {
+  v1SessionCatalog.lookupFunction(funcId, arguments) match {
 // AggregateWindowFunctions are AggregateFunctions that can 
only be evaluated within
 // the context of a Window clause. They do not need to be 
wrapped in an
 // AggregateExpression.
 case wf: AggregateWindowFunction =>
   if (isDistinct) {
 failAnalysis(
   s"DISTINCT specified, but ${wf.prettyName} is not an 
aggregate function")
+  } else if (filter.isDefined) {
 
 Review comment:
   OK.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-09 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r355829949
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 ##
 @@ -1636,26 +1636,33 @@ class Analyzer(
 s"its class is ${other.getClass.getCanonicalName}, which 
is not a generator.")
   }
 }
-  case u @ UnresolvedFunction(funcId, children, isDistinct) =>
+  case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) =>
 withPosition(u) {
-  v1SessionCatalog.lookupFunction(funcId, children) match {
+  v1SessionCatalog.lookupFunction(funcId, arguments) match {
 // AggregateWindowFunctions are AggregateFunctions that can 
only be evaluated within
 // the context of a Window clause. They do not need to be 
wrapped in an
 // AggregateExpression.
 case wf: AggregateWindowFunction =>
   if (isDistinct) {
 failAnalysis(
   s"DISTINCT specified, but ${wf.prettyName} is not an 
aggregate function")
+  } else if (filter.isDefined) {
+failAnalysis("FILTER predicate specified, " +
+  s"but ${wf.prettyName} is not an aggregate function")
   } else {
 wf
   }
 // We get an aggregate function, we need to wrap it in an 
AggregateExpression.
-case agg: AggregateFunction => AggregateExpression(agg, 
Complete, isDistinct)
+case agg: AggregateFunction =>
+  AggregateExpression(agg, Complete, isDistinct, filter)
 // This function is not an aggregate function, just return the 
resolved one.
 case other =>
   if (isDistinct) {
 failAnalysis(
   s"DISTINCT specified, but ${other.prettyName} is not an 
aggregate function")
+  } else if (filter.isDefined) {
 
 Review comment:
   OK


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-09 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r355395097
 
 

 ##
 File path: 
sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part3.sql
 ##
 @@ -400,10 +400,10 @@ SELECT ntile(0) OVER (ORDER BY ten), ten, four FROM 
tenk1;
 -- filter
 
 -- [SPARK-28500] Adds support for `filter` clause
 
 Review comment:
   OK.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-09 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r355393582
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   @cloud-fan I modified `RewriteDistinctAggregates`.
   If there exists one distinct aggregate function, will still uses 
`planAggregateWithOneDistinct`.
   If there exists more than one distinct aggregate function, will still uses 
`RewriteDistinctAggregates`.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-09 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r355383060
 
 

 ##
 File path: 
sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out
 ##
 @@ -0,0 +1,515 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 44
 
 Review comment:
   Yes. It's really.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-09 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r355343580
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,130 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
 
 Review comment:
   OK.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-09 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r355343660
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,130 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp;
+
+-- Aggregate with filter and non-empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b;
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id;
+
+-- Aggregate with filter and grouped by literals.
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) 
FROM emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= 
to_timestamp("2003-01-01")) FROM emp GROUP BY 1;
+
+-- Aggregate with filter and grouped by literals (hash aggregate).
+SELECT 'foo', APPROX_COUNT_DISTINCT(a) FILTER (WHERE b >= 0) FROM testData 
WHERE a = 0 GROUP BY 1;
+
+-- Aggregate with filter and grouped by literals (sort aggregate).
+SELECT 'foo', MAX(STRUCT(a)) FILTER (WHERE b >= 1) FROM testData WHERE a = 0 
GROUP BY 1;
+
+-- Aggregate with filter and complex GroupBy expressions.
+SELECT a + b, COUNT(b) FILTER (WHERE b >= 2) FROM testData GROUP BY a + b;
+SELECT a + 2, COUNT(b) FILTER (WHERE b IN (1, 2)) FROM testData GROUP BY a + 1;
+SELECT a + 1 + 1, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY a + 1;
+
+-- Aggregate with filter, foldable input and multiple distinct groups.
+SELECT COUNT(DISTINCT b) FILTER (WHERE b > 0), COUNT(DISTINCT b, c) FILTER 
(WHERE b > 0 AND c > 2)
+FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a;
+
+-- Aliases in SELECT could be used in GROUP BY
+SELECT a AS k, COUNT(b) FILTER (WHERE b = 1 OR b = 2) FROM testData GROUP BY k;
+SELECT dept_id as k,
+   SUM(salary) FILTER (WHERE hiredate < date "2005-01-01" OR hiredate > 
date "2010-01-01")
+FROM emp GROUP BY k;
+SELECT dept_id as k,
+   SUM(salary) FILTER (WHERE hiredate < to_date("2005-01-01") OR hiredate 
> to_date("2010-01-01"))
+FROM emp GROUP BY k;
+SELECT dept_id as k,
+   SUM(salary) FILTER (WHERE hiredate < to_timestamp("2005-01-01") OR 
hiredate > to_timestamp("2010-01-01 00:00:00"))
+FROM emp GROUP BY k;
+SELECT a AS k, COUNT(b) FILTER (WHERE NOT b < 0) FROM testData GROUP BY k 
HAVING k > 1;
+SELECT dept_id AS k, AVG(salary) FILTER (WHERE NOT hiredate <= date 
"2005-01-01")
+FROM emp GROUP BY k HAVING k < 70;
+SELECT a AS k, COUNT(b) FILTER (WHERE NOT b < 0) FROM testData GROUP BY k 
HAVING k > 1;
+SELECT dept_id AS k, AVG(salary) FILTER (WHERE NOT hiredate <= 
to_date("2005-01-01"))
+FROM emp GROUP BY k HAVING k < 70;
+SELECT a AS k, COUNT(b) FILTER (WHERE NOT b < 0) FROM testData GROUP BY k 
HAVING k > 1;
+SELECT dept_id AS k, AVG(salary) FILTER (WHERE NOT hiredate <= 
to_timestamp("2005-01-01 00:00:00"))
+FROM emp GROUP BY k HAVING k < 70;
+
+-- A

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-09 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r355332961
 
 

 ##
 File path: sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql
 ##
 @@ -0,0 +1,130 @@
+-- Test filter clause for aggregate expression.
+
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b);
+
+CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id);
+
+CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state);
+
+-- Aggregate with filter and empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData;
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData;
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM 
emp;
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) 
FROM emp;
+
+-- Aggregate with filter and non-empty GroupBy expressions.
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a;
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b;
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM 
emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) 
FROM emp GROUP BY dept_id;
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 
00:00:00")) FROM emp GROUP BY dept_id;
+
+-- Aggregate with filter and grouped by literals.
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM 
emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) 
FROM emp GROUP BY 1;
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= 
to_timestamp("2003-01-01")) FROM emp GROUP BY 1;
+
+-- Aggregate with filter and grouped by literals (hash aggregate).
+SELECT 'foo', APPROX_COUNT_DISTINCT(a) FILTER (WHERE b >= 0) FROM testData 
WHERE a = 0 GROUP BY 1;
+
+-- Aggregate with filter and grouped by literals (sort aggregate).
+SELECT 'foo', MAX(STRUCT(a)) FILTER (WHERE b >= 1) FROM testData WHERE a = 0 
GROUP BY 1;
+
+-- Aggregate with filter and complex GroupBy expressions.
+SELECT a + b, COUNT(b) FILTER (WHERE b >= 2) FROM testData GROUP BY a + b;
+SELECT a + 2, COUNT(b) FILTER (WHERE b IN (1, 2)) FROM testData GROUP BY a + 1;
+SELECT a + 1 + 1, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY a + 1;
+
+-- Aggregate with filter, foldable input and multiple distinct groups.
+SELECT COUNT(DISTINCT b) FILTER (WHERE b > 0), COUNT(DISTINCT b, c) FILTER 
(WHERE b > 0 AND c > 2)
+FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a;
+
+-- Aliases in SELECT could be used in GROUP BY
+SELECT a AS k, COUNT(b) FILTER (WHERE b = 1 OR b = 2) FROM testData GROUP BY k;
+SELECT dept_id as k,
+   SUM(salary) FILTER (WHERE hiredate < date "2005-01-01" OR hiredate > 
date "2010-01-01")
+FROM emp GROUP BY k;
+SELECT dept_id as k,
+   SUM(salary) FILTER (WHERE hiredate < to_date("2005-01-01") OR hiredate 
> to_date("2010-01-01"))
+FROM emp GROUP BY k;
+SELECT dept_id as k,
+   SUM(salary) FILTER (WHERE hiredate < to_timestamp("2005-01-01") OR 
hiredate > to_timestamp("2010-01-01 00:00:00"))
+FROM emp GROUP BY k;
+SELECT a AS k, COUNT(b) FILTER (WHERE NOT b < 0) FROM testData GROUP BY k 
HAVING k > 1;
 
 Review comment:
   OK. I will remove it.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: revie

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-09 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r355322046
 
 

 ##
 File path: 
sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out
 ##
 @@ -0,0 +1,515 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 44
 
 Review comment:
   I can't visit this link.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-09 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r355321028
 
 

 ##
 File path: 
sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part3.sql
 ##
 @@ -400,10 +400,10 @@ SELECT ntile(0) OVER (ORDER BY ten), ten, four FROM 
tenk1;
 -- filter
 
 -- [SPARK-28500] Adds support for `filter` clause
 
 Review comment:
   @maropu I created a new jira 
https://issues.apache.org/jira/browse/SPARK-30182.
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-04 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r354097370
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   Apply filter must go with applying aggregate function. 
   The first aggregate only apply the aggregate function that not contains 
DISTINCT.
   The third aggregate will apply the aggregate function that contains DISTINCT.
   @cloud-fan I have sent you an email to discuss this in detail.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-04 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r354097370
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   Apply filter must go with applying aggregate function . 
   The first aggregate only apply the aggregate function that not contains 
DISTINCT.
   The third aggregate will apply the aggregate function that contains DISTINCT 
.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-04 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353675643
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   It seems could not pass value2 to PartialMerge-AGG-3 if remove value2 from 
the grouping expressions of Partial-AGG-1 or PartialMerge-AGG-2
   Please refer to 
https://github.com/apache/spark/blob/1297e032f693f467167ca55cbbf98682a6315b02/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala#L270
   ```
 val resultProjection = UnsafeProjection.create(
   groupingAttributes ++ bufferAttributes,
   groupingAttributes ++ bufferAttributes)
 resultProjection.initialize(partIndex)
   ```
   And refer to
   
https://github.com/apache/spark/blob/1297e032f693f467167ca55cbbf98682a6315b02/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala#L620
   ```
 val resultVars = bindReferences[Expression](
   resultExpressions,
   groupingAttributes).map(_.genCode(ctx))
   ```


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-04 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353675643
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   It seems could not pass value2 to PartialMerge-AGG-3 if remove value2 from 
the grouping expressions of Partial-AGG-1 or PartialMerge-AGG-2
   Please refer to 
https://github.com/apache/spark/blob/1297e032f693f467167ca55cbbf98682a6315b02/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala#L270
   ```
 val resultProjection = UnsafeProjection.create(
   groupingAttributes ++ bufferAttributes,
   groupingAttributes ++ bufferAttributes)
 resultProjection.initialize(partIndex)
   ```


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-04 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353675643
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   It seems could not pass value2 to PartialMerge-AGG-3 if remove value2 from 
the grouping expressions of Partial-AGG-1 or PartialMerge-AGG-2
   Please refer to 
https://github.com/apache/spark/blob/1297e032f693f467167ca55cbbf98682a6315b02/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala#L270


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-04 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353675643
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   It seems could not pass value2 to PartialMerge-AGG-3 if remove value2 from 
the grouping expressions of Partial-AGG-1 or PartialMerge-AGG-2
   Please refer to 
https://github.com/apache/spark/blob/1297e032f693f467167ca55cbbf98682a6315b02/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala#L294


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-04 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353675643
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   It seems could not pass value2 to PartialMerge-AGG-3 if remove value2 from 
the grouping expressions of Partial-AGG-1 or PartialMerge-AGG-2
   Please refer to 
https://github.com/apache/spark/pull/10228#discussion_r47281934


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-04 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353636185
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   I removed value2 from the grouping expressions and find PartialMerge-AGG-3 
cannot obtain the input value2. 
   I execute `select sum(distinct id) filter (where sex = 'man') from student;` 
will lead an error:
   ```
   Caused by: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: 
execute, tree:
   HashAggregate(keys=[], functions=[partial_sum(distinct cast(id#6 as 
bigint)#13L)], output=[sum#16L])
   +- *(2) HashAggregate(keys=[cast(id#6 as bigint)#13L], functions=[], 
output=[cast(id#6 as bigint)#13L, sex#8])
  +- Exchange hashpartitioning(cast(id#6 as bigint)#13L, 401)
 +- *(1) HashAggregate(keys=[cast(id#6 as bigint) AS cast(id#6 as 
bigint)#13L], functions=[], output=[cast(id#6 as bigint)#13L, sex#8])
+- Scan hive default.xsql.student [id#6, sex#8], HiveTableRelation 
`default`.`xsql`.`student`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, 
[id#6, name#7, sex#8, class_id#9]
   
   Caused by: java.lang.RuntimeException: Couldn't find sex#8 in [cast(id#6 as 
bigint)#13L]
   at scala.sys.package$.error(package.scala:27)
   at 
org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:85)
   at 
org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:79)
   at 
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
   ```
   I don't know why PartialMerge-AGG-3 cannot bind the value2 ?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-04 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353675643
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   It seems could not pass value2 to PartialMerge-AGG-3 if remove value2 from 
the grouping expressions of Partial-AGG-1 or PartialMerge-AGG-2


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-04 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353636185
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   I removed value2 from the grouping expressions and find PartialMerge-AGG-3 
cannot obtain the input value2. 
   I execute `select sum(distinct id) filter (where sex = 'man') from student;` 
will lead an error:
   ```
   Caused by: java.lang.RuntimeException: Couldn't find sex#8 in [cast(id#6 as 
bigint)#13L]
   at scala.sys.package$.error(package.scala:27)
   at 
org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:85)
   at 
org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:79)
   at 
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
   ```
   I don't know why PartialMerge-AGG-3 cannot bind the value2 ?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-04 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353636185
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   I remove value2 from the grouping expressions and find PartialMerge-AGG-3 
cannot obtain the input value2. 
   I execute `select sum(distinct id) filter (where sex = 'man') from student;` 
will lead an error:
   ```
   Caused by: java.lang.RuntimeException: Couldn't find sex#8 in [cast(id#6 as 
bigint)#13L]
   at scala.sys.package$.error(package.scala:27)
   at 
org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:85)
   at 
org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:79)
   at 
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
   ```
   I don't know why PartialMerge-AGG-3 cannot bind the value2 ?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-04 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353636185
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   I remove value2 from the grouping expressions and find PartialMerge-AGG-3 
cannot obtain the input value2. This will lead an error:
   ```
   Caused by: java.lang.RuntimeException: Couldn't find sex#8 in [cast(id#6 as 
bigint)#13L]
   at scala.sys.package$.error(package.scala:27)
   at 
org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:85)
   at 
org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1$$anonfun$applyOrElse$1.apply(BoundAttribute.scala:79)
   at 
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
   ```


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353583379
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   Oh, I will try to this. Thanks wenchen.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353571892
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   For a query like `SELECT COUNT(DISTINCT a) FILTER (WHERE c > 0), SUM(b) 
FILTER (WHERE d = 0) FROM table` will be
   ```
   Final-AGG-4 (count distinct)
 Shuffle to a single reducer
   PartialMerge-AGG-3 (count distinct, no grouping, apply function COUNT on 
a with c > 0)
 PartialMerge-AGG-2 (grouping on a and c)
   Shuffle by a and c
 Partial-AGG-1 (grouping on a and c, apply function SUM on b with d 
= 0)
   ```


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353571892
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   For a query like `SELECT COUNT(DISTINCT a) FILTER (WHERE c > 0), SUM(b) 
FILTER (WHERE d = 0) FROM table` will be
   ```
   Final-AGG-4 (count distinct)
 Shuffle to a single reducer
   PartialMerge-AGG-3 (count distinct, no grouping, apply function COUNT on 
a with c > 0)
 PartialMerge-AGG-2 (grouping on a and c)
   Shuffle by a
 Partial-AGG-1 (grouping on a and c, apply function SUM on b with d 
= 0)
   ```


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353571892
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   For a query like `SELECT COUNT(DISTINCT a) FILTER (WHERE c > 0), SUM(b) 
FILTER (WHERE d = 0) FROM table` will be
   ```
   AGG-4 (count distinct)
 Shuffle to a single reducer
   Partial-AGG-3 (count distinct, no grouping, apply function COUNT on a 
with c > 0)
 Partial-AGG-2 (grouping on a and c)
   Shuffle by a
 Partial-AGG-1 (grouping on a and c, apply function SUM on b with d 
= 0)
   ```


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353571892
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   For a query like SELECT COUNT(DISTINCT a) FILTER (WHERE c > 0), SUM(b) 
FILTER (WHERE d = 0) FROM table will be
   ```
   AGG-4 (count distinct)
 Shuffle to a single reducer
   Partial-AGG-3 (count distinct, no grouping, apply function COUNT on a 
with c > 0)
 Partial-AGG-2 (grouping on a and c)
   Shuffle by a
 Partial-AGG-1 (grouping on a and c, apply function SUM on b with d 
= 0)
   ```


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353567561
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
 ##
 @@ -119,18 +125,24 @@ case class AggregateExpression(
   normalizedAggFunc.canonicalized.asInstanceOf[AggregateFunction],
   mode,
   isDistinct,
+  filter,
 
 Review comment:
   OK. I will change it.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353567195
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   Because the filter is related to aggregate function. The first aggregate and 
second aggregate not apply the aggregate function, so we need to output value2. 
The thrid aggregate will apply the aggregate function with input value2.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353549134
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,25 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the referred attributes in the FILTER clause 
associated with each
+  // aggregate function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
+  // will be [key, value, value2].
 
 Review comment:
   If we not add value2 to the grouping expressions, the aggregate output will 
not contains the attribute of value2.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353534696
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
 ##
 @@ -157,19 +157,52 @@ abstract class AggregationIterator(
   inputAttributes: Seq[Attribute]): (InternalRow, InternalRow) => Unit = {
 val joinedRow = new JoinedRow
 if (expressions.nonEmpty) {
-  val mergeExpressions = functions.zip(expressions).flatMap {
-case (ae: DeclarativeAggregate, expression) =>
-  expression.mode match {
-case Partial | Complete => ae.updateExpressions
-case PartialMerge | Final => ae.mergeExpressions
+  val mergeExpressions = functions.zip(expressions.map(ae => (ae.mode, 
ae.filter))).flatMap {
+case (ae: DeclarativeAggregate, (mode, filter)) =>
+  mode match {
+case Partial | Complete =>
+  if (filter.isDefined) {
+ae.updateExpressions.zip(ae.aggBufferAttributes).map {
+  case (newVal, attr) => If(filter.get, newVal, attr)
 
 Review comment:
   Good idea. I will use updateExpr


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353526405
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,27 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the expression in the FILTER clause associated 
with each aggregate
+  // function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for the AVG (value) Filter (WHERE value2> 20) GROUP BY key, the 
grouping expression
+  //   will be [key, value2];
+  // 3.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
 
 Review comment:
   Yes, I only added the referred attributes of the filter expression


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353533967
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
 ##
 @@ -119,18 +125,24 @@ case class AggregateExpression(
   normalizedAggFunc.canonicalized.asInstanceOf[AggregateFunction],
   mode,
   isDistinct,
+  filter,
 
 Review comment:
   I don't quite understand the meaning of canonicalized filter.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353533676
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,27 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the expression in the FILTER clause associated 
with each aggregate
 
 Review comment:
   OK.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353532725
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
 ##
 @@ -119,18 +125,24 @@ case class AggregateExpression(
   normalizedAggFunc.canonicalized.asInstanceOf[AggregateFunction],
   mode,
   isDistinct,
+  filter,
   ExprId(0))
   }
 
-  override def children: Seq[Expression] = aggregateFunction :: Nil
+  override def children: Seq[Expression] = filter match {
 
 Review comment:
   `override def children: Seq[Expression] = aggregateFunction ++ filter.toSeq`
   This cannot compile.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353532725
 
 

 ##
 File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
 ##
 @@ -119,18 +125,24 @@ case class AggregateExpression(
   normalizedAggFunc.canonicalized.asInstanceOf[AggregateFunction],
   mode,
   isDistinct,
+  filter,
   ExprId(0))
   }
 
-  override def children: Seq[Expression] = aggregateFunction :: Nil
+  override def children: Seq[Expression] = filter match {
 
 Review comment:
   `override def children: Seq[Expression] = aggregateFunction :: filter.toSeq 
:: Nil`
   This cannot compile.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353526405
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,27 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the expression in the FILTER clause associated 
with each aggregate
+  // function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for the AVG (value) Filter (WHERE value2> 20) GROUP BY key, the 
grouping expression
+  //   will be [key, value2];
+  // 3.for AVG (DISTINCT value) Filter (WHERE value2> 20) GROUP BY key, 
the grouping expression
 
 Review comment:
   Yes, I added the referred attributes of the filter expression


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353525269
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
 ##
 @@ -157,19 +157,52 @@ abstract class AggregationIterator(
   inputAttributes: Seq[Attribute]): (InternalRow, InternalRow) => Unit = {
 val joinedRow = new JoinedRow
 if (expressions.nonEmpty) {
-  val mergeExpressions = functions.zip(expressions).flatMap {
-case (ae: DeclarativeAggregate, expression) =>
-  expression.mode match {
-case Partial | Complete => ae.updateExpressions
-case PartialMerge | Final => ae.mergeExpressions
+  val mergeExpressions = functions.zip(expressions.map(ae => (ae.mode, 
ae.filter))).flatMap {
+case (ae: DeclarativeAggregate, (mode, filter)) =>
+  mode match {
+case Partial | Complete =>
+  if (filter.isDefined) {
+ae.updateExpressions.zip(ae.aggBufferAttributes).map {
+  case (newVal, attr) => If(filter.get, newVal, attr)
+}
+  } else {
+ae.updateExpressions
+  }
+case PartialMerge =>
 
 Review comment:
   @cloud-fan `PartialMerge` mode need to evaluate the filter when the 
aggregate funciton containing DISTINCT. Yes, the input here is the agg buffers, 
so I added the referred attributes on
   
https://github.com/apache/spark/blob/4d1413f4c8e05e9bf47e60c955a7031b9aac7e83/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala#L154
   and 
https://github.com/apache/spark/blob/4d1413f4c8e05e9bf47e60c955a7031b9aac7e83/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala#L176


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353525269
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
 ##
 @@ -157,19 +157,52 @@ abstract class AggregationIterator(
   inputAttributes: Seq[Attribute]): (InternalRow, InternalRow) => Unit = {
 val joinedRow = new JoinedRow
 if (expressions.nonEmpty) {
-  val mergeExpressions = functions.zip(expressions).flatMap {
-case (ae: DeclarativeAggregate, expression) =>
-  expression.mode match {
-case Partial | Complete => ae.updateExpressions
-case PartialMerge | Final => ae.mergeExpressions
+  val mergeExpressions = functions.zip(expressions.map(ae => (ae.mode, 
ae.filter))).flatMap {
+case (ae: DeclarativeAggregate, (mode, filter)) =>
+  mode match {
+case Partial | Complete =>
+  if (filter.isDefined) {
+ae.updateExpressions.zip(ae.aggBufferAttributes).map {
+  case (newVal, attr) => If(filter.get, newVal, attr)
+}
+  } else {
+ae.updateExpressions
+  }
+case PartialMerge =>
 
 Review comment:
   @cloud-fan @viirya  `PartialMerge` mode need to evaluate the filter when the 
aggregate funciton containing DISTINCT. Yes, the input here is the agg buffers, 
so I added the referred attributes on
   
https://github.com/apache/spark/blob/4d1413f4c8e05e9bf47e60c955a7031b9aac7e83/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala#L154
   and 
https://github.com/apache/spark/blob/4d1413f4c8e05e9bf47e60c955a7031b9aac7e83/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala#L176


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-12-03 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r353522095
 
 

 ##
 File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
 ##
 @@ -135,19 +135,27 @@ object AggUtils {
 }
 val distinctAttributes = namedDistinctExpressions.map(_.toAttribute)
 val groupingAttributes = groupingExpressions.map(_.toAttribute)
+val filterWithDistinctAttributes = 
functionsWithDistinct.flatMap(_.filterAttributes.toSeq)
 
 // 1. Create an Aggregate Operator for partial aggregations.
 val partialAggregate: SparkPlan = {
   val aggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = 
Partial))
   val aggregateAttributes = aggregateExpressions.map(_.resultAttribute)
   // We will group by the original grouping expression, plus an additional 
expression for the
-  // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, 
the grouping
-  // expressions will be [key, value].
+  // DISTINCT column and the expression in the FILTER clause associated 
with each aggregate
+  // function. For example:
+  // 1.for the AVG (DISTINCT value) GROUP BY key, the grouping expression 
will be [key, value];
+  // 2.for the AVG (value) Filter (WHERE value2> 20) GROUP BY key, the 
grouping expression
+  //   will be [key, value2];
 
 Review comment:
   I'm sorry! I wrote a wrong comment. I will change it.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-11-29 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r352263300
 
 

 ##
 File path: sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
 ##
 @@ -2835,6 +2835,82 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession {
 checkAnswer(df, Row(1, 3, 4) :: Row(2, 3, 4) :: Row(3, 3, 4) :: Nil)
   }
 
+  test("Support filter clause for aggregate function with hash") {
+Seq(("APPROX_COUNT_DISTINCT(a)", 3), ("COUNT(a)", 3), ("FIRST(a)", 1), 
("LAST(a)", 3),
+  ("MAX(a)", 3), ("AVG(a)", 2.0), ("MIN(a)", 1), ("SUM(a)", 6), 
("PERCENTILE(a, 1)", 3),
+("PERCENTILE_APPROX(a, 0.5, 100)", 2.0), ("COLLECT_LIST(a)", Seq(1, 2, 
3)),
+("COLLECT_SET(a)", Seq(1, 2, 3))).foreach{ funcToResult =>
+  val query = s"SELECT ${funcToResult._1} FILTER (WHERE b > 1) FROM 
testData2"
+  val df = sql(query)
+  val physical = df.queryExecution.sparkPlan
+  val aggregateExpressions = physical.collectFirst {
+case agg : HashAggregateExec => agg.aggregateExpressions
+case agg : ObjectHashAggregateExec => agg.aggregateExpressions
+  }
+  assert (aggregateExpressions.isDefined)
+  assert (aggregateExpressions.get.size == 1)
+  aggregateExpressions.get.foreach{ expr =>
 
 Review comment:
   `group-by-filter.sql` tests the function `COUNT`, `SUM`, 
`APPROX_COUNT_DISTINCT` `MAX` and `avg`, But there tests more.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-11-29 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r352114051
 
 

 ##
 File path: 
sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part3.sql.out
 ##
 @@ -12,11 +12,41 @@ It is not allowed to use an aggregate function in the 
argument of another aggreg
 
 
 -- !query 1
+select min(unique1) filter (where unique1 > 100) from tenk1
+-- !query 1 schema
+struct
+-- !query 1 output
+101
+
+
+-- !query 2
+select sum(1/ten) filter (where ten > 0) from tenk1
+-- !query 2 schema
+struct
+-- !query 2 output
+1000
+
+
+-- !query 3
+select ten, sum(distinct four) filter (where four > 10) from onek a
+group by ten
+having exists (select 1 from onek b where sum(distinct a.four) = b.four)
+-- !query 3 schema
+struct
+-- !query 3 output
+0  2
+2  2
+4  2
+6  2
+8  2
+
 
 Review comment:
   I have resolved this issue.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-11-29 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r352074504
 
 

 ##
 File path: 
sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part3.sql.out
 ##
 @@ -12,11 +12,41 @@ It is not allowed to use an aggregate function in the 
argument of another aggreg
 
 
 -- !query 1
+select min(unique1) filter (where unique1 > 100) from tenk1
+-- !query 1 schema
+struct
+-- !query 1 output
+101
+
+
+-- !query 2
+select sum(1/ten) filter (where ten > 0) from tenk1
+-- !query 2 schema
+struct
+-- !query 2 output
+1000
+
+
+-- !query 3
+select ten, sum(distinct four) filter (where four > 10) from onek a
+group by ten
+having exists (select 1 from onek b where sum(distinct a.four) = b.four)
+-- !query 3 schema
+struct
+-- !query 3 output
+0  2
+2  2
+4  2
+6  2
+8  2
+
 
 Review comment:
   I will check this issue.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-11-28 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r351657697
 
 

 ##
 File path: 
sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out
 ##
 @@ -0,0 +1,332 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 27
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData
+-- !query 1 schema
+struct<>
+-- !query 1 output
+org.apache.spark.sql.AnalysisException
+grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate 
function. Wrap '(count(testdata.`b`) AS `count(b)`)' in windowing function(s) 
or wrap 'testdata.`a`' in first() (or first_value) if you don't care which 
value you get.;
+
+
+-- !query 2
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData
+-- !query 2 schema
+struct
+-- !query 2 output
+2  4
+
+
+-- !query 3
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a
+-- !query 3 schema
+struct
+-- !query 3 output
+1  0
+2  2
+3  2
+NULL   0
+
+
+-- !query 4
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b
+-- !query 4 schema
+struct<>
+-- !query 4 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an 
aggregate function. Add to group by or wrap in first() (or first_value) if you 
don't care which value you get.;
+
+
+-- !query 5
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a
+-- !query 5 schema
+struct
+-- !query 5 output
+0  0
+2  0
+2  0
+3  2
+
+
+-- !query 6
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1
+-- !query 6 schema
+struct
+-- !query 6 output
+foo6
+
+
+-- !query 7
+SELECT 'foo', APPROX_COUNT_DISTINCT(a) FILTER (WHERE b >= 0) FROM testData 
WHERE a = 0 GROUP BY 1
+-- !query 7 schema
+struct
+-- !query 7 output
+
+
+
+-- !query 8
+SELECT 'foo', MAX(STRUCT(a)) FILTER (WHERE b >= 1) FROM testData WHERE a = 0 
GROUP BY 1
+-- !query 8 schema
+struct>
+-- !query 8 output
+
+
+
+-- !query 9
+SELECT a + b, COUNT(b) FILTER (WHERE b >= 2) FROM testData GROUP BY a + b
+-- !query 9 schema
+struct<(a + b):int,count(b):bigint>
+-- !query 9 output
+2  0
+3  1
+4  1
+5  1
+NULL   0
+
+
+-- !query 10
+SELECT a + 2, COUNT(b) FILTER (WHERE b IN (1, 2)) FROM testData GROUP BY a + 1
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an 
aggregate function. Add to group by or wrap in first() (or first_value) if you 
don't care which value you get.;
+
+
+-- !query 11
+SELECT a + 1 + 1, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY a + 1
+-- !query 11 schema
+struct<((a + 1) + 1):int,count(b):bigint>
+-- !query 11 output
+3  2
+4  2
+5  2
+NULL   1
+
+
+-- !query 12
+SELECT COUNT(DISTINCT b) FILTER (WHERE b > 0), COUNT(DISTINCT b, c) FILTER 
(WHERE b > 0 AND c > 2)
+FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a
+-- !query 12 schema
+struct
+-- !query 12 output
+1  1
+
+
+-- !query 13
+SELECT a AS k, COUNT(b) FILTER (WHERE b = 1 OR b = 2) FROM testData GROUP BY k
+-- !query 13 schema
+struct
+-- !query 13 output
+1  2
+2  2
+3  2
+NULL   1
+
+
+-- !query 14
+SELECT a AS k, COUNT(b) FILTER (WHERE NOT b < 0) FROM testData GROUP BY k 
HAVING k > 1
+-- !query 14 schema
+struct
+-- !query 14 output
+2  2
+3  2
+
+
+-- !query 15
+SELECT COUNT(b) FILTER (WHERE a > 0) AS k FROM testData GROUP BY k
+-- !query 15 schema
+struct<>
+-- !query 15 output
+org.apache.spark.sql.AnalysisException
+aggregate functions are not allowed in GROUP BY, but found count(testdata.`b`);
+
+
+-- !query 16
+SELECT a AS k, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY k
+-- !query 16 schema
+struct
+-- !query 16 output
+1  2
+2  2
+3  2
+NULL   1
+
+
+-- !query 17
+SELECT a, COUNT(1) FILTER (WHERE b > 1) FROM testData WHERE false GROUP BY a
+-- !query 17 schema
+struct
+-- !query 17 output
+
+
+
+-- !query 18
+SELECT COUNT(1) FILTER (WHERE b = 2) FROM testData WHERE false
+-- !query 18 schema
+struct
+-- !query 18 output
+0
+
+
+-- !query 19
+SELECT 1 FROM (SELECT COUNT(1) FILTER (WHERE a >= 3 OR b <= 1) FROM testData 
WHERE false) t
+-- !query 19 schema
+struct<1:int>
+-- !query 19 output
+1
+
+
+-- !query 20
+CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-11-27 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r351614466
 
 

 ##
 File path: 
sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out
 ##
 @@ -0,0 +1,332 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 27
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData
+-- !query 1 schema
+struct<>
+-- !query 1 output
+org.apache.spark.sql.AnalysisException
+grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate 
function. Wrap '(count(testdata.`b`) AS `count(b)`)' in windowing function(s) 
or wrap 'testdata.`a`' in first() (or first_value) if you don't care which 
value you get.;
+
+
+-- !query 2
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData
+-- !query 2 schema
+struct
+-- !query 2 output
+2  4
+
+
+-- !query 3
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a
+-- !query 3 schema
+struct
+-- !query 3 output
+1  0
+2  2
+3  2
+NULL   0
+
+
+-- !query 4
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b
+-- !query 4 schema
+struct<>
+-- !query 4 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an 
aggregate function. Add to group by or wrap in first() (or first_value) if you 
don't care which value you get.;
+
+
+-- !query 5
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a
+-- !query 5 schema
+struct
+-- !query 5 output
+0  0
+2  0
+2  0
+3  2
+
+
+-- !query 6
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1
+-- !query 6 schema
+struct
+-- !query 6 output
+foo6
+
+
+-- !query 7
+SELECT 'foo', APPROX_COUNT_DISTINCT(a) FILTER (WHERE b >= 0) FROM testData 
WHERE a = 0 GROUP BY 1
+-- !query 7 schema
+struct
+-- !query 7 output
+
+
+
+-- !query 8
+SELECT 'foo', MAX(STRUCT(a)) FILTER (WHERE b >= 1) FROM testData WHERE a = 0 
GROUP BY 1
+-- !query 8 schema
+struct>
+-- !query 8 output
+
+
+
+-- !query 9
+SELECT a + b, COUNT(b) FILTER (WHERE b >= 2) FROM testData GROUP BY a + b
+-- !query 9 schema
+struct<(a + b):int,count(b):bigint>
+-- !query 9 output
+2  0
+3  1
+4  1
+5  1
+NULL   0
+
+
+-- !query 10
+SELECT a + 2, COUNT(b) FILTER (WHERE b IN (1, 2)) FROM testData GROUP BY a + 1
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an 
aggregate function. Add to group by or wrap in first() (or first_value) if you 
don't care which value you get.;
+
+
+-- !query 11
+SELECT a + 1 + 1, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY a + 1
+-- !query 11 schema
+struct<((a + 1) + 1):int,count(b):bigint>
+-- !query 11 output
+3  2
+4  2
+5  2
+NULL   1
+
+
+-- !query 12
+SELECT COUNT(DISTINCT b) FILTER (WHERE b > 0), COUNT(DISTINCT b, c) FILTER 
(WHERE b > 0 AND c > 2)
+FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a
+-- !query 12 schema
+struct
+-- !query 12 output
+1  1
+
+
+-- !query 13
+SELECT a AS k, COUNT(b) FILTER (WHERE b = 1 OR b = 2) FROM testData GROUP BY k
+-- !query 13 schema
+struct
+-- !query 13 output
+1  2
+2  2
+3  2
+NULL   1
+
+
+-- !query 14
+SELECT a AS k, COUNT(b) FILTER (WHERE NOT b < 0) FROM testData GROUP BY k 
HAVING k > 1
+-- !query 14 schema
+struct
+-- !query 14 output
+2  2
+3  2
+
+
+-- !query 15
+SELECT COUNT(b) FILTER (WHERE a > 0) AS k FROM testData GROUP BY k
+-- !query 15 schema
+struct<>
+-- !query 15 output
+org.apache.spark.sql.AnalysisException
+aggregate functions are not allowed in GROUP BY, but found count(testdata.`b`);
+
+
+-- !query 16
+SELECT a AS k, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY k
+-- !query 16 schema
+struct
+-- !query 16 output
+1  2
+2  2
+3  2
+NULL   1
+
+
+-- !query 17
+SELECT a, COUNT(1) FILTER (WHERE b > 1) FROM testData WHERE false GROUP BY a
+-- !query 17 schema
+struct
+-- !query 17 output
+
+
+
+-- !query 18
+SELECT COUNT(1) FILTER (WHERE b = 2) FROM testData WHERE false
+-- !query 18 schema
+struct
+-- !query 18 output
+0
+
+
+-- !query 19
+SELECT 1 FROM (SELECT COUNT(1) FILTER (WHERE a >= 3 OR b <= 1) FROM testData 
WHERE false) t
+-- !query 19 schema
+struct<1:int>
+-- !query 19 output
+1
+
+
+-- !query 20
+CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.

[GitHub] [spark] beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

2019-11-27 Thread GitBox

beliefer commented on a change in pull request #26656: [SPARK-27986][SQL] 
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r351590672
 
 

 ##
 File path: 
sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out
 ##
 @@ -0,0 +1,332 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 27
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData
+-- !query 1 schema
+struct<>
+-- !query 1 output
+org.apache.spark.sql.AnalysisException
+grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate 
function. Wrap '(count(testdata.`b`) AS `count(b)`)' in windowing function(s) 
or wrap 'testdata.`a`' in first() (or first_value) if you don't care which 
value you get.;
+
+
+-- !query 2
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData
+-- !query 2 schema
+struct
+-- !query 2 output
+2  4
+
+
+-- !query 3
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a
+-- !query 3 schema
+struct
+-- !query 3 output
+1  0
+2  2
+3  2
+NULL   0
+
+
+-- !query 4
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b
+-- !query 4 schema
+struct<>
+-- !query 4 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an 
aggregate function. Add to group by or wrap in first() (or first_value) if you 
don't care which value you get.;
+
+
+-- !query 5
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a
+-- !query 5 schema
+struct
+-- !query 5 output
+0  0
+2  0
+2  0
+3  2
+
+
+-- !query 6
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1
+-- !query 6 schema
+struct
+-- !query 6 output
+foo6
+
+
+-- !query 7
+SELECT 'foo', APPROX_COUNT_DISTINCT(a) FILTER (WHERE b >= 0) FROM testData 
WHERE a = 0 GROUP BY 1
+-- !query 7 schema
+struct
+-- !query 7 output
+
+
+
+-- !query 8
+SELECT 'foo', MAX(STRUCT(a)) FILTER (WHERE b >= 1) FROM testData WHERE a = 0 
GROUP BY 1
+-- !query 8 schema
+struct>
+-- !query 8 output
+
+
+
+-- !query 9
+SELECT a + b, COUNT(b) FILTER (WHERE b >= 2) FROM testData GROUP BY a + b
+-- !query 9 schema
+struct<(a + b):int,count(b):bigint>
+-- !query 9 output
+2  0
+3  1
+4  1
+5  1
+NULL   0
+
+
+-- !query 10
+SELECT a + 2, COUNT(b) FILTER (WHERE b IN (1, 2)) FROM testData GROUP BY a + 1
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an 
aggregate function. Add to group by or wrap in first() (or first_value) if you 
don't care which value you get.;
+
+
+-- !query 11
+SELECT a + 1 + 1, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY a + 1
+-- !query 11 schema
+struct<((a + 1) + 1):int,count(b):bigint>
+-- !query 11 output
+3  2
+4  2
+5  2
+NULL   1
+
+
+-- !query 12
+SELECT COUNT(DISTINCT b) FILTER (WHERE b > 0), COUNT(DISTINCT b, c) FILTER 
(WHERE b > 0 AND c > 2)
+FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a
+-- !query 12 schema
+struct
+-- !query 12 output
+1  1
+
+
+-- !query 13
+SELECT a AS k, COUNT(b) FILTER (WHERE b = 1 OR b = 2) FROM testData GROUP BY k
+-- !query 13 schema
+struct
+-- !query 13 output
+1  2
+2  2
+3  2
+NULL   1
+
+
+-- !query 14
+SELECT a AS k, COUNT(b) FILTER (WHERE NOT b < 0) FROM testData GROUP BY k 
HAVING k > 1
+-- !query 14 schema
+struct
+-- !query 14 output
+2  2
+3  2
+
+
+-- !query 15
+SELECT COUNT(b) FILTER (WHERE a > 0) AS k FROM testData GROUP BY k
+-- !query 15 schema
+struct<>
+-- !query 15 output
+org.apache.spark.sql.AnalysisException
+aggregate functions are not allowed in GROUP BY, but found count(testdata.`b`);
+
+
+-- !query 16
+SELECT a AS k, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY k
+-- !query 16 schema
+struct
+-- !query 16 output
+1  2
+2  2
+3  2
+NULL   1
+
+
+-- !query 17
+SELECT a, COUNT(1) FILTER (WHERE b > 1) FROM testData WHERE false GROUP BY a
+-- !query 17 schema
+struct
+-- !query 17 output
+
+
+
+-- !query 18
+SELECT COUNT(1) FILTER (WHERE b = 2) FROM testData WHERE false
+-- !query 18 schema
+struct
+-- !query 18 output
+0
+
+
+-- !query 19
+SELECT 1 FROM (SELECT COUNT(1) FILTER (WHERE a >= 3 OR b <= 1) FROM testData 
WHERE false) t
+-- !query 19 schema
+struct<1:int>
+-- !query 19 output
+1
+
+
+-- !query 20
+CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.

1 2 >

1 - 100 of 122 matches

Mail list logo