beliefer commented on a change in pull request #26656: [SPARK-27986][SQL]
Support ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r360751512
##########
File path:
sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out
##########
@@ -0,0 +1,569 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 47
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null,
null)
+AS testData(a, b)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+ (100, "emp 1", date "2005-01-01", 100.00D, 10),
+ (100, "emp 1", date "2005-01-01", 100.00D, 10),
+ (200, "emp 2", date "2003-01-01", 200.00D, 10),
+ (300, "emp 3", date "2002-01-01", 300.00D, 20),
+ (400, "emp 4", date "2005-01-01", 400.00D, 30),
+ (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+ (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+ (700, "emp 7", date "2010-01-01", 400.00D, 100),
+ (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+ (10, "dept 1", "CA"),
+ (20, "dept 2", "NY"),
+ (30, "dept 3", "TX"),
+ (40, "dept 4 - unassigned", "OR"),
+ (50, "dept 5 - unassigned", "NJ"),
+ (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state)
+-- !query 2 schema
+struct<>
+-- !query 2 output
+
+
+
+-- !query 3
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData
+-- !query 3 schema
+struct<>
+-- !query 3 output
+org.apache.spark.sql.AnalysisException
+grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate
function. Wrap '(count(testdata.`b`) AS `count(b)`)' in windowing function(s)
or wrap 'testdata.`a`' in first() (or first_value) if you don't care which
value you get.;
+
+
+-- !query 4
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM
testData
+-- !query 4 schema
+struct<count(a):bigint,count(b):bigint>
+-- !query 4 output
+2 4
+
+
+-- !query 5
+SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp
+-- !query 5 schema
+struct<count(id):bigint>
+-- !query 5 output
+2
+
+
+-- !query 6
+SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM
emp
+-- !query 6 schema
+struct<count(id):bigint>
+-- !query 6 output
+2
+
+
+-- !query 7
+SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00"))
FROM emp
+-- !query 7 schema
+struct<count(id):bigint>
+-- !query 7 output
+2
+
+
+-- !query 8
+SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd") =
"2001-01-01") FROM emp
+-- !query 8 schema
+struct<count(id):bigint>
+-- !query 8 output
+2
+
+
+-- !query 9
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a
+-- !query 9 schema
+struct<a:int,count(b):bigint>
+-- !query 9 output
+1 0
+2 2
+3 2
+NULL 0
+
+
+-- !query 10
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an
aggregate function. Add to group by or wrap in first() (or first_value) if you
don't care which value you get.;
+
+
+-- !query 11
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM
testData GROUP BY a
+-- !query 11 schema
+struct<count(a):bigint,count(b):bigint>
+-- !query 11 output
+0 0
+2 0
+2 0
+3 2
+
+
+-- !query 12
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM
emp GROUP BY dept_id
+-- !query 12 schema
+struct<dept_id:int,sum(salary):double>
+-- !query 12 output
+10 200.0
+100 400.0
+20 NULL
+30 400.0
+70 150.0
+NULL NULL
+
+
+-- !query 13
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01"))
FROM emp GROUP BY dept_id
+-- !query 13 schema
+struct<dept_id:int,sum(salary):double>
+-- !query 13 output
+10 200.0
+100 400.0
+20 NULL
+30 400.0
+70 150.0
+NULL NULL
+
+
+-- !query 14
+SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01
00:00:00")) FROM emp GROUP BY dept_id
+-- !query 14 schema
+struct<dept_id:int,sum(salary):double>
+-- !query 14 output
+10 200.0
+100 400.0
+20 NULL
+30 400.0
+70 150.0
+NULL NULL
+
+
+-- !query 15
+SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd")
> "2003-01-01") FROM emp GROUP BY dept_id
+-- !query 15 schema
+struct<dept_id:int,sum(salary):double>
+-- !query 15 output
+10 200.0
+100 400.0
+20 NULL
+30 400.0
+70 150.0
+NULL NULL
+
+
+-- !query 16
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1
+-- !query 16 schema
+struct<foo:string,count(a):bigint>
+-- !query 16 output
+foo 6
+
+
+-- !query 17
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM
emp GROUP BY 1
+-- !query 17 schema
+struct<foo:string,sum(salary):double>
+-- !query 17 output
+foo 1350.0
+
+
+-- !query 18
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01"))
FROM emp GROUP BY 1
+-- !query 18 schema
+struct<foo:string,sum(salary):double>
+-- !query 18 output
+foo 1350.0
+
+
+-- !query 19
+SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >=
to_timestamp("2003-01-01")) FROM emp GROUP BY 1
+-- !query 19 schema
+struct<foo:string,sum(salary):double>
+-- !query 19 output
+foo 1350.0
+
+
+-- !query 20
+select dept_id, count(distinct emp_name), count(distinct hiredate),
sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id
+-- !query 20 schema
+struct<dept_id:int,count(DISTINCT emp_name):bigint,count(DISTINCT
hiredate):bigint,sum(salary):double,sum(salary):double>
+-- !query 20 output
+10 2 2 400.0 NULL
+100 2 2 800.0 800.0
+20 1 1 300.0 300.0
+30 1 1 400.0 400.0
+70 1 1 150.0 150.0
+NULL 1 1 400.0 400.0
+
+
+-- !query 21
+select dept_id, count(distinct emp_name), count(distinct hiredate),
sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id >
200) from emp group by dept_id
+-- !query 21 schema
+struct<dept_id:int,count(DISTINCT emp_name):bigint,count(DISTINCT
hiredate):bigint,sum(salary):double,sum(salary):double>
+-- !query 21 output
+10 2 2 400.0 NULL
+100 2 2 NULL 800.0
+20 1 1 300.0 300.0
+30 1 1 NULL 400.0
+70 1 1 150.0 150.0
+NULL 1 1 NULL 400.0
+
+
+-- !query 22
+SELECT 'foo', APPROX_COUNT_DISTINCT(a) FILTER (WHERE b >= 0) FROM testData
WHERE a = 0 GROUP BY 1
+-- !query 22 schema
+struct<foo:string,approx_count_distinct(a):bigint>
+-- !query 22 output
+
+
+
+-- !query 23
+SELECT 'foo', MAX(STRUCT(a)) FILTER (WHERE b >= 1) FROM testData WHERE a = 0
GROUP BY 1
+-- !query 23 schema
+struct<foo:string,max(named_struct(a, a)):struct<a:int>>
+-- !query 23 output
+
+
+
+-- !query 24
+SELECT a + b, COUNT(b) FILTER (WHERE b >= 2) FROM testData GROUP BY a + b
+-- !query 24 schema
+struct<(a + b):int,count(b):bigint>
+-- !query 24 output
+2 0
+3 1
+4 1
+5 1
+NULL 0
+
+
+-- !query 25
+SELECT a + 2, COUNT(b) FILTER (WHERE b IN (1, 2)) FROM testData GROUP BY a + 1
+-- !query 25 schema
+struct<>
+-- !query 25 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an
aggregate function. Add to group by or wrap in first() (or first_value) if you
don't care which value you get.;
+
+
+-- !query 26
+SELECT a + 1 + 1, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY a + 1
+-- !query 26 schema
+struct<((a + 1) + 1):int,count(b):bigint>
+-- !query 26 output
+3 2
+4 2
+5 2
+NULL 1
+
+
+-- !query 27
+SELECT a AS k, COUNT(b) FILTER (WHERE b = 1 OR b = 2) FROM testData GROUP BY k
+-- !query 27 schema
+struct<k:int,count(b):bigint>
+-- !query 27 output
+1 2
+2 2
+3 2
+NULL 1
+
+
+-- !query 28
+SELECT dept_id as k,
+ SUM(salary) FILTER (WHERE hiredate < date "2005-01-01" OR hiredate >
date "2010-01-01")
+FROM emp GROUP BY k
+-- !query 28 schema
+struct<k:int,sum(salary):double>
+-- !query 28 output
+10 200.0
+100 400.0
+20 300.0
+30 NULL
+70 150.0
+NULL 400.0
+
+
+-- !query 29
+SELECT dept_id as k,
+ SUM(salary) FILTER (WHERE hiredate < to_date("2005-01-01") OR hiredate
> to_date("2010-01-01"))
+FROM emp GROUP BY k
+-- !query 29 schema
+struct<k:int,sum(salary):double>
+-- !query 29 output
+10 200.0
+100 400.0
+20 300.0
+30 NULL
+70 150.0
+NULL 400.0
+
+
+-- !query 30
+SELECT dept_id as k,
+ SUM(salary) FILTER (WHERE hiredate < to_timestamp("2005-01-01") OR
hiredate > to_timestamp("2010-01-01 00:00:00"))
+FROM emp GROUP BY k
+-- !query 30 schema
+struct<k:int,sum(salary):double>
+-- !query 30 output
+10 200.0
+100 400.0
+20 300.0
+30 NULL
+70 150.0
+NULL 400.0
+
+
+-- !query 31
+SELECT a AS k, COUNT(b) FILTER (WHERE NOT b < 0) FROM testData GROUP BY k
HAVING k > 1
+-- !query 31 schema
+struct<k:int,count(b):bigint>
+-- !query 31 output
+2 2
+3 2
+
+
+-- !query 32
+SELECT dept_id AS k, AVG(salary) FILTER (WHERE NOT hiredate <= date
"2005-01-01")
+FROM emp GROUP BY k HAVING k < 70
+-- !query 32 schema
+struct<k:int,avg(salary):double>
+-- !query 32 output
+10 NULL
+20 NULL
+30 NULL
+
+
+-- !query 33
+SELECT dept_id AS k, AVG(salary) FILTER (WHERE NOT hiredate <=
to_date("2005-01-01"))
+FROM emp GROUP BY k HAVING k < 70
+-- !query 33 schema
+struct<k:int,avg(salary):double>
+-- !query 33 output
+10 NULL
+20 NULL
+30 NULL
+
+
+-- !query 34
+SELECT dept_id AS k, AVG(salary) FILTER (WHERE NOT hiredate <=
to_timestamp("2005-01-01 00:00:00"))
+FROM emp GROUP BY k HAVING k < 70
+-- !query 34 schema
+struct<k:int,avg(salary):double>
+-- !query 34 output
+10 NULL
+20 NULL
+30 NULL
+
+
+-- !query 35
+SELECT COUNT(b) FILTER (WHERE a > 0) AS k FROM testData GROUP BY k
+-- !query 35 schema
+struct<>
+-- !query 35 output
+org.apache.spark.sql.AnalysisException
+aggregate functions are not allowed in GROUP BY, but found count(testdata.`b`);
+
+
+-- !query 36
+SELECT a AS k, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY k
+-- !query 36 schema
+struct<k:int,count(b):bigint>
+-- !query 36 output
+1 2
+2 2
+3 2
+NULL 1
+
+
+-- !query 37
+SELECT a, COUNT(1) FILTER (WHERE b > 1) FROM testData WHERE false GROUP BY a
+-- !query 37 schema
+struct<a:int,count(1):bigint>
+-- !query 37 output
+
+
+
+-- !query 38
+SELECT COUNT(1) FILTER (WHERE b = 2) FROM testData WHERE false
+-- !query 38 schema
+struct<count(1):bigint>
+-- !query 38 output
+0
+
+
+-- !query 39
+SELECT 1 FROM (SELECT COUNT(1) FILTER (WHERE a >= 3 OR b <= 1) FROM testData
WHERE false) t
+-- !query 39 schema
+struct<1:int>
+-- !query 39 output
+1
+
+
+-- !query 40
+SELECT emp.dept_id,
+ avg(salary),
+ avg(salary) FILTER (WHERE id > (SELECT 200))
+FROM emp
+GROUP BY dept_id
+-- !query 40 schema
+struct<dept_id:int,avg(salary):double,avg(salary):double>
+-- !query 40 output
+10 133.33333333333334 NULL
+100 400.0 400.0
+20 300.0 300.0
+30 400.0 400.0
+70 150.0 150.0
+NULL 400.0 400.0
+
+
+-- !query 41
+SELECT emp.dept_id,
+ avg(salary),
+ avg(salary) FILTER (WHERE emp.dept_id = (SELECT dept_id FROM dept LIMIT
1))
+FROM emp
+GROUP BY dept_id
+-- !query 41 schema
+struct<dept_id:int,avg(salary):double,avg(salary):double>
+-- !query 41 output
+10 133.33333333333334 133.33333333333334
+100 400.0 NULL
+20 300.0 NULL
+30 400.0 NULL
+70 150.0 NULL
+NULL 400.0 NULL
+
+
+-- !query 42
+SELECT emp.dept_id,
+ avg(salary),
+ avg(salary) FILTER (WHERE EXISTS (SELECT state
+ FROM dept
+ WHERE dept.dept_id = emp.dept_id))
+FROM emp
+GROUP BY dept_id
+-- !query 42 schema
+struct<>
+-- !query 42 output
+org.apache.spark.sql.AnalysisException
Review comment:
OK
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]