[GitHub] [spark] maropu commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

GitBox Thu, 28 Nov 2019 15:59:05 -0800

maropu commented on a change in pull request #26656: [SPARK-27986][SQL] Support 
ANSI SQL filter clause for aggregate expression
URL: https://github.com/apache/spark/pull/26656#discussion_r351945670


 ##########
 File path: 
sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out
 ##########
 @@ -0,0 +1,332 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 27
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData
+-- !query 1 schema
+struct<>
+-- !query 1 output
+org.apache.spark.sql.AnalysisException
+grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate 
function. Wrap '(count(testdata.`b`) AS `count(b)`)' in windowing function(s) 
or wrap 'testdata.`a`' in first() (or first_value) if you don't care which 
value you get.;
+
+
+-- !query 2
+SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM 
testData
+-- !query 2 schema
+struct<count(a):bigint,count(b):bigint>
+-- !query 2 output
+2      4
+
+
+-- !query 3
+SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a
+-- !query 3 schema
+struct<a:int,count(b):bigint>
+-- !query 3 output
+1      0
+2      2
+3      2
+NULL   0
+
+
+-- !query 4
+SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b
+-- !query 4 schema
+struct<>
+-- !query 4 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an 
aggregate function. Add to group by or wrap in first() (or first_value) if you 
don't care which value you get.;
+
+
+-- !query 5
+SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM 
testData GROUP BY a
+-- !query 5 schema
+struct<count(a):bigint,count(b):bigint>
+-- !query 5 output
+0      0
+2      0
+2      0
+3      2
+
+
+-- !query 6
+SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1
+-- !query 6 schema
+struct<foo:string,count(a):bigint>
+-- !query 6 output
+foo    6
+
+
+-- !query 7
+SELECT 'foo', APPROX_COUNT_DISTINCT(a) FILTER (WHERE b >= 0) FROM testData 
WHERE a = 0 GROUP BY 1
+-- !query 7 schema
+struct<foo:string,approx_count_distinct(a):bigint>
+-- !query 7 output
+
+
+
+-- !query 8
+SELECT 'foo', MAX(STRUCT(a)) FILTER (WHERE b >= 1) FROM testData WHERE a = 0 
GROUP BY 1
+-- !query 8 schema
+struct<foo:string,max(named_struct(a, a)):struct<a:int>>
+-- !query 8 output
+
+
+
+-- !query 9
+SELECT a + b, COUNT(b) FILTER (WHERE b >= 2) FROM testData GROUP BY a + b
+-- !query 9 schema
+struct<(a + b):int,count(b):bigint>
+-- !query 9 output
+2      0
+3      1
+4      1
+5      1
+NULL   0
+
+
+-- !query 10
+SELECT a + 2, COUNT(b) FILTER (WHERE b IN (1, 2)) FROM testData GROUP BY a + 1
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an 
aggregate function. Add to group by or wrap in first() (or first_value) if you 
don't care which value you get.;
+
+
+-- !query 11
+SELECT a + 1 + 1, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY a + 1
+-- !query 11 schema
+struct<((a + 1) + 1):int,count(b):bigint>
+-- !query 11 output
+3      2
+4      2
+5      2
+NULL   1
+
+
+-- !query 12
+SELECT COUNT(DISTINCT b) FILTER (WHERE b > 0), COUNT(DISTINCT b, c) FILTER 
(WHERE b > 0 AND c > 2)
+FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a
+-- !query 12 schema
+struct<count(DISTINCT b):bigint,count(DISTINCT b, c):bigint>
+-- !query 12 output
+1      1
+
+
+-- !query 13
+SELECT a AS k, COUNT(b) FILTER (WHERE b = 1 OR b = 2) FROM testData GROUP BY k
+-- !query 13 schema
+struct<k:int,count(b):bigint>
+-- !query 13 output
+1      2
+2      2
+3      2
+NULL   1
+
+
+-- !query 14
+SELECT a AS k, COUNT(b) FILTER (WHERE NOT b < 0) FROM testData GROUP BY k 
HAVING k > 1
+-- !query 14 schema
+struct<k:int,count(b):bigint>
+-- !query 14 output
+2      2
+3      2
+
+
+-- !query 15
+SELECT COUNT(b) FILTER (WHERE a > 0) AS k FROM testData GROUP BY k
+-- !query 15 schema
+struct<>
+-- !query 15 output
+org.apache.spark.sql.AnalysisException
+aggregate functions are not allowed in GROUP BY, but found count(testdata.`b`);
+
+
+-- !query 16
+SELECT a AS k, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY k
+-- !query 16 schema
+struct<k:int,count(b):bigint>
+-- !query 16 output
+1      2
+2      2
+3      2
+NULL   1
+
+
+-- !query 17
+SELECT a, COUNT(1) FILTER (WHERE b > 1) FROM testData WHERE false GROUP BY a
+-- !query 17 schema
+struct<a:int,count(1):bigint>
+-- !query 17 output
+
+
+
+-- !query 18
+SELECT COUNT(1) FILTER (WHERE b = 2) FROM testData WHERE false
+-- !query 18 schema
+struct<count(1):bigint>
+-- !query 18 output
+0
+
+
+-- !query 19
+SELECT 1 FROM (SELECT COUNT(1) FILTER (WHERE a >= 3 OR b <= 1) FROM testData 
WHERE false) t
+-- !query 19 schema
+struct<1:int>
+-- !query 19 output
+1
+
+
+-- !query 20
+CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (100, "emp 1", date "2005-01-01", 100.00D, 10),
+  (200, "emp 2", date "2003-01-01", 200.00D, 10),
+  (300, "emp 3", date "2002-01-01", 300.00D, 20),
+  (400, "emp 4", date "2005-01-01", 400.00D, 30),
+  (500, "emp 5", date "2001-01-01", 400.00D, NULL),
+  (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100),
+  (700, "emp 7", date "2010-01-01", 400.00D, 100),
+  (800, "emp 8", date "2016-01-01", 150.00D, 70)
+AS EMP(id, emp_name, hiredate, salary, dept_id)
+-- !query 20 schema
+struct<>
+-- !query 20 output
+
+
+
+-- !query 21
+CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES
+  (10, "dept 1", "CA"),
+  (20, "dept 2", "NY"),
+  (30, "dept 3", "TX"),
+  (40, "dept 4 - unassigned", "OR"),
+  (50, "dept 5 - unassigned", "NJ"),
+  (70, "dept 7", "FL")
+AS DEPT(dept_id, dept_name, state)
+-- !query 21 schema
+struct<>
+-- !query 21 output
+
+
+
+-- !query 22
+SELECT emp.dept_id,
+       avg(salary),
+       avg(salary) FILTER (WHERE EXISTS (SELECT state
+               FROM dept
+               WHERE dept.dept_id = emp.dept_id))
+FROM emp
+GROUP BY dept_id
+-- !query 22 schema
+struct<>
+-- !query 22 output
+org.apache.spark.sql.AnalysisException
+IN/EXISTS predicate sub-queries can only be used in Filter/Join and a few 
commands: Aggregate [dept_id#x], [dept_id#x, avg(salary#x) AS avg(salary)#x, 
avg(salary#x) AS avg(salary)#x]
 
 Review comment:
   Ah, I see. Thanks. Then, yes, I personally think its ok not to support 
subqueries in this pr. Can you throw an exception with an explicit error 
message in `CheckAnalysis`?  cc: @gatorsmile @cloud-fan 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] maropu commented on a change in pull request #26656: [SPARK-27986][SQL] Support ANSI SQL filter clause for aggregate expression

Reply via email to