[GitHub] [spark] HyukjinKwon commented on a change in pull request #25098: [SPARK-28280][SQL][PYTHON][TESTS] Convert and port 'group-by.sql' into UDF test base

GitBox Thu, 18 Jul 2019 03:55:28 -0700

HyukjinKwon commented on a change in pull request #25098: 
[SPARK-28280][SQL][PYTHON][TESTS] Convert and port 'group-by.sql' into UDF test 
base
URL: https://github.com/apache/spark/pull/25098#discussion_r304855228


 ##########
 File path: 
sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out
 ##########
 @@ -0,0 +1,528 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 52
+
+
+-- !query 0
+CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
+(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, 
null)
+AS testData(a, b)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT udf(a), udf(COUNT(b)) FROM testData
+-- !query 1 schema
+struct<>
+-- !query 1 output
+org.apache.spark.sql.AnalysisException
+grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate 
function. Wrap '(CAST(udf(cast(count(b) as string)) AS BIGINT) AS 
`CAST(udf(cast(count(b) as string)) AS BIGINT)`)' in windowing function(s) or 
wrap 'testdata.`a`' in first() (or first_value) if you don't care which value 
you get.;
+
+
+-- !query 2
+SELECT COUNT(udf(a)), udf(COUNT(b)) FROM testData
+-- !query 2 schema
+struct<count(CAST(udf(cast(a as string)) AS 
INT)):bigint,CAST(udf(cast(count(b) as string)) AS BIGINT):bigint>
+-- !query 2 output
+7      7
+
+
+-- !query 3
+SELECT udf(a), COUNT(udf(b)) FROM testData GROUP BY a
+-- !query 3 schema
+struct<CAST(udf(cast(a as string)) AS INT):int,count(CAST(udf(cast(b as 
string)) AS INT)):bigint>
+-- !query 3 output
+1      2
+2      2
+3      2
+NULL   1
+
+
+-- !query 4
+SELECT udf(a), udf(COUNT(udf(b))) FROM testData GROUP BY b
+-- !query 4 schema
+struct<>
+-- !query 4 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an 
aggregate function. Add to group by or wrap in first() (or first_value) if you 
don't care which value you get.;
+
+
+-- !query 5
+SELECT COUNT(udf(a)), COUNT(udf(b)) FROM testData GROUP BY udf(a)
+-- !query 5 schema
+struct<count(CAST(udf(cast(a as string)) AS INT)):bigint,count(CAST(udf(cast(b 
as string)) AS INT)):bigint>
+-- !query 5 output
+0      1
+2      2
+2      2
+3      2
+
+
+-- !query 6
+SELECT 'foo', COUNT(udf(a)) FROM testData GROUP BY 1
+-- !query 6 schema
+struct<foo:string,count(CAST(udf(cast(a as string)) AS INT)):bigint>
+-- !query 6 output
+foo    7
+
+
+-- !query 7
+SELECT 'foo' FROM testData WHERE a = 0 GROUP BY udf(1)
+-- !query 7 schema
+struct<foo:string>
+-- !query 7 output
+
+
+
+-- !query 8
+SELECT 'foo', udf(APPROX_COUNT_DISTINCT(udf(a))) FROM testData WHERE a = 0 
GROUP BY 1
+-- !query 8 schema
+struct<foo:string,CAST(udf(cast(approx_count_distinct(cast(udf(cast(a as 
string)) as int), 0.05, 0, 0) as string)) AS BIGINT):bigint>
+-- !query 8 output
+
+
+
+-- !query 9
+SELECT 'foo', MAX(STRUCT(udf(a))) FROM testData WHERE a = 0 GROUP BY 1
+-- !query 9 schema
+struct<foo:string,max(named_struct(col1, CAST(udf(cast(a as string)) AS 
INT))):struct<col1:int>>
+-- !query 9 output
+
+
+
+-- !query 10
+SELECT udf(a + b), udf(COUNT(b)) FROM testData GROUP BY a + b
+-- !query 10 schema
+struct<CAST(udf(cast((a + b) as string)) AS INT):int,CAST(udf(cast(count(b) as 
string)) AS BIGINT):bigint>
+-- !query 10 output
+2      1
+3      2
+4      2
+5      1
+NULL   1
+
+
+-- !query 11
+SELECT udf(a + 2), udf(COUNT(b)) FROM testData GROUP BY a + 1
+-- !query 11 schema
+struct<>
+-- !query 11 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an 
aggregate function. Add to group by or wrap in first() (or first_value) if you 
don't care which value you get.;
+
+
+-- !query 12
+SELECT udf(a + 1 + 1), udf(COUNT(b)) FROM testData GROUP BY udf(a + 1)
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.AnalysisException
+expression 'testdata.`a`' is neither present in the group by, nor is it an 
aggregate function. Add to group by or wrap in first() (or first_value) if you 
don't care which value you get.;
+
+
+-- !query 13
+SELECT SKEWNESS(udf(a)), udf(KURTOSIS(a)), udf(MIN(a)), MAX(udf(a)), 
udf(AVG(udf(a))), udf(VARIANCE(a)), STDDEV(udf(a)), udf(SUM(a)), udf(COUNT(a))
+FROM testData
+-- !query 13 schema
+struct<skewness(CAST(CAST(udf(cast(a as string)) AS INT) AS 
DOUBLE)):double,CAST(udf(cast(kurtosis(cast(a as double)) as string)) AS 
DOUBLE):double,CAST(udf(cast(min(a) as string)) AS INT):int,max(CAST(udf(cast(a 
as string)) AS INT)):int,CAST(udf(cast(avg(cast(cast(udf(cast(a as string)) as 
int) as bigint)) as string)) AS DOUBLE):double,CAST(udf(cast(var_samp(cast(a as 
double)) as string)) AS DOUBLE):double,stddev_samp(CAST(CAST(udf(cast(a as 
string)) AS INT) AS DOUBLE)):double,CAST(udf(cast(sum(cast(a as bigint)) as 
string)) AS BIGINT):bigint,CAST(udf(cast(count(a) as string)) AS BIGINT):bigint>
+-- !query 13 output
+-0.2723801058145729    -1.5069204152249134     1       3       
2.142857142857143       0.8095238095238094      0.8997354108424372      15      
7
+
+
+-- !query 14
+SELECT COUNT(DISTINCT udf(b)), udf(COUNT(DISTINCT b, c)) FROM (SELECT 1 AS a, 
2 AS b, 3 AS c) GROUP BY a
+-- !query 14 schema
+struct<count(DISTINCT CAST(udf(cast(b as string)) AS 
INT)):bigint,CAST(udf(cast(count(distinct b, c) as string)) AS BIGINT):bigint>
+-- !query 14 output
+1      1
+
+
+-- !query 15
+SELECT a AS k, COUNT(udf(b)) FROM testData GROUP BY k
+-- !query 15 schema
+struct<k:int,count(CAST(udf(cast(b as string)) AS INT)):bigint>
+-- !query 15 output
+1      2
+2      2
+3      2
+NULL   1
+
+
+-- !query 16
+SELECT a AS k, udf(COUNT(b)) FROM testData GROUP BY k HAVING k > 1
+-- !query 16 schema
+struct<k:int,CAST(udf(cast(count(b) as string)) AS BIGINT):bigint>
+-- !query 16 output
+2      2
+3      2
+
+
+-- !query 17
+SELECT udf(COUNT(b)) AS k FROM testData GROUP BY k
+-- !query 17 schema
+struct<>
+-- !query 17 output
+org.apache.spark.sql.AnalysisException
+aggregate functions are not allowed in GROUP BY, but found 
CAST(udf(cast(count(b) as string)) AS BIGINT);
+
+
+-- !query 18
+CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM 
VALUES
+(1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v)
+-- !query 18 schema
+struct<>
+-- !query 18 output
+
+
+
+-- !query 19
+SELECT k AS a, udf(COUNT(udf(v))) FROM testDataHasSameNameWithAlias GROUP BY a
+-- !query 19 schema
+struct<>
+-- !query 19 output
+org.apache.spark.sql.AnalysisException
+expression 'testdatahassamenamewithalias.`k`' is neither present in the group 
by, nor is it an aggregate function. Add to group by or wrap in first() (or 
first_value) if you don't care which value you get.;
+
+
+-- !query 20
+set spark.sql.groupByAliases=false
+-- !query 20 schema
+struct<key:string,value:string>
+-- !query 20 output
+spark.sql.groupByAliases       false
+
+
+-- !query 21
+SELECT a AS k, udf(COUNT(udf(b))) FROM testData GROUP BY k
+-- !query 21 schema
+struct<>
+-- !query 21 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '`k`' given input columns: [testdata.a, testdata.b]; line 1 pos 
57
+
+
+-- !query 22
+SELECT a, COUNT(udf(1)) FROM testData WHERE false GROUP BY a
+-- !query 22 schema
+struct<a:int,count(CAST(udf(cast(1 as string)) AS INT)):bigint>
+-- !query 22 output
+
+
+
+-- !query 23
+SELECT udf(COUNT(1)) FROM testData WHERE false
+-- !query 23 schema
+struct<CAST(udf(cast(count(1) as string)) AS BIGINT):bigint>
+-- !query 23 output
+0
+
+
+-- !query 24
+SELECT 1 FROM (SELECT udf(COUNT(1)) FROM testData WHERE false) t
+-- !query 24 schema
+struct<1:int>
+-- !query 24 output
+1
+
+
+-- !query 25
+SELECT 1 from (
+  SELECT 1 AS z,
+  udf(MIN(a.x))
+  FROM (select 1 as x) a
+  WHERE false
+) b
+where b.z != b.z
+-- !query 25 schema
+struct<1:int>
+-- !query 25 output
+
+
+
+-- !query 26
+SELECT corr(DISTINCT x, y), udf(corr(DISTINCT y, x)), count(*)
+  FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y)
+-- !query 26 schema
+struct<corr(DISTINCT CAST(x AS DOUBLE), CAST(y AS 
DOUBLE)):double,CAST(udf(cast(corr(distinct cast(y as double), cast(x as 
double)) as string)) AS DOUBLE):double,count(1):bigint>
+-- !query 26 output
+1.0    1.0     3
+
+
+-- !query 27
+SELECT udf(1) FROM range(10) HAVING true
+-- !query 27 schema
+struct<CAST(udf(cast(1 as string)) AS INT):int>
+-- !query 27 output
+1
+
+
+-- !query 28
+SELECT udf(udf(1)) FROM range(10) HAVING MAX(id) > 0
+-- !query 28 schema
+struct<CAST(udf(cast(cast(udf(cast(1 as string)) as int) as string)) AS 
INT):int>
+-- !query 28 output
+1
+
+
+-- !query 29
+SELECT udf(id) FROM range(10) HAVING id > 0
+-- !query 29 schema
+struct<>
+-- !query 29 output
+org.apache.spark.sql.AnalysisException
+grouping expressions sequence is empty, and '`id`' is not an aggregate 
function. Wrap '()' in windowing function(s) or wrap '`id`' in first() (or 
first_value) if you don't care which value you get.;
+
+
+-- !query 30
+CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES
+  (1, true), (1, false),
+  (2, true),
+  (3, false), (3, null),
+  (4, null), (4, null),
+  (5, null), (5, true), (5, false) AS test_agg(k, v)
+-- !query 30 schema
+struct<>
+-- !query 30 output
+
+
+
+-- !query 31
+SELECT udf(every(v)), udf(some(v)), any(v) FROM test_agg WHERE 1 = 0
+-- !query 31 schema
+struct<CAST(udf(cast(every(v) as string)) AS 
BOOLEAN):boolean,CAST(udf(cast(some(v) as string)) AS 
BOOLEAN):boolean,any(v):boolean>
+-- !query 31 output
+NULL   NULL    NULL
+
+
+-- !query 32
+SELECT udf(every(udf(v))), some(v), any(v) FROM test_agg WHERE k = 4
+-- !query 32 schema
+struct<CAST(udf(cast(every(cast(udf(cast(v as string)) as boolean)) as 
string)) AS BOOLEAN):boolean,some(v):boolean,any(v):boolean>
+-- !query 32 output
+NULL   NULL    NULL
+
+
+-- !query 33
+SELECT every(v), udf(some(v), any(v) FROM test_agg WHERE k = 5
+-- !query 33 schema
+struct<>
+-- !query 33 output
+org.apache.spark.sql.catalyst.parser.ParseException
+
+extraneous input 'FROM' expecting {')', ','}(line 1, pos 37)
+
+== SQL ==
+SELECT every(v), udf(some(v), any(v) FROM test_agg WHERE k = 5
 
 Review comment:
   There seems a typo here `udf(some(v),`

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] HyukjinKwon commented on a change in pull request #25098: [SPARK-28280][SQL][PYTHON][TESTS] Convert and port 'group-by.sql' into UDF test base

Reply via email to