skonto opened a new pull request #25196: [SPARK-28279][SQL][PYTHON][TESTS] 
Convert and port 'group-analytics.sql' into UDF test base
URL: https://github.com/apache/spark/pull/25196
 
 
   ## What changes were proposed in this pull request?
   This PR adds some tests converted from group-analytics.sql to test UDFs. 
Please see contribution guide of this umbrella ticket - SPARK-27921.
   <details><summary>Diff comparing to 'group-analytics.sql'</summary>
   <p>
   
   ```diff
   diff --git 
a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out 
b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out
   index 3439a05727..ef14152136 100644
   --- 
a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out
   +++ 
b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out
   @@ -13,9 +13,9 @@ struct<>
    
    
    -- !query 1
   -SELECT a + b, b, SUM(a - b) FROM testData GROUP BY a + b, b WITH CUBE
   +SELECT a + b, b, udf(SUM(a - b)) FROM testData GROUP BY a + b, b WITH CUBE
    -- !query 1 schema
   -struct<(a + b):int,b:int,sum((a - b)):bigint>
   +struct<(a + b):int,b:int,CAST(udf(cast(sum(cast((a - b) as bigint)) as 
string)) AS BIGINT):bigint>
    -- !query 1 output
    2   1       0
    2   NULL    0
   @@ -33,9 +33,9 @@ NULL       NULL    3
    
    
    -- !query 2
   -SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH CUBE
   +SELECT a, udf(b), SUM(b) FROM testData GROUP BY a, b WITH CUBE
    -- !query 2 schema
   -struct<a:int,b:int,sum(b):bigint>
   +struct<a:int,CAST(udf(cast(b as string)) AS INT):int,sum(b):bigint>
    -- !query 2 output
    1   1       1
    1   2       2
   @@ -52,9 +52,9 @@ NULL       NULL    9
    
    
    -- !query 3
   -SELECT a + b, b, SUM(a - b) FROM testData GROUP BY a + b, b WITH ROLLUP
   +SELECT udf(a + b), b, SUM(a - b) FROM testData GROUP BY a + b, b WITH ROLLUP
    -- !query 3 schema
   -struct<(a + b):int,b:int,sum((a - b)):bigint>
   +struct<CAST(udf(cast((a + b) as string)) AS INT):int,b:int,sum((a - 
b)):bigint>
    -- !query 3 output
    2   1       0
    2   NULL    0
   @@ -70,9 +70,9 @@ NULL       NULL    3
    
    
    -- !query 4
   -SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH ROLLUP
   +SELECT a, b, udf(SUM(b)) FROM testData GROUP BY a, b WITH ROLLUP
    -- !query 4 schema
   -struct<a:int,b:int,sum(b):bigint>
   +struct<a:int,b:int,CAST(udf(cast(sum(cast(b as bigint)) as string)) AS 
BIGINT):bigint>
    -- !query 4 output
    1   1       1
    1   2       2
   @@ -97,7 +97,7 @@ struct<>
    
    
    -- !query 6
   -SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, 
year) ORDER BY course, year
   +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, 
year) ORDER BY udf(course), year
    -- !query 6 schema
    struct<course:string,year:int,sum(earnings):bigint>
    -- !query 6 output
   @@ -111,7 +111,7 @@ dotNET   2013    48000
    
    
    -- !query 7
   -SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, 
year) ORDER BY course, year
   +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, 
year) ORDER BY course, udf(year)
    -- !query 7 schema
    struct<course:string,year:int,sum(earnings):bigint>
    -- !query 7 output
   @@ -127,9 +127,9 @@ dotNET   2013    48000
    
    
    -- !query 8
   -SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year 
GROUPING SETS(course, year)
   +SELECT course, udf(year), SUM(earnings) FROM courseSales GROUP BY course, 
year GROUPING SETS(course, year)
    -- !query 8 schema
   -struct<course:string,year:int,sum(earnings):bigint>
   +struct<course:string,CAST(udf(cast(year as string)) AS 
INT):int,sum(earnings):bigint>
    -- !query 8 output
    Java        NULL    50000
    NULL        2012    35000
   @@ -138,26 +138,26 @@ dotNET NULL    63000
    
    
    -- !query 9
   -SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year 
GROUPING SETS(course)
   +SELECT course, year, udf(SUM(earnings)) FROM courseSales GROUP BY course, 
year GROUPING SETS(course)
    -- !query 9 schema
   -struct<course:string,year:int,sum(earnings):bigint>
   +struct<course:string,year:int,CAST(udf(cast(sum(cast(earnings as bigint)) 
as string)) AS BIGINT):bigint>
    -- !query 9 output
    Java        NULL    50000
    dotNET      NULL    63000
    
    
    -- !query 10
   -SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year 
GROUPING SETS(year)
   +SELECT udf(course), year, SUM(earnings) FROM courseSales GROUP BY course, 
year GROUPING SETS(year)
    -- !query 10 schema
   -struct<course:string,year:int,sum(earnings):bigint>
   +struct<CAST(udf(cast(course as string)) AS 
STRING):string,year:int,sum(earnings):bigint>
    -- !query 10 output
    NULL        2012    35000
    NULL        2013    78000
    
    
    -- !query 11
   -SELECT course, SUM(earnings) AS sum FROM courseSales
   -GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) 
ORDER BY course, sum
   +SELECT course, udf(SUM(earnings)) AS sum FROM courseSales
   +GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) 
ORDER BY course, udf(sum)
    -- !query 11 schema
    struct<course:string,sum:bigint>
    -- !query 11 output
   @@ -173,7 +173,7 @@ dotNET   63000
    
    -- !query 12
    SELECT course, SUM(earnings) AS sum, GROUPING_ID(course, earnings) FROM 
courseSales
   -GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) 
ORDER BY course, sum
   +GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) 
ORDER BY udf(course), sum
    -- !query 12 schema
    struct<course:string,sum:bigint,grouping_id(course, earnings):int>
    -- !query 12 output
   @@ -188,10 +188,10 @@ dotNET 63000   1
    
    
    -- !query 13
   -SELECT course, year, GROUPING(course), GROUPING(year), GROUPING_ID(course, 
year) FROM courseSales
   +SELECT udf(course), udf(year), GROUPING(course), GROUPING(year), 
GROUPING_ID(course, year) FROM courseSales
    GROUP BY CUBE(course, year)
    -- !query 13 schema
   
-struct<course:string,year:int,grouping(course):tinyint,grouping(year):tinyint,grouping_id(course,
 year):int>
   +struct<CAST(udf(cast(course as string)) AS 
STRING):string,CAST(udf(cast(year as string)) AS 
INT):int,grouping(course):tinyint,grouping(year):tinyint,grouping_id(course, 
year):int>
    -- !query 13 output
    Java        2012    0       0       0
    Java        2013    0       0       0
   @@ -205,7 +205,7 @@ dotNET   NULL    0       1       1
    
    
    -- !query 14
   -SELECT course, year, GROUPING(course) FROM courseSales GROUP BY course, year
   +SELECT course, udf(year), GROUPING(course) FROM courseSales GROUP BY 
course, year
    -- !query 14 schema
    struct<>
    -- !query 14 output
   @@ -214,7 +214,7 @@ grouping() can only be used with 
GroupingSets/Cube/Rollup;
    
    
    -- !query 15
   -SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY 
course, year
   +SELECT course, udf(year), GROUPING_ID(course, year) FROM courseSales GROUP 
BY course, year
    -- !query 15 schema
    struct<>
    -- !query 15 output
   @@ -240,7 +240,7 @@ NULL     NULL    3
    
    -- !query 17
    SELECT course, year FROM courseSales GROUP BY CUBE(course, year)
   -HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0 ORDER BY 
course, year
   +HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0 ORDER BY 
course, udf(year)
    -- !query 17 schema
    struct<course:string,year:int>
    -- !query 17 output
   @@ -250,7 +250,7 @@ dotNET   NULL
    
    
    -- !query 18
   -SELECT course, year FROM courseSales GROUP BY course, year HAVING 
GROUPING(course) > 0
   +SELECT course, udf(year) FROM courseSales GROUP BY course, year HAVING 
GROUPING(course) > 0
    -- !query 18 schema
    struct<>
    -- !query 18 output
   @@ -259,7 +259,7 @@ grouping()/grouping_id() can only be used with 
GroupingSets/Cube/Rollup;
    
    
    -- !query 19
   -SELECT course, year FROM courseSales GROUP BY course, year HAVING 
GROUPING_ID(course) > 0
   +SELECT course, udf(udf(year)) FROM courseSales GROUP BY course, year HAVING 
GROUPING_ID(course) > 0
    -- !query 19 schema
    struct<>
    -- !query 19 output
   @@ -268,9 +268,9 @@ grouping()/grouping_id() can only be used with 
GroupingSets/Cube/Rollup;
    
    
    -- !query 20
   -SELECT course, year FROM courseSales GROUP BY CUBE(course, year) HAVING 
grouping__id > 0
   +SELECT udf(course), year FROM courseSales GROUP BY CUBE(course, year) 
HAVING grouping__id > 0
    -- !query 20 schema
   -struct<course:string,year:int>
   +struct<CAST(udf(cast(course as string)) AS STRING):string,year:int>
    -- !query 20 output
    Java        NULL
    NULL        2012
   @@ -281,7 +281,7 @@ dotNET   NULL
    
    -- !query 21
    SELECT course, year, GROUPING(course), GROUPING(year) FROM courseSales 
GROUP BY CUBE(course, year)
   -ORDER BY GROUPING(course), GROUPING(year), course, year
   +ORDER BY GROUPING(course), GROUPING(year), course, udf(year)
    -- !query 21 schema
    
struct<course:string,year:int,grouping(course):tinyint,grouping(year):tinyint>
    -- !query 21 output
   @@ -298,7 +298,7 @@ NULL     NULL    1       1
    
    -- !query 22
    SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY 
CUBE(course, year)
   -ORDER BY GROUPING(course), GROUPING(year), course, year
   +ORDER BY GROUPING(course), GROUPING(year), course, udf(year)
    -- !query 22 schema
    struct<course:string,year:int,grouping_id(course, year):int>
    -- !query 22 output
   @@ -314,7 +314,7 @@ NULL     NULL    3
    
    
    -- !query 23
   -SELECT course, year FROM courseSales GROUP BY course, year ORDER BY 
GROUPING(course)
   +SELECT course, udf(year) FROM courseSales GROUP BY course, udf(year) ORDER 
BY GROUPING(course)
    -- !query 23 schema
    struct<>
    -- !query 23 output
   @@ -323,7 +323,7 @@ grouping()/grouping_id() can only be used with 
GroupingSets/Cube/Rollup;
    
    
    -- !query 24
   -SELECT course, year FROM courseSales GROUP BY course, year ORDER BY 
GROUPING_ID(course)
   +SELECT course, udf(year) FROM courseSales GROUP BY course, udf(year) ORDER 
BY GROUPING_ID(course)
    -- !query 24 schema
    struct<>
    -- !query 24 output
   @@ -332,7 +332,7 @@ grouping()/grouping_id() can only be used with 
GroupingSets/Cube/Rollup;
    
    
    -- !query 25
   -SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY 
grouping__id, course, year
   +SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY 
grouping__id, udf(course), year
    -- !query 25 schema
    struct<course:string,year:int>
    -- !query 25 output
   @@ -348,7 +348,7 @@ NULL     NULL
    
    
    -- !query 26
   -SELECT a + b AS k1, b AS k2, SUM(a - b) FROM testData GROUP BY CUBE(k1, k2)
   +SELECT udf(a + b) AS k1, udf(b) AS k2, SUM(a - b) FROM testData GROUP BY 
CUBE(k1, k2)
    -- !query 26 schema
    struct<k1:int,k2:int,sum((a - b)):bigint>
    -- !query 26 output
   @@ -368,7 +368,7 @@ NULL     NULL    3
    
    
    -- !query 27
   -SELECT a + b AS k, b, SUM(a - b) FROM testData GROUP BY ROLLUP(k, b)
   +SELECT udf(udf(a + b)) AS k, b, SUM(a - b) FROM testData GROUP BY ROLLUP(k, 
b)
    -- !query 27 schema
    struct<k:int,b:int,sum((a - b)):bigint>
    -- !query 27 output
   @@ -386,9 +386,9 @@ NULL     NULL    3
    
    
    -- !query 28
   -SELECT a + b, b AS k, SUM(a - b) FROM testData GROUP BY a + b, k GROUPING 
SETS(k)
   +SELECT udf(a + b), udf(udf(b)) AS k, SUM(a - b) FROM testData GROUP BY a + 
b, k GROUPING SETS(k)
    -- !query 28 schema
   -struct<(a + b):int,k:int,sum((a - b)):bigint>
   +struct<CAST(udf(cast((a + b) as string)) AS INT):int,k:int,sum((a - 
b)):bigint>
    -- !query 28 output
    NULL        1       3
    NULL        2       0
   
   ```
   ## How was this patch tested?
   
   Tested as guided in SPARK-27921.
   Verified pandas & pyarrow versions:
   
   Python 3.6.8 (default, Jan 14 2019, 11:02:34) 
   [GCC 8.0.1 20180414 (experimental) [trunk revision 259383]] on linux
   Type "help", "copyright", "credits" or "license" for more information.
   >>> import pandas
   >>> import pyarrow
   >>> pyarrow.__version__
   '0.14.0'
   >>> pandas.__version__
   '0.24.2'
   From the sql output it seems that sql statements are evaluated correctly 
given that udf returns a string and may change results as Null will be returned 
as None and will be counted in returned values.
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to