skonto commented on issue #25098: [SPARK-28280][SQL][PYTHON][TESTS] Convert and 
port 'group-by.sql' into UDF test base
URL: https://github.com/apache/spark/pull/25098#issuecomment-510222840
 
 
   Ok I found out what is wrong. The current golden file is the output of the 
pandas udf testcases which come at the end. Each type of udf overwrites the 
file:
   Compared to pandas here is the diff for scala udf:
   
   ```diff
   diff --git 
a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out 
b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out
   index 97c831aec4..58ed37fd56 100644
   --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out
   +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out
   @@ -37,7 +37,7 @@ struct<udf(a):string,count(udf(b)):bigint>
    1   2
    2   2
    3   3
   -nan 2
   +null        2
    
    
    -- !query 4
   @@ -101,7 +101,7 @@ struct<udf((a + b)):string,udf(count(b)):string>
    3   2
    4   2
    5   1
   -nan 1
   +null        1
    
    
    -- !query 11
   @@ -121,7 +121,7 @@ struct<udf(((a + 1) + 1)):string,udf(count(b)):string>
    3   2
    4   2
    5   2
   -nan 1
   +null        1
    
    
    -- !query 13
   @@ -130,7 +130,7 @@ FROM testData
    -- !query 13 schema
    struct<skewness(CAST(udf(a) AS DOUBLE)):double,udf(kurtosis(cast(a as 
double))):string,udf(min(a)):string,max(udf(a)):string,udf(avg(cast(udf(a) as 
double))):string,udf(var_samp(cast(a as 
double))):string,stddev_samp(CAST(udf(a) AS DOUBLE)):double,udf(sum(cast(a as 
bigint))):string,udf(count(a)):string>
    -- !query 13 output
   --0.2723801058145729 -1.5069204152249134     1       nan     
2.142857142857143       0.8095238095238094      0.8997354108424372      15      
7
   +-0.2723801058145729 -1.5069204152249134     1       null    
2.142857142857143       0.8095238095238094      0.8997354108424372      15      
7
    
    
    -- !query 14
   @@ -295,7 +295,7 @@ SELECT udf(every(v)), udf(some(v)), any(v) FROM test_agg 
WHERE 1 = 0
    -- !query 31 schema
    struct<udf(every(v)):string,udf(some(v)):string,any(v):boolean>
    -- !query 31 output
   -None        None    NULL
   +null        null    NULL
    
    
    -- !query 32
   @@ -303,7 +303,7 @@ SELECT udf(every(v)), some(v), any(v) FROM test_agg 
WHERE k = 4
    -- !query 32 schema
    struct<udf(every(v)):string,some(v):boolean,any(v):boolean>
    -- !query 32 output
   -None        NULL    NULL
   +null        NULL    NULL
    
    
    -- !query 33
   @@ -311,7 +311,7 @@ SELECT every(v), udf(some(v)), any(v) FROM test_agg 
WHERE k = 5
    -- !query 33 schema
    struct<every(v):boolean,udf(some(v)):string,any(v):boolean>
    -- !query 33 output
   -false       True    true
   +false       true    true
    
    
    -- !query 34
   @@ -319,11 +319,11 @@ SELECT k, every(v), udf(some(v)), any(v) FROM test_agg 
GROUP BY k
    -- !query 34 schema
    struct<k:int,every(v):boolean,udf(some(v)):string,any(v):boolean>
    -- !query 34 output
   -1   false   True    true
   -2   true    True    true
   -3   false   False   false
   -4   NULL    None    NULL
   -5   false   True    true
   +1   false   true    true
   +2   true    true    true
   +3   false   false   false
   +4   NULL    null    NULL
   +5   false   true    true
    
    
    -- !query 35
   @@ -356,7 +356,7 @@ GROUP  BY k
    -- !query 37 schema
    struct<k:int,every:string>
    -- !query 37 output
   -2   True
   +2   true
    
    
    -- !query 38
   @@ -432,16 +432,16 @@ SELECT k, udf(udf(v)), some(v) OVER (PARTITION BY k 
ORDER BY v) FROM test_agg
    -- !query 44 schema
    struct<k:int,udf(udf(v)):string,some(v) OVER (PARTITION BY k ORDER BY v ASC 
NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):boolean>
    -- !query 44 output
   -1   False   false
   -1   True    true
   -2   True    true
   -3   False   false
   -3   None    NULL
   -4   None    NULL
   -4   None    NULL
   -5   False   false
   -5   None    NULL
   -5   True    true
   +1   false   false
   +1   true    true
   +2   true    true
   +3   false   false
   +3   null    NULL
   +4   null    NULL
   +4   null    NULL
   +5   false   false
   +5   null    NULL
   +5   true    true
    
    
    -- !query 45
   @@ -474,9 +474,9 @@ SELECT k, udf(max(udf(v))) FROM test_agg GROUP BY k 
HAVING max(v) = true
    -- !query 47 schema
    struct<k:int,udf(max(udf(v))):string>
    -- !query 47 output
   -1   True
   -2   True
   -5   True
   +1   true
   +2   true
   +5   true
    
    
    -- !query 48
   ```
   and here is the diff with python udf:
   ```diff
   diff --git 
a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out 
b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out
   index 97c831aec4..487fbc86f7 100644
   --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out
   +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out
   @@ -37,7 +37,7 @@ struct<udf(a):string,count(udf(b)):bigint>
    1   2
    2   2
    3   3
   -nan 2
   +None        2
    
    
    -- !query 4
   @@ -101,7 +101,7 @@ struct<udf((a + b)):string,udf(count(b)):string>
    3   2
    4   2
    5   1
   -nan 1
   +None        1
    
    
    -- !query 11
   @@ -121,7 +121,7 @@ struct<udf(((a + 1) + 1)):string,udf(count(b)):string>
    3   2
    4   2
    5   2
   -nan 1
   +None        1
    
    
    -- !query 13
   @@ -130,7 +130,7 @@ FROM testData
    -- !query 13 schema
    struct<skewness(CAST(udf(a) AS DOUBLE)):double,udf(kurtosis(cast(a as 
double))):string,udf(min(a)):string,max(udf(a)):string,udf(avg(cast(udf(a) as 
double))):string,udf(var_samp(cast(a as 
double))):string,stddev_samp(CAST(udf(a) AS DOUBLE)):double,udf(sum(cast(a as 
bigint))):string,udf(count(a)):string>
    -- !query 13 output
   --0.2723801058145729 -1.5069204152249134     1       nan     
2.142857142857143       0.8095238095238094      0.8997354108424372      15      
7
   +-0.2723801058145729 -1.5069204152249134     1       None    
2.142857142857143       0.8095238095238094      0.8997354108424372      15      
7
    
    
    -- !query 14
   
   ```
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to