(incubator-gluten) branch main updated: [VL] Enable linear-regression.sql test case in GlutenSQLQueryTestSuite for Spark34 (#5306)

yangzy Tue, 09 Apr 2024 20:50:23 -0700

This is an automated email from the ASF dual-hosted git repository.

yangzy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git



The following commit(s) were added to refs/heads/main by this push:
     new 383b99912 [VL] Enable linear-regression.sql test case in 
GlutenSQLQueryTestSuite for Spark34 (#5306)
383b99912 is described below

commit 383b99912187e52d7d897709b069165c5ce5556d
Author: Joey <[email protected]>
AuthorDate: Wed Apr 10 11:50:14 2024 +0800

    [VL] Enable linear-regression.sql test case in GlutenSQLQueryTestSuite for 
Spark34 (#5306)
---
 .../sql-tests/inputs/linear-regression.sql         |  52 ++++
 .../sql-tests/results/linear-regression.sql.out    | 276 +++++++++++++++++++++
 .../utils/velox/VeloxSQLQueryTestSettings.scala    |   2 +
 3 files changed, 330 insertions(+)

diff --git 
a/gluten-ut/spark34/src/test/resources/sql-tests/inputs/linear-regression.sql 
b/gluten-ut/spark34/src/test/resources/sql-tests/inputs/linear-regression.sql
new file mode 100644
index 000000000..c7cb5bf11
--- /dev/null
+++ 
b/gluten-ut/spark34/src/test/resources/sql-tests/inputs/linear-regression.sql
@@ -0,0 +1,52 @@
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testRegression AS SELECT * FROM VALUES
+(1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35)
+AS testRegression(k, y, x);
+
+-- SPARK-37613: Support ANSI Aggregate Function: regr_count
+SELECT regr_count(y, x) FROM testRegression;
+SELECT regr_count(y, x) FROM testRegression WHERE x IS NOT NULL;
+SELECT k, count(*), regr_count(y, x) FROM testRegression GROUP BY k;
+SELECT k, count(*) FILTER (WHERE x IS NOT NULL), regr_count(y, x) FROM 
testRegression GROUP BY k;
+
+-- SPARK-37613: Support ANSI Aggregate Function: regr_r2
+SELECT regr_r2(y, x) FROM testRegression;
+SELECT regr_r2(y, x) FROM testRegression WHERE x IS NOT NULL;
+SELECT k, corr(y, x), regr_r2(y, x) FROM testRegression GROUP BY k;
+SELECT k, corr(y, x) FILTER (WHERE x IS NOT NULL), regr_r2(y, x) FROM 
testRegression GROUP BY k;
+
+-- SPARK-37614: Support ANSI Aggregate Function: regr_avgx & regr_avgy
+SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression;
+SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression WHERE x IS NOT 
NULL AND y IS NOT NULL;
+SELECT k, avg(x), avg(y), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression 
GROUP BY k;
+SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), avg(y) FILTER 
(WHERE x IS NOT NULL AND y IS NOT NULL), regr_avgx(y, x), regr_avgy(y, x) FROM 
testRegression GROUP BY k;
+
+-- SPARK-37672: Support ANSI Aggregate Function: regr_sxx
+SELECT regr_sxx(y, x) FROM testRegression;
+SELECT regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT 
NULL;
+SELECT k, regr_sxx(y, x) FROM testRegression GROUP BY k;
+SELECT k, regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT 
NULL GROUP BY k;
+
+-- SPARK-37681: Support ANSI Aggregate Function: regr_sxy
+SELECT regr_sxy(y, x) FROM testRegression;
+SELECT regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT 
NULL;
+SELECT k, regr_sxy(y, x) FROM testRegression GROUP BY k;
+SELECT k, regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT 
NULL GROUP BY k;
+
+-- SPARK-37702: Support ANSI Aggregate Function: regr_syy
+SELECT regr_syy(y, x) FROM testRegression;
+SELECT regr_syy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT 
NULL;
+SELECT k, regr_syy(y, x) FROM testRegression GROUP BY k;
+SELECT k, regr_syy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT 
NULL GROUP BY k;
+
+-- SPARK-39230: Support ANSI Aggregate Function: regr_slope
+SELECT regr_slope(y, x) FROM testRegression;
+SELECT regr_slope(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT 
NULL;
+SELECT k, regr_slope(y, x) FROM testRegression GROUP BY k;
+SELECT k, regr_slope(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS 
NOT NULL GROUP BY k;
+
+-- SPARK-37623: Support ANSI Aggregate Function: regr_intercept
+SELECT regr_intercept(y, x) FROM testRegression;
+SELECT regr_intercept(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS 
NOT NULL;
+SELECT k, regr_intercept(y, x) FROM testRegression GROUP BY k;
+SELECT k, regr_intercept(y, x) FROM testRegression WHERE x IS NOT NULL AND y 
IS NOT NULL GROUP BY k;
diff --git 
a/gluten-ut/spark34/src/test/resources/sql-tests/results/linear-regression.sql.out
 
b/gluten-ut/spark34/src/test/resources/sql-tests/results/linear-regression.sql.out
new file mode 100644
index 000000000..a73347a2f
--- /dev/null
+++ 
b/gluten-ut/spark34/src/test/resources/sql-tests/results/linear-regression.sql.out
@@ -0,0 +1,276 @@
+-- Automatically generated by SQLQueryTestSuite
+-- !query
+CREATE OR REPLACE TEMPORARY VIEW testRegression AS SELECT * FROM VALUES
+(1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35)
+AS testRegression(k, y, x)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+SELECT regr_count(y, x) FROM testRegression
+-- !query schema
+struct<regr_count(y, x):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT regr_count(y, x) FROM testRegression WHERE x IS NOT NULL
+-- !query schema
+struct<regr_count(y, x):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT k, count(*), regr_count(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,count(1):bigint,regr_count(y, x):bigint>
+-- !query output
+1      1       0
+2      4       3
+
+
+-- !query
+SELECT k, count(*) FILTER (WHERE x IS NOT NULL), regr_count(y, x) FROM 
testRegression GROUP BY k
+-- !query schema
+struct<k:int,count(1) FILTER (WHERE (x IS NOT NULL)):bigint,regr_count(y, 
x):bigint>
+-- !query output
+1      0       0
+2      3       3
+
+
+-- !query
+SELECT regr_r2(y, x) FROM testRegression
+-- !query schema
+struct<regr_r2(y, x):double>
+-- !query output
+0.9976905311778291
+
+
+-- !query
+SELECT regr_r2(y, x) FROM testRegression WHERE x IS NOT NULL
+-- !query schema
+struct<regr_r2(y, x):double>
+-- !query output
+0.9976905311778291
+
+
+-- !query
+SELECT k, corr(y, x), regr_r2(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,corr(y, x):double,regr_r2(y, x):double>
+-- !query output
+1      NULL    NULL
+2      0.9988445981121532      0.9976905311778291
+
+
+-- !query
+SELECT k, corr(y, x) FILTER (WHERE x IS NOT NULL), regr_r2(y, x) FROM 
testRegression GROUP BY k
+-- !query schema
+struct<k:int,corr(y, x) FILTER (WHERE (x IS NOT NULL)):double,regr_r2(y, 
x):double>
+-- !query output
+1      NULL    NULL
+2      0.9988445981121532      0.9976905311778291
+
+
+-- !query
+SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression
+-- !query schema
+struct<regr_avgx(y, x):double,regr_avgy(y, x):double>
+-- !query output
+22.666666666666668     20.0
+
+
+-- !query
+SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression WHERE x IS NOT 
NULL AND y IS NOT NULL
+-- !query schema
+struct<regr_avgx(y, x):double,regr_avgy(y, x):double>
+-- !query output
+22.666666666666668     20.0
+
+
+-- !query
+SELECT k, avg(x), avg(y), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression 
GROUP BY k
+-- !query schema
+struct<k:int,avg(x):double,avg(y):double,regr_avgx(y, x):double,regr_avgy(y, 
x):double>
+-- !query output
+1      NULL    10.0    NULL    NULL
+2      22.666666666666668      21.25   22.666666666666668      20.0
+
+
+-- !query
+SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), avg(y) FILTER 
(WHERE x IS NOT NULL AND y IS NOT NULL), regr_avgx(y, x), regr_avgy(y, x) FROM 
testRegression GROUP BY k
+-- !query schema
+struct<k:int,avg(x) FILTER (WHERE ((x IS NOT NULL) AND (y IS NOT 
NULL))):double,avg(y) FILTER (WHERE ((x IS NOT NULL) AND (y IS NOT 
NULL))):double,regr_avgx(y, x):double,regr_avgy(y, x):double>
+-- !query output
+1      NULL    NULL    NULL    NULL
+2      22.666666666666668      20.0    22.666666666666668      20.0
+
+
+-- !query
+SELECT regr_sxx(y, x) FROM testRegression
+-- !query schema
+struct<regr_sxx(y, x):double>
+-- !query output
+288.66666666666663
+
+
+-- !query
+SELECT regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
+-- !query schema
+struct<regr_sxx(y, x):double>
+-- !query output
+288.66666666666663
+
+
+-- !query
+SELECT k, regr_sxx(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,regr_sxx(y, x):double>
+-- !query output
+1      NULL
+2      288.66666666666663
+
+
+-- !query
+SELECT k, regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT 
NULL GROUP BY k
+-- !query schema
+struct<k:int,regr_sxx(y, x):double>
+-- !query output
+2      288.66666666666663
+
+
+-- !query
+SELECT regr_sxy(y, x) FROM testRegression
+-- !query schema
+struct<regr_sxy(y, x):double>
+-- !query output
+240.0
+
+
+-- !query
+SELECT regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
+-- !query schema
+struct<regr_sxy(y, x):double>
+-- !query output
+240.0
+
+
+-- !query
+SELECT k, regr_sxy(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,regr_sxy(y, x):double>
+-- !query output
+1      NULL
+2      240.0
+
+
+-- !query
+SELECT k, regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT 
NULL GROUP BY k
+-- !query schema
+struct<k:int,regr_sxy(y, x):double>
+-- !query output
+2      240.0
+
+
+-- !query
+SELECT regr_syy(y, x) FROM testRegression
+-- !query schema
+struct<regr_syy(y, x):double>
+-- !query output
+200.0
+
+
+-- !query
+SELECT regr_syy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
+-- !query schema
+struct<regr_syy(y, x):double>
+-- !query output
+200.0
+
+
+-- !query
+SELECT k, regr_syy(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,regr_syy(y, x):double>
+-- !query output
+1      NULL
+2      200.0
+
+
+-- !query
+SELECT k, regr_syy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT 
NULL GROUP BY k
+-- !query schema
+struct<k:int,regr_syy(y, x):double>
+-- !query output
+2      200.0
+
+
+-- !query
+SELECT regr_slope(y, x) FROM testRegression
+-- !query schema
+struct<regr_slope(y, x):double>
+-- !query output
+0.8314087759815244
+
+
+-- !query
+SELECT regr_slope(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT 
NULL
+-- !query schema
+struct<regr_slope(y, x):double>
+-- !query output
+0.8314087759815244
+
+
+-- !query
+SELECT k, regr_slope(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,regr_slope(y, x):double>
+-- !query output
+1      NULL
+2      0.8314087759815244
+
+
+-- !query
+SELECT k, regr_slope(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS 
NOT NULL GROUP BY k
+-- !query schema
+struct<k:int,regr_slope(y, x):double>
+-- !query output
+2      0.8314087759815244
+
+
+-- !query
+SELECT regr_intercept(y, x) FROM testRegression
+-- !query schema
+struct<regr_intercept(y, x):double>
+-- !query output
+1.1547344110854487
+
+
+-- !query
+SELECT regr_intercept(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS 
NOT NULL
+-- !query schema
+struct<regr_intercept(y, x):double>
+-- !query output
+1.1547344110854487
+
+
+-- !query
+SELECT k, regr_intercept(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,regr_intercept(y, x):double>
+-- !query output
+1      NULL
+2      1.1547344110854487
+
+
+-- !query
+SELECT k, regr_intercept(y, x) FROM testRegression WHERE x IS NOT NULL AND y 
IS NOT NULL GROUP BY k
+-- !query schema
+struct<k:int,regr_intercept(y, x):double>
+-- !query output
+2      1.1547344110854487
diff --git 
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala
 
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala
index 364da52e7..5067de74e 100644
--- 
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala
+++ 
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala
@@ -240,6 +240,8 @@ object VeloxSQLQueryTestSettings extends 
SQLQueryTestSettings {
     // -- Aggregate with nulls.
     "group-by.sql",
     "udf/udf-group-by.sql",
+    // Overwrite some results of regr_intercept, regr_r2, corr.
+    "linear-regression.sql",
     // Exception string doesn't match for
     // SELECT (SELECT a FROM (SELECT 1 AS a UNION ALL SELECT 2 AS a) t) AS b
     "subquery/scalar-subquery/scalar-subquery-select.sql"


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(incubator-gluten) branch main updated: [VL] Enable linear-regression.sql test case in GlutenSQLQueryTestSuite for Spark34 (#5306)

Reply via email to