This is an automated email from the ASF dual-hosted git repository.
yangzy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 383b99912 [VL] Enable linear-regression.sql test case in
GlutenSQLQueryTestSuite for Spark34 (#5306)
383b99912 is described below
commit 383b99912187e52d7d897709b069165c5ce5556d
Author: Joey <[email protected]>
AuthorDate: Wed Apr 10 11:50:14 2024 +0800
[VL] Enable linear-regression.sql test case in GlutenSQLQueryTestSuite for
Spark34 (#5306)
---
.../sql-tests/inputs/linear-regression.sql | 52 ++++
.../sql-tests/results/linear-regression.sql.out | 276 +++++++++++++++++++++
.../utils/velox/VeloxSQLQueryTestSettings.scala | 2 +
3 files changed, 330 insertions(+)
diff --git
a/gluten-ut/spark34/src/test/resources/sql-tests/inputs/linear-regression.sql
b/gluten-ut/spark34/src/test/resources/sql-tests/inputs/linear-regression.sql
new file mode 100644
index 000000000..c7cb5bf11
--- /dev/null
+++
b/gluten-ut/spark34/src/test/resources/sql-tests/inputs/linear-regression.sql
@@ -0,0 +1,52 @@
+-- Test data.
+CREATE OR REPLACE TEMPORARY VIEW testRegression AS SELECT * FROM VALUES
+(1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35)
+AS testRegression(k, y, x);
+
+-- SPARK-37613: Support ANSI Aggregate Function: regr_count
+SELECT regr_count(y, x) FROM testRegression;
+SELECT regr_count(y, x) FROM testRegression WHERE x IS NOT NULL;
+SELECT k, count(*), regr_count(y, x) FROM testRegression GROUP BY k;
+SELECT k, count(*) FILTER (WHERE x IS NOT NULL), regr_count(y, x) FROM
testRegression GROUP BY k;
+
+-- SPARK-37613: Support ANSI Aggregate Function: regr_r2
+SELECT regr_r2(y, x) FROM testRegression;
+SELECT regr_r2(y, x) FROM testRegression WHERE x IS NOT NULL;
+SELECT k, corr(y, x), regr_r2(y, x) FROM testRegression GROUP BY k;
+SELECT k, corr(y, x) FILTER (WHERE x IS NOT NULL), regr_r2(y, x) FROM
testRegression GROUP BY k;
+
+-- SPARK-37614: Support ANSI Aggregate Function: regr_avgx & regr_avgy
+SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression;
+SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression WHERE x IS NOT
NULL AND y IS NOT NULL;
+SELECT k, avg(x), avg(y), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression
GROUP BY k;
+SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), avg(y) FILTER
(WHERE x IS NOT NULL AND y IS NOT NULL), regr_avgx(y, x), regr_avgy(y, x) FROM
testRegression GROUP BY k;
+
+-- SPARK-37672: Support ANSI Aggregate Function: regr_sxx
+SELECT regr_sxx(y, x) FROM testRegression;
+SELECT regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT
NULL;
+SELECT k, regr_sxx(y, x) FROM testRegression GROUP BY k;
+SELECT k, regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT
NULL GROUP BY k;
+
+-- SPARK-37681: Support ANSI Aggregate Function: regr_sxy
+SELECT regr_sxy(y, x) FROM testRegression;
+SELECT regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT
NULL;
+SELECT k, regr_sxy(y, x) FROM testRegression GROUP BY k;
+SELECT k, regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT
NULL GROUP BY k;
+
+-- SPARK-37702: Support ANSI Aggregate Function: regr_syy
+SELECT regr_syy(y, x) FROM testRegression;
+SELECT regr_syy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT
NULL;
+SELECT k, regr_syy(y, x) FROM testRegression GROUP BY k;
+SELECT k, regr_syy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT
NULL GROUP BY k;
+
+-- SPARK-39230: Support ANSI Aggregate Function: regr_slope
+SELECT regr_slope(y, x) FROM testRegression;
+SELECT regr_slope(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT
NULL;
+SELECT k, regr_slope(y, x) FROM testRegression GROUP BY k;
+SELECT k, regr_slope(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS
NOT NULL GROUP BY k;
+
+-- SPARK-37623: Support ANSI Aggregate Function: regr_intercept
+SELECT regr_intercept(y, x) FROM testRegression;
+SELECT regr_intercept(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS
NOT NULL;
+SELECT k, regr_intercept(y, x) FROM testRegression GROUP BY k;
+SELECT k, regr_intercept(y, x) FROM testRegression WHERE x IS NOT NULL AND y
IS NOT NULL GROUP BY k;
diff --git
a/gluten-ut/spark34/src/test/resources/sql-tests/results/linear-regression.sql.out
b/gluten-ut/spark34/src/test/resources/sql-tests/results/linear-regression.sql.out
new file mode 100644
index 000000000..a73347a2f
--- /dev/null
+++
b/gluten-ut/spark34/src/test/resources/sql-tests/results/linear-regression.sql.out
@@ -0,0 +1,276 @@
+-- Automatically generated by SQLQueryTestSuite
+-- !query
+CREATE OR REPLACE TEMPORARY VIEW testRegression AS SELECT * FROM VALUES
+(1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35)
+AS testRegression(k, y, x)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+SELECT regr_count(y, x) FROM testRegression
+-- !query schema
+struct<regr_count(y, x):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT regr_count(y, x) FROM testRegression WHERE x IS NOT NULL
+-- !query schema
+struct<regr_count(y, x):bigint>
+-- !query output
+3
+
+
+-- !query
+SELECT k, count(*), regr_count(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,count(1):bigint,regr_count(y, x):bigint>
+-- !query output
+1 1 0
+2 4 3
+
+
+-- !query
+SELECT k, count(*) FILTER (WHERE x IS NOT NULL), regr_count(y, x) FROM
testRegression GROUP BY k
+-- !query schema
+struct<k:int,count(1) FILTER (WHERE (x IS NOT NULL)):bigint,regr_count(y,
x):bigint>
+-- !query output
+1 0 0
+2 3 3
+
+
+-- !query
+SELECT regr_r2(y, x) FROM testRegression
+-- !query schema
+struct<regr_r2(y, x):double>
+-- !query output
+0.9976905311778291
+
+
+-- !query
+SELECT regr_r2(y, x) FROM testRegression WHERE x IS NOT NULL
+-- !query schema
+struct<regr_r2(y, x):double>
+-- !query output
+0.9976905311778291
+
+
+-- !query
+SELECT k, corr(y, x), regr_r2(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,corr(y, x):double,regr_r2(y, x):double>
+-- !query output
+1 NULL NULL
+2 0.9988445981121532 0.9976905311778291
+
+
+-- !query
+SELECT k, corr(y, x) FILTER (WHERE x IS NOT NULL), regr_r2(y, x) FROM
testRegression GROUP BY k
+-- !query schema
+struct<k:int,corr(y, x) FILTER (WHERE (x IS NOT NULL)):double,regr_r2(y,
x):double>
+-- !query output
+1 NULL NULL
+2 0.9988445981121532 0.9976905311778291
+
+
+-- !query
+SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression
+-- !query schema
+struct<regr_avgx(y, x):double,regr_avgy(y, x):double>
+-- !query output
+22.666666666666668 20.0
+
+
+-- !query
+SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression WHERE x IS NOT
NULL AND y IS NOT NULL
+-- !query schema
+struct<regr_avgx(y, x):double,regr_avgy(y, x):double>
+-- !query output
+22.666666666666668 20.0
+
+
+-- !query
+SELECT k, avg(x), avg(y), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression
GROUP BY k
+-- !query schema
+struct<k:int,avg(x):double,avg(y):double,regr_avgx(y, x):double,regr_avgy(y,
x):double>
+-- !query output
+1 NULL 10.0 NULL NULL
+2 22.666666666666668 21.25 22.666666666666668 20.0
+
+
+-- !query
+SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), avg(y) FILTER
(WHERE x IS NOT NULL AND y IS NOT NULL), regr_avgx(y, x), regr_avgy(y, x) FROM
testRegression GROUP BY k
+-- !query schema
+struct<k:int,avg(x) FILTER (WHERE ((x IS NOT NULL) AND (y IS NOT
NULL))):double,avg(y) FILTER (WHERE ((x IS NOT NULL) AND (y IS NOT
NULL))):double,regr_avgx(y, x):double,regr_avgy(y, x):double>
+-- !query output
+1 NULL NULL NULL NULL
+2 22.666666666666668 20.0 22.666666666666668 20.0
+
+
+-- !query
+SELECT regr_sxx(y, x) FROM testRegression
+-- !query schema
+struct<regr_sxx(y, x):double>
+-- !query output
+288.66666666666663
+
+
+-- !query
+SELECT regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
+-- !query schema
+struct<regr_sxx(y, x):double>
+-- !query output
+288.66666666666663
+
+
+-- !query
+SELECT k, regr_sxx(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,regr_sxx(y, x):double>
+-- !query output
+1 NULL
+2 288.66666666666663
+
+
+-- !query
+SELECT k, regr_sxx(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT
NULL GROUP BY k
+-- !query schema
+struct<k:int,regr_sxx(y, x):double>
+-- !query output
+2 288.66666666666663
+
+
+-- !query
+SELECT regr_sxy(y, x) FROM testRegression
+-- !query schema
+struct<regr_sxy(y, x):double>
+-- !query output
+240.0
+
+
+-- !query
+SELECT regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
+-- !query schema
+struct<regr_sxy(y, x):double>
+-- !query output
+240.0
+
+
+-- !query
+SELECT k, regr_sxy(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,regr_sxy(y, x):double>
+-- !query output
+1 NULL
+2 240.0
+
+
+-- !query
+SELECT k, regr_sxy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT
NULL GROUP BY k
+-- !query schema
+struct<k:int,regr_sxy(y, x):double>
+-- !query output
+2 240.0
+
+
+-- !query
+SELECT regr_syy(y, x) FROM testRegression
+-- !query schema
+struct<regr_syy(y, x):double>
+-- !query output
+200.0
+
+
+-- !query
+SELECT regr_syy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
+-- !query schema
+struct<regr_syy(y, x):double>
+-- !query output
+200.0
+
+
+-- !query
+SELECT k, regr_syy(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,regr_syy(y, x):double>
+-- !query output
+1 NULL
+2 200.0
+
+
+-- !query
+SELECT k, regr_syy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT
NULL GROUP BY k
+-- !query schema
+struct<k:int,regr_syy(y, x):double>
+-- !query output
+2 200.0
+
+
+-- !query
+SELECT regr_slope(y, x) FROM testRegression
+-- !query schema
+struct<regr_slope(y, x):double>
+-- !query output
+0.8314087759815244
+
+
+-- !query
+SELECT regr_slope(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT
NULL
+-- !query schema
+struct<regr_slope(y, x):double>
+-- !query output
+0.8314087759815244
+
+
+-- !query
+SELECT k, regr_slope(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,regr_slope(y, x):double>
+-- !query output
+1 NULL
+2 0.8314087759815244
+
+
+-- !query
+SELECT k, regr_slope(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS
NOT NULL GROUP BY k
+-- !query schema
+struct<k:int,regr_slope(y, x):double>
+-- !query output
+2 0.8314087759815244
+
+
+-- !query
+SELECT regr_intercept(y, x) FROM testRegression
+-- !query schema
+struct<regr_intercept(y, x):double>
+-- !query output
+1.1547344110854487
+
+
+-- !query
+SELECT regr_intercept(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS
NOT NULL
+-- !query schema
+struct<regr_intercept(y, x):double>
+-- !query output
+1.1547344110854487
+
+
+-- !query
+SELECT k, regr_intercept(y, x) FROM testRegression GROUP BY k
+-- !query schema
+struct<k:int,regr_intercept(y, x):double>
+-- !query output
+1 NULL
+2 1.1547344110854487
+
+
+-- !query
+SELECT k, regr_intercept(y, x) FROM testRegression WHERE x IS NOT NULL AND y
IS NOT NULL GROUP BY k
+-- !query schema
+struct<k:int,regr_intercept(y, x):double>
+-- !query output
+2 1.1547344110854487
diff --git
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala
index 364da52e7..5067de74e 100644
---
a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala
+++
b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxSQLQueryTestSettings.scala
@@ -240,6 +240,8 @@ object VeloxSQLQueryTestSettings extends
SQLQueryTestSettings {
// -- Aggregate with nulls.
"group-by.sql",
"udf/udf-group-by.sql",
+ // Overwrite some results of regr_intercept, regr_r2, corr.
+ "linear-regression.sql",
// Exception string doesn't match for
// SELECT (SELECT a FROM (SELECT 1 AS a UNION ALL SELECT 2 AS a) t) AS b
"subquery/scalar-subquery/scalar-subquery-select.sql"
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]