This is an automated email from the ASF dual-hosted git repository.
ulyssesyou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 657966e07 [VL] Fix spark34 group-by.sql(.out) in
GlutenSQLQueryTestSuite (#5162)
657966e07 is described below
commit 657966e0720ede25a5dcd5b3f8cb6cd719db6eab
Author: Joey <[email protected]>
AuthorDate: Thu Mar 28 14:05:30 2024 +0800
[VL] Fix spark34 group-by.sql(.out) in GlutenSQLQueryTestSuite (#5162)
---
.../utils/velox/VeloxSQLQueryTestSettings.scala | 2 +-
.../test/resources/sql-tests/inputs/group-by.sql | 65 ++-----
.../resources/sql-tests/results/group-by.sql.out | 192 +++++----------------
.../utils/velox/VeloxSQLQueryTestSettings.scala | 2 +-
4 files changed, 61 insertions(+), 200 deletions(-)
diff --git
a/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
b/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
index 11f77b9e8..26428b3a4 100644
---
a/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
+++
b/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
@@ -234,7 +234,7 @@ object VeloxSQLQueryTestSettings extends
SQLQueryTestSettings {
private val OVERWRITE_SQL_QUERY_LIST: Set[String] = Set(
// Velox corr has better computation logic but it fails Spark's precision
check.
// Remove -- SPARK-24369 multiple distinct aggregations having the same
argument set,
- // -- SPARK-37613: Support ANSI Aggregate Function: regr_r2
+ // -- SPARK-37613: Support ANSI Aggregate Function: regr_r2
"group-by.sql",
// Remove -- SPARK-24369 multiple distinct aggregations having the same
argument set
"udf/udf-group-by.sql"
diff --git a/gluten-ut/spark34/src/test/resources/sql-tests/inputs/group-by.sql
b/gluten-ut/spark34/src/test/resources/sql-tests/inputs/group-by.sql
index 331cd9440..b618ad1d5 100644
--- a/gluten-ut/spark34/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/gluten-ut/spark34/src/test/resources/sql-tests/inputs/group-by.sql
@@ -7,12 +7,6 @@
CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null,
null)
AS testData(a, b);
-CREATE OR REPLACE TEMPORARY VIEW testRegression AS SELECT * FROM VALUES
-(1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35)
-AS testRegression(k, y, x);
-CREATE OR REPLACE TEMPORARY VIEW aggr AS SELECT * FROM VALUES
-(0, 0), (0, 10), (0, 20), (0, 30), (0, 40), (1, 10), (1, 20), (2, 10), (2,
20), (2, 25), (2, 30), (3, 60), (4, null)
-AS aggr(k, v);
-- Aggregate with empty GroupBy expressions.
SELECT a, COUNT(b) FROM testData;
@@ -40,6 +34,9 @@ SELECT a + b, COUNT(b) FROM testData GROUP BY a + b;
SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1;
SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1;
+-- struct() in group by
+SELECT count(1) FROM testData GROUP BY struct(a + 0.1 AS aa);
+
-- Aggregate with nulls.
SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a),
STDDEV(a), SUM(a), COUNT(a)
FROM testData;
@@ -233,17 +230,6 @@ FROM VALUES (CAST(NULL AS DOUBLE)), (CAST(NULL AS
DOUBLE)), (CAST(NULL AS DOUBLE
SELECT histogram_numeric(col, 3)
FROM VALUES (CAST(NULL AS INT)), (CAST(NULL AS INT)), (CAST(NULL AS INT)) AS
tab(col);
-
--- SPARK-37613: Support ANSI Aggregate Function: regr_count
-SELECT regr_count(y, x) FROM testRegression;
-SELECT regr_count(y, x) FROM testRegression WHERE x IS NOT NULL;
-SELECT k, count(*), regr_count(y, x) FROM testRegression GROUP BY k;
-SELECT k, count(*) FILTER (WHERE x IS NOT NULL), regr_count(y, x) FROM
testRegression GROUP BY k;
-
--- SPARK-37613: Support ANSI Aggregate Function: regr_r2
-SELECT regr_r2(y, x) FROM testRegression;
-SELECT regr_r2(y, x) FROM testRegression WHERE x IS NOT NULL;
-
-- SPARK-27974: Support ANSI Aggregate Function: array_agg
SELECT
collect_list(col),
@@ -258,34 +244,19 @@ FROM VALUES
(1,4),(2,3),(1,4),(2,4) AS v(a,b)
GROUP BY a;
--- SPARK-37614: Support ANSI Aggregate Function: regr_avgx & regr_avgy
-SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression;
-SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression WHERE x IS NOT
NULL AND y IS NOT NULL;
-SELECT k, avg(x), avg(y), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression
GROUP BY k;
-SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), avg(y) FILTER
(WHERE x IS NOT NULL AND y IS NOT NULL), regr_avgx(y, x), regr_avgy(y, x) FROM
testRegression GROUP BY k;
--- SPARK-37676: Support ANSI Aggregation Function: percentile_cont
-SELECT
- percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
- percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr;
-SELECT
- k,
- percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
- percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr
-GROUP BY k
-ORDER BY k;
-
--- SPARK-37691: Support ANSI Aggregation Function: percentile_disc
-SELECT
- percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
- percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr;
-SELECT
- k,
- percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
- percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr
-GROUP BY k
-ORDER BY k;
+SELECT mode(a), mode(b) FROM testData;
+SELECT a, mode(b) FROM testData GROUP BY a ORDER BY a;
+
+
+-- SPARK-44846: PushFoldableIntoBranches in complex grouping expressions cause
bindReference error
+SELECT c * 2 AS d
+FROM (
+ SELECT if(b > 1, 1, b) AS c
+ FROM (
+ SELECT if(a < 0, 0, a) AS b
+ FROM VALUES (-1), (1), (2) AS t1(a)
+ ) t2
+ GROUP BY b
+ ) t3
+GROUP BY c;
diff --git
a/gluten-ut/spark34/src/test/resources/sql-tests/results/group-by.sql.out
b/gluten-ut/spark34/src/test/resources/sql-tests/results/group-by.sql.out
index eb0496194..581446305 100644
--- a/gluten-ut/spark34/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/gluten-ut/spark34/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,7 +1,4 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 101
-
-
-- !query
CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null,
null)
@@ -12,26 +9,6 @@ struct<>
--- !query
-CREATE OR REPLACE TEMPORARY VIEW testRegression AS SELECT * FROM VALUES
-(1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35)
-AS testRegression(k, y, x)
--- !query schema
-struct<>
--- !query output
-
-
-
--- !query
-CREATE OR REPLACE TEMPORARY VIEW aggr AS SELECT * FROM VALUES
-(0, 0), (0, 10), (0, 20), (0, 30), (0, 40), (1, 10), (1, 20), (2, 10), (2,
20), (2, 25), (2, 30), (3, 60), (4, null)
-AS aggr(k, v)
--- !query schema
-struct<>
--- !query output
-
-
-
-- !query
SELECT a, COUNT(b) FROM testData
-- !query schema
@@ -168,6 +145,17 @@ struct<((a + 1) + 1):int,count(b):bigint>
NULL 1
+-- !query
+SELECT count(1) FROM testData GROUP BY struct(a + 0.1 AS aa)
+-- !query schema
+struct<count(1):bigint>
+-- !query output
+2
+2
+2
+3
+
+
-- !query
SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a),
STDDEV(a), SUM(a), COUNT(a)
FROM testData
@@ -274,6 +262,7 @@ org.apache.spark.sql.AnalysisException
}
}
+
-- !query
set spark.sql.groupByAliases=false
-- !query schema
@@ -304,6 +293,7 @@ org.apache.spark.sql.AnalysisException
} ]
}
+
-- !query
SELECT a, COUNT(1) FROM testData WHERE false GROUP BY a
-- !query schema
@@ -341,7 +331,6 @@ struct<1:int>
-- !query output
-
-- !query
SELECT 1 FROM range(10) HAVING true
-- !query schema
@@ -376,6 +365,7 @@ org.apache.spark.sql.AnalysisException
} ]
}
+
-- !query
SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=true
-- !query schema
@@ -423,6 +413,7 @@ org.apache.spark.sql.AnalysisException
} ]
}
+
-- !query
SELECT id FROM range(10) HAVING id > 0
-- !query schema
@@ -569,6 +560,7 @@ org.apache.spark.sql.AnalysisException
} ]
}
+
-- !query
SELECT some(1S)
-- !query schema
@@ -594,6 +586,7 @@ org.apache.spark.sql.AnalysisException
} ]
}
+
-- !query
SELECT any(1L)
-- !query schema
@@ -619,6 +612,7 @@ org.apache.spark.sql.AnalysisException
} ]
}
+
-- !query
SELECT every("true")
-- !query schema
@@ -644,6 +638,7 @@ org.apache.spark.sql.AnalysisException
} ]
}
+
-- !query
SELECT bool_and(1.0)
-- !query schema
@@ -1054,56 +1049,6 @@ struct<histogram_numeric(col,
3):array<struct<x:int,y:double>>>
NULL
--- !query
-SELECT regr_count(y, x) FROM testRegression
--- !query schema
-struct<regr_count(y, x):bigint>
--- !query output
-3
-
-
--- !query
-SELECT regr_count(y, x) FROM testRegression WHERE x IS NOT NULL
--- !query schema
-struct<regr_count(y, x):bigint>
--- !query output
-3
-
-
--- !query
-SELECT k, count(*), regr_count(y, x) FROM testRegression GROUP BY k
--- !query schema
-struct<k:int,count(1):bigint,regr_count(y, x):bigint>
--- !query output
-1 1 0
-2 4 3
-
-
--- !query
-SELECT k, count(*) FILTER (WHERE x IS NOT NULL), regr_count(y, x) FROM
testRegression GROUP BY k
--- !query schema
-struct<k:int,count(1) FILTER (WHERE (x IS NOT NULL)):bigint,regr_count(y,
x):bigint>
--- !query output
-1 0 0
-2 3 3
-
-
--- !query
-SELECT regr_r2(y, x) FROM testRegression
--- !query schema
-struct<regr_r2(y, x):double>
--- !query output
-0.997690531177829
-
-
--- !query
-SELECT regr_r2(y, x) FROM testRegression WHERE x IS NOT NULL
--- !query schema
-struct<regr_r2(y, x):double>
--- !query output
-0.997690531177829
-
-
-- !query
SELECT
collect_list(col),
@@ -1132,92 +1077,37 @@
struct<a:int,collect_list(b):array<int>,collect_list(b):array<int>>
-- !query
-SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression
--- !query schema
-struct<regr_avgx(y, x):double,regr_avgy(y, x):double>
--- !query output
-22.666666666666668 20.0
-
-
--- !query
-SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression WHERE x IS NOT
NULL AND y IS NOT NULL
+SELECT mode(a), mode(b) FROM testData
-- !query schema
-struct<regr_avgx(y, x):double,regr_avgy(y, x):double>
+struct<mode(a):int,mode(b):int>
-- !query output
-22.666666666666668 20.0
+3 1
-- !query
-SELECT k, avg(x), avg(y), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression
GROUP BY k
+SELECT a, mode(b) FROM testData GROUP BY a ORDER BY a
-- !query schema
-struct<k:int,avg(x):double,avg(y):double,regr_avgx(y, x):double,regr_avgy(y,
x):double>
+struct<a:int,mode(b):int>
-- !query output
-1 NULL 10.0 NULL NULL
-2 22.666666666666668 21.25 22.666666666666668 20.0
-
-
--- !query
-SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), avg(y) FILTER
(WHERE x IS NOT NULL AND y IS NOT NULL), regr_avgx(y, x), regr_avgy(y, x) FROM
testRegression GROUP BY k
--- !query schema
-struct<k:int,avg(x) FILTER (WHERE ((x IS NOT NULL) AND (y IS NOT
NULL))):double,avg(y) FILTER (WHERE ((x IS NOT NULL) AND (y IS NOT
NULL))):double,regr_avgx(y, x):double,regr_avgy(y, x):double>
--- !query output
-1 NULL NULL NULL NULL
-2 22.666666666666668 20.0 22.666666666666668 20.0
-
-
--- !query
-SELECT
- percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
- percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr
--- !query schema
-struct<percentile_cont(0.25) WITHIN GROUP (ORDER BY
v):double,percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC):double>
--- !query output
-10.0 30.0
-
-
--- !query
-SELECT
- k,
- percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
- percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr
-GROUP BY k
-ORDER BY k
--- !query schema
-struct<k:int,percentile_cont(0.25) WITHIN GROUP (ORDER BY
v):double,percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC):double>
--- !query output
-0 10.0 30.0
-1 12.5 17.5
-2 17.5 26.25
-3 60.0 60.0
-4 NULL NULL
+NULL 1
+1 1
+2 1
+3 1
-- !query
-SELECT
- percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
- percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr
+SELECT c * 2 AS d
+FROM (
+ SELECT if(b > 1, 1, b) AS c
+ FROM (
+ SELECT if(a < 0, 0, a) AS b
+ FROM VALUES (-1), (1), (2) AS t1(a)
+ ) t2
+ GROUP BY b
+ ) t3
+GROUP BY c
-- !query schema
-struct<percentile_disc(0.25) WITHIN GROUP (ORDER BY
v):double,percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC):double>
+struct<d:int>
-- !query output
-10.0 30.0
-
-
--- !query
-SELECT
- k,
- percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
- percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr
-GROUP BY k
-ORDER BY k
--- !query schema
-struct<k:int,percentile_disc(0.25) WITHIN GROUP (ORDER BY
v):double,percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC):double>
--- !query output
-0 10.0 30.0
-1 10.0 20.0
-2 10.0 30.0
-3 60.0 60.0
-4 NULL NULL
+0
+2
diff --git
a/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
b/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
index 81c664526..8f22a2d0c 100644
---
a/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
+++
b/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
@@ -63,7 +63,6 @@ object VeloxSQLQueryTestSettings extends SQLQueryTestSettings
{
"extract.sql",
"group-by-filter.sql",
"group-by-ordinal.sql",
- "group-by.sql",
"grouping_set.sql",
"having.sql",
"ignored.sql",
@@ -236,6 +235,7 @@ object VeloxSQLQueryTestSettings extends
SQLQueryTestSettings {
val OVERWRITE_SQL_QUERY_LIST: Set[String] = Set(
// Velox corr has better computation logic but it fails Spark's precision
check.
// Remove -- SPARK-24369 multiple distinct aggregations having the same
argument set
+ "group-by.sql",
"udf/udf-group-by.sql",
// Exception string doesn't match for
// SELECT (SELECT a FROM (SELECT 1 AS a UNION ALL SELECT 2 AS a) t) AS b
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]