This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.3 by this push: new 784cb0f68af [SPARK-40218][SQL] GROUPING SETS should preserve the grouping columns 784cb0f68af is described below commit 784cb0f68af84ac85e66b67439c8c7289f1c8b6e Author: Wenchen Fan <wenc...@databricks.com> AuthorDate: Fri Aug 26 15:24:01 2022 +0800 [SPARK-40218][SQL] GROUPING SETS should preserve the grouping columns ### What changes were proposed in this pull request? This PR fixes a bug caused by https://github.com/apache/spark/pull/32022 . Although we deprecate `GROUP BY ... GROUPING SETS ...`, it should still work if it worked before. https://github.com/apache/spark/pull/32022 made a mistake that it didn't preserve the order of user-specified group by columns. Usually it's not a problem, as `GROUP BY a, b` is no different from `GROUP BY b, a`. However, the `grouping_id(...)` function requires the input to be exactly the same with the group by columns. This PR fixes the problem by preserve the order of user-specified group by columns. ### Why are the changes needed? bug fix ### Does this PR introduce _any_ user-facing change? Yes, now a query that worked before 3.2 can work again. ### How was this patch tested? new test Closes #37655 from cloud-fan/grouping. Authored-by: Wenchen Fan <wenc...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> (cherry picked from commit 1ed592ef28abdb14aa1d8c8a129d6ac3084ffb0c) Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../org/apache/spark/sql/catalyst/expressions/grouping.scala | 9 +++++++-- sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql | 3 +++ .../src/test/resources/sql-tests/results/grouping_set.sql.out | 11 +++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala index 8ce0e57b691..22e25b31f2e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala @@ -151,10 +151,15 @@ case class GroupingSets( override def selectedGroupByExprs: Seq[Seq[Expression]] = groupingSets // Includes the `userGivenGroupByExprs` in the children, which will be included in the final // GROUP BY expressions, so that `SELECT c ... GROUP BY (a, b, c) GROUPING SETS (a, b)` works. - override def children: Seq[Expression] = flatGroupingSets ++ userGivenGroupByExprs + // Note that, we must put `userGivenGroupByExprs` at the beginning, to preserve the order of + // grouping columns specified by users. For example, GROUP BY (a, b) GROUPING SETS (b, a), the + // final grouping columns should be (a, b). + override def children: Seq[Expression] = userGivenGroupByExprs ++ flatGroupingSets override protected def withNewChildrenInternal( newChildren: IndexedSeq[Expression]): GroupingSets = - super.legacyWithNewChildren(newChildren).asInstanceOf[GroupingSets] + copy( + userGivenGroupByExprs = newChildren.take(userGivenGroupByExprs.length), + flatGroupingSets = newChildren.drop(userGivenGroupByExprs.length)) } object GroupingSets { diff --git a/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql index d30914fdd92..4d516bdda7b 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql @@ -57,3 +57,6 @@ SELECT k1, k2, avg(v) FROM (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY GROUP SELECT grouping__id, k1, k2, avg(v) FROM (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY GROUPING SETS ((k1),(k1,k2),(k2,k1)); SELECT grouping(k1), k1, k2, avg(v) FROM (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY GROUPING SETS ((k1),(k1,k2),(k2,k1)); + +-- grouping_id function +SELECT grouping_id(k1, k2), avg(v) from (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY k1, k2 GROUPING SETS ((k2, k1), k1); diff --git a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out index 21c13af560d..d89050ab6d8 100644 --- a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out @@ -205,3 +205,14 @@ struct<grouping(k1):tinyint,k1:int,k2:int,avg(v):double> 0 2 2 2.0 0 2 2 2.0 0 2 NULL 2.0 + + +-- !query +SELECT grouping_id(k1, k2), avg(v) from (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY k1, k2 GROUPING SETS ((k2, k1), k1) +-- !query schema +struct<grouping_id(k1, k2):bigint,avg(v):double> +-- !query output +0 1.0 +0 2.0 +1 1.0 +1 2.0 --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org