maropu commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r532044070
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
##########
@@ -1529,7 +1475,25 @@ class Analyzer(override val catalogManager:
CatalogManager)
}
val resolvedGroupingExprs = a.groupingExpressions
- .map(resolveExpressionTopDown(_, planForResolve, trimAlias = true))
+ .map {
+ case c @ Cube(groupingSets) =>
+ c.copy(groupingSets =
+ groupingSets.map(_.map(resolveExpressionTopDown(_, a,
trimAlias = true))
Review comment:
Hm, I see. Since these extra entries for `GroupingSet` make the analyzer
more complicated, could you find another approach for removing them? IIUC these
entries are needed because `GroupingSet` holds `grpuingSets` as non-children
exprs. For example, it seems `groupingSets` exprs and its children
(`groupByExprs` exprs) looks duplicated, so is it possible to replace
`groupingSets` exprs with integer indices to `groupByExprs` ?
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
##########
@@ -39,45 +41,64 @@ trait GroupingSet extends Expression with CodegenFallback {
override def eval(input: InternalRow): Any = throw new
UnsupportedOperationException
}
-// scalastyle:off line.size.limit line.contains.tab
-@ExpressionDescription(
- usage = """
- _FUNC_([col1[, col2 ..]]) - create a multi-dimensional cube using the
specified columns
- so that we can run aggregation on them.
- """,
- examples = """
- Examples:
- > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob')
people(age, name) GROUP BY _FUNC_(name, age);
- Bob 5 1
- Alice 2 1
- Alice NULL 1
- NULL 2 1
- NULL NULL 2
- Bob NULL 1
- NULL 5 1
- """,
- since = "2.0.0")
-// scalastyle:on line.size.limit line.contains.tab
-case class Cube(groupByExprs: Seq[Expression]) extends GroupingSet {}
+object GroupingSet {
+ /*
+ * GROUP BY a, b, c WITH ROLLUP
Review comment:
(I know this is not your mistake though) could you remove the single
leading space?
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
##########
@@ -39,45 +41,64 @@ trait GroupingSet extends Expression with CodegenFallback {
override def eval(input: InternalRow): Any = throw new
UnsupportedOperationException
}
-// scalastyle:off line.size.limit line.contains.tab
-@ExpressionDescription(
- usage = """
- _FUNC_([col1[, col2 ..]]) - create a multi-dimensional cube using the
specified columns
- so that we can run aggregation on them.
- """,
- examples = """
- Examples:
- > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob')
people(age, name) GROUP BY _FUNC_(name, age);
- Bob 5 1
- Alice 2 1
- Alice NULL 1
- NULL 2 1
- NULL NULL 2
- Bob NULL 1
- NULL 5 1
- """,
- since = "2.0.0")
-// scalastyle:on line.size.limit line.contains.tab
-case class Cube(groupByExprs: Seq[Expression]) extends GroupingSet {}
+object GroupingSet {
+ /*
+ * GROUP BY a, b, c WITH ROLLUP
+ * is equivalent to
+ * GROUP BY a, b, c GROUPING SETS ( (a, b, c), (a, b), (a), ( ) ).
+ * Group Count: N + 1 (N is the number of group expressions)
+ *
+ * We need to get all of its subsets for the rule described above, the
subset is
+ * represented as sequence of expressions.
+ */
+ def rollupExprs(exprs: Seq[Seq[Expression]]): Seq[Seq[Expression]] =
+ exprs.inits.map(_.flatten).toIndexedSeq
-// scalastyle:off line.size.limit line.contains.tab
-@ExpressionDescription(
- usage = """
- _FUNC_([col1[, col2 ..]]) - create a multi-dimensional rollup using the
specified columns
- so that we can run aggregation on them.
- """,
- examples = """
- Examples:
- > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob')
people(age, name) GROUP BY _FUNC_(name, age);
- Bob 5 1
- Alice 2 1
- Alice NULL 1
- NULL NULL 2
- Bob NULL 1
- """,
- since = "2.0.0")
-// scalastyle:on line.size.limit line.contains.tab
-case class Rollup(groupByExprs: Seq[Expression]) extends GroupingSet {}
+ /*
+ * GROUP BY a, b, c WITH CUBE
Review comment:
ditto
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
##########
@@ -39,45 +41,64 @@ trait GroupingSet extends Expression with CodegenFallback {
override def eval(input: InternalRow): Any = throw new
UnsupportedOperationException
}
-// scalastyle:off line.size.limit line.contains.tab
-@ExpressionDescription(
- usage = """
- _FUNC_([col1[, col2 ..]]) - create a multi-dimensional cube using the
specified columns
- so that we can run aggregation on them.
- """,
- examples = """
- Examples:
- > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob')
people(age, name) GROUP BY _FUNC_(name, age);
- Bob 5 1
- Alice 2 1
- Alice NULL 1
- NULL 2 1
- NULL NULL 2
- Bob NULL 1
- NULL 5 1
- """,
- since = "2.0.0")
-// scalastyle:on line.size.limit line.contains.tab
-case class Cube(groupByExprs: Seq[Expression]) extends GroupingSet {}
+object GroupingSet {
+ /*
+ * GROUP BY a, b, c WITH ROLLUP
+ * is equivalent to
+ * GROUP BY a, b, c GROUPING SETS ( (a, b, c), (a, b), (a), ( ) ).
+ * Group Count: N + 1 (N is the number of group expressions)
+ *
+ * We need to get all of its subsets for the rule described above, the
subset is
+ * represented as sequence of expressions.
+ */
+ def rollupExprs(exprs: Seq[Seq[Expression]]): Seq[Seq[Expression]] =
+ exprs.inits.map(_.flatten).toIndexedSeq
-// scalastyle:off line.size.limit line.contains.tab
-@ExpressionDescription(
- usage = """
- _FUNC_([col1[, col2 ..]]) - create a multi-dimensional rollup using the
specified columns
- so that we can run aggregation on them.
- """,
- examples = """
- Examples:
- > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob')
people(age, name) GROUP BY _FUNC_(name, age);
- Bob 5 1
- Alice 2 1
- Alice NULL 1
- NULL NULL 2
- Bob NULL 1
- """,
- since = "2.0.0")
-// scalastyle:on line.size.limit line.contains.tab
-case class Rollup(groupByExprs: Seq[Expression]) extends GroupingSet {}
+ /*
+ * GROUP BY a, b, c WITH CUBE
+ * is equivalent to
+ * GROUP BY a, b, c GROUPING SETS ( (a, b, c), (a, b), (b, c), (a, c), (a),
(b), (c), ( ) ).
+ * Group Count: 2 ^ N (N is the number of group expressions)
+ *
+ * We need to get all of its subsets for a given GROUPBY expression, the
subsets are
+ * represented as sequence of expressions.
+ */
+ def cubeExprs(exprs: Seq[Seq[Expression]]): Seq[Seq[Expression]] = {
+ // `cubeExprs0` is recursive and returns a lazy Stream. Here we call
`toIndexedSeq` to
+ // materialize it and avoid serialization problems later on.
+ cubeExprs0(exprs).toIndexedSeq
+ }
+
+ def cubeExprs0(exprs: Seq[Seq[Expression]]): Seq[Seq[Expression]] =
exprs.toList match {
+ case x :: xs =>
+ val initial = cubeExprs0(xs)
+ initial.map(x ++ _) ++ initial
+ case Nil =>
+ Seq(Seq.empty)
+ }
+}
+
+case class Cube(groupingSets: Seq[Seq[Expression]]) extends GroupingSet {
+ import GroupingSet._
+ override def groupByExprs: Seq[Expression] =
groupingSets.flatMap(_.distinct).distinct
+
+ override def selectedGroupByExprs: Seq[Seq[Expression]] =
cubeExprs(groupingSets)
+}
+
+case class Rollup(groupingSets: Seq[Seq[Expression]]) extends GroupingSet {
+ import GroupingSet._
+ override def groupByExprs: Seq[Expression] =
groupingSets.flatMap(_.distinct).distinct
+
+ override def selectedGroupByExprs: Seq[Seq[Expression]] =
rollupExprs(groupingSets)
Review comment:
`rollupExprs` -> `GroupingSet.rollupExprs` then how about removing
`import GroupingSet._` because this is used only once.
##########
File path: sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
##########
@@ -18,12 +18,19 @@ AS courseSales(course, year, earnings);
-- ROLLUP
SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course,
year) ORDER BY course, year;
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP (course,
year) ORDER BY course, year;
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course,
year, (course, year)) ORDER BY course, year;
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course,
year, (course, year), ()) ORDER BY course, year;
-- CUBE
SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course,
year) ORDER BY course, year;
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE (course,
year) ORDER BY course, year;
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course,
year, (course, year)) ORDER BY course, year;
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course,
year, (course, year), ()) ORDER BY course, year;
Review comment:
Could you check the behaviour of other systems (other than postgresql)
when having an empty set in cube/rollup?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]