[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r603801288 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala ## @@ -907,29 +907,74 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan. + * Add an [[Aggregate]] to a logical plan. */ private def withAggregationClause( ctx: AggregationClauseContext, selectExpressions: Seq[NamedExpression], query: LogicalPlan): LogicalPlan = withOrigin(ctx) { -val groupByExpressions = expressionList(ctx.groupingExpressions) - -if (ctx.GROUPING != null) { - // GROUP BY GROUPING SETS (...) - val selectedGroupByExprs = -ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) - GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, selectExpressions) -} else { - // GROUP BY (WITH CUBE | WITH ROLLUP)? - val mappedGroupByExpressions = if (ctx.CUBE != null) { -Seq(Cube(groupByExpressions)) - } else if (ctx.ROLLUP != null) { -Seq(Rollup(groupByExpressions)) +if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) { + val groupByExpressions = expressionList(ctx.groupingExpressions) + if (ctx.GROUPING != null) { +// GROUP BY GROUPING SETS (...) +// `GROUP BY warehouse, product GROUPING SETS((warehouse, producets), (warehouse))` is +// semantically equivalent to `GROUP BY GROUPING SETS((warehouse, produce), (warehouse))`. +// Under this grammar, the fields appearing in `GROUPING SETS`'s groupingSets must be a +// subset of the columns appearing in group by expression. +val groupingSets = + ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) +Aggregate(Seq(GroupingSets(groupingSets.toSeq, groupByExpressions)), + selectExpressions, query) } else { -groupByExpressions +// GROUP BY (WITH CUBE | WITH ROLLUP)? +val mappedGroupByExpressions = if (ctx.CUBE != null) { + Seq(Cube(groupByExpressions.map(Seq(_ +} else if (ctx.ROLLUP != null) { + Seq(Rollup(groupByExpressions.map(Seq(_ +} else { + groupByExpressions +} +Aggregate(mappedGroupByExpressions, selectExpressions, query) + } +} else { + val groupByExpressions = +ctx.groupingExpressionsWithGroupingAnalytics.asScala + .map(groupByExpr => { +val groupingAnalytics = groupByExpr.groupingAnalytics +if (groupingAnalytics != null) { + val groupingSets = groupingAnalytics.groupingSet.asScala +.map(_.expression.asScala.map(e => expression(e)).toSeq) + if (groupingAnalytics.CUBE != null) { +// CUBE(A, B, (A, B), ()) is not supported. +if (groupingSets.exists(_.isEmpty)) { + throw new ParseException("Empty set in CUBE grouping sets is not supported.", +groupingAnalytics) +} +Cube(groupingSets.toSeq) + } else if (groupingAnalytics.ROLLUP != null) { +// ROLLUP(A, B, (A, B), ()) is not supported. +if (groupingSets.exists(_.isEmpty)) { + throw new ParseException("Empty set in ROLLUP grouping sets is not supported.", +groupingAnalytics) +} +Rollup(groupingSets.toSeq) + } else { +assert(groupingAnalytics.GROUPING != null && groupingAnalytics.SETS != null) +GroupingSets(groupingSets.toSeq, + groupingSets.flatten.distinct.toSeq) + } +} else { + expression(groupByExpr.expression) +} + }) + val (groupingSet, expressions) = groupByExpressions.partition(_.isInstanceOf[GroupingSet]) + if ((expressions.nonEmpty && groupingSet.nonEmpty) || groupingSet.size > 1) { +throw new ParseException("Partial CUBE/ROLLUP/GROUPING SETS like " + Review comment: > could you handle the two error cases separately? Done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r603747251 ## File path: sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out ## @@ -275,6 +245,24 @@ NULL 201235000 NULL 201378000 +-- !query +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, year), ROLLUP(course, year) ORDER BY course, year +-- !query schema +struct<> +-- !query output +java.lang.UnsupportedOperationException Review comment: > parser Done. After this merged, I will restart https://issues.apache.org/jira/browse/SPARK-33229 Since TeraData. PostgresSQL, SQLServer, Oracle both support this way -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r603365492 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala ## @@ -653,14 +616,9 @@ class Analyzer(override val catalogManager: CatalogManager) val aggForResolving = h.child match { // For CUBE/ROLLUP expressions, to avoid resolving repeatedly, here we delete them from // groupingExpressions for condition resolving. -case a @ Aggregate(Seq(c @ Cube(groupByExprs)), _, _) => - a.copy(groupingExpressions = groupByExprs) -case a @ Aggregate(Seq(r @ Rollup(groupByExprs)), _, _) => - a.copy(groupingExpressions = groupByExprs) -case g: GroupingSets => - Aggregate( -getFinalGroupByExpressions(g.selectedGroupByExprs, g.groupByExprs), -g.aggregations, g.child) +case a @ Aggregate(Seq(gs: GroupingSet), _, _) => Review comment: > where do we fail if there are more than one `GroupingSet`? do we have test for it? Refer to https://issues.apache.org/jira/browse/SPARK-33229 Add UT in end-to-end test ## File path: sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveGroupingAnalyticsSuite.scala ## @@ -41,61 +41,64 @@ class ResolveGroupingAnalyticsSuite extends AnalysisTest { lazy val r1 = LocalRelation(a, b, c) test("rollupExprs") { -val testRollup = (exprs: Seq[Expression], rollup: Seq[Seq[Expression]]) => { - val result = SimpleAnalyzer.ResolveGroupingAnalytics.rollupExprs(exprs) +val testRollup = (exprs: Seq[Seq[Expression]], rollup: Seq[Seq[Expression]]) => { + val result = GroupingSet.rollupExprs(exprs) Review comment: > the input can still be `Seq[Expression]`, and here we do `GroupingSet.rollupExprs(exprs.map(Seq(_)))` Done ## File path: sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveGroupingAnalyticsSuite.scala ## @@ -41,61 +41,64 @@ class ResolveGroupingAnalyticsSuite extends AnalysisTest { lazy val r1 = LocalRelation(a, b, c) test("rollupExprs") { -val testRollup = (exprs: Seq[Expression], rollup: Seq[Seq[Expression]]) => { - val result = SimpleAnalyzer.ResolveGroupingAnalytics.rollupExprs(exprs) +val testRollup = (exprs: Seq[Seq[Expression]], rollup: Seq[Seq[Expression]]) => { + val result = GroupingSet.rollupExprs(exprs) assert(result.sortBy(_.hashCode) == rollup.sortBy(_.hashCode)) } -testRollup(Seq(a, b, c), Seq(Seq(), Seq(a), Seq(a, b), Seq(a, b, c))) -testRollup(Seq(c, b, a), Seq(Seq(), Seq(c), Seq(c, b), Seq(c, b, a))) -testRollup(Seq(a), Seq(Seq(), Seq(a))) +testRollup(Seq(a, b, c).map(Seq(_)), Seq(Seq(), Seq(a), Seq(a, b), Seq(a, b, c))) +testRollup(Seq(c, b, a).map(Seq(_)), Seq(Seq(), Seq(c), Seq(c, b), Seq(c, b, a))) +testRollup(Seq(a).map(Seq(_)), Seq(Seq(), Seq(a))) testRollup(Seq(), Seq(Seq())) } test("cubeExprs") { -val testCube = (exprs: Seq[Expression], cube: Seq[Seq[Expression]]) => { - val result = SimpleAnalyzer.ResolveGroupingAnalytics.cubeExprs(exprs) +val testCube = (exprs: Seq[Seq[Expression]], cube: Seq[Seq[Expression]]) => { + val result = GroupingSet.cubeExprs(exprs) Review comment: > ditto Done ## File path: sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql ## @@ -18,12 +18,19 @@ AS courseSales(course, year, earnings); -- ROLLUP SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, year) ORDER BY course, year; +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP (course, year) ORDER BY course, year; +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, year, (course, year)) ORDER BY course, year; +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, year, (course, year), ()) ORDER BY course, year; -- CUBE SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, year) ORDER BY course, year; +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE (course, year) ORDER BY course, year; Review comment: > ditto Done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r603366298 ## File path: sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql ## @@ -18,9 +18,11 @@ AS courseSales(course, year, earnings); -- ROLLUP SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, year) ORDER BY course, year; +SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP (course, year) ORDER BY course, year; Review comment: > +1, I don't think we should test extra space in the end-to-end test. Done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r603313867 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,40 +42,53 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters -* **GROUPING SETS** +* **grouping_expression** -Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent -to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause -is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` -operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. * **grouping_set** -A grouping set is specified by zero or more comma-separated expressions in parentheses. +A grouping set is specified by zero or more comma-separated expressions. Review comment: > ``` > A grouping set is specified by zero or more comma-separated expressions in parentheses. When the > grouping set has only one element, parentheses can be omitted. For example, `GROUPING SETS ((a), (b))` > is the same as `GROUPING SETS (a, b)`. > ``` Done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r603311932 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,40 +42,53 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters -* **GROUPING SETS** +* **grouping_expression** -Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent -to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause -is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` -operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. * **grouping_set** -A grouping set is specified by zero or more comma-separated expressions in parentheses. +A grouping set is specified by zero or more comma-separated expressions. -**Syntax:** `( [ expression [ , ... ] ] )` +**Syntax:** `{ ( [ expression [ , ... ] ] ) | expression }` -* **grouping_expression** +* **GROUPING SETS** -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. +Groups the rows for each grouping set specified after GROUPING SETS. For example, +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent +to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause +is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` +operator performs aggregation of each grouping set specified in the `GROUPING SETS` clause. +Similarly, `GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is semantically +equivalent to the union of results of `GROUP BY warehouse, product`, `GROUP BY product` +and global aggregate. + +`GROUP BY warehouse, product GROUPING SETS((warehouse, producets), (warehouse))` is semantically equivalent to Review comment: > Are there any other databases supporting this syntax? It seems useless grammar to me. > > If we only keep it for some historical reasons, let's not document it and only explain it in the internal code comments. Yea, only see spark support it. I am confuse why we support such a grammar. Remove and move this to AstBuilder -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r603182762 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,60 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** -Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent +Groups the rows for each grouping set specified after GROUPING SETS. For example, +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` -operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. - -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. +operator performs aggregation of each grouping set specified in the `GROUPING SETS` clause. +Similarly, `GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is semantically +equivalent to the union of results of `GROUP BY warehouse, product`, `GROUP BY product` +and global aggregate. + +GROUPING SETS can be followed with normal expressions as well, such as +`GROUP BY GROUPING SETS (warehouse, product)`. It generates one grouping set +per non-empty subset of the given expressions, so `GROUP BY GROUPING SETS (warehouse, product)` Review comment: > I think we don't need this paragraph anymore. We should put `grouping_set` before this one so that people know `a` is equal to `(a)`. Done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r603182409 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,60 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** -Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent +Groups the rows for each grouping set specified after GROUPING SETS. For example, +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` -operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. - -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. +operator performs aggregation of each grouping set specified in the `GROUPING SETS` clause. +Similarly, `GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is semantically +equivalent to the union of results of `GROUP BY warehouse, product`, `GROUP BY product` +and global aggregate. + +GROUPING SETS can be followed with normal expressions as well, such as +`GROUP BY GROUPING SETS (warehouse, product)`. It generates one grouping set +per non-empty subset of the given expressions, so `GROUP BY GROUPING SETS (warehouse, product)` +is equivalent to `GROUP BY GROUPING SETS ((warehouse), (product))`. + +`GROUP BY warehouse, product GROUPING SETS((warehouse, producets), (warehouse))` is semantically equivalent to +`GROUP BY GROUPING SETS((warehouse, produce), (warehouse))`. Under this grammar, +the fields appearing in `GROUPING SETS`'s grouping_set must be a subset of the columns +appearing in group by expression. * **ROLLUP** Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), ())`. +`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`. +`GROUP BY ROLLUP(warehouse, product, (warehouse, location))` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, product), (warehouse), ())`. The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`. * **CUBE** `CUBE` clause is used to perform aggregations based on combination of grouping columns specified in the `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), (product), ())`. +`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`. +`GROUP BY CUBE(warehouse, product, (warehouse, location))` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, product), (warehouse, location), + (product, warehouse, location), (warehouse), (product), (warehouse, product), ())`. The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`. +* **grouping_set** + +A grouping set is specified by zero or more comma-separated expressions in parentheses. Review comment: > let's update this. `parentheses` is not a must-have. `a` is equal to `(a)`. Done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r602159045 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,55 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** -Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent +Groups the rows for each grouping set specified after GROUPING SETS. For example, +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. - -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. +Similarly, `GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is semantically +equivalent to the union of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` +and global aggregate. + +GROUPING SETS can be followed with normal expressions as well, such as +`GROUP BY GROUPING SETS (warehouse, product)`. It generates one grouping set +per non-empty subset of the given expressions, so `GROUP BY GROUPING SETS (warehouse, product)` +is equivalent to `GROUP BY GROUPING SETS ((warehouse), (product))`. * **ROLLUP** Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), ())`. +`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`. +`GROUP BY ROLLUP(warehouse, product, (warehouse, location))` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, product), (warehouse), ())`. The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`. * **CUBE** `CUBE` clause is used to perform aggregations based on combination of grouping columns specified in the `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), (product), ())`. +`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`. +`GROUP BY CUBE(warehouse, product, (warehouse, location))` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, product), (warehouse, location), + (product, warehouse, location), (warehouse), (product), (warehouse, product), ())`. Review comment: > `(warehouse, product)` is duplicated, is it OK? Each grouping_set will be a subset of result and same result in PpostgresSQL ![image](https://user-images.githubusercontent.com/46485123/112616090-4758bd00-8e5e-11eb-933f-cab866d7bdcc.png) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r602159045 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,55 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** -Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent +Groups the rows for each grouping set specified after GROUPING SETS. For example, +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. - -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. +Similarly, `GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is semantically +equivalent to the union of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` +and global aggregate. + +GROUPING SETS can be followed with normal expressions as well, such as +`GROUP BY GROUPING SETS (warehouse, product)`. It generates one grouping set +per non-empty subset of the given expressions, so `GROUP BY GROUPING SETS (warehouse, product)` +is equivalent to `GROUP BY GROUPING SETS ((warehouse), (product))`. * **ROLLUP** Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), ())`. +`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`. +`GROUP BY ROLLUP(warehouse, product, (warehouse, location))` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, product), (warehouse), ())`. The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`. * **CUBE** `CUBE` clause is used to perform aggregations based on combination of grouping columns specified in the `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), (product), ())`. +`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`. +`GROUP BY CUBE(warehouse, product, (warehouse, location))` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, product), (warehouse, location), + (product, warehouse, location), (warehouse), (product), (warehouse, product), ())`. Review comment: > `(warehouse, product)` is duplicated, is it OK? Each grouping_set will be a subset of result ![image](https://user-images.githubusercontent.com/46485123/112616090-4758bd00-8e5e-11eb-933f-cab866d7bdcc.png) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r602155530 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala ## @@ -907,29 +907,64 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan. + * Add an [[Aggregate]] to a logical plan. */ private def withAggregationClause( ctx: AggregationClauseContext, selectExpressions: Seq[NamedExpression], query: LogicalPlan): LogicalPlan = withOrigin(ctx) { -val groupByExpressions = expressionList(ctx.groupingExpressions) - -if (ctx.GROUPING != null) { - // GROUP BY GROUPING SETS (...) - val selectedGroupByExprs = -ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) - GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, selectExpressions) -} else { - // GROUP BY (WITH CUBE | WITH ROLLUP)? - val mappedGroupByExpressions = if (ctx.CUBE != null) { -Seq(Cube(groupByExpressions)) - } else if (ctx.ROLLUP != null) { -Seq(Rollup(groupByExpressions)) +if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) { + val groupByExpressions = expressionList(ctx.groupingExpressions) + if (ctx.GROUPING != null) { +// GROUP BY GROUPING SETS (...) +val selectedGroupByExprs = + ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) +Aggregate(Seq(GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions)), + selectExpressions, query) } else { -groupByExpressions +// GROUP BY (WITH CUBE | WITH ROLLUP)? +val mappedGroupByExpressions = if (ctx.CUBE != null) { + Seq(Cube(groupByExpressions.map(Seq(_ +} else if (ctx.ROLLUP != null) { + Seq(Rollup(groupByExpressions.map(Seq(_ +} else { + groupByExpressions +} +Aggregate(mappedGroupByExpressions, selectExpressions, query) } - Aggregate(mappedGroupByExpressions, selectExpressions, query) +} else { + val groupByExpressions = +ctx.groupingExpressionsWithGroupingAnalytics.asScala + .map(groupByExpr => { +val groupingAnalytics = groupByExpr.groupingAnalytics +if (groupingAnalytics != null) { + val selectedGroupByExprs = groupingAnalytics.groupingSet.asScala Review comment: > ditto, `groupingSets` is a better name. Done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r602155694 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala ## @@ -39,47 +40,98 @@ trait GroupingSet extends Expression with CodegenFallback { override def eval(input: InternalRow): Any = throw new UnsupportedOperationException } -// scalastyle:off line.size.limit line.contains.tab -@ExpressionDescription( - usage = """ -_FUNC_([col1[, col2 ..]]) - create a multi-dimensional cube using the specified columns - so that we can run aggregation on them. - """, - examples = """ -Examples: - > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob') people(age, name) GROUP BY _FUNC_(name, age); -Bob5 1 -Alice 2 1 -Alice NULL1 -NULL 2 1 -NULL NULL2 -BobNULL1 -NULL 5 1 - """, - since = "2.0.0", - group = "agg_funcs") -// scalastyle:on line.size.limit line.contains.tab -case class Cube(groupByExprs: Seq[Expression]) extends GroupingSet {} +object GroupingSet { + /** + * GROUP BY a, b, c WITH ROLLUP + * is equivalent to + * GROUP BY a, b, c GROUPING SETS ( (a, b, c), (a, b), (a), ( ) ). Review comment: > This is not documented. What does `GROUP BY a, b, c` mean if there is a `GROUPING SETS` after it? Updated, also with cube's. How about current? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r602155326 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,55 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** -Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent +Groups the rows for each grouping set specified after GROUPING SETS. For example, +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. Review comment: > ``` > performs aggregation of each grouping set specified in the `GROUPING SETS` clause > ``` Done ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,55 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** -Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent +Groups the rows for each grouping set specified after GROUPING SETS. For example, +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. - -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. +Similarly, `GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is semantically +equivalent to the union of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` Review comment: > `GROUP BY warehouse` -> `GROUP BY product` Done ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala ## @@ -907,29 +907,64 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan. + * Add an [[Aggregate]] to a logical plan. */ private def withAggregationClause( ctx: AggregationClauseContext, selectExpressions: Seq[NamedExpression], query: LogicalPlan): LogicalPlan = withOrigin(ctx) { -val groupByExpressions = expressionList(ctx.groupingExpressions) - -if (ctx.GROUPING != null) { - // GROUP BY GROUPING SETS (...) - val selectedGroupByExprs = -ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) - GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, selectExpressions) -} else { - // GROUP BY (WITH CUBE | WITH ROLLUP)? - val mappedGroupByExpressions = if (ctx.CUBE != null) { -Seq(Cube(groupByExpressions)) - } else if (ctx.ROLLUP != null) { -Seq(Rollup(groupByExpressions)) +if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) { + val groupByExpressions = expressionList(ctx.groupingExpressions) + if (ctx.GROUPING != null) { +// GROUP BY GROUPING SETS (...) +val selectedGroupByExprs = Review comment: > The name `selectedGroupByExprs` is a bit weird, should it be `groupingSets`? Done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r602154380 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala ## @@ -907,29 +907,64 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan. + * Add an [[Aggregate]] to a logical plan. */ private def withAggregationClause( ctx: AggregationClauseContext, selectExpressions: Seq[NamedExpression], query: LogicalPlan): LogicalPlan = withOrigin(ctx) { -val groupByExpressions = expressionList(ctx.groupingExpressions) - -if (ctx.GROUPING != null) { - // GROUP BY GROUPING SETS (...) - val selectedGroupByExprs = -ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) - GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, selectExpressions) -} else { - // GROUP BY (WITH CUBE | WITH ROLLUP)? - val mappedGroupByExpressions = if (ctx.CUBE != null) { -Seq(Cube(groupByExpressions)) - } else if (ctx.ROLLUP != null) { -Seq(Rollup(groupByExpressions)) +if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) { + val groupByExpressions = expressionList(ctx.groupingExpressions) + if (ctx.GROUPING != null) { +// GROUP BY GROUPING SETS (...) +val selectedGroupByExprs = + ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) Review comment: For SQL ``` select id, city, car_model, sum(quantity) from dealer group by grouping sets((id), (city), (car_model)) UNION all select id, city, car_model, sum(quantity) from dealer group by grouping sets(id, city, car_model) order by id, city, car_model ``` Result in PGSQL ![image](https://user-images.githubusercontent.com/46485123/112615257-3b203000-8e5d-11eb-958e-fb5196c9592c.png) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r602151807 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala ## @@ -907,29 +907,64 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan. + * Add an [[Aggregate]] to a logical plan. */ private def withAggregationClause( ctx: AggregationClauseContext, selectExpressions: Seq[NamedExpression], query: LogicalPlan): LogicalPlan = withOrigin(ctx) { -val groupByExpressions = expressionList(ctx.groupingExpressions) - -if (ctx.GROUPING != null) { - // GROUP BY GROUPING SETS (...) - val selectedGroupByExprs = -ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) - GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, selectExpressions) -} else { - // GROUP BY (WITH CUBE | WITH ROLLUP)? - val mappedGroupByExpressions = if (ctx.CUBE != null) { -Seq(Cube(groupByExpressions)) - } else if (ctx.ROLLUP != null) { -Seq(Rollup(groupByExpressions)) +if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) { + val groupByExpressions = expressionList(ctx.groupingExpressions) + if (ctx.GROUPING != null) { +// GROUP BY GROUPING SETS (...) +val selectedGroupByExprs = + ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) Review comment: here the group set depending on the parser logic right? https://github.com/apache/spark/blob/8945e7c52f8d09282f6847c19bd926ac4c20e11a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4#L604-L611 According to the definition, `a` can be a grouping_set, `(a)` can be a grouping_set, `(a, b)` can be a grouping_set. You may make a miss about how expand `cube/rollup` and origin `grouping set` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r602151807 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala ## @@ -907,29 +907,64 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan. + * Add an [[Aggregate]] to a logical plan. */ private def withAggregationClause( ctx: AggregationClauseContext, selectExpressions: Seq[NamedExpression], query: LogicalPlan): LogicalPlan = withOrigin(ctx) { -val groupByExpressions = expressionList(ctx.groupingExpressions) - -if (ctx.GROUPING != null) { - // GROUP BY GROUPING SETS (...) - val selectedGroupByExprs = -ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) - GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, selectExpressions) -} else { - // GROUP BY (WITH CUBE | WITH ROLLUP)? - val mappedGroupByExpressions = if (ctx.CUBE != null) { -Seq(Cube(groupByExpressions)) - } else if (ctx.ROLLUP != null) { -Seq(Rollup(groupByExpressions)) +if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) { + val groupByExpressions = expressionList(ctx.groupingExpressions) + if (ctx.GROUPING != null) { +// GROUP BY GROUPING SETS (...) +val selectedGroupByExprs = + ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) Review comment: here the group set depending on the parser logic right? https://github.com/apache/spark/blob/8945e7c52f8d09282f6847c19bd926ac4c20e11a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4#L604-L611 According to the definition, `a` can be a grouping_set, `(a)` can be a grouping_set, `(a, b)` can be a grouping_set. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r602141666 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala ## @@ -907,29 +907,64 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan. + * Add an [[Aggregate]] to a logical plan. */ private def withAggregationClause( ctx: AggregationClauseContext, selectExpressions: Seq[NamedExpression], query: LogicalPlan): LogicalPlan = withOrigin(ctx) { -val groupByExpressions = expressionList(ctx.groupingExpressions) - -if (ctx.GROUPING != null) { - // GROUP BY GROUPING SETS (...) - val selectedGroupByExprs = -ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) - GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, selectExpressions) -} else { - // GROUP BY (WITH CUBE | WITH ROLLUP)? - val mappedGroupByExpressions = if (ctx.CUBE != null) { -Seq(Cube(groupByExpressions)) - } else if (ctx.ROLLUP != null) { -Seq(Rollup(groupByExpressions)) +if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) { + val groupByExpressions = expressionList(ctx.groupingExpressions) + if (ctx.GROUPING != null) { +// GROUP BY GROUPING SETS (...) +val selectedGroupByExprs = + ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) Review comment: > Seems we don't distinguish `(a)` and `a`, then how do we handle `GROUPING SETS(a, b, c)` and `GROUPING SETS((a), (b), (c))`? > > According to the document, `GROUPING SETS(a, b, c)` is `GROUPING SETS((a, b), (a, c), (b, c), (a), (b), (c))`. `GROUPING SETS(a, b, c)` is same as `GROUPING SETS((a), (b), (c))` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r602141666 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala ## @@ -907,29 +907,64 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan. + * Add an [[Aggregate]] to a logical plan. */ private def withAggregationClause( ctx: AggregationClauseContext, selectExpressions: Seq[NamedExpression], query: LogicalPlan): LogicalPlan = withOrigin(ctx) { -val groupByExpressions = expressionList(ctx.groupingExpressions) - -if (ctx.GROUPING != null) { - // GROUP BY GROUPING SETS (...) - val selectedGroupByExprs = -ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) - GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, selectExpressions) -} else { - // GROUP BY (WITH CUBE | WITH ROLLUP)? - val mappedGroupByExpressions = if (ctx.CUBE != null) { -Seq(Cube(groupByExpressions)) - } else if (ctx.ROLLUP != null) { -Seq(Rollup(groupByExpressions)) +if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) { + val groupByExpressions = expressionList(ctx.groupingExpressions) + if (ctx.GROUPING != null) { +// GROUP BY GROUPING SETS (...) +val selectedGroupByExprs = + ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) Review comment: > Seems we don't distinguish `(a)` and `a`, then how do we handle `GROUPING SETS(a, b, c)` and `GROUPING SETS((a), (b), (c))`? > > According to the document, `GROUPING SETS(a, b, c)` is `GROUPING SETS((a, b), (a, c), (b, c), (a), (b), (c))`. `GROUPING SETS(a, b, c)` is same as `GROUPING SETS((a), (b), (c))`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r602139085 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,55 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** -Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent +Groups the rows for each grouping set specified after GROUPING SETS. For example, +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. - -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. +Similarly, `GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is semantically +equivalent to the union of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` +and global aggregate. + +GROUPING SETS can be followed with normal expressions as well, such as +`GROUP BY GROUPING SETS (warehouse, product)`. It generates one grouping set +per non-empty subset of the given expressions, so `GROUP BY GROUPING SETS (warehouse, product)` +is equivalent to `GROUP BY GROUPING SETS ((warehouse), (product))`. * **ROLLUP** Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), ())`. +`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`. +`GROUP BY ROLLUP(warehouse, product, (warehouse, location))` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, product), (warehouse), ())`. The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`. * **CUBE** `CUBE` clause is used to perform aggregations based on combination of grouping columns specified in the `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), (product), ())`. +`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`. +`GROUP BY CUBE(warehouse, product, (warehouse, location))` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, product), (warehouse, location), + (product, warehouse, location), (warehouse), (product), (warehouse, product), ())`. Review comment: > `(warehouse, product)` is duplicated, is it OK? It's right -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r602042757 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. - -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. +`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is semantically equivalent to union +of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` and `GROUP BY ()`. * **ROLLUP** Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), ())`. +`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`. The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`. * **CUBE** `CUBE` clause is used to perform aggregations based on combination of grouping columns specified in the `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), (product), ())`. +`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`. The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`. +* **grouping_set** + +A grouping set is specified by zero or more comma-separated expressions in parentheses. + +**Syntax:** `( [ expression [ , ... ] ] )` Review comment: > what's its corresponding representation in `GROUPING SETS`? As the rule it should be `grouping sets((id, city, (id, car_model)), (id, city), (id), ())` But it can't run since have two level of `()` We should to write it as `grouping sets((id, city, id, car_model), (id, city), (id), ())` remove the duplicated column it can be `grouping sets((id, city, car_model), (id, city), (id), ())` ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. - -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r602044878 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. - -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. +`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is semantically equivalent to union +of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` and `GROUP BY ()`. * **ROLLUP** Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), ())`. +`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`. The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`. * **CUBE** `CUBE` clause is used to perform aggregations based on combination of grouping columns specified in the `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), (product), ())`. +`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`. The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`. +* **grouping_set** + +A grouping set is specified by zero or more comma-separated expressions in parentheses. + +**Syntax:** `( [ expression [ , ... ] ] )` Review comment: > It's hard to tell the behavior from the examples, can we just document the behavior of `rollup(id, city, (id, car_model))`? Done, same rule, it depending on how you want to combine the group. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r602042757 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. - -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. +`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is semantically equivalent to union +of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` and `GROUP BY ()`. * **ROLLUP** Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), ())`. +`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`. The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`. * **CUBE** `CUBE` clause is used to perform aggregations based on combination of grouping columns specified in the `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), (product), ())`. +`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`. The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`. +* **grouping_set** + +A grouping set is specified by zero or more comma-separated expressions in parentheses. + +**Syntax:** `( [ expression [ , ... ] ] )` Review comment: > what's its corresponding representation in `GROUPING SETS`? `grouping sets((id, city, car_model), (id, city), (id), ())` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r602042757 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. - -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. +`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is semantically equivalent to union +of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` and `GROUP BY ()`. * **ROLLUP** Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), ())`. +`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`. The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`. * **CUBE** `CUBE` clause is used to perform aggregations based on combination of grouping columns specified in the `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), (product), ())`. +`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`. The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`. +* **grouping_set** + +A grouping set is specified by zero or more comma-separated expressions in parentheses. + +**Syntax:** `( [ expression [ , ... ] ] )` Review comment: > what's its corresponding representation in `GROUPING SETS`? grouping sets((id, city, car_model), (id, city), (id), ()); -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r601480158 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** Groups the rows for each subset of the expressions specified in the grouping sets. For example, Review comment: Done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r601479820 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. - -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. +`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is semantically equivalent to union Review comment: yea, done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r601478487 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. - -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. +`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is semantically equivalent to union +of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` and `GROUP BY ()`. * **ROLLUP** Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), ())`. +`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`. The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`. * **CUBE** `CUBE` clause is used to perform aggregations based on combination of grouping columns specified in the `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), (product), ())`. +`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`. The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`. +* **grouping_set** + +A grouping set is specified by zero or more comma-separated expressions in parentheses. + +**Syntax:** `( [ expression [ , ... ] ] )` Review comment: > it's actually `{ ( [ expression [ , ... ] ] ) | expression }` Yea, more correct. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r601478215 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column name like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** Groups the rows for each subset of the expressions specified in the grouping sets. For example, -`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent +`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically equivalent to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. - -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. +`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is semantically equivalent to union +of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` and `GROUP BY ()`. * **ROLLUP** Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), ())`. +`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`. The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`. * **CUBE** `CUBE` clause is used to perform aggregations based on combination of grouping columns specified in the `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), (product), ())`. +`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, product)` is equivalent to +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`. The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`. +* **grouping_set** + +A grouping set is specified by zero or more comma-separated expressions in parentheses. + +**Syntax:** `( [ expression [ , ... ] ] )` Review comment: > do we support `ROLLUP((a, b), (a))`? ``` > select id, city, car_model, sum(quantity) from dealer group by rollup(id, city, (id, car_model)); NULL NULLNULL78 200 Dublin Honda CRV 3 300 San JoseNULL13 200 Dublin Honda Accord10 100 NULLNULL32 200 Dublin Honda Civic 20 100 Fremont Honda Civic 10 200 Dublin NULL33 200 NULLNULL33 300 San JoseHonda Accord8 300 San JoseHonda Civic 5 100 Fremont NULL32 100 Fremont Honda Accord15 100 Fremont Honda CRV 7 300 NULLNULL13 Time taken: 1.287 seconds, Fetched 15 row(s) spark-sql> ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r601476562 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -32,7 +32,7 @@ When a FILTER clause is attached to an aggregate function, only the matching row GROUP BY group_expression [ , group_expression [ , ... ] ] [ { WITH ROLLUP | WITH CUBE | GROUPING SETS (grouping_set [ , ...]) } ] Review comment: You can see the comment https://github.com/apache/spark/pull/30212/files#r529501391 Not all case can be covered such as `GROUP BY A, B grouping sets(a, (a, b ))` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r601268462 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column alias like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** Groups the rows for each subset of the expressions specified in the grouping sets. For example, `GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent -to union of results of `GROUP BY warehouse` and `GROUP BY product`. This clause +to union of results of `GROUP BY warehouse` and `GROUP BY product`. +`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is semantically equivalent to union Review comment: Done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r601264788 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column alias like `GROUP BY a`, a column position like +`GROUP BY 0`, or an expression like `GROUP BY a + b`. + * **GROUPING SETS** Groups the rows for each subset of the expressions specified in the grouping sets. For example, Review comment: Yea, more clear, Done ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column alias like `GROUP BY a`, a column position like Review comment: Done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r601250092 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -32,7 +32,7 @@ When a FILTER clause is attached to an aggregate function, only the matching row GROUP BY group_expression [ , group_expression [ , ... ] ] [ { WITH ROLLUP | WITH CUBE | GROUPING SETS (grouping_set [ , ...]) } ] Review comment: > BTW do we really support `GROUP BY a, b GROUPING SETS (c, d)`? Not yet. Same reason as https://github.com/apache/spark/pull/30212#discussion_r601171306 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r601244661 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -32,7 +32,7 @@ When a FILTER clause is attached to an aggregate function, only the matching row GROUP BY group_expression [ , group_expression [ , ... ] ] [ { WITH ROLLUP | WITH CUBE | GROUPING SETS (grouping_set [ , ...]) } ] -GROUP BY GROUPING SETS (grouping_set [ , ...]) +GROUP BY { group_expression | { ROLLUP | CUBE | GROUPING SETS } (grouping_set [ , ...]) } [ , ... ] Review comment: > Shall we remove the `[ , ... ]` at the end since we don't support it right now? Syntactically speaking, we can support this after this pr, but currently does not support executio. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r601173492 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -50,34 +65,28 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. Review comment: Should we explain more about `group by ()`, it mean's aggregate all result. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r601172474 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,6 +42,21 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column alias, a column position +or an expression. Review comment: Done ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -32,7 +32,7 @@ When a FILTER clause is attached to an aggregate function, only the matching row GROUP BY group_expression [ , group_expression [ , ... ] ] [ { WITH ROLLUP | WITH CUBE | GROUPING SETS (grouping_set [ , ...]) } ] -GROUP BY GROUPING SETS (grouping_set [ , ...]) +GROUP BY group_expression_with_grouping_analytics [ , group_expression_with_grouping_analytics [ , ... ] ] Review comment: Done ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -42,6 +42,21 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex ### Parameters +* **grouping_expression** + +Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on +result values of the grouping expressions. A grouping expression may be a column alias, a column position +or an expression. + +* **group_expression_with_grouping_analytics** Review comment: Done ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -50,34 +65,28 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. - * **ROLLUP** Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), ())`. +`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY ROLLUP(warehouse, product)` or Review comment: Done ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -50,34 +65,28 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. Review comment: Done -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r601171306 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -50,34 +65,28 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. - * **ROLLUP** Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), ())`. +`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY ROLLUP(warehouse, product)` or +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`. The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`. * **CUBE** `CUBE` clause is used to perform aggregations based on combination of grouping columns specified in the `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), (product), ())`. +`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY CUBE(warehouse, product)` or +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`. The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`. Review comment: > seems we miss the document of multiple grouping sets, which is newly added in this PR. What does `GROUP BY ROLLUP(a, b), ROLLUP(c, d)` mean? We can't support this now. We should support this in https://github.com/apache/spark/pull/30144 When We doing https://github.com/apache/spark/pull/30144 we found current grouping analytic's code need to refactor then can make it support more flexible case like postgresSQL and Oracle. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r601161941 ## File path: docs/sql-ref-syntax-qry-select-groupby.md ## @@ -50,34 +65,28 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_ex is a shorthand for a `UNION ALL` where each leg of the `UNION ALL` operator performs aggregation of subset of the columns specified in the `GROUPING SETS` clause. -* **grouping_set** - -A grouping set is specified by zero or more comma-separated expressions in parentheses. - -**Syntax:** `( [ expression [ , ... ] ] )` - -* **grouping_expression** - -Specifies the criteria based on which the rows are grouped together. The grouping of rows is performed based on -result values of the grouping expressions. A grouping expression may be a column alias, a column position -or an expression. - * **ROLLUP** Specifies multiple levels of aggregations in a single statement. This clause is used to compute aggregations based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING SETS`. For example, -`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY GROUPING SETS -((warehouse, product), (warehouse), ())`. +`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY ROLLUP(warehouse, product)` or +`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`. Review comment: > is this corrected? I though the result is a union of `GROUP BY warehouse, product`, `GROUP BY warehouse`, `GROUP BY ()` (global aggregate). ``` spark-sql> CREATE TABLE dealer (id INT, city STRING, car_model STRING, quantity INT); Time taken: 3.084 seconds spark-sql> INSERT INTO dealer VALUES > (100, 'Fremont', 'Honda Civic', 10), > (100, 'Fremont', 'Honda Accord', 15), > (100, 'Fremont', 'Honda CRV', 7), > (200, 'Dublin', 'Honda Civic', 20), > (200, 'Dublin', 'Honda Accord', 10), > (200, 'Dublin', 'Honda CRV', 3), > (300, 'San Jose', 'Honda Civic', 5), > (300, 'San Jose', 'Honda Accord', 8); 21/03/25 15:48:47 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException Time taken: 2.332 seconds spark-sql> select id, city, sum(quantity) from dealer group by rollup(id, city); 100 NULL32 300 NULL13 200 NULL33 NULL NULL78 200 Dublin 33 300 San Jose13 100 Fremont 32 Time taken: 1.388 seconds, Fetched 7 row(s) spark-sql> select id, city, sum(quantity) from dealer group by grouping sets((id, city), (id), ()); 100 NULL32 300 NULL13 200 NULL33 NULL NULL78 200 Dublin 33 300 San Jose13 100 Fremont 32 Time taken: 0.284 seconds, Fetched 7 row(s) spark-sql> explain select id, city, sum(quantity) from dealer group by rollup(id, city); == Physical Plan == AdaptiveSparkPlan isFinalPlan=false +- HashAggregate(keys=[id#68, city#69, spark_grouping_id#67L], functions=[sum(quantity#62)]) +- Exchange hashpartitioning(id#68, city#69, spark_grouping_id#67L, 200), ENSURE_REQUIREMENTS, [id=#113] +- HashAggregate(keys=[id#68, city#69, spark_grouping_id#67L], functions=[partial_sum(quantity#62)]) +- Expand [List(quantity#62, id#59, city#60, 0), List(quantity#62, id#59, null, 1), List(quantity#62, null, null, 3)], [quantity#62, id#68, city#69, spark_grouping_id#67L] +- Scan hive default.dealer [quantity#62, id#59, city#60], HiveTableRelation [`default`.`dealer`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, Data Cols: [id#59, city#60, car_model#61, quantity#62], Partition Cols: []] Time taken: 0.064 seconds, Fetched 1 row(s) spark-sql> explain select id, city, sum(quantity) from dealer group by grouping sets((id, city), (id), ()); == Physical Plan == AdaptiveSparkPlan isFinalPlan=false +- HashAggregate(keys=[id#86, city#87, spark_grouping_id#85L], functions=[sum(quantity#80)]) +- Exchange hashpartitioning(id#86, city#87, spark_grouping_id#85L, 200), ENSURE_REQUIREMENTS, [id=#136] +- HashAggregate(keys=[id#86, city#87, spark_grouping_id#85L], functions=[partial_sum(quantity#80)]) +- Expand [List(quantity#80, id#77, city#78, 0), List(quantity#80, id#77, null, 1), List(quantity#80, null, null, 3)], [quantity#80, id#86, city#87, spark_grouping_id#85L] +- Scan hive default.dealer [quantity#80, id#77, city#78], HiveTableRelation [`default`.`dealer`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, Data Cols: [id#77, city#78, car_model#79, quantity#80], Partition Cols: []] Time taken: 0.063 seconds, Fetched 1 row(s) spark-sql> ``` -- This is an automated message from the Apache Git Service. To respond to the
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r547222654 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala ## @@ -850,29 +850,62 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan. + * Add an [[Aggregate]] to a logical plan. */ private def withAggregationClause( ctx: AggregationClauseContext, selectExpressions: Seq[NamedExpression], query: LogicalPlan): LogicalPlan = withOrigin(ctx) { -val groupByExpressions = expressionList(ctx.groupingExpressions) - -if (ctx.GROUPING != null) { - // GROUP BY GROUPING SETS (...) - val selectedGroupByExprs = -ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) - GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, selectExpressions) -} else { - // GROUP BY (WITH CUBE | WITH ROLLUP)? - val mappedGroupByExpressions = if (ctx.CUBE != null) { -Seq(Cube(groupByExpressions)) - } else if (ctx.ROLLUP != null) { -Seq(Rollup(groupByExpressions)) +if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) { + val groupByExpressions = expressionList(ctx.groupingExpressions) + if (ctx.GROUPING != null) { +// GROUP BY GROUPING SETS (...) +val selectedGroupByExprs = + ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) +Aggregate(Seq(GroupingSets(selectedGroupByExprs, groupByExpressions)), + selectExpressions, query) } else { -groupByExpressions +// GROUP BY (WITH CUBE | WITH ROLLUP)? +val mappedGroupByExpressions = if (ctx.CUBE != null) { + Seq(Cube(groupByExpressions.map(Seq(_ +} else if (ctx.ROLLUP != null) { + Seq(Rollup(groupByExpressions.map(Seq(_ +} else { + groupByExpressions +} +Aggregate(mappedGroupByExpressions, selectExpressions, query) } - Aggregate(mappedGroupByExpressions, selectExpressions, query) +} else { + val groupByExpressions = +ctx.groupingExpressionsWithGroupingAnalytics.asScala + .map(groupByExpr => { +val groupingAnalytics = groupByExpr.groupingAnalytics +if (groupingAnalytics != null) { + val selectedGroupByExprs = groupingAnalytics.groupingSet.asScala +.map(_.expression.asScala.map(e => expression(e)).toSeq) + if (groupingAnalytics.CUBE != null) { +// CUBE(A, B, (A, B), ()) is not supported. +if (selectedGroupByExprs.exists(_.isEmpty)) { + throw new ParseException("Empty set in CUBE grouping sets is not supported.", +groupingAnalytics) +} +Cube(selectedGroupByExprs) + } else if (groupingAnalytics.ROLLUP != null) { +// ROLLUP(A, B, (A, B), ()) is not supported. +if (selectedGroupByExprs.exists(_.isEmpty)) { + throw new ParseException("Empty set in ROLLUP grouping sets is not supported.", +groupingAnalytics) +} +Rollup(selectedGroupByExprs) + } else { +GroupingSets(selectedGroupByExprs, selectedGroupByExprs.flatten.distinct) Review comment: > Could you check `assert(groupingAnalytics.GROUPING != null)`? Done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r546248497 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala ## @@ -850,29 +850,70 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan. + * Add an [[Aggregate]] to a logical plan. */ private def withAggregationClause( ctx: AggregationClauseContext, selectExpressions: Seq[NamedExpression], query: LogicalPlan): LogicalPlan = withOrigin(ctx) { -val groupByExpressions = expressionList(ctx.groupingExpressions) - -if (ctx.GROUPING != null) { - // GROUP BY GROUPING SETS (...) - val selectedGroupByExprs = -ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) - GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, selectExpressions) -} else { - // GROUP BY (WITH CUBE | WITH ROLLUP)? - val mappedGroupByExpressions = if (ctx.CUBE != null) { -Seq(Cube(groupByExpressions)) - } else if (ctx.ROLLUP != null) { -Seq(Rollup(groupByExpressions)) +if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) { + val groupByExpressions = expressionList(ctx.groupingExpressions) + if (ctx.GROUPING != null) { +// GROUP BY GROUPING SETS (...) +val selectedGroupByExprs = + ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) +Aggregate(Seq(GroupingSets(selectedGroupByExprs, groupByExpressions)), + selectExpressions, query) } else { -groupByExpressions +// GROUP BY (WITH CUBE | WITH ROLLUP)? +val mappedGroupByExpressions = if (ctx.CUBE != null) { + Seq(Cube(groupByExpressions.map(Seq(_ +} else if (ctx.ROLLUP != null) { + Seq(Rollup(groupByExpressions.map(Seq(_ +} else { + groupByExpressions +} +Aggregate(mappedGroupByExpressions, selectExpressions, query) } - Aggregate(mappedGroupByExpressions, selectExpressions, query) +} else { + val groupByExpressions = +ctx.groupingExpressionsWithGroupingAnalytics.asScala + .map(groupByExpr => { +val expr = groupByExpr.expression() +val groupingAnalytics = groupByExpr.groupingAnalytics() +(expr, groupingAnalytics) match { + case (_: ExpressionContext, null) => +expression(expr) + case (null, _: GroupingAnalyticsContext) => +val selectedGroupByExprs = groupingAnalytics.groupingSet().asScala +.map(_.expression.asScala.map(e => expression(e)).toSeq) +if (groupingAnalytics.GROUPING() != null) { + GroupingSets(selectedGroupByExprs, selectedGroupByExprs.flatten.distinct) +} else if (groupingAnalytics.CUBE != null) { + // CUBE(A, B, (A, B), ()) is not supported. + if (selectedGroupByExprs.exists(_.isEmpty)) { +throw new ParseException("Empty set in CUBE grouping sets is not supported.", + groupingAnalytics) + } + Cube(selectedGroupByExprs) +} else if (groupingAnalytics.ROLLUP != null) { + // ROLLUP(A, B, (A, B), ()) is not supported. + if (selectedGroupByExprs.exists(_.isEmpty)) { +throw new ParseException("Empty set in ROLLUP grouping sets is not supported.", + groupingAnalytics) + } + Rollup(selectedGroupByExprs) +} else { + throw new IllegalArgumentException( +"Grouping Analytics only support CUBE/ROLLUP/GROUPING SETS.") +} + case (_, _) => +throw new IllegalArgumentException("Each GROUP BY expression should be" + Review comment: Change to use if else block. ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala ## @@ -850,29 +850,70 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan. + * Add an [[Aggregate]] to a logical plan. */ private def withAggregationClause( ctx: AggregationClauseContext, selectExpressions: Seq[NamedExpression], query: LogicalPlan): LogicalPlan = withOrigin(ctx) { -val groupByExpressions = expressionList(ctx.groupingExpressions) - -if (ctx.GROUPING != null) { - // GROUP BY GROUPING SETS (...) - val selectedGroupByExprs = -ctx.groupingSet.asScala.map(_.expression.asScala.map(e =>
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r546239214 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala ## @@ -850,29 +850,70 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan. + * Add an [[Aggregate]] to a logical plan. */ private def withAggregationClause( ctx: AggregationClauseContext, selectExpressions: Seq[NamedExpression], query: LogicalPlan): LogicalPlan = withOrigin(ctx) { -val groupByExpressions = expressionList(ctx.groupingExpressions) - -if (ctx.GROUPING != null) { - // GROUP BY GROUPING SETS (...) - val selectedGroupByExprs = -ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) - GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, selectExpressions) -} else { - // GROUP BY (WITH CUBE | WITH ROLLUP)? - val mappedGroupByExpressions = if (ctx.CUBE != null) { -Seq(Cube(groupByExpressions)) - } else if (ctx.ROLLUP != null) { -Seq(Rollup(groupByExpressions)) +if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) { + val groupByExpressions = expressionList(ctx.groupingExpressions) + if (ctx.GROUPING != null) { +// GROUP BY GROUPING SETS (...) +val selectedGroupByExprs = + ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) +Aggregate(Seq(GroupingSets(selectedGroupByExprs, groupByExpressions)), + selectExpressions, query) } else { -groupByExpressions +// GROUP BY (WITH CUBE | WITH ROLLUP)? +val mappedGroupByExpressions = if (ctx.CUBE != null) { + Seq(Cube(groupByExpressions.map(Seq(_ +} else if (ctx.ROLLUP != null) { + Seq(Rollup(groupByExpressions.map(Seq(_ +} else { + groupByExpressions +} +Aggregate(mappedGroupByExpressions, selectExpressions, query) } - Aggregate(mappedGroupByExpressions, selectExpressions, query) +} else { + val groupByExpressions = +ctx.groupingExpressionsWithGroupingAnalytics.asScala + .map(groupByExpr => { +val expr = groupByExpr.expression() +val groupingAnalytics = groupByExpr.groupingAnalytics() +(expr, groupingAnalytics) match { + case (_: ExpressionContext, null) => +expression(expr) + case (null, _: GroupingAnalyticsContext) => +val selectedGroupByExprs = groupingAnalytics.groupingSet().asScala +.map(_.expression.asScala.map(e => expression(e)).toSeq) +if (groupingAnalytics.GROUPING() != null) { + GroupingSets(selectedGroupByExprs, selectedGroupByExprs.flatten.distinct) +} else if (groupingAnalytics.CUBE != null) { + // CUBE(A, B, (A, B), ()) is not supported. + if (selectedGroupByExprs.exists(_.isEmpty)) { +throw new ParseException("Empty set in CUBE grouping sets is not supported.", + groupingAnalytics) + } + Cube(selectedGroupByExprs) +} else if (groupingAnalytics.ROLLUP != null) { + // ROLLUP(A, B, (A, B), ()) is not supported. + if (selectedGroupByExprs.exists(_.isEmpty)) { +throw new ParseException("Empty set in ROLLUP grouping sets is not supported.", + groupingAnalytics) + } + Rollup(selectedGroupByExprs) +} else { + throw new IllegalArgumentException( +"Grouping Analytics only support CUBE/ROLLUP/GROUPING SETS.") +} + case (_, _) => +throw new IllegalArgumentException("Each GROUP BY expression should be" + Review comment: > ditto: We can reach this code path? Won't, I remember `match` should have `case` block for all case? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r546239097 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala ## @@ -850,29 +850,70 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg } /** - * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan. + * Add an [[Aggregate]] to a logical plan. */ private def withAggregationClause( ctx: AggregationClauseContext, selectExpressions: Seq[NamedExpression], query: LogicalPlan): LogicalPlan = withOrigin(ctx) { -val groupByExpressions = expressionList(ctx.groupingExpressions) - -if (ctx.GROUPING != null) { - // GROUP BY GROUPING SETS (...) - val selectedGroupByExprs = -ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) - GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, selectExpressions) -} else { - // GROUP BY (WITH CUBE | WITH ROLLUP)? - val mappedGroupByExpressions = if (ctx.CUBE != null) { -Seq(Cube(groupByExpressions)) - } else if (ctx.ROLLUP != null) { -Seq(Rollup(groupByExpressions)) +if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) { + val groupByExpressions = expressionList(ctx.groupingExpressions) + if (ctx.GROUPING != null) { +// GROUP BY GROUPING SETS (...) +val selectedGroupByExprs = + ctx.groupingSet.asScala.map(_.expression.asScala.map(e => expression(e)).toSeq) +Aggregate(Seq(GroupingSets(selectedGroupByExprs, groupByExpressions)), + selectExpressions, query) } else { -groupByExpressions +// GROUP BY (WITH CUBE | WITH ROLLUP)? +val mappedGroupByExpressions = if (ctx.CUBE != null) { + Seq(Cube(groupByExpressions.map(Seq(_ +} else if (ctx.ROLLUP != null) { + Seq(Rollup(groupByExpressions.map(Seq(_ +} else { + groupByExpressions +} +Aggregate(mappedGroupByExpressions, selectExpressions, query) } - Aggregate(mappedGroupByExpressions, selectExpressions, query) +} else { + val groupByExpressions = +ctx.groupingExpressionsWithGroupingAnalytics.asScala + .map(groupByExpr => { +val expr = groupByExpr.expression() +val groupingAnalytics = groupByExpr.groupingAnalytics() +(expr, groupingAnalytics) match { + case (_: ExpressionContext, null) => +expression(expr) + case (null, _: GroupingAnalyticsContext) => +val selectedGroupByExprs = groupingAnalytics.groupingSet().asScala +.map(_.expression.asScala.map(e => expression(e)).toSeq) +if (groupingAnalytics.GROUPING() != null) { + GroupingSets(selectedGroupByExprs, selectedGroupByExprs.flatten.distinct) +} else if (groupingAnalytics.CUBE != null) { + // CUBE(A, B, (A, B), ()) is not supported. + if (selectedGroupByExprs.exists(_.isEmpty)) { +throw new ParseException("Empty set in CUBE grouping sets is not supported.", + groupingAnalytics) + } + Cube(selectedGroupByExprs) +} else if (groupingAnalytics.ROLLUP != null) { + // ROLLUP(A, B, (A, B), ()) is not supported. + if (selectedGroupByExprs.exists(_.isEmpty)) { +throw new ParseException("Empty set in ROLLUP grouping sets is not supported.", + groupingAnalytics) + } + Rollup(selectedGroupByExprs) +} else { + throw new IllegalArgumentException( +"Grouping Analytics only support CUBE/ROLLUP/GROUPING SETS.") Review comment: > We can reach this code path? Won't, just remove this `else` block? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r542277956 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala ## @@ -2027,65 +1952,76 @@ class Analyzer(override val catalogManager: CatalogManager) */ object ResolveFunctions extends Rule[LogicalPlan] { val trimWarningEnabled = new AtomicBoolean(true) + +def resolveFunction(): PartialFunction[Expression, Expression] = { + case u if !u.childrenResolved => u // Skip until children are resolved. + case u: UnresolvedAttribute if resolver(u.name, VirtualColumn.hiveGroupingIdName) => +withPosition(u) { + Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)() +} + case u @ UnresolvedGenerator(name, children) => +withPosition(u) { + v1SessionCatalog.lookupFunction(name, children) match { +case generator: Generator => generator +case other => + failAnalysis(s"$name is expected to be a generator. However, " + +s"its class is ${other.getClass.getCanonicalName}, which is not a generator.") + } +} + case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) => +withPosition(u) { + v1SessionCatalog.lookupFunction(funcId, arguments) match { +// AggregateWindowFunctions are AggregateFunctions that can only be evaluated within +// the context of a Window clause. They do not need to be wrapped in an +// AggregateExpression. +case wf: AggregateWindowFunction => + if (isDistinct || filter.isDefined) { +failAnalysis("DISTINCT or FILTER specified, " + + s"but ${wf.prettyName} is not an aggregate function") + } else { +wf + } +// We get an aggregate function, we need to wrap it in an AggregateExpression. +case agg: AggregateFunction => + if (filter.isDefined && !filter.get.deterministic) { +failAnalysis("FILTER expression is non-deterministic, " + + "it cannot be used in aggregate functions") + } + AggregateExpression(agg, Complete, isDistinct, filter) +// This function is not an aggregate function, just return the resolved one. +case other if (isDistinct || filter.isDefined) => + failAnalysis("DISTINCT or FILTER specified, " + +s"but ${other.prettyName} is not an aggregate function") +case e: String2TrimExpression if arguments.size == 2 => + if (trimWarningEnabled.get) { +logWarning("Two-parameter TRIM/LTRIM/RTRIM function signatures are deprecated." + + " Use SQL syntax `TRIM((BOTH | LEADING | TRAILING)? trimStr FROM str)`" + + " instead.") +trimWarningEnabled.set(false) + } + e +case other => + other + } +} +} + def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { // Resolve functions with concrete relations from v2 catalog. case UnresolvedFunc(multipartIdent) => val funcIdent = parseSessionCatalogFunctionIdentifier(multipartIdent) ResolvedFunc(Identifier.of(funcIdent.database.toArray, funcIdent.funcName)) - case q: LogicalPlan => -q transformExpressions { - case u if !u.childrenResolved => u // Skip until children are resolved. - case u: UnresolvedAttribute if resolver(u.name, VirtualColumn.hiveGroupingIdName) => -withPosition(u) { - Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)() -} - case u @ UnresolvedGenerator(name, children) => -withPosition(u) { - v1SessionCatalog.lookupFunction(name, children) match { -case generator: Generator => generator -case other => - failAnalysis(s"$name is expected to be a generator. However, " + -s"its class is ${other.getClass.getCanonicalName}, which is not a generator.") - } -} - case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) => -withPosition(u) { - v1SessionCatalog.lookupFunction(funcId, arguments) match { -// AggregateWindowFunctions are AggregateFunctions that can only be evaluated within -// the context of a Window clause. They do not need to be wrapped in an -// AggregateExpression. -case wf: AggregateWindowFunction => - if (isDistinct || filter.isDefined) { -failAnalysis("DISTINCT or FILTER specified, " + - s"but ${wf.prettyName} is not an
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r542257009 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala ## @@ -2027,65 +1952,76 @@ class Analyzer(override val catalogManager: CatalogManager) */ object ResolveFunctions extends Rule[LogicalPlan] { val trimWarningEnabled = new AtomicBoolean(true) + +def resolveFunction(): PartialFunction[Expression, Expression] = { + case u if !u.childrenResolved => u // Skip until children are resolved. + case u: UnresolvedAttribute if resolver(u.name, VirtualColumn.hiveGroupingIdName) => +withPosition(u) { + Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)() +} + case u @ UnresolvedGenerator(name, children) => +withPosition(u) { + v1SessionCatalog.lookupFunction(name, children) match { +case generator: Generator => generator +case other => + failAnalysis(s"$name is expected to be a generator. However, " + +s"its class is ${other.getClass.getCanonicalName}, which is not a generator.") + } +} + case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) => +withPosition(u) { + v1SessionCatalog.lookupFunction(funcId, arguments) match { +// AggregateWindowFunctions are AggregateFunctions that can only be evaluated within +// the context of a Window clause. They do not need to be wrapped in an +// AggregateExpression. +case wf: AggregateWindowFunction => + if (isDistinct || filter.isDefined) { +failAnalysis("DISTINCT or FILTER specified, " + + s"but ${wf.prettyName} is not an aggregate function") + } else { +wf + } +// We get an aggregate function, we need to wrap it in an AggregateExpression. +case agg: AggregateFunction => + if (filter.isDefined && !filter.get.deterministic) { +failAnalysis("FILTER expression is non-deterministic, " + + "it cannot be used in aggregate functions") + } + AggregateExpression(agg, Complete, isDistinct, filter) +// This function is not an aggregate function, just return the resolved one. +case other if (isDistinct || filter.isDefined) => + failAnalysis("DISTINCT or FILTER specified, " + +s"but ${other.prettyName} is not an aggregate function") +case e: String2TrimExpression if arguments.size == 2 => + if (trimWarningEnabled.get) { +logWarning("Two-parameter TRIM/LTRIM/RTRIM function signatures are deprecated." + + " Use SQL syntax `TRIM((BOTH | LEADING | TRAILING)? trimStr FROM str)`" + + " instead.") +trimWarningEnabled.set(false) + } + e +case other => + other + } +} +} + def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { // Resolve functions with concrete relations from v2 catalog. case UnresolvedFunc(multipartIdent) => val funcIdent = parseSessionCatalogFunctionIdentifier(multipartIdent) ResolvedFunc(Identifier.of(funcIdent.database.toArray, funcIdent.funcName)) - case q: LogicalPlan => -q transformExpressions { - case u if !u.childrenResolved => u // Skip until children are resolved. - case u: UnresolvedAttribute if resolver(u.name, VirtualColumn.hiveGroupingIdName) => -withPosition(u) { - Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)() -} - case u @ UnresolvedGenerator(name, children) => -withPosition(u) { - v1SessionCatalog.lookupFunction(name, children) match { -case generator: Generator => generator -case other => - failAnalysis(s"$name is expected to be a generator. However, " + -s"its class is ${other.getClass.getCanonicalName}, which is not a generator.") - } -} - case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) => -withPosition(u) { - v1SessionCatalog.lookupFunction(funcId, arguments) match { -// AggregateWindowFunctions are AggregateFunctions that can only be evaluated within -// the context of a Window clause. They do not need to be wrapped in an -// AggregateExpression. -case wf: AggregateWindowFunction => - if (isDistinct || filter.isDefined) { -failAnalysis("DISTINCT or FILTER specified, " + - s"but ${wf.prettyName} is not an
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r533438082 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala ## @@ -2027,65 +1952,76 @@ class Analyzer(override val catalogManager: CatalogManager) */ object ResolveFunctions extends Rule[LogicalPlan] { val trimWarningEnabled = new AtomicBoolean(true) + +def resolveFunction(): PartialFunction[Expression, Expression] = { + case u if !u.childrenResolved => u // Skip until children are resolved. + case u: UnresolvedAttribute if resolver(u.name, VirtualColumn.hiveGroupingIdName) => +withPosition(u) { + Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)() +} + case u @ UnresolvedGenerator(name, children) => +withPosition(u) { + v1SessionCatalog.lookupFunction(name, children) match { +case generator: Generator => generator +case other => + failAnalysis(s"$name is expected to be a generator. However, " + +s"its class is ${other.getClass.getCanonicalName}, which is not a generator.") + } +} + case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) => +withPosition(u) { + v1SessionCatalog.lookupFunction(funcId, arguments) match { +// AggregateWindowFunctions are AggregateFunctions that can only be evaluated within +// the context of a Window clause. They do not need to be wrapped in an +// AggregateExpression. +case wf: AggregateWindowFunction => + if (isDistinct || filter.isDefined) { +failAnalysis("DISTINCT or FILTER specified, " + + s"but ${wf.prettyName} is not an aggregate function") + } else { +wf + } +// We get an aggregate function, we need to wrap it in an AggregateExpression. +case agg: AggregateFunction => + if (filter.isDefined && !filter.get.deterministic) { +failAnalysis("FILTER expression is non-deterministic, " + + "it cannot be used in aggregate functions") + } + AggregateExpression(agg, Complete, isDistinct, filter) +// This function is not an aggregate function, just return the resolved one. +case other if (isDistinct || filter.isDefined) => + failAnalysis("DISTINCT or FILTER specified, " + +s"but ${other.prettyName} is not an aggregate function") +case e: String2TrimExpression if arguments.size == 2 => + if (trimWarningEnabled.get) { +logWarning("Two-parameter TRIM/LTRIM/RTRIM function signatures are deprecated." + + " Use SQL syntax `TRIM((BOTH | LEADING | TRAILING)? trimStr FROM str)`" + + " instead.") +trimWarningEnabled.set(false) + } + e +case other => + other + } +} +} + def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { // Resolve functions with concrete relations from v2 catalog. case UnresolvedFunc(multipartIdent) => val funcIdent = parseSessionCatalogFunctionIdentifier(multipartIdent) ResolvedFunc(Identifier.of(funcIdent.database.toArray, funcIdent.funcName)) - case q: LogicalPlan => -q transformExpressions { - case u if !u.childrenResolved => u // Skip until children are resolved. - case u: UnresolvedAttribute if resolver(u.name, VirtualColumn.hiveGroupingIdName) => -withPosition(u) { - Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)() -} - case u @ UnresolvedGenerator(name, children) => -withPosition(u) { - v1SessionCatalog.lookupFunction(name, children) match { -case generator: Generator => generator -case other => - failAnalysis(s"$name is expected to be a generator. However, " + -s"its class is ${other.getClass.getCanonicalName}, which is not a generator.") - } -} - case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) => -withPosition(u) { - v1SessionCatalog.lookupFunction(funcId, arguments) match { -// AggregateWindowFunctions are AggregateFunctions that can only be evaluated within -// the context of a Window clause. They do not need to be wrapped in an -// AggregateExpression. -case wf: AggregateWindowFunction => - if (isDistinct || filter.isDefined) { -failAnalysis("DISTINCT or FILTER specified, " + - s"but ${wf.prettyName} is not an
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r533152103 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala ## @@ -2027,65 +1952,76 @@ class Analyzer(override val catalogManager: CatalogManager) */ object ResolveFunctions extends Rule[LogicalPlan] { val trimWarningEnabled = new AtomicBoolean(true) + +def resolveFunction(): PartialFunction[Expression, Expression] = { + case u if !u.childrenResolved => u // Skip until children are resolved. + case u: UnresolvedAttribute if resolver(u.name, VirtualColumn.hiveGroupingIdName) => +withPosition(u) { + Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)() +} + case u @ UnresolvedGenerator(name, children) => +withPosition(u) { + v1SessionCatalog.lookupFunction(name, children) match { +case generator: Generator => generator +case other => + failAnalysis(s"$name is expected to be a generator. However, " + +s"its class is ${other.getClass.getCanonicalName}, which is not a generator.") + } +} + case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) => +withPosition(u) { + v1SessionCatalog.lookupFunction(funcId, arguments) match { +// AggregateWindowFunctions are AggregateFunctions that can only be evaluated within +// the context of a Window clause. They do not need to be wrapped in an +// AggregateExpression. +case wf: AggregateWindowFunction => + if (isDistinct || filter.isDefined) { +failAnalysis("DISTINCT or FILTER specified, " + + s"but ${wf.prettyName} is not an aggregate function") + } else { +wf + } +// We get an aggregate function, we need to wrap it in an AggregateExpression. +case agg: AggregateFunction => + if (filter.isDefined && !filter.get.deterministic) { +failAnalysis("FILTER expression is non-deterministic, " + + "it cannot be used in aggregate functions") + } + AggregateExpression(agg, Complete, isDistinct, filter) +// This function is not an aggregate function, just return the resolved one. +case other if (isDistinct || filter.isDefined) => + failAnalysis("DISTINCT or FILTER specified, " + +s"but ${other.prettyName} is not an aggregate function") +case e: String2TrimExpression if arguments.size == 2 => + if (trimWarningEnabled.get) { +logWarning("Two-parameter TRIM/LTRIM/RTRIM function signatures are deprecated." + + " Use SQL syntax `TRIM((BOTH | LEADING | TRAILING)? trimStr FROM str)`" + + " instead.") +trimWarningEnabled.set(false) + } + e +case other => + other + } +} +} + def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { // Resolve functions with concrete relations from v2 catalog. case UnresolvedFunc(multipartIdent) => val funcIdent = parseSessionCatalogFunctionIdentifier(multipartIdent) ResolvedFunc(Identifier.of(funcIdent.database.toArray, funcIdent.funcName)) - case q: LogicalPlan => -q transformExpressions { - case u if !u.childrenResolved => u // Skip until children are resolved. - case u: UnresolvedAttribute if resolver(u.name, VirtualColumn.hiveGroupingIdName) => -withPosition(u) { - Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)() -} - case u @ UnresolvedGenerator(name, children) => -withPosition(u) { - v1SessionCatalog.lookupFunction(name, children) match { -case generator: Generator => generator -case other => - failAnalysis(s"$name is expected to be a generator. However, " + -s"its class is ${other.getClass.getCanonicalName}, which is not a generator.") - } -} - case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) => -withPosition(u) { - v1SessionCatalog.lookupFunction(funcId, arguments) match { -// AggregateWindowFunctions are AggregateFunctions that can only be evaluated within -// the context of a Window clause. They do not need to be wrapped in an -// AggregateExpression. -case wf: AggregateWindowFunction => - if (isDistinct || filter.isDefined) { -failAnalysis("DISTINCT or FILTER specified, " + - s"but ${wf.prettyName} is not an
[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics
AngersZh commented on a change in pull request #30212: URL: https://github.com/apache/spark/pull/30212#discussion_r533087740 ## File path: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala ## @@ -2027,65 +1952,76 @@ class Analyzer(override val catalogManager: CatalogManager) */ object ResolveFunctions extends Rule[LogicalPlan] { val trimWarningEnabled = new AtomicBoolean(true) + +def resolveFunction(): PartialFunction[Expression, Expression] = { + case u if !u.childrenResolved => u // Skip until children are resolved. + case u: UnresolvedAttribute if resolver(u.name, VirtualColumn.hiveGroupingIdName) => +withPosition(u) { + Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)() +} + case u @ UnresolvedGenerator(name, children) => +withPosition(u) { + v1SessionCatalog.lookupFunction(name, children) match { +case generator: Generator => generator +case other => + failAnalysis(s"$name is expected to be a generator. However, " + +s"its class is ${other.getClass.getCanonicalName}, which is not a generator.") + } +} + case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) => +withPosition(u) { + v1SessionCatalog.lookupFunction(funcId, arguments) match { +// AggregateWindowFunctions are AggregateFunctions that can only be evaluated within +// the context of a Window clause. They do not need to be wrapped in an +// AggregateExpression. +case wf: AggregateWindowFunction => + if (isDistinct || filter.isDefined) { +failAnalysis("DISTINCT or FILTER specified, " + + s"but ${wf.prettyName} is not an aggregate function") + } else { +wf + } +// We get an aggregate function, we need to wrap it in an AggregateExpression. +case agg: AggregateFunction => + if (filter.isDefined && !filter.get.deterministic) { +failAnalysis("FILTER expression is non-deterministic, " + + "it cannot be used in aggregate functions") + } + AggregateExpression(agg, Complete, isDistinct, filter) +// This function is not an aggregate function, just return the resolved one. +case other if (isDistinct || filter.isDefined) => + failAnalysis("DISTINCT or FILTER specified, " + +s"but ${other.prettyName} is not an aggregate function") +case e: String2TrimExpression if arguments.size == 2 => + if (trimWarningEnabled.get) { +logWarning("Two-parameter TRIM/LTRIM/RTRIM function signatures are deprecated." + + " Use SQL syntax `TRIM((BOTH | LEADING | TRAILING)? trimStr FROM str)`" + + " instead.") +trimWarningEnabled.set(false) + } + e +case other => + other + } +} +} + def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { // Resolve functions with concrete relations from v2 catalog. case UnresolvedFunc(multipartIdent) => val funcIdent = parseSessionCatalogFunctionIdentifier(multipartIdent) ResolvedFunc(Identifier.of(funcIdent.database.toArray, funcIdent.funcName)) - case q: LogicalPlan => -q transformExpressions { - case u if !u.childrenResolved => u // Skip until children are resolved. - case u: UnresolvedAttribute if resolver(u.name, VirtualColumn.hiveGroupingIdName) => -withPosition(u) { - Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)() -} - case u @ UnresolvedGenerator(name, children) => -withPosition(u) { - v1SessionCatalog.lookupFunction(name, children) match { -case generator: Generator => generator -case other => - failAnalysis(s"$name is expected to be a generator. However, " + -s"its class is ${other.getClass.getCanonicalName}, which is not a generator.") - } -} - case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) => -withPosition(u) { - v1SessionCatalog.lookupFunction(funcId, arguments) match { -// AggregateWindowFunctions are AggregateFunctions that can only be evaluated within -// the context of a Window clause. They do not need to be wrapped in an -// AggregateExpression. -case wf: AggregateWindowFunction => - if (isDistinct || filter.isDefined) { -failAnalysis("DISTINCT or FILTER specified, " + - s"but ${wf.prettyName} is not an