subject:"\[GitHub\] \[spark\] AngersZhuuuu commented on a change in pull request #30212\: \[SPARK\-33308\]\[SQL\] Refactor current grouping analytics"

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-29 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r603801288



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
##
@@ -907,29 +907,74 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
SQLConfHelper with Logg
   }
 
   /**
-   * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan.
+   * Add an [[Aggregate]] to a logical plan.
*/
   private def withAggregationClause(
   ctx: AggregationClauseContext,
   selectExpressions: Seq[NamedExpression],
   query: LogicalPlan): LogicalPlan = withOrigin(ctx) {
-val groupByExpressions = expressionList(ctx.groupingExpressions)
-
-if (ctx.GROUPING != null) {
-  // GROUP BY  GROUPING SETS (...)
-  val selectedGroupByExprs =
-ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
-  GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, 
selectExpressions)
-} else {
-  // GROUP BY  (WITH CUBE | WITH ROLLUP)?
-  val mappedGroupByExpressions = if (ctx.CUBE != null) {
-Seq(Cube(groupByExpressions))
-  } else if (ctx.ROLLUP != null) {
-Seq(Rollup(groupByExpressions))
+if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) {
+  val groupByExpressions = expressionList(ctx.groupingExpressions)
+  if (ctx.GROUPING != null) {
+// GROUP BY  GROUPING SETS (...)
+// `GROUP BY warehouse, product GROUPING SETS((warehouse, producets), 
(warehouse))` is
+// semantically equivalent to `GROUP BY GROUPING SETS((warehouse, 
produce), (warehouse))`.
+// Under this grammar, the fields appearing in `GROUPING SETS`'s 
groupingSets must be a
+// subset of the columns appearing in group by expression.
+val groupingSets =
+  ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
+Aggregate(Seq(GroupingSets(groupingSets.toSeq, groupByExpressions)),
+  selectExpressions, query)
   } else {
-groupByExpressions
+// GROUP BY  (WITH CUBE | WITH ROLLUP)?
+val mappedGroupByExpressions = if (ctx.CUBE != null) {
+  Seq(Cube(groupByExpressions.map(Seq(_
+} else if (ctx.ROLLUP != null) {
+  Seq(Rollup(groupByExpressions.map(Seq(_
+} else {
+  groupByExpressions
+}
+Aggregate(mappedGroupByExpressions, selectExpressions, query)
+  }
+} else {
+  val groupByExpressions =
+ctx.groupingExpressionsWithGroupingAnalytics.asScala
+  .map(groupByExpr => {
+val groupingAnalytics = groupByExpr.groupingAnalytics
+if (groupingAnalytics != null) {
+  val groupingSets = groupingAnalytics.groupingSet.asScala
+.map(_.expression.asScala.map(e => expression(e)).toSeq)
+  if (groupingAnalytics.CUBE != null) {
+// CUBE(A, B, (A, B), ()) is not supported.
+if (groupingSets.exists(_.isEmpty)) {
+  throw new ParseException("Empty set in CUBE grouping sets is 
not supported.",
+groupingAnalytics)
+}
+Cube(groupingSets.toSeq)
+  } else if (groupingAnalytics.ROLLUP != null) {
+// ROLLUP(A, B, (A, B), ()) is not supported.
+if (groupingSets.exists(_.isEmpty)) {
+  throw new ParseException("Empty set in ROLLUP grouping sets 
is not supported.",
+groupingAnalytics)
+}
+Rollup(groupingSets.toSeq)
+  } else {
+assert(groupingAnalytics.GROUPING != null && 
groupingAnalytics.SETS != null)
+GroupingSets(groupingSets.toSeq,
+  groupingSets.flatten.distinct.toSeq)
+  }
+} else {
+  expression(groupByExpr.expression)
+}
+  })
+  val (groupingSet, expressions) = 
groupByExpressions.partition(_.isInstanceOf[GroupingSet])
+  if ((expressions.nonEmpty && groupingSet.nonEmpty) || groupingSet.size > 
1) {
+throw new ParseException("Partial CUBE/ROLLUP/GROUPING SETS like " +

Review comment:
   > could you handle the two error cases separately?
   
   Done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-29 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r603747251



##
File path: sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out
##
@@ -275,6 +245,24 @@ NULL   201235000
 NULL   201378000
 
 
+-- !query
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, 
year), ROLLUP(course, year) ORDER BY course, year
+-- !query schema
+struct<>
+-- !query output
+java.lang.UnsupportedOperationException

Review comment:
   > parser
   
   Done.  
   After this merged, I will restart  
https://issues.apache.org/jira/browse/SPARK-33229 Since TeraData. PostgresSQL, 
SQLServer, Oracle both support this way




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-29 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r603365492



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
##
@@ -653,14 +616,9 @@ class Analyzer(override val catalogManager: CatalogManager)
   val aggForResolving = h.child match {
 // For CUBE/ROLLUP expressions, to avoid resolving repeatedly, here we 
delete them from
 // groupingExpressions for condition resolving.
-case a @ Aggregate(Seq(c @ Cube(groupByExprs)), _, _) =>
-  a.copy(groupingExpressions = groupByExprs)
-case a @ Aggregate(Seq(r @ Rollup(groupByExprs)), _, _) =>
-  a.copy(groupingExpressions = groupByExprs)
-case g: GroupingSets =>
-  Aggregate(
-getFinalGroupByExpressions(g.selectedGroupByExprs, g.groupByExprs),
-g.aggregations, g.child)
+case a @ Aggregate(Seq(gs: GroupingSet), _, _) =>

Review comment:
   > where do we fail if there are more than one `GroupingSet`? do we have 
test for it?
   
   Refer to https://issues.apache.org/jira/browse/SPARK-33229
   Add UT in end-to-end test

##
File path: 
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveGroupingAnalyticsSuite.scala
##
@@ -41,61 +41,64 @@ class ResolveGroupingAnalyticsSuite extends AnalysisTest {
   lazy val r1 = LocalRelation(a, b, c)
 
   test("rollupExprs") {
-val testRollup = (exprs: Seq[Expression], rollup: Seq[Seq[Expression]]) => 
{
-  val result = SimpleAnalyzer.ResolveGroupingAnalytics.rollupExprs(exprs)
+val testRollup = (exprs: Seq[Seq[Expression]], rollup: 
Seq[Seq[Expression]]) => {
+  val result = GroupingSet.rollupExprs(exprs)

Review comment:
   > the input can still be `Seq[Expression]`, and here we do 
`GroupingSet.rollupExprs(exprs.map(Seq(_)))`
   
   Done

##
File path: 
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveGroupingAnalyticsSuite.scala
##
@@ -41,61 +41,64 @@ class ResolveGroupingAnalyticsSuite extends AnalysisTest {
   lazy val r1 = LocalRelation(a, b, c)
 
   test("rollupExprs") {
-val testRollup = (exprs: Seq[Expression], rollup: Seq[Seq[Expression]]) => 
{
-  val result = SimpleAnalyzer.ResolveGroupingAnalytics.rollupExprs(exprs)
+val testRollup = (exprs: Seq[Seq[Expression]], rollup: 
Seq[Seq[Expression]]) => {
+  val result = GroupingSet.rollupExprs(exprs)
   assert(result.sortBy(_.hashCode) == rollup.sortBy(_.hashCode))
 }
 
-testRollup(Seq(a, b, c), Seq(Seq(), Seq(a), Seq(a, b), Seq(a, b, c)))
-testRollup(Seq(c, b, a), Seq(Seq(), Seq(c), Seq(c, b), Seq(c, b, a)))
-testRollup(Seq(a), Seq(Seq(), Seq(a)))
+testRollup(Seq(a, b, c).map(Seq(_)), Seq(Seq(), Seq(a), Seq(a, b), Seq(a, 
b, c)))
+testRollup(Seq(c, b, a).map(Seq(_)), Seq(Seq(), Seq(c), Seq(c, b), Seq(c, 
b, a)))
+testRollup(Seq(a).map(Seq(_)), Seq(Seq(), Seq(a)))
 testRollup(Seq(), Seq(Seq()))
   }
 
   test("cubeExprs") {
-val testCube = (exprs: Seq[Expression], cube: Seq[Seq[Expression]]) => {
-  val result = SimpleAnalyzer.ResolveGroupingAnalytics.cubeExprs(exprs)
+val testCube = (exprs: Seq[Seq[Expression]], cube: Seq[Seq[Expression]]) 
=> {
+  val result = GroupingSet.cubeExprs(exprs)

Review comment:
   > ditto
   
   Done

##
File path: sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
##
@@ -18,12 +18,19 @@ AS courseSales(course, year, earnings);
 
 -- ROLLUP
 SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, 
year) ORDER BY course, year;
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP (course, 
year) ORDER BY course, year;
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, 
year, (course, year)) ORDER BY course, year;
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, 
year, (course, year), ()) ORDER BY course, year;
 
 -- CUBE
 SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, 
year) ORDER BY course, year;
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE (course, 
year) ORDER BY course, year;

Review comment:
   > ditto
   
   Done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-29 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r603366298



##
File path: sql/core/src/test/resources/sql-tests/inputs/group-analytics.sql
##
@@ -18,9 +18,11 @@ AS courseSales(course, year, earnings);
 
 -- ROLLUP
 SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, 
year) ORDER BY course, year;
+SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP (course, 
year) ORDER BY course, year;

Review comment:
   > +1, I don't think we should test extra space in the end-to-end test.
   
   Done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-29 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r603313867



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,40 +42,53 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
-* **GROUPING SETS**
+* **grouping_expression**
 
-Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
-to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
-is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
-operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
 
 * **grouping_set**
 
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
+A grouping set is specified by zero or more comma-separated expressions.

Review comment:
   > ```
   > A grouping set is specified by zero or more comma-separated expressions in 
parentheses. When the
   > grouping set has only one element, parentheses can be omitted. For 
example, `GROUPING SETS ((a), (b))`
   > is the same as `GROUPING SETS (a, b)`.
   > ```
   
   Done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-29 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r603311932



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,40 +42,53 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
-* **GROUPING SETS**
+* **grouping_expression**
 
-Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
-to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
-is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
-operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
 
 * **grouping_set**
 
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
+A grouping set is specified by zero or more comma-separated expressions.
 
-**Syntax:** `( [ expression [ , ... ] ] )`
+**Syntax:** `{ ( [ expression [ , ... ] ] ) | expression }`
 
-* **grouping_expression**
+* **GROUPING SETS**
 
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
+Groups the rows for each grouping set specified after GROUPING SETS. For 
example,
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
+to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
+is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
+operator performs aggregation of each grouping set specified in the 
`GROUPING SETS` clause.
+Similarly, `GROUP BY GROUPING SETS ((warehouse, product), (product), ())` 
is semantically
+equivalent to the union of results of `GROUP BY warehouse, product`, 
`GROUP BY product`
+and global aggregate.
+
+`GROUP BY warehouse, product GROUPING SETS((warehouse, producets), 
(warehouse))` is semantically equivalent to

Review comment:
   > Are there any other databases supporting this syntax? It seems useless 
grammar to me.
   > 
   > If we only keep it for some historical reasons, let's not document it and 
only explain it in the internal code comments.
   
   Yea, only see spark support it. I am confuse why we support such a  grammar. 
Remove and move this to AstBuilder




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-29 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r603182762



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,60 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
-Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
+Groups the rows for each grouping set specified after GROUPING SETS. For 
example,
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
 to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
-operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
-
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
+operator performs aggregation of each grouping set specified in the 
`GROUPING SETS` clause.
+Similarly, `GROUP BY GROUPING SETS ((warehouse, product), (product), ())` 
is semantically
+equivalent to the union of results of `GROUP BY warehouse, product`, 
`GROUP BY product`
+and global aggregate.
+
+GROUPING SETS can be followed with normal expressions as well, such as
+`GROUP BY GROUPING SETS (warehouse, product)`. It generates one grouping 
set
+per non-empty subset of the given expressions, so `GROUP BY GROUPING SETS 
(warehouse, product)`

Review comment:
   > I think we don't need this paragraph anymore. We should put 
`grouping_set` before this one so that people know `a` is equal to `(a)`.
   
   Done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-29 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r603182409



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,60 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
-Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
+Groups the rows for each grouping set specified after GROUPING SETS. For 
example,
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
 to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
-operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
-
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
+operator performs aggregation of each grouping set specified in the 
`GROUPING SETS` clause.
+Similarly, `GROUP BY GROUPING SETS ((warehouse, product), (product), ())` 
is semantically
+equivalent to the union of results of `GROUP BY warehouse, product`, 
`GROUP BY product`
+and global aggregate.
+
+GROUPING SETS can be followed with normal expressions as well, such as
+`GROUP BY GROUPING SETS (warehouse, product)`. It generates one grouping 
set
+per non-empty subset of the given expressions, so `GROUP BY GROUPING SETS 
(warehouse, product)`
+is equivalent to `GROUP BY GROUPING SETS ((warehouse), (product))`.
+
+`GROUP BY warehouse, product GROUPING SETS((warehouse, producets), 
(warehouse))` is semantically equivalent to
+`GROUP BY GROUPING SETS((warehouse, produce), (warehouse))`. Under this 
grammar,
+the fields appearing in `GROUPING SETS`'s grouping_set must be a subset of 
the columns
+appearing in group by expression.
 
 * **ROLLUP**
 
 Specifies multiple levels of aggregations in a single statement. This 
clause is used to compute aggregations
 based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING 
SETS`. For example,
-`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), ())`.
+`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, 
product)` is equivalent to
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`.
+`GROUP BY ROLLUP(warehouse, product, (warehouse, location))` is equivalent 
to
+`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, 
product), (warehouse), ())`.
 The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`.
 
 * **CUBE**
 
 `CUBE` clause is used to perform aggregations based on combination of 
grouping columns specified in the
 `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example,
-`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), (product), ())`.
+`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, 
product)` is equivalent to 
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`.
+`GROUP BY CUBE(warehouse, product, (warehouse, location))` is equivalent to
+`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, 
product), (warehouse, location),
+ (product, warehouse, location), (warehouse), (product), (warehouse, 
product), ())`.
 The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`.
 
+* **grouping_set**
+
+A grouping set is specified by zero or more comma-separated expressions in 
parentheses.

Review comment:
   > let's update this. `parentheses` is not a must-have. `a` is equal to 
`(a)`.
   
   Done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-26 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r602159045



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,55 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
-Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
+Groups the rows for each grouping set specified after GROUPING SETS. For 
example,
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
 to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
-
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
+Similarly, `GROUP BY GROUPING SETS ((warehouse, product), (product), ())` 
is semantically
+equivalent to the union of results of `GROUP BY warehouse, product`, 
`GROUP BY warehouse`
+and global aggregate.
+
+GROUPING SETS can be followed with normal expressions as well, such as
+`GROUP BY GROUPING SETS (warehouse, product)`. It generates one grouping 
set
+per non-empty subset of the given expressions, so `GROUP BY GROUPING SETS 
(warehouse, product)`
+is equivalent to `GROUP BY GROUPING SETS ((warehouse), (product))`.
 
 * **ROLLUP**
 
 Specifies multiple levels of aggregations in a single statement. This 
clause is used to compute aggregations
 based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING 
SETS`. For example,
-`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), ())`.
+`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, 
product)` is equivalent to
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`.
+`GROUP BY ROLLUP(warehouse, product, (warehouse, location))` is equivalent 
to
+`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, 
product), (warehouse), ())`.
 The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`.
 
 * **CUBE**
 
 `CUBE` clause is used to perform aggregations based on combination of 
grouping columns specified in the
 `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example,
-`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), (product), ())`.
+`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, 
product)` is equivalent to 
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`.
+`GROUP BY CUBE(warehouse, product, (warehouse, location))` is equivalent to
+`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, 
product), (warehouse, location),
+ (product, warehouse, location), (warehouse), (product), (warehouse, 
product), ())`.

Review comment:
   > `(warehouse, product)` is duplicated, is it OK?
   
   Each grouping_set will be a subset of result and same result in PpostgresSQL
   
![image](https://user-images.githubusercontent.com/46485123/112616090-4758bd00-8e5e-11eb-933f-cab866d7bdcc.png)
   




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-26 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r602159045



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,55 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
-Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
+Groups the rows for each grouping set specified after GROUPING SETS. For 
example,
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
 to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
-
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
+Similarly, `GROUP BY GROUPING SETS ((warehouse, product), (product), ())` 
is semantically
+equivalent to the union of results of `GROUP BY warehouse, product`, 
`GROUP BY warehouse`
+and global aggregate.
+
+GROUPING SETS can be followed with normal expressions as well, such as
+`GROUP BY GROUPING SETS (warehouse, product)`. It generates one grouping 
set
+per non-empty subset of the given expressions, so `GROUP BY GROUPING SETS 
(warehouse, product)`
+is equivalent to `GROUP BY GROUPING SETS ((warehouse), (product))`.
 
 * **ROLLUP**
 
 Specifies multiple levels of aggregations in a single statement. This 
clause is used to compute aggregations
 based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING 
SETS`. For example,
-`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), ())`.
+`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, 
product)` is equivalent to
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`.
+`GROUP BY ROLLUP(warehouse, product, (warehouse, location))` is equivalent 
to
+`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, 
product), (warehouse), ())`.
 The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`.
 
 * **CUBE**
 
 `CUBE` clause is used to perform aggregations based on combination of 
grouping columns specified in the
 `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example,
-`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), (product), ())`.
+`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, 
product)` is equivalent to 
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`.
+`GROUP BY CUBE(warehouse, product, (warehouse, location))` is equivalent to
+`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, 
product), (warehouse, location),
+ (product, warehouse, location), (warehouse), (product), (warehouse, 
product), ())`.

Review comment:
   > `(warehouse, product)` is duplicated, is it OK?
   
   Each grouping_set will be a subset of result
   
![image](https://user-images.githubusercontent.com/46485123/112616090-4758bd00-8e5e-11eb-933f-cab866d7bdcc.png)
   




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-26 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r602155530



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
##
@@ -907,29 +907,64 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
SQLConfHelper with Logg
   }
 
   /**
-   * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan.
+   * Add an [[Aggregate]] to a logical plan.
*/
   private def withAggregationClause(
   ctx: AggregationClauseContext,
   selectExpressions: Seq[NamedExpression],
   query: LogicalPlan): LogicalPlan = withOrigin(ctx) {
-val groupByExpressions = expressionList(ctx.groupingExpressions)
-
-if (ctx.GROUPING != null) {
-  // GROUP BY  GROUPING SETS (...)
-  val selectedGroupByExprs =
-ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
-  GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, 
selectExpressions)
-} else {
-  // GROUP BY  (WITH CUBE | WITH ROLLUP)?
-  val mappedGroupByExpressions = if (ctx.CUBE != null) {
-Seq(Cube(groupByExpressions))
-  } else if (ctx.ROLLUP != null) {
-Seq(Rollup(groupByExpressions))
+if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) {
+  val groupByExpressions = expressionList(ctx.groupingExpressions)
+  if (ctx.GROUPING != null) {
+// GROUP BY  GROUPING SETS (...)
+val selectedGroupByExprs =
+  ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
+Aggregate(Seq(GroupingSets(selectedGroupByExprs.toSeq, 
groupByExpressions)),
+  selectExpressions, query)
   } else {
-groupByExpressions
+// GROUP BY  (WITH CUBE | WITH ROLLUP)?
+val mappedGroupByExpressions = if (ctx.CUBE != null) {
+  Seq(Cube(groupByExpressions.map(Seq(_
+} else if (ctx.ROLLUP != null) {
+  Seq(Rollup(groupByExpressions.map(Seq(_
+} else {
+  groupByExpressions
+}
+Aggregate(mappedGroupByExpressions, selectExpressions, query)
   }
-  Aggregate(mappedGroupByExpressions, selectExpressions, query)
+} else {
+  val groupByExpressions =
+ctx.groupingExpressionsWithGroupingAnalytics.asScala
+  .map(groupByExpr => {
+val groupingAnalytics = groupByExpr.groupingAnalytics
+if (groupingAnalytics != null) {
+  val selectedGroupByExprs = groupingAnalytics.groupingSet.asScala

Review comment:
   > ditto, `groupingSets` is a better name.
   
   Done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-26 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r602155694



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
##
@@ -39,47 +40,98 @@ trait GroupingSet extends Expression with CodegenFallback {
   override def eval(input: InternalRow): Any = throw new 
UnsupportedOperationException
 }
 
-// scalastyle:off line.size.limit line.contains.tab
-@ExpressionDescription(
-  usage = """
-_FUNC_([col1[, col2 ..]]) - create a multi-dimensional cube using the 
specified columns
-  so that we can run aggregation on them.
-  """,
-  examples = """
-Examples:
-  > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob') 
people(age, name) GROUP BY _FUNC_(name, age);
-Bob5   1
-Alice  2   1
-Alice  NULL1
-NULL   2   1
-NULL   NULL2
-BobNULL1
-NULL   5   1
-  """,
-  since = "2.0.0",
-  group = "agg_funcs")
-// scalastyle:on line.size.limit line.contains.tab
-case class Cube(groupByExprs: Seq[Expression]) extends GroupingSet {}
+object GroupingSet {
+  /**
+   * GROUP BY a, b, c WITH ROLLUP
+   * is equivalent to
+   * GROUP BY a, b, c GROUPING SETS ( (a, b, c), (a, b), (a), ( ) ).

Review comment:
   > This is not documented. What does `GROUP BY a, b, c` mean if there is 
a `GROUPING SETS` after it?
   
   Updated, also with cube's. How about current?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-26 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r602155326



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,55 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
-Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
+Groups the rows for each grouping set specified after GROUPING SETS. For 
example,
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
 to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.

Review comment:
   > ```
   > performs aggregation of each grouping set specified in the `GROUPING SETS` 
clause
   > ```
   
   Done

##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,55 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
-Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
+Groups the rows for each grouping set specified after GROUPING SETS. For 
example,
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
 to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
-
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
+Similarly, `GROUP BY GROUPING SETS ((warehouse, product), (product), ())` 
is semantically
+equivalent to the union of results of `GROUP BY warehouse, product`, 
`GROUP BY warehouse`

Review comment:
   > `GROUP BY warehouse` -> `GROUP BY product`
   
   Done

##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
##
@@ -907,29 +907,64 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
SQLConfHelper with Logg
   }
 
   /**
-   * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan.
+   * Add an [[Aggregate]] to a logical plan.
*/
   private def withAggregationClause(
   ctx: AggregationClauseContext,
   selectExpressions: Seq[NamedExpression],
   query: LogicalPlan): LogicalPlan = withOrigin(ctx) {
-val groupByExpressions = expressionList(ctx.groupingExpressions)
-
-if (ctx.GROUPING != null) {
-  // GROUP BY  GROUPING SETS (...)
-  val selectedGroupByExprs =
-ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
-  GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, 
selectExpressions)
-} else {
-  // GROUP BY  (WITH CUBE | WITH ROLLUP)?
-  val mappedGroupByExpressions = if (ctx.CUBE != null) {
-Seq(Cube(groupByExpressions))
-  } else if (ctx.ROLLUP != null) {
-Seq(Rollup(groupByExpressions))
+if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) {
+  val groupByExpressions = expressionList(ctx.groupingExpressions)
+  if (ctx.GROUPING != null) {
+// GROUP BY  GROUPING SETS (...)
+val selectedGroupByExprs =

Review comment:
   > The name `selectedGroupByExprs` is a bit weird, should it be 
`groupingSets`?
   
   Done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-26 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r602154380



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
##
@@ -907,29 +907,64 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
SQLConfHelper with Logg
   }
 
   /**
-   * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan.
+   * Add an [[Aggregate]] to a logical plan.
*/
   private def withAggregationClause(
   ctx: AggregationClauseContext,
   selectExpressions: Seq[NamedExpression],
   query: LogicalPlan): LogicalPlan = withOrigin(ctx) {
-val groupByExpressions = expressionList(ctx.groupingExpressions)
-
-if (ctx.GROUPING != null) {
-  // GROUP BY  GROUPING SETS (...)
-  val selectedGroupByExprs =
-ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
-  GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, 
selectExpressions)
-} else {
-  // GROUP BY  (WITH CUBE | WITH ROLLUP)?
-  val mappedGroupByExpressions = if (ctx.CUBE != null) {
-Seq(Cube(groupByExpressions))
-  } else if (ctx.ROLLUP != null) {
-Seq(Rollup(groupByExpressions))
+if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) {
+  val groupByExpressions = expressionList(ctx.groupingExpressions)
+  if (ctx.GROUPING != null) {
+// GROUP BY  GROUPING SETS (...)
+val selectedGroupByExprs =
+  ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)

Review comment:
   For SQL
   ```
   select id, city, car_model, sum(quantity)
   from dealer group by grouping sets((id), (city), (car_model))
   UNION all
   select id, city, car_model, sum(quantity)
   from dealer group by grouping sets(id, city, car_model)
   order by id, city, car_model
   ```
   Result in PGSQL
   
![image](https://user-images.githubusercontent.com/46485123/112615257-3b203000-8e5d-11eb-958e-fb5196c9592c.png)
   
   




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-26 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r602151807



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
##
@@ -907,29 +907,64 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
SQLConfHelper with Logg
   }
 
   /**
-   * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan.
+   * Add an [[Aggregate]] to a logical plan.
*/
   private def withAggregationClause(
   ctx: AggregationClauseContext,
   selectExpressions: Seq[NamedExpression],
   query: LogicalPlan): LogicalPlan = withOrigin(ctx) {
-val groupByExpressions = expressionList(ctx.groupingExpressions)
-
-if (ctx.GROUPING != null) {
-  // GROUP BY  GROUPING SETS (...)
-  val selectedGroupByExprs =
-ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
-  GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, 
selectExpressions)
-} else {
-  // GROUP BY  (WITH CUBE | WITH ROLLUP)?
-  val mappedGroupByExpressions = if (ctx.CUBE != null) {
-Seq(Cube(groupByExpressions))
-  } else if (ctx.ROLLUP != null) {
-Seq(Rollup(groupByExpressions))
+if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) {
+  val groupByExpressions = expressionList(ctx.groupingExpressions)
+  if (ctx.GROUPING != null) {
+// GROUP BY  GROUPING SETS (...)
+val selectedGroupByExprs =
+  ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)

Review comment:
   here the group set  depending on the parser logic right?
   
https://github.com/apache/spark/blob/8945e7c52f8d09282f6847c19bd926ac4c20e11a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4#L604-L611
   
   According to the definition, `a` can be a grouping_set, `(a)` can be a 
grouping_set, `(a, b)` can be a grouping_set.
   
   You may make a miss about how expand `cube/rollup` and origin `grouping set`




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-26 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r602151807



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
##
@@ -907,29 +907,64 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
SQLConfHelper with Logg
   }
 
   /**
-   * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan.
+   * Add an [[Aggregate]] to a logical plan.
*/
   private def withAggregationClause(
   ctx: AggregationClauseContext,
   selectExpressions: Seq[NamedExpression],
   query: LogicalPlan): LogicalPlan = withOrigin(ctx) {
-val groupByExpressions = expressionList(ctx.groupingExpressions)
-
-if (ctx.GROUPING != null) {
-  // GROUP BY  GROUPING SETS (...)
-  val selectedGroupByExprs =
-ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
-  GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, 
selectExpressions)
-} else {
-  // GROUP BY  (WITH CUBE | WITH ROLLUP)?
-  val mappedGroupByExpressions = if (ctx.CUBE != null) {
-Seq(Cube(groupByExpressions))
-  } else if (ctx.ROLLUP != null) {
-Seq(Rollup(groupByExpressions))
+if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) {
+  val groupByExpressions = expressionList(ctx.groupingExpressions)
+  if (ctx.GROUPING != null) {
+// GROUP BY  GROUPING SETS (...)
+val selectedGroupByExprs =
+  ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)

Review comment:
   here the group set  depending on the parser logic right?
   
https://github.com/apache/spark/blob/8945e7c52f8d09282f6847c19bd926ac4c20e11a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4#L604-L611
   
   According to the definition, `a` can be a grouping_set, `(a)` can be a 
grouping_set, `(a, b)` can be a grouping_set.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-26 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r602141666



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
##
@@ -907,29 +907,64 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
SQLConfHelper with Logg
   }
 
   /**
-   * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan.
+   * Add an [[Aggregate]] to a logical plan.
*/
   private def withAggregationClause(
   ctx: AggregationClauseContext,
   selectExpressions: Seq[NamedExpression],
   query: LogicalPlan): LogicalPlan = withOrigin(ctx) {
-val groupByExpressions = expressionList(ctx.groupingExpressions)
-
-if (ctx.GROUPING != null) {
-  // GROUP BY  GROUPING SETS (...)
-  val selectedGroupByExprs =
-ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
-  GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, 
selectExpressions)
-} else {
-  // GROUP BY  (WITH CUBE | WITH ROLLUP)?
-  val mappedGroupByExpressions = if (ctx.CUBE != null) {
-Seq(Cube(groupByExpressions))
-  } else if (ctx.ROLLUP != null) {
-Seq(Rollup(groupByExpressions))
+if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) {
+  val groupByExpressions = expressionList(ctx.groupingExpressions)
+  if (ctx.GROUPING != null) {
+// GROUP BY  GROUPING SETS (...)
+val selectedGroupByExprs =
+  ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)

Review comment:
   > Seems we don't distinguish `(a)` and `a`, then how do we handle 
`GROUPING SETS(a, b, c)` and `GROUPING SETS((a), (b), (c))`?
   > 
   > According to the document, `GROUPING SETS(a, b, c)` is `GROUPING SETS((a, 
b), (a, c), (b, c), (a), (b), (c))`.
   
   `GROUPING SETS(a, b, c)`  is same as  `GROUPING SETS((a), (b), (c))`
   




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-26 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r602141666



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
##
@@ -907,29 +907,64 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
SQLConfHelper with Logg
   }
 
   /**
-   * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan.
+   * Add an [[Aggregate]] to a logical plan.
*/
   private def withAggregationClause(
   ctx: AggregationClauseContext,
   selectExpressions: Seq[NamedExpression],
   query: LogicalPlan): LogicalPlan = withOrigin(ctx) {
-val groupByExpressions = expressionList(ctx.groupingExpressions)
-
-if (ctx.GROUPING != null) {
-  // GROUP BY  GROUPING SETS (...)
-  val selectedGroupByExprs =
-ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
-  GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, 
selectExpressions)
-} else {
-  // GROUP BY  (WITH CUBE | WITH ROLLUP)?
-  val mappedGroupByExpressions = if (ctx.CUBE != null) {
-Seq(Cube(groupByExpressions))
-  } else if (ctx.ROLLUP != null) {
-Seq(Rollup(groupByExpressions))
+if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) {
+  val groupByExpressions = expressionList(ctx.groupingExpressions)
+  if (ctx.GROUPING != null) {
+// GROUP BY  GROUPING SETS (...)
+val selectedGroupByExprs =
+  ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)

Review comment:
   > Seems we don't distinguish `(a)` and `a`, then how do we handle 
`GROUPING SETS(a, b, c)` and `GROUPING SETS((a), (b), (c))`?
   > 
   > According to the document, `GROUPING SETS(a, b, c)` is `GROUPING SETS((a, 
b), (a, c), (b, c), (a), (b), (c))`.
   
   `GROUPING SETS(a, b, c)`  is same as  `GROUPING SETS((a), (b), (c))`?
   




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-26 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r602139085



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,55 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
-Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
+Groups the rows for each grouping set specified after GROUPING SETS. For 
example,
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
 to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
-
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
+Similarly, `GROUP BY GROUPING SETS ((warehouse, product), (product), ())` 
is semantically
+equivalent to the union of results of `GROUP BY warehouse, product`, 
`GROUP BY warehouse`
+and global aggregate.
+
+GROUPING SETS can be followed with normal expressions as well, such as
+`GROUP BY GROUPING SETS (warehouse, product)`. It generates one grouping 
set
+per non-empty subset of the given expressions, so `GROUP BY GROUPING SETS 
(warehouse, product)`
+is equivalent to `GROUP BY GROUPING SETS ((warehouse), (product))`.
 
 * **ROLLUP**
 
 Specifies multiple levels of aggregations in a single statement. This 
clause is used to compute aggregations
 based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING 
SETS`. For example,
-`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), ())`.
+`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, 
product)` is equivalent to
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`.
+`GROUP BY ROLLUP(warehouse, product, (warehouse, location))` is equivalent 
to
+`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, 
product), (warehouse), ())`.
 The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`.
 
 * **CUBE**
 
 `CUBE` clause is used to perform aggregations based on combination of 
grouping columns specified in the
 `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example,
-`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), (product), ())`.
+`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, 
product)` is equivalent to 
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`.
+`GROUP BY CUBE(warehouse, product, (warehouse, location))` is equivalent to
+`GROUP BY GROUPING SETS((warehouse, product, location), (warehouse, 
product), (warehouse, location),
+ (product, warehouse, location), (warehouse), (product), (warehouse, 
product), ())`.

Review comment:
   > `(warehouse, product)` is duplicated, is it OK?
   
   It's right




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-26 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r602042757



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
 Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
 to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
-
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
+`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is 
semantically equivalent to union
+of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` and 
`GROUP BY ()`.
 
 * **ROLLUP**
 
 Specifies multiple levels of aggregations in a single statement. This 
clause is used to compute aggregations
 based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING 
SETS`. For example,
-`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), ())`.
+`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, 
product)` is equivalent to
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`.
 The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`.
 
 * **CUBE**
 
 `CUBE` clause is used to perform aggregations based on combination of 
grouping columns specified in the
 `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example,
-`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), (product), ())`.
+`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, 
product)` is equivalent to 
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`.
 The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`.
 
+* **grouping_set**
+
+A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
+
+**Syntax:** `( [ expression [ , ... ] ] )`

Review comment:
   > what's its corresponding representation in `GROUPING SETS`?
   As the rule it should be  `grouping sets((id, city,  (id, car_model)), (id, 
city), (id), ())`
   But it can't run since have two level of `()`
   We should to write it as  `grouping sets((id, city,  id, car_model), (id, 
city), (id), ())`
   remove the duplicated column it can be
   `grouping sets((id, city,  car_model), (id, city), (id), ())`

##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
 Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
 to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
-
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-26 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r602044878



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
 Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
 to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
-
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
+`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is 
semantically equivalent to union
+of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` and 
`GROUP BY ()`.
 
 * **ROLLUP**
 
 Specifies multiple levels of aggregations in a single statement. This 
clause is used to compute aggregations
 based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING 
SETS`. For example,
-`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), ())`.
+`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, 
product)` is equivalent to
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`.
 The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`.
 
 * **CUBE**
 
 `CUBE` clause is used to perform aggregations based on combination of 
grouping columns specified in the
 `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example,
-`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), (product), ())`.
+`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, 
product)` is equivalent to 
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`.
 The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`.
 
+* **grouping_set**
+
+A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
+
+**Syntax:** `( [ expression [ , ... ] ] )`

Review comment:
   > It's hard to tell the behavior from the examples, can we just document 
the behavior of `rollup(id, city, (id, car_model))`?
   
   Done, same rule, it depending on how you want to combine the group.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-26 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r602042757



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
 Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
 to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
-
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
+`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is 
semantically equivalent to union
+of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` and 
`GROUP BY ()`.
 
 * **ROLLUP**
 
 Specifies multiple levels of aggregations in a single statement. This 
clause is used to compute aggregations
 based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING 
SETS`. For example,
-`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), ())`.
+`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, 
product)` is equivalent to
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`.
 The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`.
 
 * **CUBE**
 
 `CUBE` clause is used to perform aggregations based on combination of 
grouping columns specified in the
 `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example,
-`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), (product), ())`.
+`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, 
product)` is equivalent to 
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`.
 The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`.
 
+* **grouping_set**
+
+A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
+
+**Syntax:** `( [ expression [ , ... ] ] )`

Review comment:
   > what's its corresponding representation in `GROUPING SETS`?
   
   `grouping sets((id, city,  car_model), (id, city), (id), ())`




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-26 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r602042757



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
 Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
 to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
-
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
+`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is 
semantically equivalent to union
+of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` and 
`GROUP BY ()`.
 
 * **ROLLUP**
 
 Specifies multiple levels of aggregations in a single statement. This 
clause is used to compute aggregations
 based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING 
SETS`. For example,
-`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), ())`.
+`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, 
product)` is equivalent to
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`.
 The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`.
 
 * **CUBE**
 
 `CUBE` clause is used to perform aggregations based on combination of 
grouping columns specified in the
 `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example,
-`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), (product), ())`.
+`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, 
product)` is equivalent to 
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`.
 The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`.
 
+* **grouping_set**
+
+A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
+
+**Syntax:** `( [ expression [ , ... ] ] )`

Review comment:
   > what's its corresponding representation in `GROUPING SETS`?
   
   grouping sets((id, city,  car_model), (id, city), (id), ());




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-25 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r601480158



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
 Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,

Review comment:
   Done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-25 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r601479820



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
 Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
 to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
-
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
+`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is 
semantically equivalent to union

Review comment:
   yea, done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-25 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r601478487



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
 Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
 to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
-
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
+`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is 
semantically equivalent to union
+of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` and 
`GROUP BY ()`.
 
 * **ROLLUP**
 
 Specifies multiple levels of aggregations in a single statement. This 
clause is used to compute aggregations
 based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING 
SETS`. For example,
-`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), ())`.
+`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, 
product)` is equivalent to
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`.
 The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`.
 
 * **CUBE**
 
 `CUBE` clause is used to perform aggregations based on combination of 
grouping columns specified in the
 `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example,
-`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), (product), ())`.
+`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, 
product)` is equivalent to 
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`.
 The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`.
 
+* **grouping_set**
+
+A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
+
+**Syntax:** `( [ expression [ , ... ] ] )`

Review comment:
   > it's actually `{ ( [ expression [ , ... ] ] ) | expression }`
   
   Yea, more correct.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-25 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r601478215



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column name like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+
 * **GROUPING SETS**
 
 Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
-`GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
+`GROUP BY GROUPING SETS ((warehouse), (product))` is semantically 
equivalent
 to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
-
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
+`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is 
semantically equivalent to union
+of results of `GROUP BY warehouse, product`, `GROUP BY warehouse` and 
`GROUP BY ()`.
 
 * **ROLLUP**
 
 Specifies multiple levels of aggregations in a single statement. This 
clause is used to compute aggregations
 based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING 
SETS`. For example,
-`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), ())`.
+`GROUP BY warehouse, product WITH ROLLUP` or `GROUP BY ROLLUP(warehouse, 
product)` is equivalent to
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`.
 The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`.
 
 * **CUBE**
 
 `CUBE` clause is used to perform aggregations based on combination of 
grouping columns specified in the
 `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example,
-`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), (product), ())`.
+`GROUP BY warehouse, product WITH CUBE` or `GROUP BY CUBE(warehouse, 
product)` is equivalent to 
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`.
 The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`.
 
+* **grouping_set**
+
+A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
+
+**Syntax:** `( [ expression [ , ... ] ] )`

Review comment:
   > do we support `ROLLUP((a, b), (a))`?
   
   ```
> select id, city, car_model, sum(quantity) from dealer group by 
rollup(id, city, (id, car_model));
   NULL NULLNULL78
   200  Dublin  Honda CRV   3
   300  San JoseNULL13
   200  Dublin  Honda Accord10
   100  NULLNULL32
   200  Dublin  Honda Civic 20
   100  Fremont Honda Civic 10
   200  Dublin  NULL33
   200  NULLNULL33
   300  San JoseHonda Accord8
   300  San JoseHonda Civic 5
   100  Fremont NULL32
   100  Fremont Honda Accord15
   100  Fremont Honda CRV   7
   300  NULLNULL13
   Time taken: 1.287 seconds, Fetched 15 row(s)
   spark-sql>
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-25 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r601476562



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -32,7 +32,7 @@ When a FILTER clause is attached to an aggregate function, 
only the matching row
 GROUP BY group_expression [ , group_expression [ , ... ] ]
 [ { WITH ROLLUP | WITH CUBE | GROUPING SETS (grouping_set [ , ...]) } ]

Review comment:
   You can see the comment 
https://github.com/apache/spark/pull/30212/files#r529501391
   
   Not all case can be covered such as `GROUP BY A, B grouping sets(a, (a, b ))`




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-25 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r601268462



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column alias like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+   
 * **GROUPING SETS**
 
 Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,
 `GROUP BY GROUPING SETS (warehouse, product)` is semantically equivalent
-to union of results of `GROUP BY warehouse` and `GROUP BY product`. This 
clause
+to union of results of `GROUP BY warehouse` and `GROUP BY product`. 
+`GROUP BY GROUPING SETS ((warehouse, product), (product), ())` is 
semantically equivalent to union

Review comment:
   Done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-25 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r601264788



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column alias like `GROUP BY a`, a column position like
+`GROUP BY 0`, or an expression like `GROUP BY a + b`.
+   
 * **GROUPING SETS**
 
 Groups the rows for each subset of the expressions specified in the 
grouping sets. For example,

Review comment:
   Yea, more clear, Done

##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,42 +42,44 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column alias like `GROUP BY a`, a column position like

Review comment:
   Done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-25 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r601250092



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -32,7 +32,7 @@ When a FILTER clause is attached to an aggregate function, 
only the matching row
 GROUP BY group_expression [ , group_expression [ , ... ] ]
 [ { WITH ROLLUP | WITH CUBE | GROUPING SETS (grouping_set [ , ...]) } ]

Review comment:
   > BTW do we really support `GROUP BY a, b GROUPING SETS (c, d)`?
   
   Not yet. Same reason as 
https://github.com/apache/spark/pull/30212#discussion_r601171306




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-25 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r601244661



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -32,7 +32,7 @@ When a FILTER clause is attached to an aggregate function, 
only the matching row
 GROUP BY group_expression [ , group_expression [ , ... ] ]
 [ { WITH ROLLUP | WITH CUBE | GROUPING SETS (grouping_set [ , ...]) } ]
 
-GROUP BY GROUPING SETS (grouping_set [ , ...])
+GROUP BY { group_expression | { ROLLUP | CUBE | GROUPING SETS } (grouping_set 
[ , ...]) } [ , ... ]

Review comment:
   > Shall we remove the `[ , ... ]` at the end since we don't support it 
right now?
   
   Syntactically speaking, we can support this after this pr, but currently 
does not support executio.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-25 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r601173492



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -50,34 +65,28 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.

Review comment:
   Should we explain more about `group by ()`, it mean's aggregate all 
result.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-25 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r601172474



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,6 +42,21 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
+or an expression.

Review comment:
   Done

##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -32,7 +32,7 @@ When a FILTER clause is attached to an aggregate function, 
only the matching row
 GROUP BY group_expression [ , group_expression [ , ... ] ]
 [ { WITH ROLLUP | WITH CUBE | GROUPING SETS (grouping_set [ , ...]) } ]
 
-GROUP BY GROUPING SETS (grouping_set [ , ...])
+GROUP BY group_expression_with_grouping_analytics [ , 
group_expression_with_grouping_analytics [ , ... ] ]

Review comment:
   Done

##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -42,6 +42,21 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 
 ### Parameters
 
+* **grouping_expression**
+
+Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
+result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
+or an expression.
+
+* **group_expression_with_grouping_analytics**

Review comment:
   Done

##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -50,34 +65,28 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
 
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
-
 * **ROLLUP**
 
 Specifies multiple levels of aggregations in a single statement. This 
clause is used to compute aggregations
 based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING 
SETS`. For example,
-`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), ())`.
+`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
ROLLUP(warehouse, product)` or

Review comment:
   Done

##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -50,34 +65,28 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.

Review comment:
   Done




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-25 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r601171306



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -50,34 +65,28 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
 
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
-
 * **ROLLUP**
 
 Specifies multiple levels of aggregations in a single statement. This 
clause is used to compute aggregations
 based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING 
SETS`. For example,
-`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), ())`.
+`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
ROLLUP(warehouse, product)` or
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`.
 The N elements of a `ROLLUP` specification results in N+1 `GROUPING SETS`.
 
 * **CUBE**
 
 `CUBE` clause is used to perform aggregations based on combination of 
grouping columns specified in the
 `GROUP BY` clause. `CUBE` is a shorthand for `GROUPING SETS`. For example,
-`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), (product), ())`.
+`GROUP BY warehouse, product WITH CUBE` is equivalent to `GROUP BY 
CUBE(warehouse, product)` or 
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), (product), ())`.
 The N elements of a `CUBE` specification results in 2^N `GROUPING SETS`.
 

Review comment:
   > seems we miss the document of multiple grouping sets, which is newly 
added in this PR. What does `GROUP BY ROLLUP(a, b), ROLLUP(c, d)` mean?
   
   We can't support this now. We should support this in 
https://github.com/apache/spark/pull/30144
   When We doing https://github.com/apache/spark/pull/30144 we found current 
grouping analytic's code need to refactor then can make it support more 
flexible case like postgresSQL and Oracle.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2021-03-25 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r601161941



##
File path: docs/sql-ref-syntax-qry-select-groupby.md
##
@@ -50,34 +65,28 @@ aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ 
FILTER ( WHERE boolean_ex
 is a shorthand for a `UNION ALL` where each leg of the `UNION ALL`
 operator performs aggregation of subset of the columns specified in the 
`GROUPING SETS` clause.
 
-* **grouping_set**
-
-A grouping set is specified by zero or more comma-separated expressions in 
parentheses.
-
-**Syntax:** `( [ expression [ , ... ] ] )`
-
-* **grouping_expression**
-
-Specifies the criteria based on which the rows are grouped together. The 
grouping of rows is performed based on
-result values of the grouping expressions. A grouping expression may be a 
column alias, a column position
-or an expression.
-
 * **ROLLUP**
 
 Specifies multiple levels of aggregations in a single statement. This 
clause is used to compute aggregations
 based on multiple grouping sets. `ROLLUP` is a shorthand for `GROUPING 
SETS`. For example,
-`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
GROUPING SETS
-((warehouse, product), (warehouse), ())`.
+`GROUP BY warehouse, product WITH ROLLUP` is equivalent to `GROUP BY 
ROLLUP(warehouse, product)` or
+`GROUP BY GROUPING SETS((warehouse, product), (warehouse), ())`.

Review comment:
   > is this corrected? I though the result is a union of `GROUP BY 
warehouse, product`, `GROUP BY warehouse`, `GROUP BY ()` (global aggregate).
   
   ```
   spark-sql> CREATE TABLE dealer (id INT, city STRING, car_model STRING, 
quantity INT);
   Time taken: 3.084 seconds
   spark-sql> INSERT INTO dealer VALUES
> (100, 'Fremont', 'Honda Civic', 10),
> (100, 'Fremont', 'Honda Accord', 15),
> (100, 'Fremont', 'Honda CRV', 7),
> (200, 'Dublin', 'Honda Civic', 20),
> (200, 'Dublin', 'Honda Accord', 10),
> (200, 'Dublin', 'Honda CRV', 3),
> (300, 'San Jose', 'Honda Civic', 5),
> (300, 'San Jose', 'Honda Accord', 8);
   21/03/25 15:48:47 WARN ObjectStore: Failed to get database global_temp, 
returning NoSuchObjectException
   Time taken: 2.332 seconds
   spark-sql>  select id, city, sum(quantity) from dealer group by rollup(id, 
city);
   100  NULL32
   300  NULL13
   200  NULL33
   NULL NULL78
   200  Dublin  33
   300  San Jose13
   100  Fremont 32
   Time taken: 1.388 seconds, Fetched 7 row(s)
   spark-sql> select id, city, sum(quantity) from dealer group by grouping 
sets((id, city), (id), ());
   100  NULL32
   300  NULL13
   200  NULL33
   NULL NULL78
   200  Dublin  33
   300  San Jose13
   100  Fremont 32
   Time taken: 0.284 seconds, Fetched 7 row(s)
   spark-sql> explain select id, city, sum(quantity) from dealer group by 
rollup(id, city);
   == Physical Plan ==
   AdaptiveSparkPlan isFinalPlan=false
   +- HashAggregate(keys=[id#68, city#69, spark_grouping_id#67L], 
functions=[sum(quantity#62)])
  +- Exchange hashpartitioning(id#68, city#69, spark_grouping_id#67L, 200), 
ENSURE_REQUIREMENTS, [id=#113]
 +- HashAggregate(keys=[id#68, city#69, spark_grouping_id#67L], 
functions=[partial_sum(quantity#62)])
+- Expand [List(quantity#62, id#59, city#60, 0), List(quantity#62, 
id#59, null, 1), List(quantity#62, null, null, 3)], [quantity#62, id#68, 
city#69, spark_grouping_id#67L]
   +- Scan hive default.dealer [quantity#62, id#59, city#60], 
HiveTableRelation [`default`.`dealer`, 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, Data Cols: [id#59, city#60, 
car_model#61, quantity#62], Partition Cols: []]
   
   
   Time taken: 0.064 seconds, Fetched 1 row(s)
   spark-sql> explain select id, city, sum(quantity) from dealer group by 
grouping sets((id, city), (id), ());
   == Physical Plan ==
   AdaptiveSparkPlan isFinalPlan=false
   +- HashAggregate(keys=[id#86, city#87, spark_grouping_id#85L], 
functions=[sum(quantity#80)])
  +- Exchange hashpartitioning(id#86, city#87, spark_grouping_id#85L, 200), 
ENSURE_REQUIREMENTS, [id=#136]
 +- HashAggregate(keys=[id#86, city#87, spark_grouping_id#85L], 
functions=[partial_sum(quantity#80)])
+- Expand [List(quantity#80, id#77, city#78, 0), List(quantity#80, 
id#77, null, 1), List(quantity#80, null, null, 3)], [quantity#80, id#86, 
city#87, spark_grouping_id#85L]
   +- Scan hive default.dealer [quantity#80, id#77, city#78], 
HiveTableRelation [`default`.`dealer`, 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, Data Cols: [id#77, city#78, 
car_model#79, quantity#80], Partition Cols: []]
   
   
   Time taken: 0.063 seconds, Fetched 1 row(s)
   spark-sql>
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2020-12-22 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r547222654



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
##
@@ -850,29 +850,62 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
SQLConfHelper with Logg
   }
 
   /**
-   * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan.
+   * Add an [[Aggregate]] to a logical plan.
*/
   private def withAggregationClause(
   ctx: AggregationClauseContext,
   selectExpressions: Seq[NamedExpression],
   query: LogicalPlan): LogicalPlan = withOrigin(ctx) {
-val groupByExpressions = expressionList(ctx.groupingExpressions)
-
-if (ctx.GROUPING != null) {
-  // GROUP BY  GROUPING SETS (...)
-  val selectedGroupByExprs =
-ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
-  GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, 
selectExpressions)
-} else {
-  // GROUP BY  (WITH CUBE | WITH ROLLUP)?
-  val mappedGroupByExpressions = if (ctx.CUBE != null) {
-Seq(Cube(groupByExpressions))
-  } else if (ctx.ROLLUP != null) {
-Seq(Rollup(groupByExpressions))
+if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) {
+  val groupByExpressions = expressionList(ctx.groupingExpressions)
+  if (ctx.GROUPING != null) {
+// GROUP BY  GROUPING SETS (...)
+val selectedGroupByExprs =
+  ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
+Aggregate(Seq(GroupingSets(selectedGroupByExprs, groupByExpressions)),
+  selectExpressions, query)
   } else {
-groupByExpressions
+// GROUP BY  (WITH CUBE | WITH ROLLUP)?
+val mappedGroupByExpressions = if (ctx.CUBE != null) {
+  Seq(Cube(groupByExpressions.map(Seq(_
+} else if (ctx.ROLLUP != null) {
+  Seq(Rollup(groupByExpressions.map(Seq(_
+} else {
+  groupByExpressions
+}
+Aggregate(mappedGroupByExpressions, selectExpressions, query)
   }
-  Aggregate(mappedGroupByExpressions, selectExpressions, query)
+} else {
+  val groupByExpressions =
+ctx.groupingExpressionsWithGroupingAnalytics.asScala
+  .map(groupByExpr => {
+val groupingAnalytics = groupByExpr.groupingAnalytics
+if (groupingAnalytics != null) {
+  val selectedGroupByExprs = groupingAnalytics.groupingSet.asScala
+.map(_.expression.asScala.map(e => expression(e)).toSeq)
+  if (groupingAnalytics.CUBE != null) {
+// CUBE(A, B, (A, B), ()) is not supported.
+if (selectedGroupByExprs.exists(_.isEmpty)) {
+  throw new ParseException("Empty set in CUBE grouping sets is 
not supported.",
+groupingAnalytics)
+}
+Cube(selectedGroupByExprs)
+  } else if (groupingAnalytics.ROLLUP != null) {
+// ROLLUP(A, B, (A, B), ()) is not supported.
+if (selectedGroupByExprs.exists(_.isEmpty)) {
+  throw new ParseException("Empty set in ROLLUP grouping sets 
is not supported.",
+groupingAnalytics)
+}
+Rollup(selectedGroupByExprs)
+  } else {
+GroupingSets(selectedGroupByExprs, 
selectedGroupByExprs.flatten.distinct)

Review comment:
   > Could you check `assert(groupingAnalytics.GROUPING != null)`?
   
   Done





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2020-12-19 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r546248497



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
##
@@ -850,29 +850,70 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
SQLConfHelper with Logg
   }
 
   /**
-   * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan.
+   * Add an [[Aggregate]] to a logical plan.
*/
   private def withAggregationClause(
   ctx: AggregationClauseContext,
   selectExpressions: Seq[NamedExpression],
   query: LogicalPlan): LogicalPlan = withOrigin(ctx) {
-val groupByExpressions = expressionList(ctx.groupingExpressions)
-
-if (ctx.GROUPING != null) {
-  // GROUP BY  GROUPING SETS (...)
-  val selectedGroupByExprs =
-ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
-  GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, 
selectExpressions)
-} else {
-  // GROUP BY  (WITH CUBE | WITH ROLLUP)?
-  val mappedGroupByExpressions = if (ctx.CUBE != null) {
-Seq(Cube(groupByExpressions))
-  } else if (ctx.ROLLUP != null) {
-Seq(Rollup(groupByExpressions))
+if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) {
+  val groupByExpressions = expressionList(ctx.groupingExpressions)
+  if (ctx.GROUPING != null) {
+// GROUP BY  GROUPING SETS (...)
+val selectedGroupByExprs =
+  ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
+Aggregate(Seq(GroupingSets(selectedGroupByExprs, groupByExpressions)),
+  selectExpressions, query)
   } else {
-groupByExpressions
+// GROUP BY  (WITH CUBE | WITH ROLLUP)?
+val mappedGroupByExpressions = if (ctx.CUBE != null) {
+  Seq(Cube(groupByExpressions.map(Seq(_
+} else if (ctx.ROLLUP != null) {
+  Seq(Rollup(groupByExpressions.map(Seq(_
+} else {
+  groupByExpressions
+}
+Aggregate(mappedGroupByExpressions, selectExpressions, query)
   }
-  Aggregate(mappedGroupByExpressions, selectExpressions, query)
+} else {
+  val groupByExpressions =
+ctx.groupingExpressionsWithGroupingAnalytics.asScala
+  .map(groupByExpr => {
+val expr = groupByExpr.expression()
+val groupingAnalytics = groupByExpr.groupingAnalytics()
+(expr, groupingAnalytics) match {
+  case (_: ExpressionContext, null) =>
+expression(expr)
+  case (null, _: GroupingAnalyticsContext) =>
+val selectedGroupByExprs = 
groupingAnalytics.groupingSet().asScala
+.map(_.expression.asScala.map(e => expression(e)).toSeq)
+if (groupingAnalytics.GROUPING() != null) {
+  GroupingSets(selectedGroupByExprs, 
selectedGroupByExprs.flatten.distinct)
+} else if (groupingAnalytics.CUBE != null) {
+  // CUBE(A, B, (A, B), ()) is not supported.
+  if (selectedGroupByExprs.exists(_.isEmpty)) {
+throw new ParseException("Empty set in CUBE grouping sets 
is not supported.",
+  groupingAnalytics)
+  }
+  Cube(selectedGroupByExprs)
+} else if (groupingAnalytics.ROLLUP != null) {
+  // ROLLUP(A, B, (A, B), ()) is not supported.
+  if (selectedGroupByExprs.exists(_.isEmpty)) {
+throw new ParseException("Empty set in ROLLUP grouping 
sets is not supported.",
+  groupingAnalytics)
+  }
+  Rollup(selectedGroupByExprs)
+} else {
+  throw new IllegalArgumentException(
+"Grouping Analytics only support CUBE/ROLLUP/GROUPING 
SETS.")
+}
+  case (_, _) =>
+throw new IllegalArgumentException("Each GROUP BY expression 
should be" +

Review comment:
   Change to use if else block.

##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
##
@@ -850,29 +850,70 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
SQLConfHelper with Logg
   }
 
   /**
-   * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan.
+   * Add an [[Aggregate]] to a logical plan.
*/
   private def withAggregationClause(
   ctx: AggregationClauseContext,
   selectExpressions: Seq[NamedExpression],
   query: LogicalPlan): LogicalPlan = withOrigin(ctx) {
-val groupByExpressions = expressionList(ctx.groupingExpressions)
-
-if (ctx.GROUPING != null) {
-  // GROUP BY  GROUPING SETS (...)
-  val selectedGroupByExprs =
-ctx.groupingSet.asScala.map(_.expression.asScala.map(e =>

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2020-12-19 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r546239214



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
##
@@ -850,29 +850,70 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
SQLConfHelper with Logg
   }
 
   /**
-   * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan.
+   * Add an [[Aggregate]] to a logical plan.
*/
   private def withAggregationClause(
   ctx: AggregationClauseContext,
   selectExpressions: Seq[NamedExpression],
   query: LogicalPlan): LogicalPlan = withOrigin(ctx) {
-val groupByExpressions = expressionList(ctx.groupingExpressions)
-
-if (ctx.GROUPING != null) {
-  // GROUP BY  GROUPING SETS (...)
-  val selectedGroupByExprs =
-ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
-  GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, 
selectExpressions)
-} else {
-  // GROUP BY  (WITH CUBE | WITH ROLLUP)?
-  val mappedGroupByExpressions = if (ctx.CUBE != null) {
-Seq(Cube(groupByExpressions))
-  } else if (ctx.ROLLUP != null) {
-Seq(Rollup(groupByExpressions))
+if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) {
+  val groupByExpressions = expressionList(ctx.groupingExpressions)
+  if (ctx.GROUPING != null) {
+// GROUP BY  GROUPING SETS (...)
+val selectedGroupByExprs =
+  ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
+Aggregate(Seq(GroupingSets(selectedGroupByExprs, groupByExpressions)),
+  selectExpressions, query)
   } else {
-groupByExpressions
+// GROUP BY  (WITH CUBE | WITH ROLLUP)?
+val mappedGroupByExpressions = if (ctx.CUBE != null) {
+  Seq(Cube(groupByExpressions.map(Seq(_
+} else if (ctx.ROLLUP != null) {
+  Seq(Rollup(groupByExpressions.map(Seq(_
+} else {
+  groupByExpressions
+}
+Aggregate(mappedGroupByExpressions, selectExpressions, query)
   }
-  Aggregate(mappedGroupByExpressions, selectExpressions, query)
+} else {
+  val groupByExpressions =
+ctx.groupingExpressionsWithGroupingAnalytics.asScala
+  .map(groupByExpr => {
+val expr = groupByExpr.expression()
+val groupingAnalytics = groupByExpr.groupingAnalytics()
+(expr, groupingAnalytics) match {
+  case (_: ExpressionContext, null) =>
+expression(expr)
+  case (null, _: GroupingAnalyticsContext) =>
+val selectedGroupByExprs = 
groupingAnalytics.groupingSet().asScala
+.map(_.expression.asScala.map(e => expression(e)).toSeq)
+if (groupingAnalytics.GROUPING() != null) {
+  GroupingSets(selectedGroupByExprs, 
selectedGroupByExprs.flatten.distinct)
+} else if (groupingAnalytics.CUBE != null) {
+  // CUBE(A, B, (A, B), ()) is not supported.
+  if (selectedGroupByExprs.exists(_.isEmpty)) {
+throw new ParseException("Empty set in CUBE grouping sets 
is not supported.",
+  groupingAnalytics)
+  }
+  Cube(selectedGroupByExprs)
+} else if (groupingAnalytics.ROLLUP != null) {
+  // ROLLUP(A, B, (A, B), ()) is not supported.
+  if (selectedGroupByExprs.exists(_.isEmpty)) {
+throw new ParseException("Empty set in ROLLUP grouping 
sets is not supported.",
+  groupingAnalytics)
+  }
+  Rollup(selectedGroupByExprs)
+} else {
+  throw new IllegalArgumentException(
+"Grouping Analytics only support CUBE/ROLLUP/GROUPING 
SETS.")
+}
+  case (_, _) =>
+throw new IllegalArgumentException("Each GROUP BY expression 
should be" +

Review comment:
   > ditto: We can reach this code path?
   
   Won't, I remember `match` should have `case` block for all case?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2020-12-19 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r546239097



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
##
@@ -850,29 +850,70 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with 
SQLConfHelper with Logg
   }
 
   /**
-   * Add an [[Aggregate]] or [[GroupingSets]] to a logical plan.
+   * Add an [[Aggregate]] to a logical plan.
*/
   private def withAggregationClause(
   ctx: AggregationClauseContext,
   selectExpressions: Seq[NamedExpression],
   query: LogicalPlan): LogicalPlan = withOrigin(ctx) {
-val groupByExpressions = expressionList(ctx.groupingExpressions)
-
-if (ctx.GROUPING != null) {
-  // GROUP BY  GROUPING SETS (...)
-  val selectedGroupByExprs =
-ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
-  GroupingSets(selectedGroupByExprs.toSeq, groupByExpressions, query, 
selectExpressions)
-} else {
-  // GROUP BY  (WITH CUBE | WITH ROLLUP)?
-  val mappedGroupByExpressions = if (ctx.CUBE != null) {
-Seq(Cube(groupByExpressions))
-  } else if (ctx.ROLLUP != null) {
-Seq(Rollup(groupByExpressions))
+if (ctx.groupingExpressionsWithGroupingAnalytics.isEmpty) {
+  val groupByExpressions = expressionList(ctx.groupingExpressions)
+  if (ctx.GROUPING != null) {
+// GROUP BY  GROUPING SETS (...)
+val selectedGroupByExprs =
+  ctx.groupingSet.asScala.map(_.expression.asScala.map(e => 
expression(e)).toSeq)
+Aggregate(Seq(GroupingSets(selectedGroupByExprs, groupByExpressions)),
+  selectExpressions, query)
   } else {
-groupByExpressions
+// GROUP BY  (WITH CUBE | WITH ROLLUP)?
+val mappedGroupByExpressions = if (ctx.CUBE != null) {
+  Seq(Cube(groupByExpressions.map(Seq(_
+} else if (ctx.ROLLUP != null) {
+  Seq(Rollup(groupByExpressions.map(Seq(_
+} else {
+  groupByExpressions
+}
+Aggregate(mappedGroupByExpressions, selectExpressions, query)
   }
-  Aggregate(mappedGroupByExpressions, selectExpressions, query)
+} else {
+  val groupByExpressions =
+ctx.groupingExpressionsWithGroupingAnalytics.asScala
+  .map(groupByExpr => {
+val expr = groupByExpr.expression()
+val groupingAnalytics = groupByExpr.groupingAnalytics()
+(expr, groupingAnalytics) match {
+  case (_: ExpressionContext, null) =>
+expression(expr)
+  case (null, _: GroupingAnalyticsContext) =>
+val selectedGroupByExprs = 
groupingAnalytics.groupingSet().asScala
+.map(_.expression.asScala.map(e => expression(e)).toSeq)
+if (groupingAnalytics.GROUPING() != null) {
+  GroupingSets(selectedGroupByExprs, 
selectedGroupByExprs.flatten.distinct)
+} else if (groupingAnalytics.CUBE != null) {
+  // CUBE(A, B, (A, B), ()) is not supported.
+  if (selectedGroupByExprs.exists(_.isEmpty)) {
+throw new ParseException("Empty set in CUBE grouping sets 
is not supported.",
+  groupingAnalytics)
+  }
+  Cube(selectedGroupByExprs)
+} else if (groupingAnalytics.ROLLUP != null) {
+  // ROLLUP(A, B, (A, B), ()) is not supported.
+  if (selectedGroupByExprs.exists(_.isEmpty)) {
+throw new ParseException("Empty set in ROLLUP grouping 
sets is not supported.",
+  groupingAnalytics)
+  }
+  Rollup(selectedGroupByExprs)
+} else {
+  throw new IllegalArgumentException(
+"Grouping Analytics only support CUBE/ROLLUP/GROUPING 
SETS.")

Review comment:
   > We can reach this code path?
   
   Won't, just remove this `else` block?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2020-12-14 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r542277956



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
##
@@ -2027,65 +1952,76 @@ class Analyzer(override val catalogManager: 
CatalogManager)
*/
   object ResolveFunctions extends Rule[LogicalPlan] {
 val trimWarningEnabled = new AtomicBoolean(true)
+
+def resolveFunction(): PartialFunction[Expression, Expression] = {
+  case u if !u.childrenResolved => u // Skip until children are resolved.
+  case u: UnresolvedAttribute if resolver(u.name, 
VirtualColumn.hiveGroupingIdName) =>
+withPosition(u) {
+  Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)()
+}
+  case u @ UnresolvedGenerator(name, children) =>
+withPosition(u) {
+  v1SessionCatalog.lookupFunction(name, children) match {
+case generator: Generator => generator
+case other =>
+  failAnalysis(s"$name is expected to be a generator. However, " +
+s"its class is ${other.getClass.getCanonicalName}, which is 
not a generator.")
+  }
+}
+  case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) =>
+withPosition(u) {
+  v1SessionCatalog.lookupFunction(funcId, arguments) match {
+// AggregateWindowFunctions are AggregateFunctions that can only 
be evaluated within
+// the context of a Window clause. They do not need to be wrapped 
in an
+// AggregateExpression.
+case wf: AggregateWindowFunction =>
+  if (isDistinct || filter.isDefined) {
+failAnalysis("DISTINCT or FILTER specified, " +
+  s"but ${wf.prettyName} is not an aggregate function")
+  } else {
+wf
+  }
+// We get an aggregate function, we need to wrap it in an 
AggregateExpression.
+case agg: AggregateFunction =>
+  if (filter.isDefined && !filter.get.deterministic) {
+failAnalysis("FILTER expression is non-deterministic, " +
+  "it cannot be used in aggregate functions")
+  }
+  AggregateExpression(agg, Complete, isDistinct, filter)
+// This function is not an aggregate function, just return the 
resolved one.
+case other if (isDistinct || filter.isDefined) =>
+  failAnalysis("DISTINCT or FILTER specified, " +
+s"but ${other.prettyName} is not an aggregate function")
+case e: String2TrimExpression if arguments.size == 2 =>
+  if (trimWarningEnabled.get) {
+logWarning("Two-parameter TRIM/LTRIM/RTRIM function signatures 
are deprecated." +
+  " Use SQL syntax `TRIM((BOTH | LEADING | TRAILING)? trimStr 
FROM str)`" +
+  " instead.")
+trimWarningEnabled.set(false)
+  }
+  e
+case other =>
+  other
+  }
+}
+}
+
 def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp {
   // Resolve functions with concrete relations from v2 catalog.
   case UnresolvedFunc(multipartIdent) =>
 val funcIdent = parseSessionCatalogFunctionIdentifier(multipartIdent)
 ResolvedFunc(Identifier.of(funcIdent.database.toArray, 
funcIdent.funcName))
 
-  case q: LogicalPlan =>
-q transformExpressions {
-  case u if !u.childrenResolved => u // Skip until children are 
resolved.
-  case u: UnresolvedAttribute if resolver(u.name, 
VirtualColumn.hiveGroupingIdName) =>
-withPosition(u) {
-  Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)()
-}
-  case u @ UnresolvedGenerator(name, children) =>
-withPosition(u) {
-  v1SessionCatalog.lookupFunction(name, children) match {
-case generator: Generator => generator
-case other =>
-  failAnalysis(s"$name is expected to be a generator. However, 
" +
-s"its class is ${other.getClass.getCanonicalName}, which 
is not a generator.")
-  }
-}
-  case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) =>
-withPosition(u) {
-  v1SessionCatalog.lookupFunction(funcId, arguments) match {
-// AggregateWindowFunctions are AggregateFunctions that can 
only be evaluated within
-// the context of a Window clause. They do not need to be 
wrapped in an
-// AggregateExpression.
-case wf: AggregateWindowFunction =>
-  if (isDistinct || filter.isDefined) {
-failAnalysis("DISTINCT or FILTER specified, " +
-  s"but ${wf.prettyName} is not an

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2020-12-14 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r542257009



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
##
@@ -2027,65 +1952,76 @@ class Analyzer(override val catalogManager: 
CatalogManager)
*/
   object ResolveFunctions extends Rule[LogicalPlan] {
 val trimWarningEnabled = new AtomicBoolean(true)
+
+def resolveFunction(): PartialFunction[Expression, Expression] = {
+  case u if !u.childrenResolved => u // Skip until children are resolved.
+  case u: UnresolvedAttribute if resolver(u.name, 
VirtualColumn.hiveGroupingIdName) =>
+withPosition(u) {
+  Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)()
+}
+  case u @ UnresolvedGenerator(name, children) =>
+withPosition(u) {
+  v1SessionCatalog.lookupFunction(name, children) match {
+case generator: Generator => generator
+case other =>
+  failAnalysis(s"$name is expected to be a generator. However, " +
+s"its class is ${other.getClass.getCanonicalName}, which is 
not a generator.")
+  }
+}
+  case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) =>
+withPosition(u) {
+  v1SessionCatalog.lookupFunction(funcId, arguments) match {
+// AggregateWindowFunctions are AggregateFunctions that can only 
be evaluated within
+// the context of a Window clause. They do not need to be wrapped 
in an
+// AggregateExpression.
+case wf: AggregateWindowFunction =>
+  if (isDistinct || filter.isDefined) {
+failAnalysis("DISTINCT or FILTER specified, " +
+  s"but ${wf.prettyName} is not an aggregate function")
+  } else {
+wf
+  }
+// We get an aggregate function, we need to wrap it in an 
AggregateExpression.
+case agg: AggregateFunction =>
+  if (filter.isDefined && !filter.get.deterministic) {
+failAnalysis("FILTER expression is non-deterministic, " +
+  "it cannot be used in aggregate functions")
+  }
+  AggregateExpression(agg, Complete, isDistinct, filter)
+// This function is not an aggregate function, just return the 
resolved one.
+case other if (isDistinct || filter.isDefined) =>
+  failAnalysis("DISTINCT or FILTER specified, " +
+s"but ${other.prettyName} is not an aggregate function")
+case e: String2TrimExpression if arguments.size == 2 =>
+  if (trimWarningEnabled.get) {
+logWarning("Two-parameter TRIM/LTRIM/RTRIM function signatures 
are deprecated." +
+  " Use SQL syntax `TRIM((BOTH | LEADING | TRAILING)? trimStr 
FROM str)`" +
+  " instead.")
+trimWarningEnabled.set(false)
+  }
+  e
+case other =>
+  other
+  }
+}
+}
+
 def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp {
   // Resolve functions with concrete relations from v2 catalog.
   case UnresolvedFunc(multipartIdent) =>
 val funcIdent = parseSessionCatalogFunctionIdentifier(multipartIdent)
 ResolvedFunc(Identifier.of(funcIdent.database.toArray, 
funcIdent.funcName))
 
-  case q: LogicalPlan =>
-q transformExpressions {
-  case u if !u.childrenResolved => u // Skip until children are 
resolved.
-  case u: UnresolvedAttribute if resolver(u.name, 
VirtualColumn.hiveGroupingIdName) =>
-withPosition(u) {
-  Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)()
-}
-  case u @ UnresolvedGenerator(name, children) =>
-withPosition(u) {
-  v1SessionCatalog.lookupFunction(name, children) match {
-case generator: Generator => generator
-case other =>
-  failAnalysis(s"$name is expected to be a generator. However, 
" +
-s"its class is ${other.getClass.getCanonicalName}, which 
is not a generator.")
-  }
-}
-  case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) =>
-withPosition(u) {
-  v1SessionCatalog.lookupFunction(funcId, arguments) match {
-// AggregateWindowFunctions are AggregateFunctions that can 
only be evaluated within
-// the context of a Window clause. They do not need to be 
wrapped in an
-// AggregateExpression.
-case wf: AggregateWindowFunction =>
-  if (isDistinct || filter.isDefined) {
-failAnalysis("DISTINCT or FILTER specified, " +
-  s"but ${wf.prettyName} is not an

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2020-12-01 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r533438082



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
##
@@ -2027,65 +1952,76 @@ class Analyzer(override val catalogManager: 
CatalogManager)
*/
   object ResolveFunctions extends Rule[LogicalPlan] {
 val trimWarningEnabled = new AtomicBoolean(true)
+
+def resolveFunction(): PartialFunction[Expression, Expression] = {
+  case u if !u.childrenResolved => u // Skip until children are resolved.
+  case u: UnresolvedAttribute if resolver(u.name, 
VirtualColumn.hiveGroupingIdName) =>
+withPosition(u) {
+  Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)()
+}
+  case u @ UnresolvedGenerator(name, children) =>
+withPosition(u) {
+  v1SessionCatalog.lookupFunction(name, children) match {
+case generator: Generator => generator
+case other =>
+  failAnalysis(s"$name is expected to be a generator. However, " +
+s"its class is ${other.getClass.getCanonicalName}, which is 
not a generator.")
+  }
+}
+  case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) =>
+withPosition(u) {
+  v1SessionCatalog.lookupFunction(funcId, arguments) match {
+// AggregateWindowFunctions are AggregateFunctions that can only 
be evaluated within
+// the context of a Window clause. They do not need to be wrapped 
in an
+// AggregateExpression.
+case wf: AggregateWindowFunction =>
+  if (isDistinct || filter.isDefined) {
+failAnalysis("DISTINCT or FILTER specified, " +
+  s"but ${wf.prettyName} is not an aggregate function")
+  } else {
+wf
+  }
+// We get an aggregate function, we need to wrap it in an 
AggregateExpression.
+case agg: AggregateFunction =>
+  if (filter.isDefined && !filter.get.deterministic) {
+failAnalysis("FILTER expression is non-deterministic, " +
+  "it cannot be used in aggregate functions")
+  }
+  AggregateExpression(agg, Complete, isDistinct, filter)
+// This function is not an aggregate function, just return the 
resolved one.
+case other if (isDistinct || filter.isDefined) =>
+  failAnalysis("DISTINCT or FILTER specified, " +
+s"but ${other.prettyName} is not an aggregate function")
+case e: String2TrimExpression if arguments.size == 2 =>
+  if (trimWarningEnabled.get) {
+logWarning("Two-parameter TRIM/LTRIM/RTRIM function signatures 
are deprecated." +
+  " Use SQL syntax `TRIM((BOTH | LEADING | TRAILING)? trimStr 
FROM str)`" +
+  " instead.")
+trimWarningEnabled.set(false)
+  }
+  e
+case other =>
+  other
+  }
+}
+}
+
 def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp {
   // Resolve functions with concrete relations from v2 catalog.
   case UnresolvedFunc(multipartIdent) =>
 val funcIdent = parseSessionCatalogFunctionIdentifier(multipartIdent)
 ResolvedFunc(Identifier.of(funcIdent.database.toArray, 
funcIdent.funcName))
 
-  case q: LogicalPlan =>
-q transformExpressions {
-  case u if !u.childrenResolved => u // Skip until children are 
resolved.
-  case u: UnresolvedAttribute if resolver(u.name, 
VirtualColumn.hiveGroupingIdName) =>
-withPosition(u) {
-  Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)()
-}
-  case u @ UnresolvedGenerator(name, children) =>
-withPosition(u) {
-  v1SessionCatalog.lookupFunction(name, children) match {
-case generator: Generator => generator
-case other =>
-  failAnalysis(s"$name is expected to be a generator. However, 
" +
-s"its class is ${other.getClass.getCanonicalName}, which 
is not a generator.")
-  }
-}
-  case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) =>
-withPosition(u) {
-  v1SessionCatalog.lookupFunction(funcId, arguments) match {
-// AggregateWindowFunctions are AggregateFunctions that can 
only be evaluated within
-// the context of a Window clause. They do not need to be 
wrapped in an
-// AggregateExpression.
-case wf: AggregateWindowFunction =>
-  if (isDistinct || filter.isDefined) {
-failAnalysis("DISTINCT or FILTER specified, " +
-  s"but ${wf.prettyName} is not an

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2020-12-01 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r533152103



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
##
@@ -2027,65 +1952,76 @@ class Analyzer(override val catalogManager: 
CatalogManager)
*/
   object ResolveFunctions extends Rule[LogicalPlan] {
 val trimWarningEnabled = new AtomicBoolean(true)
+
+def resolveFunction(): PartialFunction[Expression, Expression] = {
+  case u if !u.childrenResolved => u // Skip until children are resolved.
+  case u: UnresolvedAttribute if resolver(u.name, 
VirtualColumn.hiveGroupingIdName) =>
+withPosition(u) {
+  Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)()
+}
+  case u @ UnresolvedGenerator(name, children) =>
+withPosition(u) {
+  v1SessionCatalog.lookupFunction(name, children) match {
+case generator: Generator => generator
+case other =>
+  failAnalysis(s"$name is expected to be a generator. However, " +
+s"its class is ${other.getClass.getCanonicalName}, which is 
not a generator.")
+  }
+}
+  case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) =>
+withPosition(u) {
+  v1SessionCatalog.lookupFunction(funcId, arguments) match {
+// AggregateWindowFunctions are AggregateFunctions that can only 
be evaluated within
+// the context of a Window clause. They do not need to be wrapped 
in an
+// AggregateExpression.
+case wf: AggregateWindowFunction =>
+  if (isDistinct || filter.isDefined) {
+failAnalysis("DISTINCT or FILTER specified, " +
+  s"but ${wf.prettyName} is not an aggregate function")
+  } else {
+wf
+  }
+// We get an aggregate function, we need to wrap it in an 
AggregateExpression.
+case agg: AggregateFunction =>
+  if (filter.isDefined && !filter.get.deterministic) {
+failAnalysis("FILTER expression is non-deterministic, " +
+  "it cannot be used in aggregate functions")
+  }
+  AggregateExpression(agg, Complete, isDistinct, filter)
+// This function is not an aggregate function, just return the 
resolved one.
+case other if (isDistinct || filter.isDefined) =>
+  failAnalysis("DISTINCT or FILTER specified, " +
+s"but ${other.prettyName} is not an aggregate function")
+case e: String2TrimExpression if arguments.size == 2 =>
+  if (trimWarningEnabled.get) {
+logWarning("Two-parameter TRIM/LTRIM/RTRIM function signatures 
are deprecated." +
+  " Use SQL syntax `TRIM((BOTH | LEADING | TRAILING)? trimStr 
FROM str)`" +
+  " instead.")
+trimWarningEnabled.set(false)
+  }
+  e
+case other =>
+  other
+  }
+}
+}
+
 def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp {
   // Resolve functions with concrete relations from v2 catalog.
   case UnresolvedFunc(multipartIdent) =>
 val funcIdent = parseSessionCatalogFunctionIdentifier(multipartIdent)
 ResolvedFunc(Identifier.of(funcIdent.database.toArray, 
funcIdent.funcName))
 
-  case q: LogicalPlan =>
-q transformExpressions {
-  case u if !u.childrenResolved => u // Skip until children are 
resolved.
-  case u: UnresolvedAttribute if resolver(u.name, 
VirtualColumn.hiveGroupingIdName) =>
-withPosition(u) {
-  Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)()
-}
-  case u @ UnresolvedGenerator(name, children) =>
-withPosition(u) {
-  v1SessionCatalog.lookupFunction(name, children) match {
-case generator: Generator => generator
-case other =>
-  failAnalysis(s"$name is expected to be a generator. However, 
" +
-s"its class is ${other.getClass.getCanonicalName}, which 
is not a generator.")
-  }
-}
-  case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) =>
-withPosition(u) {
-  v1SessionCatalog.lookupFunction(funcId, arguments) match {
-// AggregateWindowFunctions are AggregateFunctions that can 
only be evaluated within
-// the context of a Window clause. They do not need to be 
wrapped in an
-// AggregateExpression.
-case wf: AggregateWindowFunction =>
-  if (isDistinct || filter.isDefined) {
-failAnalysis("DISTINCT or FILTER specified, " +
-  s"but ${wf.prettyName} is not an

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refactor current grouping analytics

2020-11-30 Thread GitBox



AngersZh commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r533087740



##
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
##
@@ -2027,65 +1952,76 @@ class Analyzer(override val catalogManager: 
CatalogManager)
*/
   object ResolveFunctions extends Rule[LogicalPlan] {
 val trimWarningEnabled = new AtomicBoolean(true)
+
+def resolveFunction(): PartialFunction[Expression, Expression] = {
+  case u if !u.childrenResolved => u // Skip until children are resolved.
+  case u: UnresolvedAttribute if resolver(u.name, 
VirtualColumn.hiveGroupingIdName) =>
+withPosition(u) {
+  Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)()
+}
+  case u @ UnresolvedGenerator(name, children) =>
+withPosition(u) {
+  v1SessionCatalog.lookupFunction(name, children) match {
+case generator: Generator => generator
+case other =>
+  failAnalysis(s"$name is expected to be a generator. However, " +
+s"its class is ${other.getClass.getCanonicalName}, which is 
not a generator.")
+  }
+}
+  case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) =>
+withPosition(u) {
+  v1SessionCatalog.lookupFunction(funcId, arguments) match {
+// AggregateWindowFunctions are AggregateFunctions that can only 
be evaluated within
+// the context of a Window clause. They do not need to be wrapped 
in an
+// AggregateExpression.
+case wf: AggregateWindowFunction =>
+  if (isDistinct || filter.isDefined) {
+failAnalysis("DISTINCT or FILTER specified, " +
+  s"but ${wf.prettyName} is not an aggregate function")
+  } else {
+wf
+  }
+// We get an aggregate function, we need to wrap it in an 
AggregateExpression.
+case agg: AggregateFunction =>
+  if (filter.isDefined && !filter.get.deterministic) {
+failAnalysis("FILTER expression is non-deterministic, " +
+  "it cannot be used in aggregate functions")
+  }
+  AggregateExpression(agg, Complete, isDistinct, filter)
+// This function is not an aggregate function, just return the 
resolved one.
+case other if (isDistinct || filter.isDefined) =>
+  failAnalysis("DISTINCT or FILTER specified, " +
+s"but ${other.prettyName} is not an aggregate function")
+case e: String2TrimExpression if arguments.size == 2 =>
+  if (trimWarningEnabled.get) {
+logWarning("Two-parameter TRIM/LTRIM/RTRIM function signatures 
are deprecated." +
+  " Use SQL syntax `TRIM((BOTH | LEADING | TRAILING)? trimStr 
FROM str)`" +
+  " instead.")
+trimWarningEnabled.set(false)
+  }
+  e
+case other =>
+  other
+  }
+}
+}
+
 def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp {
   // Resolve functions with concrete relations from v2 catalog.
   case UnresolvedFunc(multipartIdent) =>
 val funcIdent = parseSessionCatalogFunctionIdentifier(multipartIdent)
 ResolvedFunc(Identifier.of(funcIdent.database.toArray, 
funcIdent.funcName))
 
-  case q: LogicalPlan =>
-q transformExpressions {
-  case u if !u.childrenResolved => u // Skip until children are 
resolved.
-  case u: UnresolvedAttribute if resolver(u.name, 
VirtualColumn.hiveGroupingIdName) =>
-withPosition(u) {
-  Alias(GroupingID(Nil), VirtualColumn.hiveGroupingIdName)()
-}
-  case u @ UnresolvedGenerator(name, children) =>
-withPosition(u) {
-  v1SessionCatalog.lookupFunction(name, children) match {
-case generator: Generator => generator
-case other =>
-  failAnalysis(s"$name is expected to be a generator. However, 
" +
-s"its class is ${other.getClass.getCanonicalName}, which 
is not a generator.")
-  }
-}
-  case u @ UnresolvedFunction(funcId, arguments, isDistinct, filter) =>
-withPosition(u) {
-  v1SessionCatalog.lookupFunction(funcId, arguments) match {
-// AggregateWindowFunctions are AggregateFunctions that can 
only be evaluated within
-// the context of a Window clause. They do not need to be 
wrapped in an
-// AggregateExpression.
-case wf: AggregateWindowFunction =>
-  if (isDistinct || filter.isDefined) {
-failAnalysis("DISTINCT or FILTER specified, " +
-  s"but ${wf.prettyName} is not an

45 matches

Mail list logo