[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refract current grouping analytics

GitBox Sat, 28 Nov 2020 22:40:15 -0800


AngersZhuuuu commented on a change in pull request #30212:
URL: https://github.com/apache/spark/pull/30212#discussion_r532164002




##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
##########
@@ -39,45 +41,64 @@ trait GroupingSet extends Expression with CodegenFallback {
   override def eval(input: InternalRow): Any = throw new 
UnsupportedOperationException
 }
 
-// scalastyle:off line.size.limit line.contains.tab
-@ExpressionDescription(
-  usage = """
-    _FUNC_([col1[, col2 ..]]) - create a multi-dimensional cube using the 
specified columns
-      so that we can run aggregation on them.
-  """,
-  examples = """
-    Examples:
-      > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob') 
people(age, name) GROUP BY _FUNC_(name, age);
-        Bob    5       1
-        Alice  2       1
-        Alice  NULL    1
-        NULL   2       1
-        NULL   NULL    2
-        Bob    NULL    1
-        NULL   5       1
-  """,
-  since = "2.0.0")
-// scalastyle:on line.size.limit line.contains.tab
-case class Cube(groupByExprs: Seq[Expression]) extends GroupingSet {}
+object GroupingSet {
+  /*
+   *  GROUP BY a, b, c WITH ROLLUP

Review comment:
       > (I know this is not your mistake though) could you remove the single 
leading space?
   
   Done

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
##########
@@ -39,45 +41,64 @@ trait GroupingSet extends Expression with CodegenFallback {
   override def eval(input: InternalRow): Any = throw new 
UnsupportedOperationException
 }
 
-// scalastyle:off line.size.limit line.contains.tab
-@ExpressionDescription(
-  usage = """
-    _FUNC_([col1[, col2 ..]]) - create a multi-dimensional cube using the 
specified columns
-      so that we can run aggregation on them.
-  """,
-  examples = """
-    Examples:
-      > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob') 
people(age, name) GROUP BY _FUNC_(name, age);
-        Bob    5       1
-        Alice  2       1
-        Alice  NULL    1
-        NULL   2       1
-        NULL   NULL    2
-        Bob    NULL    1
-        NULL   5       1
-  """,
-  since = "2.0.0")
-// scalastyle:on line.size.limit line.contains.tab
-case class Cube(groupByExprs: Seq[Expression]) extends GroupingSet {}
+object GroupingSet {
+  /*
+   *  GROUP BY a, b, c WITH ROLLUP
+   *  is equivalent to
+   *  GROUP BY a, b, c GROUPING SETS ( (a, b, c), (a, b), (a), ( ) ).
+   *  Group Count: N + 1 (N is the number of group expressions)
+   *
+   *  We need to get all of its subsets for the rule described above, the 
subset is
+   *  represented as sequence of expressions.
+   */
+  def rollupExprs(exprs: Seq[Seq[Expression]]): Seq[Seq[Expression]] =
+    exprs.inits.map(_.flatten).toIndexedSeq
 
-// scalastyle:off line.size.limit line.contains.tab
-@ExpressionDescription(
-  usage = """
-    _FUNC_([col1[, col2 ..]]) - create a multi-dimensional rollup using the 
specified columns
-      so that we can run aggregation on them.
-  """,
-  examples = """
-    Examples:
-      > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob') 
people(age, name) GROUP BY _FUNC_(name, age);
-        Bob    5       1
-        Alice  2       1
-        Alice  NULL    1
-        NULL   NULL    2
-        Bob    NULL    1
-  """,
-  since = "2.0.0")
-// scalastyle:on line.size.limit line.contains.tab
-case class Rollup(groupByExprs: Seq[Expression]) extends GroupingSet {}
+  /*
+   *  GROUP BY a, b, c WITH CUBE

Review comment:
       Done

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
##########
@@ -39,45 +41,64 @@ trait GroupingSet extends Expression with CodegenFallback {
   override def eval(input: InternalRow): Any = throw new 
UnsupportedOperationException
 }
 
-// scalastyle:off line.size.limit line.contains.tab
-@ExpressionDescription(
-  usage = """
-    _FUNC_([col1[, col2 ..]]) - create a multi-dimensional cube using the 
specified columns
-      so that we can run aggregation on them.
-  """,
-  examples = """
-    Examples:
-      > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob') 
people(age, name) GROUP BY _FUNC_(name, age);
-        Bob    5       1
-        Alice  2       1
-        Alice  NULL    1
-        NULL   2       1
-        NULL   NULL    2
-        Bob    NULL    1
-        NULL   5       1
-  """,
-  since = "2.0.0")
-// scalastyle:on line.size.limit line.contains.tab
-case class Cube(groupByExprs: Seq[Expression]) extends GroupingSet {}
+object GroupingSet {
+  /*
+   *  GROUP BY a, b, c WITH ROLLUP
+   *  is equivalent to
+   *  GROUP BY a, b, c GROUPING SETS ( (a, b, c), (a, b), (a), ( ) ).
+   *  Group Count: N + 1 (N is the number of group expressions)
+   *
+   *  We need to get all of its subsets for the rule described above, the 
subset is
+   *  represented as sequence of expressions.
+   */
+  def rollupExprs(exprs: Seq[Seq[Expression]]): Seq[Seq[Expression]] =
+    exprs.inits.map(_.flatten).toIndexedSeq
 
-// scalastyle:off line.size.limit line.contains.tab
-@ExpressionDescription(
-  usage = """
-    _FUNC_([col1[, col2 ..]]) - create a multi-dimensional rollup using the 
specified columns
-      so that we can run aggregation on them.
-  """,
-  examples = """
-    Examples:
-      > SELECT name, age, count(*) FROM VALUES (2, 'Alice'), (5, 'Bob') 
people(age, name) GROUP BY _FUNC_(name, age);
-        Bob    5       1
-        Alice  2       1
-        Alice  NULL    1
-        NULL   NULL    2
-        Bob    NULL    1
-  """,
-  since = "2.0.0")
-// scalastyle:on line.size.limit line.contains.tab
-case class Rollup(groupByExprs: Seq[Expression]) extends GroupingSet {}
+  /*
+   *  GROUP BY a, b, c WITH CUBE
+   *  is equivalent to
+   *  GROUP BY a, b, c GROUPING SETS ( (a, b, c), (a, b), (b, c), (a, c), (a), 
(b), (c), ( ) ).
+   *  Group Count: 2 ^ N (N is the number of group expressions)
+   *
+   *  We need to get all of its subsets for a given GROUPBY expression, the 
subsets are
+   *  represented as sequence of expressions.
+   */
+  def cubeExprs(exprs: Seq[Seq[Expression]]): Seq[Seq[Expression]] = {
+    // `cubeExprs0` is recursive and returns a lazy Stream. Here we call 
`toIndexedSeq` to
+    // materialize it and avoid serialization problems later on.
+    cubeExprs0(exprs).toIndexedSeq
+  }
+
+  def cubeExprs0(exprs: Seq[Seq[Expression]]): Seq[Seq[Expression]] = 
exprs.toList match {
+    case x :: xs =>
+      val initial = cubeExprs0(xs)
+      initial.map(x ++ _) ++ initial
+    case Nil =>
+      Seq(Seq.empty)
+  }
+}
+
+case class Cube(groupingSets: Seq[Seq[Expression]]) extends GroupingSet {
+  import GroupingSet._
+  override def groupByExprs: Seq[Expression] = 
groupingSets.flatMap(_.distinct).distinct
+
+  override def selectedGroupByExprs: Seq[Seq[Expression]] = 
cubeExprs(groupingSets)
+}
+
+case class Rollup(groupingSets: Seq[Seq[Expression]]) extends GroupingSet {
+  import GroupingSet._
+  override def groupByExprs: Seq[Expression] = 
groupingSets.flatMap(_.distinct).distinct
+
+  override def selectedGroupByExprs: Seq[Seq[Expression]] = 
rollupExprs(groupingSets)

Review comment:
       DOne




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] AngersZhuuuu commented on a change in pull request #30212: [SPARK-33308][SQL] Refract current grouping analytics

Reply via email to