(spark) branch master updated: [SPARK-50630][SQL] Fix GROUP BY ordinal support for pipe SQL AGGREGATE operators

wenchen Sun, 22 Dec 2024 20:55:51 -0800

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 08675b169270 [SPARK-50630][SQL] Fix GROUP BY ordinal support for pipe 
SQL AGGREGATE operators
08675b169270 is described below

commit 08675b16927080e8c305483327db8da9564a5bc5
Author: Daniel Tenedorio <[email protected]>
AuthorDate: Mon Dec 23 12:53:59 2024 +0800

    [SPARK-50630][SQL] Fix GROUP BY ordinal support for pipe SQL AGGREGATE 
operators
    
    ### What changes were proposed in this pull request?
    
    This PR fixes GROUP BY ordinal support for pipe SQL AGGREGATE operators.
    
    It adds a new `UnresolvedPipeAggregateOrdinal` expression to represent 
these ordinals. In this context, the ordinal refers to the one-based position 
of the column in the input relation. Note that this behavior is different from 
GROUP BY ordinals in regular SQL, wherein the ordinal refers to the one-based 
position of the column in the SELECT clause instead.
    
    For example:
    
    ```
    select 3 as x, 4 as y, 5 as z
    |> aggregate sum(y) group by 2, 3
    
    > 4, 5, 4
    
    select 3 as x, 4 as y, 5 as z
    |> aggregate sum(y) group by 1, 2, 3
    
    > 3, 4, 5, 4
    ```
    
    This PR also makes a small fix for `|> UNION` (and other set operations) to 
prefer future pipe operators to apply on the result of the entire union, rather 
than binding to the right leg of the union only (to allay reported confusion 
during testing). For example, `values (0, 1) s(x, y) |> union all values (2, 3) 
t(x, y) |> drop x` will succeed rather than report an error that the number of 
columns does not match.
    
    ### Why are the changes needed?
    
    The current implementation has a bug where the ordinals are sometimes 
mistakenly retained as literal integers.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, see above.
    
    ### How was this patch tested?
    
    This PR adds new golden file based test coverage.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #49248 from dtenedor/group-by-ordinals-pipe-aggregate.
    
    Authored-by: Daniel Tenedorio <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../spark/sql/catalyst/parser/SqlBaseParser.g4     |   2 +-
 .../spark/sql/catalyst/analysis/Analyzer.scala     |  26 ++-
 .../spark/sql/catalyst/analysis/unresolved.scala   |  22 +++
 .../spark/sql/catalyst/parser/AstBuilder.scala     |  33 +++-
 .../analyzer-results/pipe-operators.sql.out        | 200 ++++++++++++++++++--
 .../resources/sql-tests/inputs/pipe-operators.sql  |  54 +++++-
 .../sql-tests/results/pipe-operators.sql.out       | 205 +++++++++++++++++++--
 7 files changed, 496 insertions(+), 46 deletions(-)

diff --git 
a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 
b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
index 8ef7ab90c6ff..e743aa2a744f 100644
--- 
a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
+++ 
b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
@@ -1523,7 +1523,7 @@ operatorPipeRightSide
     | unpivotClause pivotClause?
     | sample
     | joinRelation
-    | operator=(UNION | EXCEPT | SETMINUS | INTERSECT) setQuantifier? 
right=queryTerm
+    | operator=(UNION | EXCEPT | SETMINUS | INTERSECT) setQuantifier? 
right=queryPrimary
     | queryOrganization
     | AGGREGATE namedExpressionSeq? aggregationClause?
     ;
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 35ae0125d141..e8839148f51b 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1887,10 +1887,14 @@ class Analyzer(override val catalogManager: 
CatalogManager) extends RuleExecutor
 
       // Replace the index with the corresponding expression in 
aggregateExpressions. The index is
       // a 1-base position of aggregateExpressions, which is output columns 
(select expression)
-      case Aggregate(groups, aggs, child, hint) if aggs.forall(_.resolved) &&
+      case Aggregate(groups, aggs, child, hint)
+        if aggs
+          .filter(!containUnresolvedPipeAggregateOrdinal(_))
+          .forall(_.resolved) &&
         groups.exists(containUnresolvedOrdinal) =>
-        val newGroups = groups.map(resolveGroupByExpressionOrdinal(_, aggs))
-        Aggregate(newGroups, aggs, child, hint)
+        val newAggs = aggs.map(resolvePipeAggregateExpressionOrdinal(_, 
child.output))
+        val newGroups = groups.map(resolveGroupByExpressionOrdinal(_, newAggs))
+        Aggregate(newGroups, newAggs, child, hint)
     }
 
     private def containUnresolvedOrdinal(e: Expression): Boolean = e match {
@@ -1899,6 +1903,11 @@ class Analyzer(override val catalogManager: 
CatalogManager) extends RuleExecutor
       case _ => false
     }
 
+    private def containUnresolvedPipeAggregateOrdinal(e: Expression): Boolean 
= e match {
+      case UnresolvedAlias(_: UnresolvedPipeAggregateOrdinal, _) => true
+      case _ => false
+    }
+
     private def resolveGroupByExpressionOrdinal(
         expr: Expression,
         aggs: Seq[Expression]): Expression = expr match {
@@ -1934,6 +1943,17 @@ class Analyzer(override val catalogManager: 
CatalogManager) extends RuleExecutor
     }
   }
 
+  private def resolvePipeAggregateExpressionOrdinal(
+      expr: NamedExpression,
+      inputs: Seq[Attribute]): NamedExpression = expr match {
+    case UnresolvedAlias(UnresolvedPipeAggregateOrdinal(index), _) =>
+      // In this case, the user applied the SQL pipe aggregate operator ("|> 
AGGREGATE") and used
+      // ordinals in its GROUP BY clause. This expression then refers to the 
i-th attribute of the
+      // child operator (one-based). Here we resolve the ordinal to the 
corresponding attribute.
+      inputs(index - 1)
+    case other =>
+      other
+  }
 
   /**
    * Checks whether a function identifier referenced by an 
[[UnresolvedFunction]] is defined in the
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index b14f4be534a3..61b68b743a5c 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -956,6 +956,28 @@ case class UnresolvedOrdinal(ordinal: Int)
   final override val nodePatterns: Seq[TreePattern] = Seq(UNRESOLVED_ORDINAL)
 }
 
+/**
+ * Represents an unresolved ordinal used in the GROUP BY clause of a SQL pipe 
aggregate operator
+ * ("|> AGGREGATE").
+ *
+ * In this context, the ordinal refers to the one-based position of the column 
in the input
+ * relation. Note that this behavior is different from GROUP BY ordinals in 
regular SQL, wherein the
+ * ordinal refers to the one-based position of the column in the SELECT clause.
+ *
+ * For example:
+ * {{{
+ *   values ('abc', 'def') tab(x, y)
+ *   |> aggregate sum(x) group by 2
+ * }}}
+ * @param ordinal ordinal starts from 1, instead of 0
+ */
+case class UnresolvedPipeAggregateOrdinal(ordinal: Int)
+  extends LeafExpression with Unevaluable with NonSQLExpression {
+  override def dataType: DataType = throw new UnresolvedException("dataType")
+  override def nullable: Boolean = throw new UnresolvedException("nullable")
+  override lazy val resolved = false
+}
+
 /**
  * Represents unresolved having clause, the child for it can be Aggregate, 
GroupingSets, Rollup
  * and Cube. It is turned by the analyzer into a Filter.
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 64491264f3e9..f95c0d6556ba 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -6016,7 +6016,8 @@ class AstBuilder extends DataTypeAstBuilder
     // analyzer behave as if we had added the corresponding SQL clause after a 
table subquery
     // containing the input plan.
     def withSubqueryAlias(): LogicalPlan = left match {
-      case _: SubqueryAlias | _: UnresolvedRelation | _: Join | _: Filter =>
+      case _: SubqueryAlias | _: UnresolvedRelation | _: Join | _: Filter |
+           _: GlobalLimit | _: LocalLimit | _: Offset | _: Sort =>
         left
       case _ =>
         SubqueryAlias(SubqueryAlias.generateSubqueryName(), left)
@@ -6137,7 +6138,7 @@ class AstBuilder extends DataTypeAstBuilder
         "The AGGREGATE clause requires a list of aggregate expressions " +
           "or a list of grouping expressions, or both", ctx)
     }
-    // Visit each aggregate expression, and add a PipeAggregate expression on 
top of it to generate
+    // Visit each aggregate expression, and add a [[PipeExpression]] on top of 
it to generate
     // clear error messages if the expression does not contain at least one 
aggregate function.
     val aggregateExpressions: Seq[NamedExpression] =
       Option(ctx.namedExpressionSeq()).map { n: NamedExpressionSeqContext =>
@@ -6183,12 +6184,28 @@ class AstBuilder extends DataTypeAstBuilder
           a.aggregateExpressions.foreach(visit)
           // Prepend grouping keys to the list of aggregate functions, since 
operator pipe AGGREGATE
           // clause returns the GROUP BY expressions followed by the list of 
aggregate functions.
-          val namedGroupingExpressions: Seq[NamedExpression] =
-            a.groupingExpressions.map {
-              case n: NamedExpression => n
-              case e: Expression => UnresolvedAlias(e, None)
-            }
-          a.copy(aggregateExpressions = namedGroupingExpressions ++ 
a.aggregateExpressions)
+          val newGroupingExpressions = ArrayBuffer.empty[Expression]
+          val newAggregateExpressions = ArrayBuffer.empty[NamedExpression]
+          a.groupingExpressions.foreach {
+            case n: NamedExpression =>
+              newGroupingExpressions += n
+              newAggregateExpressions += n
+            // If the grouping expression is an integer literal, create 
[[UnresolvedOrdinal]] and
+            // [[UnresolvedPipeAggregateOrdinal]] expressions to represent it 
in the final grouping
+            // and aggregate expressions, respectively. This will let the
+            // [[ResolveOrdinalInOrderByAndGroupBy]] rule detect the ordinal 
in the aggregate list
+            // and replace it with the corresponding attribute from the child 
operator.
+            case Literal(v: Int, IntegerType) if conf.groupByOrdinal =>
+              newGroupingExpressions += 
UnresolvedOrdinal(newAggregateExpressions.length + 1)
+              newAggregateExpressions += 
UnresolvedAlias(UnresolvedPipeAggregateOrdinal(v), None)
+            case e: Expression =>
+              newGroupingExpressions += e
+              newAggregateExpressions += UnresolvedAlias(e, None)
+          }
+          newAggregateExpressions.appendAll(a.aggregateExpressions)
+          a.copy(
+            groupingExpressions = newGroupingExpressions.toSeq,
+            aggregateExpressions = newAggregateExpressions.toSeq)
       }
     }.getOrElse {
       // This is a table aggregation with no grouping expressions.
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out
index 1121d8baf5db..70de582fb7b2 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/pipe-operators.sql.out
@@ -1539,6 +1539,78 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 }
 
 
+-- !query
+table t
+|> select x, length(y) as z
+|> limit 1000
+|> where x + length(y) < 4
+-- !query analysis
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION",
+  "sqlState" : "42703",
+  "messageParameters" : {
+    "objectName" : "`y`",
+    "proposal" : "`x`, `z`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 71,
+    "stopIndex" : 71,
+    "fragment" : "y"
+  } ]
+}
+
+
+-- !query
+table t
+|> select x, length(y) as z
+|> limit 1000 offset 1
+|> where x + length(y) < 4
+-- !query analysis
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION",
+  "sqlState" : "42703",
+  "messageParameters" : {
+    "objectName" : "`y`",
+    "proposal" : "`x`, `z`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 80,
+    "stopIndex" : 80,
+    "fragment" : "y"
+  } ]
+}
+
+
+-- !query
+table t
+|> select x, length(y) as z
+|> order by x, y
+|> where x + length(y) < 4
+-- !query analysis
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION",
+  "sqlState" : "42703",
+  "messageParameters" : {
+    "objectName" : "`y`",
+    "proposal" : "`x`, `z`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 52,
+    "stopIndex" : 52,
+    "fragment" : "y"
+  } ]
+}
+
+
 -- !query
 (select x, sum(length(y)) as sum_len from t group by x)
 |> where sum(length(y)) = 3
@@ -2697,21 +2769,34 @@ Union false, false
 
 
 -- !query
-values (0, 1) tab(x, y)
+values (2, 'xyz') tab(x, y)
 |> union table t
 |> where x = 0
 -- !query analysis
-Distinct
-+- Union false, false
-   :- Project [x#x, cast(y#x as bigint) AS y#xL]
-   :  +- SubqueryAlias tab
-   :     +- LocalRelation [x#x, y#x]
-   +- Project [x#x, cast(y#x as bigint) AS y#xL]
-      +- Filter (x#x = 0)
+Filter (x#x = 0)
++- SubqueryAlias __auto_generated_subquery_name
+   +- Distinct
+      +- Union false, false
+         :- SubqueryAlias tab
+         :  +- LocalRelation [x#x, y#x]
          +- SubqueryAlias spark_catalog.default.t
             +- Relation spark_catalog.default.t[x#x,y#x] csv
 
 
+-- !query
+values (2, 'xyz') tab(x, y)
+|> union table t
+|> drop x
+-- !query analysis
+Project [y#x]
++- Distinct
+   +- Union false, false
+      :- SubqueryAlias tab
+      :  +- LocalRelation [x#x, y#x]
+      +- SubqueryAlias spark_catalog.default.t
+         +- Relation spark_catalog.default.t[x#x,y#x] csv
+
+
 -- !query
 (select * from t)
 |> union all (select * from t)
@@ -2878,10 +2963,9 @@ table t
 -- !query analysis
 GlobalLimit 1
 +- LocalLimit 1
-   +- SubqueryAlias __auto_generated_subquery_name
-      +- Sort [x#x ASC NULLS FIRST], true
-         +- SubqueryAlias spark_catalog.default.t
-            +- Relation spark_catalog.default.t[x#x,y#x] csv
+   +- Sort [x#x ASC NULLS FIRST], true
+      +- SubqueryAlias spark_catalog.default.t
+         +- Relation spark_catalog.default.t[x#x,y#x] csv
 
 
 -- !query
@@ -3109,11 +3193,101 @@ Aggregate [x#x, y#x], [x#x, y#x]
 select 3 as x, 4 as y
 |> aggregate group by 1, 2
 -- !query analysis
-Aggregate [1, 2], [1 AS 1#x, 2 AS 2#x]
+Aggregate [x#x, y#x], [x#x, y#x]
 +- Project [3 AS x#x, 4 AS y#x]
    +- OneRowRelation
 
 
+-- !query
+values (3, 4) as tab(x, y)
+|> aggregate sum(y) group by 1
+-- !query analysis
+Aggregate [x#x], [x#x, pipeexpression(sum(y#x), true, AGGREGATE) AS 
pipeexpression(sum(y))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [x#x, y#x]
+
+
+-- !query
+values (3, 4), (5, 4) as tab(x, y)
+|> aggregate sum(y) group by 1
+-- !query analysis
+Aggregate [x#x], [x#x, pipeexpression(sum(y#x), true, AGGREGATE) AS 
pipeexpression(sum(y))#xL]
++- SubqueryAlias tab
+   +- LocalRelation [x#x, y#x]
+
+
+-- !query
+select 3 as x, 4 as y
+|> aggregate sum(y) group by 1, 1
+-- !query analysis
+Aggregate [x#x, x#x], [x#x, x#x, pipeexpression(sum(y#x), true, AGGREGATE) AS 
pipeexpression(sum(y))#xL]
++- Project [3 AS x#x, 4 AS y#x]
+   +- OneRowRelation
+
+
+-- !query
+select 1 as `1`, 2 as `2`
+|> aggregate sum(`2`) group by `1`
+-- !query analysis
+Aggregate [1#x], [1#x, pipeexpression(sum(2#x), true, AGGREGATE) AS 
pipeexpression(sum(2))#xL]
++- Project [1 AS 1#x, 2 AS 2#x]
+   +- OneRowRelation
+
+
+-- !query
+select 3 as x, 4 as y
+|> aggregate sum(y) group by 2
+-- !query analysis
+Aggregate [y#x], [y#x, pipeexpression(sum(y#x), true, AGGREGATE) AS 
pipeexpression(sum(y))#xL]
++- Project [3 AS x#x, 4 AS y#x]
+   +- OneRowRelation
+
+
+-- !query
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 2
+-- !query analysis
+Aggregate [y#x], [y#x, pipeexpression(sum(y#x), true, AGGREGATE) AS 
pipeexpression(sum(y))#xL]
++- Project [3 AS x#x, 4 AS y#x, 5 AS z#x]
+   +- OneRowRelation
+
+
+-- !query
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 3
+-- !query analysis
+Aggregate [z#x], [z#x, pipeexpression(sum(y#x), true, AGGREGATE) AS 
pipeexpression(sum(y))#xL]
++- Project [3 AS x#x, 4 AS y#x, 5 AS z#x]
+   +- OneRowRelation
+
+
+-- !query
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 2, 3
+-- !query analysis
+Aggregate [y#x, z#x], [y#x, z#x, pipeexpression(sum(y#x), true, AGGREGATE) AS 
pipeexpression(sum(y))#xL]
++- Project [3 AS x#x, 4 AS y#x, 5 AS z#x]
+   +- OneRowRelation
+
+
+-- !query
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 1, 2, 3
+-- !query analysis
+Aggregate [x#x, y#x, z#x], [x#x, y#x, z#x, pipeexpression(sum(y#x), true, 
AGGREGATE) AS pipeexpression(sum(y))#xL]
++- Project [3 AS x#x, 4 AS y#x, 5 AS z#x]
+   +- OneRowRelation
+
+
+-- !query
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by x, 2, 3
+-- !query analysis
+Aggregate [x#x, y#x, z#x], [x#x, y#x, z#x, pipeexpression(sum(y#x), true, 
AGGREGATE) AS pipeexpression(sum(y))#xL]
++- Project [3 AS x#x, 4 AS y#x, 5 AS z#x]
+   +- OneRowRelation
+
+
 -- !query
 table t
 |> aggregate sum(x)
diff --git a/sql/core/src/test/resources/sql-tests/inputs/pipe-operators.sql 
b/sql/core/src/test/resources/sql-tests/inputs/pipe-operators.sql
index 1299da3020d5..ec4afc6b2372 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/pipe-operators.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/pipe-operators.sql
@@ -542,6 +542,21 @@ table t
 |> select x, length(y) as z
 |> where x + length(y) < 4;
 
+table t
+|> select x, length(y) as z
+|> limit 1000
+|> where x + length(y) < 4;
+
+table t
+|> select x, length(y) as z
+|> limit 1000 offset 1
+|> where x + length(y) < 4;
+
+table t
+|> select x, length(y) as z
+|> order by x, y
+|> where x + length(y) < 4;
+
 -- If the WHERE clause wants to filter rows produced by an aggregation, it is 
not valid to try to
 -- refer to the aggregate functions directly; it is necessary to use aliases 
instead.
 (select x, sum(length(y)) as sum_len from t group by x)
@@ -843,10 +858,17 @@ values (0, 'abc') tab(x, y)
 |> union all table t;
 
 -- Union distinct with a VALUES list.
-values (0, 1) tab(x, y)
+-- The |> WHERE operator applies to the result of the |> UNION operator, not 
to the "table t" input.
+values (2, 'xyz') tab(x, y)
 |> union table t
 |> where x = 0;
 
+-- Union distinct with a VALUES list.
+-- The |> DROP operator applies to the result of the |> UNION operator, not to 
the "table t" input.
+values (2, 'xyz') tab(x, y)
+|> union table t
+|> drop x;
+
 -- Union all with a table subquery on both the source and target sides.
 (select * from t)
 |> union all (select * from t);
@@ -998,6 +1020,36 @@ select 1 as x, 2 as y
 select 3 as x, 4 as y
 |> aggregate group by 1, 2;
 
+values (3, 4) as tab(x, y)
+|> aggregate sum(y) group by 1;
+
+values (3, 4), (5, 4) as tab(x, y)
+|> aggregate sum(y) group by 1;
+
+select 3 as x, 4 as y
+|> aggregate sum(y) group by 1, 1;
+
+select 1 as `1`, 2 as `2`
+|> aggregate sum(`2`) group by `1`;
+
+select 3 as x, 4 as y
+|> aggregate sum(y) group by 2;
+
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 2;
+
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 3;
+
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 2, 3;
+
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 1, 2, 3;
+
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by x, 2, 3;
+
 -- Basic table aggregation.
 table t
 |> aggregate sum(x);
diff --git 
a/sql/core/src/test/resources/sql-tests/results/pipe-operators.sql.out 
b/sql/core/src/test/resources/sql-tests/results/pipe-operators.sql.out
index cc603903712a..3dd212d889f9 100644
--- a/sql/core/src/test/resources/sql-tests/results/pipe-operators.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/pipe-operators.sql.out
@@ -1493,6 +1493,84 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 }
 
 
+-- !query
+table t
+|> select x, length(y) as z
+|> limit 1000
+|> where x + length(y) < 4
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION",
+  "sqlState" : "42703",
+  "messageParameters" : {
+    "objectName" : "`y`",
+    "proposal" : "`x`, `z`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 71,
+    "stopIndex" : 71,
+    "fragment" : "y"
+  } ]
+}
+
+
+-- !query
+table t
+|> select x, length(y) as z
+|> limit 1000 offset 1
+|> where x + length(y) < 4
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION",
+  "sqlState" : "42703",
+  "messageParameters" : {
+    "objectName" : "`y`",
+    "proposal" : "`x`, `z`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 80,
+    "stopIndex" : 80,
+    "fragment" : "y"
+  } ]
+}
+
+
+-- !query
+table t
+|> select x, length(y) as z
+|> order by x, y
+|> where x + length(y) < 4
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION",
+  "sqlState" : "42703",
+  "messageParameters" : {
+    "objectName" : "`y`",
+    "proposal" : "`x`, `z`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 52,
+    "stopIndex" : 52,
+    "fragment" : "y"
+  } ]
+}
+
+
 -- !query
 (select x, sum(length(y)) as sum_len from t group by x)
 |> where sum(length(y)) = 3
@@ -2362,29 +2440,25 @@ struct<x:int,y:string>
 
 
 -- !query
-values (0, 1) tab(x, y)
+values (2, 'xyz') tab(x, y)
 |> union table t
 |> where x = 0
 -- !query schema
-struct<>
+struct<x:int,y:string>
 -- !query output
-org.apache.spark.SparkNumberFormatException
-{
-  "errorClass" : "CAST_INVALID_INPUT",
-  "sqlState" : "22018",
-  "messageParameters" : {
-    "expression" : "'abc'",
-    "sourceType" : "\"STRING\"",
-    "targetType" : "\"BIGINT\""
-  },
-  "queryContext" : [ {
-    "objectType" : "",
-    "objectName" : "",
-    "startIndex" : 1,
-    "stopIndex" : 55,
-    "fragment" : "values (0, 1) tab(x, y)\n|> union table t\n|> where x = 0"
-  } ]
-}
+0      abc
+
+
+-- !query
+values (2, 'xyz') tab(x, y)
+|> union table t
+|> drop x
+-- !query schema
+struct<y:string>
+-- !query output
+abc
+def
+xyz
 
 
 -- !query
@@ -2779,11 +2853,102 @@ struct<x:int,y:int>
 select 3 as x, 4 as y
 |> aggregate group by 1, 2
 -- !query schema
-struct<1:int,2:int>
+struct<x:int,y:int>
+-- !query output
+3      4
+
+
+-- !query
+values (3, 4) as tab(x, y)
+|> aggregate sum(y) group by 1
+-- !query schema
+struct<x:int,pipeexpression(sum(y)):bigint>
+-- !query output
+3      4
+
+
+-- !query
+values (3, 4), (5, 4) as tab(x, y)
+|> aggregate sum(y) group by 1
+-- !query schema
+struct<x:int,pipeexpression(sum(y)):bigint>
+-- !query output
+3      4
+5      4
+
+
+-- !query
+select 3 as x, 4 as y
+|> aggregate sum(y) group by 1, 1
+-- !query schema
+struct<x:int,x:int,pipeexpression(sum(y)):bigint>
+-- !query output
+3      3       4
+
+
+-- !query
+select 1 as `1`, 2 as `2`
+|> aggregate sum(`2`) group by `1`
+-- !query schema
+struct<1:int,pipeexpression(sum(2)):bigint>
 -- !query output
 1      2
 
 
+-- !query
+select 3 as x, 4 as y
+|> aggregate sum(y) group by 2
+-- !query schema
+struct<y:int,pipeexpression(sum(y)):bigint>
+-- !query output
+4      4
+
+
+-- !query
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 2
+-- !query schema
+struct<y:int,pipeexpression(sum(y)):bigint>
+-- !query output
+4      4
+
+
+-- !query
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 3
+-- !query schema
+struct<z:int,pipeexpression(sum(y)):bigint>
+-- !query output
+5      4
+
+
+-- !query
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 2, 3
+-- !query schema
+struct<y:int,z:int,pipeexpression(sum(y)):bigint>
+-- !query output
+4      5       4
+
+
+-- !query
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by 1, 2, 3
+-- !query schema
+struct<x:int,y:int,z:int,pipeexpression(sum(y)):bigint>
+-- !query output
+3      4       5       4
+
+
+-- !query
+select 3 as x, 4 as y, 5 as z
+|> aggregate sum(y) group by x, 2, 3
+-- !query schema
+struct<x:int,y:int,z:int,pipeexpression(sum(y)):bigint>
+-- !query output
+3      4       5       4
+
+
 -- !query
 table t
 |> aggregate sum(x)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-50630][SQL] Fix GROUP BY ordinal support for pipe SQL AGGREGATE operators

Reply via email to