DISTRIBUTE BY [spark]

via GitHub Fri, 11 Oct 2024 11:04:28 -0700


dtenedor commented on code in PR #48413:
URL: https://github.com/apache/spark/pull/48413#discussion_r1797277273



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala:
##########
@@ -1010,21 +1018,40 @@ class AstBuilder extends DataTypeAstBuilder
       // [EMPTY]
       query
     } else {
-      throw 
QueryParsingErrors.combinationQueryResultClausesUnsupportedError(ctx)
+      throw QueryParsingErrors.combinationQueryResultClausesUnsupportedError(
+        ctx, "ORDER BY/SORT BY/DISTRIBUTE BY/CLUSTER BY")
     }
 
     // WINDOWS
-    val withWindow = withOrder.optionalMap(windowClause)(withWindowClause)
+    val withWindow = withOrder.optionalMap(windowClause) {
+      withWindowClause
+    }
+    if (forPipeOperators && windowClause != null) {
+      throw QueryParsingErrors.combinationQueryResultClausesUnsupportedError(
+        ctx, s"WINDOW clauses within SQL pipe operators")

Review Comment:
   Sounds good, done.



##########
sql/core/src/test/resources/sql-tests/inputs/pipe-operators.sql:
##########
@@ -571,6 +583,95 @@ table t
 table t
 |> union all table st;
 
+-- Sorting and repartitioning operators: positive tests.
+--------------------------------------------------------
+
+-- Order by.
+table t
+|> order by x;
+
+-- Order by with a table subquery.
+(select * from t)
+|> order by x;
+
+-- Order by with a VALUES list.
+values (0, 'abc') tab(x, y)
+|> order by x;
+
+-- Limit.
+table t
+|> order by x
+|> limit 1;
+
+-- Limit with offset.
+table t
+|> where x = 1
+|> select y
+|> limit 2 offset 1;
+
+-- LIMIT ALL and OFFSET 0 are equivalent to no LIMIT or OFFSET clause, 
respectively.
+table t
+|> limit all offset 0;
+
+-- Distribute by.
+table t
+|> distribute by x;
+
+-- Cluster by.
+table t
+|> cluster by x;
+
+-- Sort and distribute by.
+table t
+|> sort by x distribute by x;
+
+-- It is possible to apply a final ORDER BY clause on the result of a query 
containing pipe
+-- operators.
+table t
+|> order by x desc
+order by y;
+
+-- Sorting and repartitioning operators: negative tests.
+--------------------------------------------------------
+
+-- Multiple order by clauses are not supported in the same pipe operator
+table t
+|> order by x desc order by x + y
+order by y;

Review Comment:
   I checked and it turns out we do.
   
   We need the extra "ORDER BY y" clause at the end in this test to show that 
the "ORDER BY x + y" clause was consumed end the of the final query, not as 
part of the pipe operator.
   
   I left this information as a comment here.
   



##########
sql/core/src/test/resources/sql-tests/results/pipe-operators.sql.out:
##########
@@ -1673,6 +1691,279 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 }
 
 
+-- !query
+table t
+|> order by x
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+1      def
+
+
+-- !query
+(select * from t)
+|> order by x
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+1      def
+
+
+-- !query
+values (0, 'abc') tab(x, y)
+|> order by x
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+
+
+-- !query
+table t
+|> order by x
+|> limit 1
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+
+
+-- !query
+table t
+|> where x = 1
+|> select y
+|> limit 2 offset 1
+-- !query schema
+struct<y:string>
+-- !query output
+
+
+
+-- !query
+table t
+|> limit all offset 0
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+1      def
+
+
+-- !query
+table t
+|> distribute by x
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+1      def
+
+
+-- !query
+table t
+|> cluster by x
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+1      def
+
+
+-- !query
+table t
+|> sort by x distribute by x
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+1      def
+
+
+-- !query
+table t
+|> order by x desc
+order by y
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+1      def
+
+
+-- !query
+table t
+|> order by x desc order by x + y
+order by y
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+{
+  "errorClass" : "PARSE_SYNTAX_ERROR",
+  "sqlState" : "42601",
+  "messageParameters" : {
+    "error" : "'order'",
+    "hint" : ""
+  }
+}
+
+
+-- !query
+table t
+|> select 1 + 2 as result
+|> order by x
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION",
+  "sqlState" : "42703",
+  "messageParameters" : {
+    "objectName" : "`x`",
+    "proposal" : "`result`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 47,
+    "stopIndex" : 47,
+    "fragment" : "x"
+  } ]
+}
+
+
+-- !query
+table t
+|> select 1 + 2 as result
+|> distribute by x
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION",
+  "sqlState" : "42703",
+  "messageParameters" : {
+    "objectName" : "`x`",
+    "proposal" : "`result`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 52,
+    "stopIndex" : 52,
+    "fragment" : "x"
+  } ]
+}
+
+
+-- !query
+table t
+|> where x = 1
+|> select y
+|> offset 1

Review Comment:
   Surprisingly, no! It turns out you can use `OFFSET` without `LIMIT` in 
regular SQL queries too. I moved this case to the "positive tests" section.



##########
sql/core/src/test/resources/sql-tests/results/pipe-operators.sql.out:
##########
@@ -1673,6 +1691,279 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 }
 
 
+-- !query
+table t
+|> order by x
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+1      def
+
+
+-- !query
+(select * from t)
+|> order by x
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+1      def
+
+
+-- !query
+values (0, 'abc') tab(x, y)
+|> order by x
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+
+
+-- !query
+table t
+|> order by x
+|> limit 1
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+
+
+-- !query
+table t
+|> where x = 1
+|> select y
+|> limit 2 offset 1
+-- !query schema
+struct<y:string>
+-- !query output
+
+
+
+-- !query
+table t
+|> limit all offset 0
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+1      def
+
+
+-- !query
+table t
+|> distribute by x
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+1      def
+
+
+-- !query
+table t
+|> cluster by x
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+1      def
+
+
+-- !query
+table t
+|> sort by x distribute by x
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+1      def
+
+
+-- !query
+table t
+|> order by x desc
+order by y
+-- !query schema
+struct<x:int,y:string>
+-- !query output
+0      abc
+1      def
+
+
+-- !query
+table t
+|> order by x desc order by x + y
+order by y
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+{
+  "errorClass" : "PARSE_SYNTAX_ERROR",
+  "sqlState" : "42601",
+  "messageParameters" : {
+    "error" : "'order'",
+    "hint" : ""
+  }
+}
+
+
+-- !query
+table t
+|> select 1 + 2 as result
+|> order by x
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION",
+  "sqlState" : "42703",
+  "messageParameters" : {
+    "objectName" : "`x`",
+    "proposal" : "`result`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 47,
+    "stopIndex" : 47,
+    "fragment" : "x"
+  } ]
+}
+
+
+-- !query
+table t
+|> select 1 + 2 as result
+|> distribute by x
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION",
+  "sqlState" : "42703",
+  "messageParameters" : {
+    "objectName" : "`x`",
+    "proposal" : "`result`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 52,
+    "stopIndex" : 52,
+    "fragment" : "x"
+  } ]
+}
+
+
+-- !query
+table t
+|> where x = 1
+|> select y
+|> offset 1
+-- !query schema
+struct<y:string>
+-- !query output
+
+
+
+-- !query
+table t
+|> order by x limit 1
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+{
+  "errorClass" : "UNSUPPORTED_FEATURE.COMBINATION_QUERY_RESULT_CLAUSES",
+  "sqlState" : "0A000",
+  "messageParameters" : {
+    "clauses" : "the ORDER BY and LIMIT clauses"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 12,
+    "stopIndex" : 29,
+    "fragment" : "order by x limit 1"
+  } ]
+}
+
+
+-- !query
+table t
+|> order by x sort by x
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+{
+  "errorClass" : "UNSUPPORTED_FEATURE.COMBINATION_QUERY_RESULT_CLAUSES",
+  "sqlState" : "0A000",
+  "messageParameters" : {
+    "clauses" : "ORDER BY/SORT BY/DISTRIBUTE BY/CLUSTER BY"

Review Comment:
   Sounds good, done.



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala:
##########
@@ -1010,21 +1018,40 @@ class AstBuilder extends DataTypeAstBuilder
       // [EMPTY]
       query
     } else {
-      throw 
QueryParsingErrors.combinationQueryResultClausesUnsupportedError(ctx)
+      throw QueryParsingErrors.combinationQueryResultClausesUnsupportedError(
+        ctx, "ORDER BY/SORT BY/DISTRIBUTE BY/CLUSTER BY")
     }
 
     // WINDOWS
-    val withWindow = withOrder.optionalMap(windowClause)(withWindowClause)
+    val withWindow = withOrder.optionalMap(windowClause) {
+      withWindowClause
+    }
+    if (forPipeOperators && windowClause != null) {
+      throw QueryParsingErrors.combinationQueryResultClausesUnsupportedError(
+        ctx, s"WINDOW clauses within SQL pipe operators")
+    }
 
     // OFFSET
     // - OFFSET 0 is the same as omitting the OFFSET clause
+    val offsetClause = "OFFSET"
     val withOffset = withWindow.optional(offset) {
+      if (forPipeOperators && clause.nonEmpty) {
+        throw QueryParsingErrors.combinationQueryResultClausesUnsupportedError(
+          ctx, s"the $clause and $offsetClause clauses")

Review Comment:
   Sounds good, done.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-49558][SQL] Add SQL pipe syntax for LIMIT/OFFSET and ORDER/SORT/CLUSTER/DISTRIBUTE BY [spark]

Reply via email to