[GitHub] [spark] sunchao commented on a diff in pull request #36995: [SPARK-39607][SQL][DSV2] Distribution and ordering support V2 function in writing

GitBox Thu, 28 Jul 2022 10:45:54 -0700


sunchao commented on code in PR #36995:
URL: https://github.com/apache/spark/pull/36995#discussion_r932515356



##########
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DistributionAndOrderingUtils.scala:
##########
@@ -17,22 +17,33 @@
 
 package org.apache.spark.sql.execution.datasources.v2
 
-import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.analysis.{AnsiTypeCoercion, TypeCoercion}
+import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, 
SortOrder, TransformExpression, V2ExpressionUtils}
 import org.apache.spark.sql.catalyst.expressions.V2ExpressionUtils._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, 
RebalancePartitions, RepartitionByExpression, Sort}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.connector.catalog.FunctionCatalog
+import org.apache.spark.sql.connector.catalog.functions.ScalarFunction
 import org.apache.spark.sql.connector.distributions._
 import org.apache.spark.sql.connector.write.{RequiresDistributionAndOrdering, 
Write}
 import org.apache.spark.sql.errors.QueryCompilationErrors
 
 object DistributionAndOrderingUtils {
 
-  def prepareQuery(write: Write, query: LogicalPlan): LogicalPlan = write 
match {
+  def prepareQuery(

Review Comment:
   Hmm I wonder how does the write work with transforms such as bucket. For 
example, suppose the required distribution is `bucket(col, 100)`, Spark 
currently will compute the partition (bucket) ID by `murmur_hash(bucket(col, 
100)) pmod 100`, so the value of `col` is essentially hashed twice. I'm not 
sure whether this breaks any assumption from the V2 data source side, or 
whether it has any effect in the hash key distributions.
   
   



##########
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DistributionAndOrderingUtils.scala:
##########
@@ -17,22 +17,33 @@
 
 package org.apache.spark.sql.execution.datasources.v2
 
-import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.analysis.{AnsiTypeCoercion, TypeCoercion}
+import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, 
SortOrder, TransformExpression, V2ExpressionUtils}
 import org.apache.spark.sql.catalyst.expressions.V2ExpressionUtils._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, 
RebalancePartitions, RepartitionByExpression, Sort}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.connector.catalog.FunctionCatalog
+import org.apache.spark.sql.connector.catalog.functions.ScalarFunction
 import org.apache.spark.sql.connector.distributions._
 import org.apache.spark.sql.connector.write.{RequiresDistributionAndOrdering, 
Write}
 import org.apache.spark.sql.errors.QueryCompilationErrors
 
 object DistributionAndOrderingUtils {
 
-  def prepareQuery(write: Write, query: LogicalPlan): LogicalPlan = write 
match {
+  def prepareQuery(
+      write: Write,
+      query: LogicalPlan,
+      funCatalogOpt: Option[FunctionCatalog]): LogicalPlan = write match {
     case write: RequiresDistributionAndOrdering =>
       val numPartitions = write.requiredNumPartitions()
 
       val distribution = write.requiredDistribution match {
-        case d: OrderedDistribution => toCatalystOrdering(d.ordering(), query)
-        case d: ClusteredDistribution => d.clustering.map(e => toCatalyst(e, 
query)).toSeq
+        case d: OrderedDistribution =>
+          toCatalystOrdering(d.ordering(), query, funCatalogOpt)
+            .map(ur => resolveTransformExpression(ur).asInstanceOf[SortOrder])

Review Comment:
   nit: why the variable is named `ur`? maybe change it to `e`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] sunchao commented on a diff in pull request #36995: [SPARK-39607][SQL][DSV2] Distribution and ordering support V2 function in writing

Reply via email to