pan3793 commented on code in PR #36995:
URL: https://github.com/apache/spark/pull/36995#discussion_r935925857
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DistributionAndOrderingUtils.scala:
##########
@@ -17,22 +17,33 @@
package org.apache.spark.sql.execution.datasources.v2
-import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.analysis.{AnsiTypeCoercion, TypeCoercion}
+import org.apache.spark.sql.catalyst.expressions.{Expression, Literal,
SortOrder, TransformExpression, V2ExpressionUtils}
import org.apache.spark.sql.catalyst.expressions.V2ExpressionUtils._
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan,
RebalancePartitions, RepartitionByExpression, Sort}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.connector.catalog.FunctionCatalog
+import org.apache.spark.sql.connector.catalog.functions.ScalarFunction
import org.apache.spark.sql.connector.distributions._
import org.apache.spark.sql.connector.write.{RequiresDistributionAndOrdering,
Write}
import org.apache.spark.sql.errors.QueryCompilationErrors
object DistributionAndOrderingUtils {
- def prepareQuery(write: Write, query: LogicalPlan): LogicalPlan = write
match {
+ def prepareQuery(
Review Comment:
Thanks for explaining, educated. After reading the code, I think you are
right that "the value of `col` is essentially hashed twice", but I don't think
it will bring correctness issues, because it still guarantees that the same
bucket values will be clustered into the same partition.
One example is Hive bucket. In `V1WritesUtils#getWriterBucketSpec`, both
`HiveHash` and `HashPartitioning#partitionIdExpression` can be used to
construct `bucketIdExpression`.
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DistributionAndOrderingUtils.scala:
##########
@@ -17,22 +17,33 @@
package org.apache.spark.sql.execution.datasources.v2
-import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.analysis.{AnsiTypeCoercion, TypeCoercion}
+import org.apache.spark.sql.catalyst.expressions.{Expression, Literal,
SortOrder, TransformExpression, V2ExpressionUtils}
import org.apache.spark.sql.catalyst.expressions.V2ExpressionUtils._
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan,
RebalancePartitions, RepartitionByExpression, Sort}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.connector.catalog.FunctionCatalog
+import org.apache.spark.sql.connector.catalog.functions.ScalarFunction
import org.apache.spark.sql.connector.distributions._
import org.apache.spark.sql.connector.write.{RequiresDistributionAndOrdering,
Write}
import org.apache.spark.sql.errors.QueryCompilationErrors
object DistributionAndOrderingUtils {
- def prepareQuery(write: Write, query: LogicalPlan): LogicalPlan = write
match {
+ def prepareQuery(
Review Comment:
Thanks for explaining, educated. After reading the code, I think you are
right that "the value of `col` is essentially hashed twice", but I don't think
it will bring correctness issues, because it still guarantees that the same
values will be clustered into the same partition.
One example is Hive bucket. In `V1WritesUtils#getWriterBucketSpec`, both
`HiveHash` and `HashPartitioning#partitionIdExpression` can be used to
construct `bucketIdExpression`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]