Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/16898#discussion_r208098556
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
---
@@ -119,23 +130,45 @@ object FileFormatWriter extends Logging {
uuid = UUID.randomUUID().toString,
serializableHadoopConf = new
SerializableConfiguration(job.getConfiguration),
outputWriterFactory = outputWriterFactory,
- allColumns = queryExecution.logical.output,
- partitionColumns = partitionColumns,
+ allColumns = allColumns,
dataColumns = dataColumns,
- bucketSpec = bucketSpec,
+ partitionColumns = partitionColumns,
+ bucketIdExpression = bucketIdExpression,
path = outputSpec.outputPath,
customPartitionLocations = outputSpec.customPartitionLocations,
maxRecordsPerFile = options.get("maxRecordsPerFile").map(_.toLong)
.getOrElse(sparkSession.sessionState.conf.maxRecordsPerFile)
)
+ // We should first sort by partition columns, then bucket id, and
finally sorting columns.
+ val requiredOrdering = partitionColumns ++ bucketIdExpression ++
sortColumns
+ // the sort order doesn't matter
+ val actualOrdering =
queryExecution.executedPlan.outputOrdering.map(_.child)
--- End diff --
That would be great, but may need some refactoring.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]