pan3793 commented on code in PR #40920:
URL: https://github.com/apache/spark/pull/40920#discussion_r1175442719
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala:
##########
@@ -789,22 +789,22 @@ case class RepairTableCommand(
if (partitionSpecsAndLocs.length > threshold) {
val hadoopConf = spark.sessionState.newHadoopConf()
val serializableConfiguration = new SerializableConfiguration(hadoopConf)
- val serializedPaths = partitionSpecsAndLocs.map(_._2.toString).toArray
+ val locations = partitionSpecsAndLocs.map(_._2)
// Set the number of parallelism to prevent following file listing from
generating many tasks
// in case of large #defaultParallelism.
- val numParallelism = Math.min(serializedPaths.length,
+ val numParallelism = Math.min(locations.length,
Math.min(spark.sparkContext.defaultParallelism, 10000))
// gather the fast stats for all the partitions otherwise Hive metastore
will list all the
// files for all the new partitions in sequential way, which is super
slow.
logInfo(s"Gather the fast stats in parallel using $numParallelism
tasks.")
- spark.sparkContext.parallelize(serializedPaths, numParallelism)
- .mapPartitions { paths =>
+ spark.sparkContext.parallelize(locations, numParallelism)
+ .mapPartitions { locationsEachPartition =>
val pathFilter = getPathFilter(serializableConfiguration.value)
- paths.map(new Path(_)).map{ path =>
- val fs = path.getFileSystem(serializableConfiguration.value)
- val statuses = fs.listStatus(path, pathFilter)
- (path.toString, PartitionStatistics(statuses.length,
statuses.map(_.getLen).sum))
+ locationsEachPartition.map { location =>
+ val fs = location.getFileSystem(serializableConfiguration.value)
+ val statuses = fs.listStatus(location, pathFilter)
+ (location.toString, PartitionStatistics(statuses.length,
statuses.map(_.getLen).sum))
Review Comment:
Looks simpler, adopted.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]