Github user maropu commented on a diff in the pull request:
https://github.com/apache/spark/pull/21608#discussion_r201544879
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
---
@@ -47,15 +48,29 @@ object CommandUtils extends Logging {
}
}
- def calculateTotalSize(sessionState: SessionState, catalogTable:
CatalogTable): BigInt = {
+ private case class SerializablePathFilter(stagingDir: String)
+ extends PathFilter with Serializable {
+ override def accept(path: Path): Boolean = {
+ val fileName = path.getName
+ (!fileName.startsWith(stagingDir) &&
+ // Ignore metadata files starting with "_"
+ !fileName.startsWith("_"))
+ }
+ }
+
+ def calculateTotalSize(spark: SparkSession, catalogTable: CatalogTable):
BigInt = {
+ val sessionState = spark.sessionState
if (catalogTable.partitionColumnNames.isEmpty) {
calculateLocationSize(sessionState, catalogTable.identifier,
catalogTable.storage.locationUri)
} else {
// Calculate table size as a sum of the visible partitions. See
SPARK-21079
val partitions =
sessionState.catalog.listPartitions(catalogTable.identifier)
- partitions.map { p =>
- calculateLocationSize(sessionState, catalogTable.identifier,
p.storage.locationUri)
- }.sum
+ val paths = partitions.map(x => new Path(x.storage.locationUri.get))
+ val stagingDir =
sessionState.conf.getConfString("hive.exec.stagingdir", ".hive-staging")
--- End diff --
Since `SerializablePathFilter` is only used here, how about defining it as
an anonymous class?
```
val stagingDir =
sessionState.conf.getConfString("hive.exec.stagingdir", ".hive-staging")
val pathFilter = new PathFilter with Serializable {
override def accept(path: Path): Boolean = ...
}
val fileStatusSeq = InMemoryFileIndex.bulkListLeafFiles(paths,
sessionState.newHadoopConf(), pathFilter,
spark).flatMap(_._2)
```
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]