Github user gatorsmile commented on a diff in the pull request:
https://github.com/apache/spark/pull/21608#discussion_r200704192
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
---
@@ -47,15 +48,26 @@ object CommandUtils extends Logging {
}
}
- def calculateTotalSize(sessionState: SessionState, catalogTable:
CatalogTable): BigInt = {
+ def calculateTotalSize(spark: SparkSession, catalogTable:
CatalogTable): BigInt = {
+
+ val sessionState = spark.sessionState
+ val stagingDir =
sessionState.conf.getConfString("hive.exec.stagingdir", ".hive-staging")
+
if (catalogTable.partitionColumnNames.isEmpty) {
- calculateLocationSize(sessionState, catalogTable.identifier,
catalogTable.storage.locationUri)
+ calculateLocationSize(sessionState, catalogTable.identifier,
+ catalogTable.storage.locationUri)
} else {
// Calculate table size as a sum of the visible partitions. See
SPARK-21079
val partitions =
sessionState.catalog.listPartitions(catalogTable.identifier)
- partitions.map { p =>
- calculateLocationSize(sessionState, catalogTable.identifier,
p.storage.locationUri)
- }.sum
+ val paths = partitions.map(x => new
Path(x.storage.locationUri.get.getPath))
+ val pathFilter = new PathFilter {
+ override def accept(path: Path): Boolean = {
+ !path.getName.startsWith(stagingDir)
+ }
+ }
+ val fileStatusSeq = InMemoryFileIndex.bulkListLeafFiles(paths,
+ sessionState.newHadoopConf(), pathFilter, spark).flatMap(x => x._2)
--- End diff --
```Scala
class PathFilterIgnoreNonData(stagingDir: String) extends PathFilter with
Serializable {
override def accept(path: Path): Boolean = {
val fileName = path.getName
(!fileName.startsWith(stagingDir) &&
// Ignore metadata files starting with "_" (for example, files
created by
// DirectoryAtomicCommitProtocol) when computing the location size
!fileName.startsWith("_"))
}
}
```
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]