This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 7ba70f02b54 [SPARK-40765][SQL] Optimize redundant fs operation in
`CommandUtils#calculateSingleLocationSize#getPathSize` method
7ba70f02b54 is described below
commit 7ba70f02b5417364985af7bbfdcde6ebeca84357
Author: yangjie01 <[email protected]>
AuthorDate: Thu Oct 13 11:21:22 2022 +0900
[SPARK-40765][SQL] Optimize redundant fs operation in
`CommandUtils#calculateSingleLocationSize#getPathSize` method
### What changes were proposed in this pull request?
This pr change the 2nd input parameter from `Path` to `FileStatus` to avoid
redundant `fs.getFileStatus(path)` in each recursive call.
### Why are the changes needed?
Reduce one dfs operation in each recursive call.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Pass Github Actions
Closes #38214 from LuciferYang/opt-getPathSize.
Authored-by: yangjie01 <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../org/apache/spark/sql/execution/command/CommandUtils.scala | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
index 41f60bfa2ff..6883f93523b 100644
---
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
+++
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
@@ -22,7 +22,7 @@ import java.net.URI
import scala.collection.mutable
import scala.util.control.NonFatal
-import org.apache.hadoop.fs.{FileSystem, Path, PathFilter}
+import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.SparkSession
@@ -113,13 +113,12 @@ object CommandUtils extends Logging {
// countFileSize to count the table size.
val stagingDir = sessionState.conf.getConfString("hive.exec.stagingdir",
".hive-staging")
- def getPathSize(fs: FileSystem, path: Path): Long = {
- val fileStatus = fs.getFileStatus(path)
+ def getPathSize(fs: FileSystem, fileStatus: FileStatus): Long = {
val size = if (fileStatus.isDirectory) {
- fs.listStatus(path)
+ fs.listStatus(fileStatus.getPath)
.map { status =>
if (isDataPath(status.getPath, stagingDir)) {
- getPathSize(fs, status.getPath)
+ getPathSize(fs, status)
} else {
0L
}
@@ -136,7 +135,7 @@ object CommandUtils extends Logging {
val path = new Path(p)
try {
val fs = path.getFileSystem(sessionState.newHadoopConf())
- getPathSize(fs, path)
+ getPathSize(fs, fs.getFileStatus(path))
} catch {
case NonFatal(e) =>
logWarning(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]