Github user barrenlake commented on a diff in the pull request:
https://github.com/apache/spark/pull/17176#discussion_r154575331
--- Diff:
sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala ---
@@ -159,36 +159,11 @@ class HadoopTableReader(
def verifyPartitionPath(
partitionToDeserializer: Map[HivePartition, Class[_ <:
Deserializer]]):
Map[HivePartition, Class[_ <: Deserializer]] = {
- if (!sparkSession.sessionState.conf.verifyPartitionPath) {
- partitionToDeserializer
- } else {
- var existPathSet = collection.mutable.Set[String]()
- var pathPatternSet = collection.mutable.Set[String]()
- partitionToDeserializer.filter {
- case (partition, partDeserializer) =>
- def updateExistPathSetByPathPattern(pathPatternStr: String) {
- val pathPattern = new Path(pathPatternStr)
- val fs = pathPattern.getFileSystem(hadoopConf)
- val matches = fs.globStatus(pathPattern)
- matches.foreach(fileStatus => existPathSet +=
fileStatus.getPath.toString)
- }
- // convert /demo/data/year/month/day to /demo/data/*/*/*/
- def getPathPatternByPath(parNum: Int, tempPath: Path): String
= {
- var path = tempPath
- for (i <- (1 to parNum)) path = path.getParent
- val tails = (1 to parNum).map(_ => "*").mkString("/", "/",
"/")
- path.toString + tails
- }
-
- val partPath = partition.getDataLocation
- val partNum =
Utilities.getPartitionDesc(partition).getPartSpec.size();
- var pathPatternStr = getPathPatternByPath(partNum, partPath)
- if (!pathPatternSet.contains(pathPatternStr)) {
- pathPatternSet += pathPatternStr
- updateExistPathSetByPathPattern(pathPatternStr)
- }
- existPathSet.contains(partPath.toString)
- }
+ partitionToDeserializer.filter {
+ case (partition, partDeserializer) =>
+ val partPath = partition.getDataLocation
+ val fs = partPath.getFileSystem(hadoopConf)
+ fs.exists(partPath)
--- End diff --
Each partition sending an RPC request to the NameNode can result in poor
performance
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]