ConeyLiu commented on a change in pull request #24237: [SPARK-27319][SQL] 
Filter out dir based on PathFilter before listing them
URL: https://github.com/apache/spark/pull/24237#discussion_r270667095
 
 

 ##########
 File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
 ##########
 @@ -354,6 +354,49 @@ class FileIndexSuite extends SharedSQLContext {
     }
   }
 
+  test("InMemoryFileIndex should filter out path based on PathFilter") {
+    withSQLConf("mapreduce.input.pathFilter.class" -> 
classOf[UserDefinedPathFilter].getName) {
+      withTempDir { dir =>
+        val file1 = new File(dir, "text.txt")
+        stringToFile(file1, "text")
+        val file2 = new File(dir, "filteredText.txt")
+        stringToFile(file2, "filtered file")
+        val subDir = new File(dir, "filteredDir")
+        subDir.mkdirs()
+        val file3 = new File(subDir, "text.txt")
+        stringToFile(file3, "text.txt")
+
+        val path = new Path(dir.getCanonicalPath)
+        val inMemoryFileIndex = new InMemoryFileIndex(spark, Seq(path), 
Map.empty, None) {
+          def leafFilePaths: Seq[Path] = leafFiles.keys.toSeq
+        }
+        val leafFiles = inMemoryFileIndex.leafFilePaths
+        assert(leafFiles.size === 1)
+        assert(leafFiles.head.toUri.getPath === file1.getCanonicalPath)
+      }
+    }
+  }
+
+  test("filter out dir based on PathFilter before list them") {
+    withSQLConf("mapreduce.input.pathFilter.class" -> 
classOf[UserDefinedPathFilter].getName) {
+      for ((scale, expectedNumPar, dirName) <- Seq((50, 1, "nofiltered"), (50, 
1, "nonFiltered"))) {
+        withTempDir { dir =>
+          val subDir = new File(dir, dirName)
+          subDir.mkdirs()
+          (0 until scale).foreach { i =>
+            val file = new File(subDir, s"$i=${i}")
+            file.mkdir()
+          }
+          HiveCatalogMetrics.reset()
+          
assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == 0)
 
 Review comment:
   done

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to