Re: [PR] [SPARK-39910][SQL] Delegate path qualification to filesystem during DataSource file path globbing [spark]

via GitHub Fri, 20 Oct 2023 04:35:50 -0700


beliefer commented on code in PR #43463:
URL: https://github.com/apache/spark/pull/43463#discussion_r1366851439



##########
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala:
##########
@@ -25,6 +27,7 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.test.SharedSparkSession
 
 class DataSourceSuite extends SharedSparkSession with PrivateMethodTester {
+

Review Comment:
   Please restore this line.



##########
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala:
##########
@@ -156,6 +159,39 @@ class DataSourceSuite extends SharedSparkSession with 
PrivateMethodTester {
     val expectMessage = "No FileSystem for scheme nonexistentFs"
     assert(message.filterNot(Set(':', '"').contains) == expectMessage)
   }
+
+  test("SPARK-39910: test Hadoop archive non glob paths") {
+    val absoluteHarPaths = buildFullHarPaths(allRelativeHarPaths)
+
+    val resultPaths = DataSource.checkAndGlobPathIfNecessary(
+      absoluteHarPaths.map(_.toString),
+      hadoopConf,
+      checkEmptyGlobPath = true,
+      checkFilesExist = true,
+      enableGlobbing = true
+    )
+
+    assert(
+      resultPaths.toSet === absoluteHarPaths.toSet
+    )

Review Comment:
   ```suggestion
       assert(resultPaths.toSet === absoluteHarPaths.toSet)
   ```



##########
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala:
##########
@@ -156,6 +159,39 @@ class DataSourceSuite extends SharedSparkSession with 
PrivateMethodTester {
     val expectMessage = "No FileSystem for scheme nonexistentFs"
     assert(message.filterNot(Set(':', '"').contains) == expectMessage)
   }
+
+  test("SPARK-39910: test Hadoop archive non glob paths") {
+    val absoluteHarPaths = buildFullHarPaths(allRelativeHarPaths)
+
+    val resultPaths = DataSource.checkAndGlobPathIfNecessary(
+      absoluteHarPaths.map(_.toString),
+      hadoopConf,
+      checkEmptyGlobPath = true,
+      checkFilesExist = true,
+      enableGlobbing = true
+    )
+
+    assert(
+      resultPaths.toSet === absoluteHarPaths.toSet
+    )
+  }
+
+  test("SPARK-39910: test Hadoop archive glob paths") {
+    val harGlobePaths = buildFullHarPaths(Seq(globeRelativeHarPath))
+
+    val resultPaths = DataSource.checkAndGlobPathIfNecessary(
+      harGlobePaths.map(_.toString),
+      hadoopConf,
+      checkEmptyGlobPath = true,
+      checkFilesExist = true,
+      enableGlobbing = true
+    )
+
+    val expectedHarPaths = buildFullHarPaths(allRelativeHarPaths)
+    assert(
+      resultPaths.toSet === expectedHarPaths.toSet
+    )

Review Comment:
   ditto.



##########
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceSuite.scala:
##########
@@ -197,11 +233,35 @@ object TestPaths {
       )
   )
 
+  val txtRelativeHarPath = new Path("/test.txt")
+  val csvRelativeHarPath = new Path("/test.csv")
+  val jsonRelativeHarPath = new Path("/test.json")
+  val parquetRelativeHarPath = new Path("/test.parquet")
+  val orcRelativeHarPath = new Path("/test.orc")
+  val globeRelativeHarPath = new Path("/test.*")

Review Comment:
   Do we really need to test all the file format?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [SPARK-39910][SQL] Delegate path qualification to filesystem during DataSource file path globbing [spark]

Reply via email to