srowen closed pull request #23288: [SPARK-26339][SQL]Throws better exception
when reading files that start with underscore
URL: https://github.com/apache/spark/pull/23288
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 795a6d0b6b040..f14b5a3706126 100644
---
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -542,7 +542,7 @@ case class DataSource(
checkFilesExist: Boolean): Seq[Path] = {
val allPaths = caseInsensitiveOptions.get("path") ++ paths
val hadoopConf = sparkSession.sessionState.newHadoopConf()
- allPaths.flatMap { path =>
+ val allGlobPath = allPaths.flatMap { path =>
val hdfsPath = new Path(path)
val fs = hdfsPath.getFileSystem(hadoopConf)
val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
@@ -559,6 +559,21 @@ case class DataSource(
}
globPath
}.toSeq
+
+ val (filteredOut, filteredIn) = allGlobPath.partition { path =>
+ InMemoryFileIndex.shouldFilterOut(path.getName)
+ }
+ if (filteredOut.nonEmpty) {
+ if (filteredIn.isEmpty) {
+ throw new AnalysisException(
+ s"All paths were ignored:\n${filteredOut.mkString("\n ")}")
+ } else {
+ logDebug(
+ s"Some paths were ignored:\n${filteredOut.mkString("\n ")}")
+ }
+ }
+
+ allGlobPath
}
}
diff --git a/sql/core/src/test/resources/test-data/_cars.csv
b/sql/core/src/test/resources/test-data/_cars.csv
new file mode 100644
index 0000000000000..40ded573ade5c
--- /dev/null
+++ b/sql/core/src/test/resources/test-data/_cars.csv
@@ -0,0 +1,7 @@
+
+year,make,model,comment,blank
+"2012","Tesla","S","No comment",
+
+1997,Ford,E350,"Go get one now they are going fast",
+2015,Chevy,Volt
+
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 3b977d74053e6..f318f1996568c 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -53,6 +53,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with
SQLTestUtils with Te
private val carsEmptyValueFile = "test-data/cars-empty-value.csv"
private val carsBlankColName = "test-data/cars-blank-column-name.csv"
private val carsCrlf = "test-data/cars-crlf.csv"
+ private val carsFilteredOutFile = "test-data/_cars.csv"
private val emptyFile = "test-data/empty.csv"
private val commentsFile = "test-data/comments.csv"
private val disableCommentsFile = "test-data/disable_comments.csv"
@@ -345,6 +346,25 @@ class CSVSuite extends QueryTest with SharedSQLContext
with SQLTestUtils with Te
assert(result.schema.fieldNames.size === 1)
}
+ test("SPARK-26339 Not throw an exception if some of specified paths are
filtered in") {
+ val cars = spark
+ .read
+ .option("header", "false")
+ .csv(testFile(carsFile), testFile(carsFilteredOutFile))
+
+ verifyCars(cars, withHeader = false, checkTypes = false)
+ }
+
+ test("SPARK-26339 Throw an exception only if all of the specified paths are
filtered out") {
+ val e = intercept[AnalysisException] {
+ val cars = spark
+ .read
+ .option("header", "false")
+ .csv(testFile(carsFilteredOutFile))
+ }.getMessage
+ assert(e.contains("All paths were ignored:"))
+ }
+
test("DDL test with empty file") {
withView("carsTable") {
spark.sql(
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]