spark git commit: [SPARK-22635][SQL][ORC] FileNotFoundException while reading ORC files containing special characters

gurwls223 Thu, 30 Nov 2017 08:24:56 -0800

Repository: spark
Updated Branches:
  refs/heads/master 6eb203fae -> 932bd09c8



[SPARK-22635][SQL][ORC] FileNotFoundException while reading ORC files 
containing special characters

## What changes were proposed in this pull request?

SPARK-22146 fix the FileNotFoundException issue only for the `inferSchema` 
method, ie. only for the schema inference, but it doesn't fix the problem when 
actually reading the data. Thus nearly the same exception happens when someone 
tries to use the data. This PR covers fixing the problem also there.

## How was this patch tested?

enhanced UT

Author: Marco Gaido <mga...@hortonworks.com>

Closes #19844 from mgaido91/SPARK-22635.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/932bd09c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/932bd09c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/932bd09c

Branch: refs/heads/master
Commit: 932bd09c80dc2dc113e94f59f4dcb77e77de7c58
Parents: 6eb203f
Author: Marco Gaido <mga...@hortonworks.com>
Authored: Fri Dec 1 01:24:15 2017 +0900
Committer: hyukjinkwon <gurwls...@gmail.com>
Committed: Fri Dec 1 01:24:15 2017 +0900

----------------------------------------------------------------------
 .../org/apache/spark/sql/hive/orc/OrcFileFormat.scala    | 11 +++++------
 .../spark/sql/hive/MetastoreDataSourcesSuite.scala       |  3 ++-
 2 files changed, 7 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/932bd09c/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 3b33a9f..95741c7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -133,10 +133,12 @@ class OrcFileFormat extends FileFormat with 
DataSourceRegister with Serializable
     (file: PartitionedFile) => {
       val conf = broadcastedHadoopConf.value.value
 
+      val filePath = new Path(new URI(file.filePath))
+
       // SPARK-8501: Empty ORC files always have an empty schema stored in 
their footer. In this
       // case, `OrcFileOperator.readSchema` returns `None`, and we can't read 
the underlying file
       // using the given physical schema. Instead, we simply return an empty 
iterator.
-      val isEmptyFile = OrcFileOperator.readSchema(Seq(file.filePath), 
Some(conf)).isEmpty
+      val isEmptyFile = OrcFileOperator.readSchema(Seq(filePath.toString), 
Some(conf)).isEmpty
       if (isEmptyFile) {
         Iterator.empty
       } else {
@@ -146,15 +148,12 @@ class OrcFileFormat extends FileFormat with 
DataSourceRegister with Serializable
           val job = Job.getInstance(conf)
           FileInputFormat.setInputPaths(job, file.filePath)
 
-          val fileSplit = new FileSplit(
-            new Path(new URI(file.filePath)), file.start, file.length, 
Array.empty
-          )
+          val fileSplit = new FileSplit(filePath, file.start, file.length, 
Array.empty)
           // Custom OrcRecordReader is used to get
           // ObjectInspector during recordReader creation itself and can
           // avoid NameNode call in unwrapOrcStructs per file.
           // Specifically would be helpful for partitioned datasets.
-          val orcReader = OrcFile.createReader(
-            new Path(new URI(file.filePath)), OrcFile.readerOptions(conf))
+          val orcReader = OrcFile.createReader(filePath, 
OrcFile.readerOptions(conf))
           new SparkOrcNewRecordReader(orcReader, conf, fileSplit.getStart, 
fileSplit.getLength)
         }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/932bd09c/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index a106047..c8caba8 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -1350,7 +1350,8 @@ class MetastoreDataSourcesSuite extends QueryTest with 
SQLTestUtils with TestHiv
       withTempDir { dir =>
         val tmpFile = s"$dir/$nameWithSpecialChars"
         spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile)
-        spark.read.format(format).load(tmpFile)
+        val fileContent = spark.read.format(format).load(tmpFile)
+        checkAnswer(fileContent, Seq(Row("a"), Row("b")))
       }
     }
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-22635][SQL][ORC] FileNotFoundException while reading ORC files containing special characters

Reply via email to