spark git commit: [SPARK-22635][SQL][ORC] FileNotFoundException while reading ORC files containing special characters

gurwls223 Fri, 01 Dec 2017 01:20:07 -0800

Repository: spark
Updated Branches:
  refs/heads/branch-2.2 ba00bd961 -> f3f8c8767



[SPARK-22635][SQL][ORC] FileNotFoundException while reading ORC files 
containing special characters

## What changes were proposed in this pull request?

SPARK-22146 fix the FileNotFoundException issue only for the `inferSchema` 
method, ie. only for the schema inference, but it doesn't fix the problem when 
actually reading the data. Thus nearly the same exception happens when someone 
tries to use the data. This PR covers fixing the problem also there.

## How was this patch tested?

enhanced UT

Author: Marco Gaido <mga...@hortonworks.com>

Closes #19844 from mgaido91/SPARK-22635.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f3f8c876
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f3f8c876
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f3f8c876

Branch: refs/heads/branch-2.2
Commit: f3f8c8767efbe8c941b4181f71587c65a05e1b82
Parents: ba00bd9
Author: Marco Gaido <mga...@hortonworks.com>
Authored: Fri Dec 1 01:24:15 2017 +0900
Committer: hyukjinkwon <gurwls...@gmail.com>
Committed: Fri Dec 1 18:18:57 2017 +0900

----------------------------------------------------------------------
 .../org/apache/spark/sql/hive/orc/OrcFileFormat.scala    | 11 +++++------
 .../spark/sql/hive/MetastoreDataSourcesSuite.scala       |  3 ++-
 2 files changed, 7 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/f3f8c876/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 54e8f82..2defd31 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -131,10 +131,12 @@ class OrcFileFormat extends FileFormat with 
DataSourceRegister with Serializable
     (file: PartitionedFile) => {
       val conf = broadcastedHadoopConf.value.value
 
+      val filePath = new Path(new URI(file.filePath))
+
       // SPARK-8501: Empty ORC files always have an empty schema stored in 
their footer. In this
       // case, `OrcFileOperator.readSchema` returns `None`, and we can't read 
the underlying file
       // using the given physical schema. Instead, we simply return an empty 
iterator.
-      val isEmptyFile = OrcFileOperator.readSchema(Seq(file.filePath), 
Some(conf)).isEmpty
+      val isEmptyFile = OrcFileOperator.readSchema(Seq(filePath.toString), 
Some(conf)).isEmpty
       if (isEmptyFile) {
         Iterator.empty
       } else {
@@ -144,15 +146,12 @@ class OrcFileFormat extends FileFormat with 
DataSourceRegister with Serializable
           val job = Job.getInstance(conf)
           FileInputFormat.setInputPaths(job, file.filePath)
 
-          val fileSplit = new FileSplit(
-            new Path(new URI(file.filePath)), file.start, file.length, 
Array.empty
-          )
+          val fileSplit = new FileSplit(filePath, file.start, file.length, 
Array.empty)
           // Custom OrcRecordReader is used to get
           // ObjectInspector during recordReader creation itself and can
           // avoid NameNode call in unwrapOrcStructs per file.
           // Specifically would be helpful for partitioned datasets.
-          val orcReader = OrcFile.createReader(
-            new Path(new URI(file.filePath)), OrcFile.readerOptions(conf))
+          val orcReader = OrcFile.createReader(filePath, 
OrcFile.readerOptions(conf))
           new SparkOrcNewRecordReader(orcReader, conf, fileSplit.getStart, 
fileSplit.getLength)
         }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/f3f8c876/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index c0acffb..d62ed19 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -1355,7 +1355,8 @@ class MetastoreDataSourcesSuite extends QueryTest with 
SQLTestUtils with TestHiv
       withTempDir { dir =>
         val tmpFile = s"$dir/$nameWithSpecialChars"
         spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile)
-        spark.read.format(format).load(tmpFile)
+        val fileContent = spark.read.format(format).load(tmpFile)
+        checkAnswer(fileContent, Seq(Row("a"), Row("b")))
       }
     }
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-22635][SQL][ORC] FileNotFoundException while reading ORC files containing special characters

Reply via email to