spark git commit: [SPARK-20367] Properly unescape column names of partitioning columns parsed from paths.

wenchen Thu, 20 Apr 2017 18:51:06 -0700

Repository: spark
Updated Branches:
  refs/heads/branch-2.2 7e9eba08a -> d17dea8f1



[SPARK-20367] Properly unescape column names of partitioning columns parsed 
from paths.

## What changes were proposed in this pull request?

When infering partitioning schema from paths, the column in 
parsePartitionColumn should be unescaped with unescapePathName, just like it is 
being done in e.g. parsePathFragmentAsSeq.

## How was this patch tested?

Added a test to FileIndexSuite.

Author: Juliusz Sompolski <[email protected]>

Closes #17703 from juliuszsompolski/SPARK-20367.

(cherry picked from commit 0368eb9d86634c83b3140ce3190cb9e0d0b7fd86)
Signed-off-by: Wenchen Fan <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d17dea8f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d17dea8f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d17dea8f

Branch: refs/heads/branch-2.2
Commit: d17dea8f17989e5f8f7809a8564493d82290b5df
Parents: 7e9eba0
Author: Juliusz Sompolski <[email protected]>
Authored: Fri Apr 21 09:49:42 2017 +0800
Committer: Wenchen Fan <[email protected]>
Committed: Fri Apr 21 09:49:57 2017 +0800

----------------------------------------------------------------------
 .../sql/execution/datasources/PartitioningUtils.scala   |  2 +-
 .../sql/execution/datasources/FileIndexSuite.scala      | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/d17dea8f/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index c358320..2d70172 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -243,7 +243,7 @@ object PartitioningUtils {
     if (equalSignIndex == -1) {
       None
     } else {
-      val columnName = columnSpec.take(equalSignIndex)
+      val columnName = unescapePathName(columnSpec.take(equalSignIndex))
       assert(columnName.nonEmpty, s"Empty partition column name in 
'$columnSpec'")
 
       val rawColumnValue = columnSpec.drop(equalSignIndex + 1)

http://git-wip-us.apache.org/repos/asf/spark/blob/d17dea8f/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
index a9511cb..b461682 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala
@@ -27,6 +27,7 @@ import org.apache.hadoop.fs.{FileStatus, Path, 
RawLocalFileSystem}
 
 import org.apache.spark.metrics.source.HiveCatalogMetrics
 import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.{KnownSizeEstimation, SizeEstimator}
@@ -236,6 +237,17 @@ class FileIndexSuite extends SharedSQLContext {
     val fileStatusCache = FileStatusCache.getOrCreate(spark)
     fileStatusCache.putLeafFiles(new Path("/tmp", "abc"), files.toArray)
   }
+
+  test("SPARK-20367 - properly unescape column names in inferPartitioning") {
+    withTempPath { path =>
+      val colToUnescape = "Column/#%'?"
+      spark
+        .range(1)
+        .select(col("id").as(colToUnescape), col("id"))
+        .write.partitionBy(colToUnescape).parquet(path.getAbsolutePath)
+      assert(spark.read.parquet(path.getAbsolutePath).schema.exists(_.name == 
colToUnescape))
+    }
+  }
 }
 
 class FakeParentPathFileSystem extends RawLocalFileSystem {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-20367] Properly unescape column names of partitioning columns parsed from paths.

Reply via email to