spark git commit: [SPARK-16216][SQL][FOLLOWUP][BRANCH-2.0] Bacoport enabling timestamp type tests for JSON and verify all unsupported types in CSV

hvanhovell Sun, 28 Aug 2016 07:31:53 -0700

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 5487fa0b8 -> eec03718d



[SPARK-16216][SQL][FOLLOWUP][BRANCH-2.0] Bacoport enabling timestamp type tests 
for JSON and verify all unsupported types in CSV

## What changes were proposed in this pull request?

This backports https://github.com/apache/spark/pull/14829

## How was this patch tested?

Tests in `JsonHadoopFsRelation` and `CSVSuite`.

Author: hyukjinkwon <[email protected]>

Closes #14840 from HyukjinKwon/SPARK-16216-followup-backport.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eec03718
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eec03718
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eec03718

Branch: refs/heads/branch-2.0
Commit: eec03718db7e412f466ea72f3d9c2682915080bc
Parents: 5487fa0
Author: hyukjinkwon <[email protected]>
Authored: Sun Aug 28 16:30:33 2016 +0200
Committer: Herman van Hovell <[email protected]>
Committed: Sun Aug 28 16:30:33 2016 +0200

----------------------------------------------------------------------
 .../datasources/csv/CSVFileFormat.scala          | 19 ++++++++++++-------
 .../datasources/csv/CSVInferSchema.scala         |  1 +
 .../sql/execution/datasources/csv/CSVSuite.scala | 15 ++++++++++++++-
 .../sql/sources/JsonHadoopFsRelationSuite.scala  |  4 ----
 4 files changed, 27 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/eec03718/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
index 12e19f9..4a60f51 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
@@ -180,13 +180,18 @@ class CSVFileFormat extends TextBasedFileFormat with 
DataSourceRegister {
   }
 
   private def verifySchema(schema: StructType): Unit = {
-    schema.foreach { field =>
-      field.dataType match {
-        case _: ArrayType | _: MapType | _: StructType =>
-          throw new UnsupportedOperationException(
-            s"CSV data source does not support ${field.dataType.simpleString} 
data type.")
-        case _ =>
-      }
+    def verifyType(dataType: DataType): Unit = dataType match {
+      case ByteType | ShortType | IntegerType | LongType | FloatType |
+           DoubleType | BooleanType | _: DecimalType | TimestampType |
+           DateType | StringType =>
+
+      case udt: UserDefinedType[_] => verifyType(udt.sqlType)
+
+      case _ =>
+        throw new UnsupportedOperationException(
+          s"CSV data source does not support ${dataType.simpleString} data 
type.")
     }
+
+    schema.foreach(field => verifyType(field.dataType))
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/eec03718/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
index f1b4c11..1ca6eff 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
@@ -290,6 +290,7 @@ private[csv] object CSVTypeCast {
             
DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(datum).getTime)
           }
       case _: StringType => UTF8String.fromString(datum)
+      case udt: UserDefinedType[_] => castTo(datum, udt.sqlType, nullable, 
options)
       case _ => throw new RuntimeException(s"Unsupported type: 
${castType.typeName}")
     }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/eec03718/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index f68d220..1930862 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -27,7 +27,7 @@ import org.apache.hadoop.io.SequenceFile.CompressionType
 import org.apache.hadoop.io.compress.GzipCodec
 
 import org.apache.spark.SparkException
-import org.apache.spark.sql.{DataFrame, QueryTest, Row}
+import org.apache.spark.sql.{DataFrame, QueryTest, Row, UDT}
 import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
 import org.apache.spark.sql.types._
 
@@ -680,6 +680,19 @@ class CSVSuite extends QueryTest with SharedSQLContext 
with SQLTestUtils {
         Seq((1, Array("Tesla", "Chevy", "Ford"))).toDF("id", 
"brands").write.csv(csvDir)
       }.getMessage
       assert(msg.contains("CSV data source does not support array<string> data 
type"))
+
+      msg = intercept[UnsupportedOperationException] {
+        Seq((1, new UDT.MyDenseVector(Array(0.25, 2.25, 4.25)))).toDF("id", 
"vectors")
+          .write.csv(csvDir)
+      }.getMessage
+      assert(msg.contains("CSV data source does not support array<double> data 
type"))
+
+      msg = intercept[SparkException] {
+        val schema = StructType(StructField("a", new UDT.MyDenseVectorUDT(), 
true) :: Nil)
+        spark.range(1).write.csv(csvDir)
+        spark.read.schema(schema).csv(csvDir).collect()
+      }.getCause.getMessage
+      assert(msg.contains("Unsupported type: array"))
     }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/eec03718/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
index 52486b1..d79edee 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala
@@ -32,10 +32,6 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest 
{
   override protected def supportsDataType(dataType: DataType): Boolean = 
dataType match {
     case _: NullType => false
     case _: BinaryType => false
-    // `TimestampType` is disabled because `DatatypeConverter.parseDateTime()`
-    // in `DateTimeUtils` parses the formatted string wrongly when the date is
-    // too early. (e.g. "1600-07-13T08:36:32.847").
-    case _: TimestampType => false
     case _: CalendarIntervalType => false
     case _ => true
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-16216][SQL][FOLLOWUP][BRANCH-2.0] Bacoport enabling timestamp type tests for JSON and verify all unsupported types in CSV

Reply via email to