[GitHub] [spark] sadikovi commented on a change in pull request #34596: [SPARK-37326][SQL] Support TimestampNTZ in CSV data source

GitBox Wed, 17 Nov 2021 22:07:08 -0800


sadikovi commented on a change in pull request #34596:
URL: https://github.com/apache/spark/pull/34596#discussion_r751930741




##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
##########
@@ -1012,6 +1012,162 @@ abstract class CSVSuite
     }
   }
 
+  test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_NTZ 
values") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      val exp = spark.sql("select timestamp_ntz'2020-12-12 12:12:12' as col0")
+      exp.write.format("csv").option("timestampNTZFormat", "yyyy-MM-dd 
HH:mm:ss").save(path)
+
+      withSQLConf(SQLConf.TIMESTAMP_TYPE.key -> 
SQLConf.TimestampTypes.TIMESTAMP_NTZ.toString) {
+        val res = spark.read
+          .format("csv")
+          .option("inferSchema", "true")
+          .option("timestampNTZFormat", "yyyy-MM-dd HH:mm:ss")
+          .load(path)
+
+        checkAnswer(res, exp)
+      }
+    }
+  }
+
+  test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_LTZ 
values") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      val exp = spark.sql("select timestamp_ltz'2020-12-12 12:12:12' as col0")
+      exp.write.format("csv").option("timestampFormat", "yyyy-MM-dd 
HH:mm:ss").save(path)
+
+      withSQLConf(SQLConf.TIMESTAMP_TYPE.key -> 
SQLConf.TimestampTypes.TIMESTAMP_LTZ.toString) {
+        val res = spark.read
+          .format("csv")
+          .option("inferSchema", "true")
+          .option("timestampFormat", "yyyy-MM-dd HH:mm:ss")
+          .load(path)
+
+        checkAnswer(res, exp)
+      }
+    }
+  }
+
+  test("SPARK-37326: Roundtrip in reading and writing TIMESTAMP_NTZ values 
with custom schema") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      val exp = spark.sql("""
+        select
+          timestamp_ntz'2020-12-12 12:12:12' as col1,
+          timestamp_ltz'2020-12-12 12:12:12' as col2
+        """)
+
+      exp.write.format("csv").option("header", "true").save(path)
+
+      val res = spark.read
+        .format("csv")
+        .schema("col1 TIMESTAMP_NTZ, col2 TIMESTAMP_LTZ")
+        .option("header", "true")
+        .load(path)
+
+      checkAnswer(res, exp)
+    }
+  }
+
+  test("SPARK-37326: Timestamp type inference for a column with TIMESTAMP_NTZ 
values") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      val exp = spark.sql("""
+        select timestamp_ntz'2020-12-12 12:12:12' as col0 union all
+        select timestamp_ntz'2020-12-12 12:12:12' as col0
+        """)
+
+      exp.write.format("csv").option("header", "true").save(path)
+
+      val timestampTypes = Seq(
+        SQLConf.TimestampTypes.TIMESTAMP_NTZ.toString,
+        SQLConf.TimestampTypes.TIMESTAMP_LTZ.toString)
+
+      for (timestampType <- timestampTypes) {
+        withSQLConf(SQLConf.TIMESTAMP_TYPE.key -> timestampType) {
+          val res = spark.read
+            .format("csv")
+            .option("inferSchema", "true")
+            .option("header", "true")
+            .load(path)
+
+          if (timestampType == SQLConf.TimestampTypes.TIMESTAMP_NTZ.toString) {
+            checkAnswer(res, exp)
+          } else {
+            checkAnswer(
+              res,
+              spark.sql("""
+                select timestamp_ltz'2020-12-12 12:12:12' as col0 union all
+                select timestamp_ltz'2020-12-12 12:12:12' as col0
+                """)
+            )
+          }
+        }
+      }
+    }
+  }
+
+  test("SPARK-37326: Timestamp type inference for a mix of TIMESTAMP_NTZ and 
TIMESTAMP_LTZ") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      Seq(
+        "col0",
+        "2020-12-12T12:12:12.000",
+        "2020-12-12T17:12:12.000Z",
+        "2020-12-12T17:12:12.000+05:00",
+        "2020-12-12T12:12:12.000"
+      ).toDF("data")
+        .coalesce(1)
+        .write.text(path)
+
+      val res = spark.read
+        .format("csv")
+        .option("inferSchema", "true")

Review comment:
       I added a test for this, now if we cannot parse timestamp as 
TIMESTAMP_NTZ, we will mark this value as malformed.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] sadikovi commented on a change in pull request #34596: [SPARK-37326][SQL] Support TimestampNTZ in CSV data source

Reply via email to