[GitHub] [spark] gengliangwang commented on a change in pull request #34596: [SPARK-37326][SQL] Support TimestampNTZ in CSV data source

GitBox Thu, 18 Nov 2021 04:22:17 -0800


gengliangwang commented on a change in pull request #34596:
URL: https://github.com/apache/spark/pull/34596#discussion_r752188950




##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
##########
@@ -1012,6 +1012,162 @@ abstract class CSVSuite
     }
   }
 
+  test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_NTZ 
values") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      val exp = spark.sql("select timestamp_ntz'2020-12-12 12:12:12' as col0")
+      exp.write.format("csv").option("timestampNTZFormat", "yyyy-MM-dd 
HH:mm:ss").save(path)
+
+      withSQLConf(SQLConf.TIMESTAMP_TYPE.key -> 
SQLConf.TimestampTypes.TIMESTAMP_NTZ.toString) {
+        val res = spark.read
+          .format("csv")
+          .option("inferSchema", "true")
+          .option("timestampNTZFormat", "yyyy-MM-dd HH:mm:ss")
+          .load(path)
+
+        checkAnswer(res, exp)
+      }
+    }
+  }
+
+  test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_LTZ 
values") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      val exp = spark.sql("select timestamp_ltz'2020-12-12 12:12:12' as col0")
+      exp.write.format("csv").option("timestampFormat", "yyyy-MM-dd 
HH:mm:ss").save(path)
+
+      withSQLConf(SQLConf.TIMESTAMP_TYPE.key -> 
SQLConf.TimestampTypes.TIMESTAMP_LTZ.toString) {
+        val res = spark.read
+          .format("csv")
+          .option("inferSchema", "true")
+          .option("timestampFormat", "yyyy-MM-dd HH:mm:ss")
+          .load(path)
+
+        checkAnswer(res, exp)
+      }
+    }
+  }
+
+  test("SPARK-37326: Roundtrip in reading and writing TIMESTAMP_NTZ values 
with custom schema") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      val exp = spark.sql("""
+        select
+          timestamp_ntz'2020-12-12 12:12:12' as col1,
+          timestamp_ltz'2020-12-12 12:12:12' as col2
+        """)
+
+      exp.write.format("csv").option("header", "true").save(path)
+
+      val res = spark.read
+        .format("csv")
+        .schema("col1 TIMESTAMP_NTZ, col2 TIMESTAMP_LTZ")
+        .option("header", "true")
+        .load(path)
+
+      checkAnswer(res, exp)
+    }
+  }
+
+  test("SPARK-37326: Timestamp type inference for a column with TIMESTAMP_NTZ 
values") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      val exp = spark.sql("""
+        select timestamp_ntz'2020-12-12 12:12:12' as col0 union all
+        select timestamp_ntz'2020-12-12 12:12:12' as col0
+        """)
+
+      exp.write.format("csv").option("header", "true").save(path)
+
+      val timestampTypes = Seq(
+        SQLConf.TimestampTypes.TIMESTAMP_NTZ.toString,
+        SQLConf.TimestampTypes.TIMESTAMP_LTZ.toString)
+
+      for (timestampType <- timestampTypes) {
+        withSQLConf(SQLConf.TIMESTAMP_TYPE.key -> timestampType) {
+          val res = spark.read
+            .format("csv")
+            .option("inferSchema", "true")
+            .option("header", "true")
+            .load(path)
+
+          if (timestampType == SQLConf.TimestampTypes.TIMESTAMP_NTZ.toString) {
+            checkAnswer(res, exp)
+          } else {
+            checkAnswer(
+              res,
+              spark.sql("""
+                select timestamp_ltz'2020-12-12 12:12:12' as col0 union all
+                select timestamp_ltz'2020-12-12 12:12:12' as col0
+                """)
+            )
+          }
+        }
+      }
+    }
+  }
+
+  test("SPARK-37326: Timestamp type inference for a mix of TIMESTAMP_NTZ and 
TIMESTAMP_LTZ") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      Seq(
+        "col0",
+        "2020-12-12T12:12:12.000",
+        "2020-12-12T17:12:12.000Z",
+        "2020-12-12T17:12:12.000+05:00",
+        "2020-12-12T12:12:12.000"
+      ).toDF("data")
+        .coalesce(1)
+        .write.text(path)
+
+      val res = spark.read
+        .format("csv")
+        .option("inferSchema", "true")

Review comment:
       Oh, right. I forgot that it will become `null` instead of runtime error 
in the non-Ansi mode which I am focused on recently. I am sorry about this.
   Now I think we should cast it as TimestampNTZ in both read/write. WDYT?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] gengliangwang commented on a change in pull request #34596: [SPARK-37326][SQL] Support TimestampNTZ in CSV data source

Reply via email to