[GitHub] [spark] MaxGekk commented on a change in pull request #34596: [SPARK-37326][SQL] Support TimestampNTZ in CSV data source

GitBox Tue, 30 Nov 2021 00:44:47 -0800


MaxGekk commented on a change in pull request #34596:
URL: https://github.com/apache/spark/pull/34596#discussion_r759038536




##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
##########
@@ -1012,6 +1012,196 @@ abstract class CSVSuite
     }
   }
 
+  test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_NTZ 
values") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      val exp = spark.sql("select timestamp_ntz'2020-12-12 12:12:12' as col0")
+      exp.write.format("csv").option("timestampNTZFormat", "yyyy-MM-dd 
HH:mm:ss").save(path)

Review comment:
       Could you test max precision with pattern `.SSSSSS`, please.

##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
##########
@@ -1012,6 +1012,196 @@ abstract class CSVSuite
     }
   }
 
+  test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_NTZ 
values") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      val exp = spark.sql("select timestamp_ntz'2020-12-12 12:12:12' as col0")
+      exp.write.format("csv").option("timestampNTZFormat", "yyyy-MM-dd 
HH:mm:ss").save(path)
+
+      withSQLConf(SQLConf.TIMESTAMP_TYPE.key -> 
SQLConf.TimestampTypes.TIMESTAMP_NTZ.toString) {
+        val res = spark.read
+          .format("csv")
+          .option("inferSchema", "true")
+          .option("timestampNTZFormat", "yyyy-MM-dd HH:mm:ss")
+          .load(path)
+
+        checkAnswer(res, exp)
+      }
+    }
+  }
+
+  test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_LTZ 
values") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      val exp = spark.sql("select timestamp_ltz'2020-12-12 12:12:12' as col0")
+      exp.write.format("csv").option("timestampFormat", "yyyy-MM-dd 
HH:mm:ss").save(path)
+
+      withSQLConf(SQLConf.TIMESTAMP_TYPE.key -> 
SQLConf.TimestampTypes.TIMESTAMP_LTZ.toString) {
+        val res = spark.read
+          .format("csv")
+          .option("inferSchema", "true")
+          .option("timestampFormat", "yyyy-MM-dd HH:mm:ss")
+          .load(path)
+
+        checkAnswer(res, exp)
+      }
+    }
+  }
+
+  test("SPARK-37326: Roundtrip in reading and writing TIMESTAMP_NTZ values 
with custom schema") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      val exp = spark.sql("""
+        select
+          timestamp_ntz'2020-12-12 12:12:12' as col1,
+          timestamp_ltz'2020-12-12 12:12:12' as col2
+        """)
+
+      exp.write.format("csv").option("header", "true").save(path)
+
+      val res = spark.read
+        .format("csv")
+        .schema("col1 TIMESTAMP_NTZ, col2 TIMESTAMP_LTZ")
+        .option("header", "true")
+        .load(path)
+
+      checkAnswer(res, exp)
+    }
+  }
+
+  test("SPARK-37326: Timestamp type inference for a column with TIMESTAMP_NTZ 
values") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      val exp = spark.sql("""
+        select timestamp_ntz'2020-12-12 12:12:12' as col0 union all
+        select timestamp_ntz'2020-12-12 12:12:12' as col0
+        """)
+
+      exp.write.format("csv").option("header", "true").save(path)
+
+      val timestampTypes = Seq(
+        SQLConf.TimestampTypes.TIMESTAMP_NTZ.toString,
+        SQLConf.TimestampTypes.TIMESTAMP_LTZ.toString)
+
+      for (timestampType <- timestampTypes) {
+        withSQLConf(SQLConf.TIMESTAMP_TYPE.key -> timestampType) {
+          val res = spark.read
+            .format("csv")
+            .option("inferSchema", "true")
+            .option("header", "true")
+            .load(path)
+
+          if (timestampType == SQLConf.TimestampTypes.TIMESTAMP_NTZ.toString) {
+            checkAnswer(res, exp)
+          } else {
+            checkAnswer(
+              res,
+              spark.sql("""
+                select timestamp_ltz'2020-12-12 12:12:12' as col0 union all
+                select timestamp_ltz'2020-12-12 12:12:12' as col0
+                """)
+            )
+          }
+        }
+      }
+    }
+  }
+
+  test("SPARK-37326: Timestamp type inference for a mix of TIMESTAMP_NTZ and 
TIMESTAMP_LTZ") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      Seq(
+        "col0",
+        "2020-12-12T12:12:12.000",
+        "2020-12-12T17:12:12.000Z",
+        "2020-12-12T17:12:12.000+05:00",
+        "2020-12-12T12:12:12.000"
+      ).toDF("data")
+        .coalesce(1)
+        .write.text(path)
+
+      val res = spark.read.format("csv")
+        .option("inferSchema", "true")
+        .option("header", "true")
+        .load(path)
+
+      for (policy <- Seq("exception", "corrected", "legacy")) {

Review comment:
       Where is `policy` used?

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
##########
@@ -164,6 +164,10 @@ class CSVOptions(
       s"${DateFormatter.defaultPattern}'T'HH:mm:ss[.SSS][XXX]"
     })
 
+  val timestampNTZFormatInRead: Option[String] = 
parameters.get("timestampNTZFormat")
+  val timestampNTZFormatInWrite: String = 
parameters.getOrElse("timestampNTZFormat",
+    s"${DateFormatter.defaultPattern}'T'HH:mm:ss[.SSS]")

Review comment:
       I wonder what's the reason to have the optional field `[.SSS]` in write. 
How should CSV writer decide whether to write milliseconds or not?
   
   Another question, why the precision is in milliseconds but not in 
microseconds?

##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
##########
@@ -1012,6 +1012,196 @@ abstract class CSVSuite
     }
   }
 
+  test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_NTZ 
values") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"

Review comment:
       Just write directly to `dir`

##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
##########
@@ -1012,6 +1012,196 @@ abstract class CSVSuite
     }
   }
 
+  test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_NTZ 
values") {

Review comment:
       The test title says about different patterns but the patterns are the 
same in write and in inferring, in fact.

##########
File path: sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
##########
@@ -368,4 +368,15 @@ class CsvFunctionsSuite extends QueryTest with 
SharedSparkSession {
       .selectExpr("value.a")
     checkAnswer(fromCsvDF, Row(localDT))
   }
+
+  test("SPARK-37326: Handle incorrectly formatted timestamp_ntz values in 
from_csv") {
+    val fromCsvDF = Seq("2021-08-12T15:16:23.000+11:00").toDF("csv")
+      .select(
+        from_csv(
+          $"csv",
+          StructType(StructField("a", TimestampNTZType) :: Nil),
+          Map.empty[String, String]) as "value")
+      .selectExpr("value.a")
+    checkAnswer(fromCsvDF, Row(null))

Review comment:
       How about positive test for the functions `from_csv`/`to_csv` and 
`schema_of_csv`?

##########
File path: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
##########
@@ -1012,6 +1012,196 @@ abstract class CSVSuite
     }
   }
 
+  test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_NTZ 
values") {
+    withTempDir { dir =>
+      val path = s"${dir.getCanonicalPath}/csv"
+
+      val exp = spark.sql("select timestamp_ntz'2020-12-12 12:12:12' as col0")
+      exp.write.format("csv").option("timestampNTZFormat", "yyyy-MM-dd 
HH:mm:ss").save(path)
+
+      withSQLConf(SQLConf.TIMESTAMP_TYPE.key -> 
SQLConf.TimestampTypes.TIMESTAMP_NTZ.toString) {
+        val res = spark.read
+          .format("csv")
+          .option("inferSchema", "true")
+          .option("timestampNTZFormat", "yyyy-MM-dd HH:mm:ss")
+          .load(path)
+
+        checkAnswer(res, exp)

Review comment:
       `checkAnswer` doesn't check the type. Could you check that the inferred 
type is correct.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] MaxGekk commented on a change in pull request #34596: [SPARK-37326][SQL] Support TimestampNTZ in CSV data source

Reply via email to