MaxGekk commented on a change in pull request #34596:
URL: https://github.com/apache/spark/pull/34596#discussion_r759038536
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
##########
@@ -1012,6 +1012,196 @@ abstract class CSVSuite
}
}
+ test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_NTZ
values") {
+ withTempDir { dir =>
+ val path = s"${dir.getCanonicalPath}/csv"
+
+ val exp = spark.sql("select timestamp_ntz'2020-12-12 12:12:12' as col0")
+ exp.write.format("csv").option("timestampNTZFormat", "yyyy-MM-dd
HH:mm:ss").save(path)
Review comment:
Could you test max precision with pattern `.SSSSSS`, please.
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
##########
@@ -1012,6 +1012,196 @@ abstract class CSVSuite
}
}
+ test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_NTZ
values") {
+ withTempDir { dir =>
+ val path = s"${dir.getCanonicalPath}/csv"
+
+ val exp = spark.sql("select timestamp_ntz'2020-12-12 12:12:12' as col0")
+ exp.write.format("csv").option("timestampNTZFormat", "yyyy-MM-dd
HH:mm:ss").save(path)
+
+ withSQLConf(SQLConf.TIMESTAMP_TYPE.key ->
SQLConf.TimestampTypes.TIMESTAMP_NTZ.toString) {
+ val res = spark.read
+ .format("csv")
+ .option("inferSchema", "true")
+ .option("timestampNTZFormat", "yyyy-MM-dd HH:mm:ss")
+ .load(path)
+
+ checkAnswer(res, exp)
+ }
+ }
+ }
+
+ test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_LTZ
values") {
+ withTempDir { dir =>
+ val path = s"${dir.getCanonicalPath}/csv"
+
+ val exp = spark.sql("select timestamp_ltz'2020-12-12 12:12:12' as col0")
+ exp.write.format("csv").option("timestampFormat", "yyyy-MM-dd
HH:mm:ss").save(path)
+
+ withSQLConf(SQLConf.TIMESTAMP_TYPE.key ->
SQLConf.TimestampTypes.TIMESTAMP_LTZ.toString) {
+ val res = spark.read
+ .format("csv")
+ .option("inferSchema", "true")
+ .option("timestampFormat", "yyyy-MM-dd HH:mm:ss")
+ .load(path)
+
+ checkAnswer(res, exp)
+ }
+ }
+ }
+
+ test("SPARK-37326: Roundtrip in reading and writing TIMESTAMP_NTZ values
with custom schema") {
+ withTempDir { dir =>
+ val path = s"${dir.getCanonicalPath}/csv"
+
+ val exp = spark.sql("""
+ select
+ timestamp_ntz'2020-12-12 12:12:12' as col1,
+ timestamp_ltz'2020-12-12 12:12:12' as col2
+ """)
+
+ exp.write.format("csv").option("header", "true").save(path)
+
+ val res = spark.read
+ .format("csv")
+ .schema("col1 TIMESTAMP_NTZ, col2 TIMESTAMP_LTZ")
+ .option("header", "true")
+ .load(path)
+
+ checkAnswer(res, exp)
+ }
+ }
+
+ test("SPARK-37326: Timestamp type inference for a column with TIMESTAMP_NTZ
values") {
+ withTempDir { dir =>
+ val path = s"${dir.getCanonicalPath}/csv"
+
+ val exp = spark.sql("""
+ select timestamp_ntz'2020-12-12 12:12:12' as col0 union all
+ select timestamp_ntz'2020-12-12 12:12:12' as col0
+ """)
+
+ exp.write.format("csv").option("header", "true").save(path)
+
+ val timestampTypes = Seq(
+ SQLConf.TimestampTypes.TIMESTAMP_NTZ.toString,
+ SQLConf.TimestampTypes.TIMESTAMP_LTZ.toString)
+
+ for (timestampType <- timestampTypes) {
+ withSQLConf(SQLConf.TIMESTAMP_TYPE.key -> timestampType) {
+ val res = spark.read
+ .format("csv")
+ .option("inferSchema", "true")
+ .option("header", "true")
+ .load(path)
+
+ if (timestampType == SQLConf.TimestampTypes.TIMESTAMP_NTZ.toString) {
+ checkAnswer(res, exp)
+ } else {
+ checkAnswer(
+ res,
+ spark.sql("""
+ select timestamp_ltz'2020-12-12 12:12:12' as col0 union all
+ select timestamp_ltz'2020-12-12 12:12:12' as col0
+ """)
+ )
+ }
+ }
+ }
+ }
+ }
+
+ test("SPARK-37326: Timestamp type inference for a mix of TIMESTAMP_NTZ and
TIMESTAMP_LTZ") {
+ withTempDir { dir =>
+ val path = s"${dir.getCanonicalPath}/csv"
+
+ Seq(
+ "col0",
+ "2020-12-12T12:12:12.000",
+ "2020-12-12T17:12:12.000Z",
+ "2020-12-12T17:12:12.000+05:00",
+ "2020-12-12T12:12:12.000"
+ ).toDF("data")
+ .coalesce(1)
+ .write.text(path)
+
+ val res = spark.read.format("csv")
+ .option("inferSchema", "true")
+ .option("header", "true")
+ .load(path)
+
+ for (policy <- Seq("exception", "corrected", "legacy")) {
Review comment:
Where is `policy` used?
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
##########
@@ -164,6 +164,10 @@ class CSVOptions(
s"${DateFormatter.defaultPattern}'T'HH:mm:ss[.SSS][XXX]"
})
+ val timestampNTZFormatInRead: Option[String] =
parameters.get("timestampNTZFormat")
+ val timestampNTZFormatInWrite: String =
parameters.getOrElse("timestampNTZFormat",
+ s"${DateFormatter.defaultPattern}'T'HH:mm:ss[.SSS]")
Review comment:
I wonder what's the reason to have the optional field `[.SSS]` in write.
How should CSV writer decide whether to write milliseconds or not?
Another question, why the precision is in milliseconds but not in
microseconds?
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
##########
@@ -1012,6 +1012,196 @@ abstract class CSVSuite
}
}
+ test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_NTZ
values") {
+ withTempDir { dir =>
+ val path = s"${dir.getCanonicalPath}/csv"
Review comment:
Just write directly to `dir`
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
##########
@@ -1012,6 +1012,196 @@ abstract class CSVSuite
}
}
+ test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_NTZ
values") {
Review comment:
The test title says about different patterns but the patterns are the
same in write and in inferring, in fact.
##########
File path: sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
##########
@@ -368,4 +368,15 @@ class CsvFunctionsSuite extends QueryTest with
SharedSparkSession {
.selectExpr("value.a")
checkAnswer(fromCsvDF, Row(localDT))
}
+
+ test("SPARK-37326: Handle incorrectly formatted timestamp_ntz values in
from_csv") {
+ val fromCsvDF = Seq("2021-08-12T15:16:23.000+11:00").toDF("csv")
+ .select(
+ from_csv(
+ $"csv",
+ StructType(StructField("a", TimestampNTZType) :: Nil),
+ Map.empty[String, String]) as "value")
+ .selectExpr("value.a")
+ checkAnswer(fromCsvDF, Row(null))
Review comment:
How about positive test for the functions `from_csv`/`to_csv` and
`schema_of_csv`?
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
##########
@@ -1012,6 +1012,196 @@ abstract class CSVSuite
}
}
+ test("SPARK-37326: Use different pattern to write and infer TIMESTAMP_NTZ
values") {
+ withTempDir { dir =>
+ val path = s"${dir.getCanonicalPath}/csv"
+
+ val exp = spark.sql("select timestamp_ntz'2020-12-12 12:12:12' as col0")
+ exp.write.format("csv").option("timestampNTZFormat", "yyyy-MM-dd
HH:mm:ss").save(path)
+
+ withSQLConf(SQLConf.TIMESTAMP_TYPE.key ->
SQLConf.TimestampTypes.TIMESTAMP_NTZ.toString) {
+ val res = spark.read
+ .format("csv")
+ .option("inferSchema", "true")
+ .option("timestampNTZFormat", "yyyy-MM-dd HH:mm:ss")
+ .load(path)
+
+ checkAnswer(res, exp)
Review comment:
`checkAnswer` doesn't check the type. Could you check that the inferred
type is correct.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]