MaxGekk commented on a change in pull request #27524: [WIP][SQL] Support
`SimpleDateFormat` and `FastDateFormat` as legacy date/timestamp formatters
URL: https://github.com/apache/spark/pull/27524#discussion_r377165571
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
##########
@@ -525,170 +529,194 @@ class DateFunctionsSuite extends QueryTest with
SharedSparkSession {
}
test("from_unixtime") {
- val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
- val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
- val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
- val fmt3 = "yy-MM-dd HH-mm-ss"
- val sdf3 = new SimpleDateFormat(fmt3, Locale.US)
- val df = Seq((1000, "yyyy-MM-dd HH:mm:ss.SSS"), (-1000, "yy-MM-dd
HH-mm-ss")).toDF("a", "b")
- checkAnswer(
- df.select(from_unixtime(col("a"))),
- Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new
Timestamp(-1000000)))))
- checkAnswer(
- df.select(from_unixtime(col("a"), fmt2)),
- Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new
Timestamp(-1000000)))))
- checkAnswer(
- df.select(from_unixtime(col("a"), fmt3)),
- Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new
Timestamp(-1000000)))))
- checkAnswer(
- df.selectExpr("from_unixtime(a)"),
- Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new
Timestamp(-1000000)))))
- checkAnswer(
- df.selectExpr(s"from_unixtime(a, '$fmt2')"),
- Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new
Timestamp(-1000000)))))
- checkAnswer(
- df.selectExpr(s"from_unixtime(a, '$fmt3')"),
- Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new
Timestamp(-1000000)))))
+ Seq(false, true).foreach { legacyParser =>
+ withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key ->
legacyParser.toString) {
+ val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
+ val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
+ val sdf2 = new SimpleDateFormat(fmt2, Locale.US)
+ val fmt3 = "yy-MM-dd HH-mm-ss"
+ val sdf3 = new SimpleDateFormat(fmt3, Locale.US)
+ val df = Seq((1000, "yyyy-MM-dd HH:mm:ss.SSS"), (-1000, "yy-MM-dd
HH-mm-ss")).toDF("a", "b")
+ checkAnswer(
+ df.select(from_unixtime(col("a"))),
+ Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new
Timestamp(-1000000)))))
+ checkAnswer(
+ df.select(from_unixtime(col("a"), fmt2)),
+ Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new
Timestamp(-1000000)))))
+ checkAnswer(
+ df.select(from_unixtime(col("a"), fmt3)),
+ Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new
Timestamp(-1000000)))))
+ checkAnswer(
+ df.selectExpr("from_unixtime(a)"),
+ Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new
Timestamp(-1000000)))))
+ checkAnswer(
+ df.selectExpr(s"from_unixtime(a, '$fmt2')"),
+ Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new
Timestamp(-1000000)))))
+ checkAnswer(
+ df.selectExpr(s"from_unixtime(a, '$fmt3')"),
+ Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new
Timestamp(-1000000)))))
+ }
+ }
}
private def secs(millis: Long): Long =
TimeUnit.MILLISECONDS.toSeconds(millis)
test("unix_timestamp") {
- val date1 = Date.valueOf("2015-07-24")
- val date2 = Date.valueOf("2015-07-25")
- val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3")
- val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2")
- val s1 = "2015/07/24 10:00:00.5"
- val s2 = "2015/07/25 02:02:02.6"
- val ss1 = "2015-07-24 10:00:00"
- val ss2 = "2015-07-25 02:02:02"
- val fmt = "yyyy/MM/dd HH:mm:ss.S"
- val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts",
"s", "ss")
- checkAnswer(df.select(unix_timestamp(col("ts"))), Seq(
- Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
- checkAnswer(df.select(unix_timestamp(col("ss"))), Seq(
- Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
- checkAnswer(df.select(unix_timestamp(col("d"), fmt)), Seq(
- Row(secs(date1.getTime)), Row(secs(date2.getTime))))
- checkAnswer(df.select(unix_timestamp(col("s"), fmt)), Seq(
- Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
- checkAnswer(df.selectExpr("unix_timestamp(ts)"), Seq(
- Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
- checkAnswer(df.selectExpr("unix_timestamp(ss)"), Seq(
- Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
- checkAnswer(df.selectExpr(s"unix_timestamp(d, '$fmt')"), Seq(
- Row(secs(date1.getTime)), Row(secs(date2.getTime))))
- checkAnswer(df.selectExpr(s"unix_timestamp(s, '$fmt')"), Seq(
- Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
-
- val x1 = "2015-07-24 10:00:00"
- val x2 = "2015-25-07 02:02:02"
- val x3 = "2015-07-24 25:02:02"
- val x4 = "2015-24-07 26:02:02"
- val ts3 = Timestamp.valueOf("2015-07-24 02:25:02")
- val ts4 = Timestamp.valueOf("2015-07-24 00:10:00")
-
- val df1 = Seq(x1, x2, x3, x4).toDF("x")
- checkAnswer(df1.select(unix_timestamp(col("x"))), Seq(
- Row(secs(ts1.getTime)), Row(null), Row(null), Row(null)))
- checkAnswer(df1.selectExpr("unix_timestamp(x)"), Seq(
- Row(secs(ts1.getTime)), Row(null), Row(null), Row(null)))
- checkAnswer(df1.select(unix_timestamp(col("x"), "yyyy-dd-MM HH:mm:ss")),
Seq(
- Row(null), Row(secs(ts2.getTime)), Row(null), Row(null)))
- checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"),
Seq(
- Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)), Row(null)))
-
- // invalid format
- checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd aa:HH:ss')"),
Seq(
- Row(null), Row(null), Row(null), Row(null)))
-
- // february
- val y1 = "2016-02-29"
- val y2 = "2017-02-29"
- val ts5 = Timestamp.valueOf("2016-02-29 00:00:00")
- val df2 = Seq(y1, y2).toDF("y")
- checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq(
- Row(secs(ts5.getTime)), Row(null)))
-
- val now = sql("select unix_timestamp()").collect().head.getLong(0)
- checkAnswer(
- sql(s"select cast ($now as timestamp)"),
- Row(new java.util.Date(TimeUnit.SECONDS.toMillis(now))))
+ Seq(false, true).foreach { legacyParser =>
+ withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key ->
legacyParser.toString) {
+ val date1 = Date.valueOf("2015-07-24")
+ val date2 = Date.valueOf("2015-07-25")
+ val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3")
+ val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2")
+ val s1 = "2015/07/24 10:00:00.5"
+ val s2 = "2015/07/25 02:02:02.6"
+ val ss1 = "2015-07-24 10:00:00"
+ val ss2 = "2015-07-25 02:02:02"
+ val fmt = "yyyy/MM/dd HH:mm:ss.S"
+ val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d",
"ts", "s", "ss")
+ checkAnswer(df.select(unix_timestamp(col("ts"))), Seq(
+ Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+ checkAnswer(df.select(unix_timestamp(col("ss"))), Seq(
+ Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+ checkAnswer(df.select(unix_timestamp(col("d"), fmt)), Seq(
+ Row(secs(date1.getTime)), Row(secs(date2.getTime))))
+ checkAnswer(df.select(unix_timestamp(col("s"), fmt)), Seq(
+ Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+ checkAnswer(df.selectExpr("unix_timestamp(ts)"), Seq(
+ Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+ checkAnswer(df.selectExpr("unix_timestamp(ss)"), Seq(
+ Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+ checkAnswer(df.selectExpr(s"unix_timestamp(d, '$fmt')"), Seq(
+ Row(secs(date1.getTime)), Row(secs(date2.getTime))))
+ checkAnswer(df.selectExpr(s"unix_timestamp(s, '$fmt')"), Seq(
+ Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+
+ val x1 = "2015-07-24 10:00:00"
+ val x2 = "2015-25-07 02:02:02"
+ val x3 = "2015-07-24 25:02:02"
+ val x4 = "2015-24-07 26:02:02"
+ val ts3 = Timestamp.valueOf("2015-07-24 02:25:02")
+ val ts4 = Timestamp.valueOf("2015-07-24 00:10:00")
+
+ val df1 = Seq(x1, x2, x3, x4).toDF("x")
+ checkAnswer(df1.select(unix_timestamp(col("x"))), Seq(
+ Row(secs(ts1.getTime)), Row(null), Row(null), Row(null)))
+ checkAnswer(df1.selectExpr("unix_timestamp(x)"), Seq(
+ Row(secs(ts1.getTime)), Row(null), Row(null), Row(null)))
+ checkAnswer(df1.select(unix_timestamp(col("x"), "yyyy-dd-MM
HH:mm:ss")), Seq(
+ Row(null), Row(secs(ts2.getTime)), Row(null), Row(null)))
+ checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd
mm:HH:ss')"), Seq(
+ Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)),
Row(null)))
+
+ // invalid format
+ checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd
aa:HH:ss')"), Seq(
+ Row(null), Row(null), Row(null), Row(null)))
+
+ // february
+ val y1 = "2016-02-29"
+ val y2 = "2017-02-29"
+ val ts5 = Timestamp.valueOf("2016-02-29 00:00:00")
+ val df2 = Seq(y1, y2).toDF("y")
+ checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq(
+ Row(secs(ts5.getTime)), Row(null)))
+
+ val now = sql("select unix_timestamp()").collect().head.getLong(0)
+ checkAnswer(
+ sql(s"select cast ($now as timestamp)"),
+ Row(new java.util.Date(TimeUnit.SECONDS.toMillis(now))))
+ }
+ }
}
test("to_unix_timestamp") {
- val date1 = Date.valueOf("2015-07-24")
- val date2 = Date.valueOf("2015-07-25")
- val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3")
- val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2")
- val s1 = "2015/07/24 10:00:00.5"
- val s2 = "2015/07/25 02:02:02.6"
- val ss1 = "2015-07-24 10:00:00"
- val ss2 = "2015-07-25 02:02:02"
- val fmt = "yyyy/MM/dd HH:mm:ss.S"
- val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts",
"s", "ss")
- checkAnswer(df.selectExpr("to_unix_timestamp(ts)"), Seq(
- Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
- checkAnswer(df.selectExpr("to_unix_timestamp(ss)"), Seq(
- Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
- checkAnswer(df.selectExpr(s"to_unix_timestamp(d, '$fmt')"), Seq(
- Row(secs(date1.getTime)), Row(secs(date2.getTime))))
- checkAnswer(df.selectExpr(s"to_unix_timestamp(s, '$fmt')"), Seq(
- Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
-
- val x1 = "2015-07-24 10:00:00"
- val x2 = "2015-25-07 02:02:02"
- val x3 = "2015-07-24 25:02:02"
- val x4 = "2015-24-07 26:02:02"
- val ts3 = Timestamp.valueOf("2015-07-24 02:25:02")
- val ts4 = Timestamp.valueOf("2015-07-24 00:10:00")
-
- val df1 = Seq(x1, x2, x3, x4).toDF("x")
- checkAnswer(df1.selectExpr("to_unix_timestamp(x)"), Seq(
- Row(secs(ts1.getTime)), Row(null), Row(null), Row(null)))
- checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd
mm:HH:ss')"), Seq(
- Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)), Row(null)))
-
- // february
- val y1 = "2016-02-29"
- val y2 = "2017-02-29"
- val ts5 = Timestamp.valueOf("2016-02-29 00:00:00")
- val df2 = Seq(y1, y2).toDF("y")
- checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq(
- Row(secs(ts5.getTime)), Row(null)))
-
- // invalid format
- checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd
bb:HH:ss')"), Seq(
- Row(null), Row(null), Row(null), Row(null)))
+ Seq(false, true).foreach { legacyParser =>
+ withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key ->
legacyParser.toString) {
+ val date1 = Date.valueOf("2015-07-24")
+ val date2 = Date.valueOf("2015-07-25")
+ val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3")
+ val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2")
+ val s1 = "2015/07/24 10:00:00.5"
+ val s2 = "2015/07/25 02:02:02.6"
+ val ss1 = "2015-07-24 10:00:00"
+ val ss2 = "2015-07-25 02:02:02"
+ val fmt = "yyyy/MM/dd HH:mm:ss.S"
+ val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d",
"ts", "s", "ss")
+ checkAnswer(df.selectExpr("to_unix_timestamp(ts)"), Seq(
+ Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+ checkAnswer(df.selectExpr("to_unix_timestamp(ss)"), Seq(
+ Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+ checkAnswer(df.selectExpr(s"to_unix_timestamp(d, '$fmt')"), Seq(
+ Row(secs(date1.getTime)), Row(secs(date2.getTime))))
+ checkAnswer(df.selectExpr(s"to_unix_timestamp(s, '$fmt')"), Seq(
+ Row(secs(ts1.getTime)), Row(secs(ts2.getTime))))
+
+ val x1 = "2015-07-24 10:00:00"
+ val x2 = "2015-25-07 02:02:02"
+ val x3 = "2015-07-24 25:02:02"
+ val x4 = "2015-24-07 26:02:02"
+ val ts3 = Timestamp.valueOf("2015-07-24 02:25:02")
+ val ts4 = Timestamp.valueOf("2015-07-24 00:10:00")
+
+ val df1 = Seq(x1, x2, x3, x4).toDF("x")
+ checkAnswer(df1.selectExpr("to_unix_timestamp(x)"), Seq(
+ Row(secs(ts1.getTime)), Row(null), Row(null), Row(null)))
+ checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd
mm:HH:ss')"), Seq(
+ Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)),
Row(null)))
+
+ // february
+ val y1 = "2016-02-29"
+ val y2 = "2017-02-29"
+ val ts5 = Timestamp.valueOf("2016-02-29 00:00:00")
+ val df2 = Seq(y1, y2).toDF("y")
+ checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq(
+ Row(secs(ts5.getTime)), Row(null)))
+
+ // invalid format
+ checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd
bb:HH:ss')"), Seq(
+ Row(null), Row(null), Row(null), Row(null)))
+ }
+ }
}
test("to_timestamp") {
- val date1 = Date.valueOf("2015-07-24")
- val date2 = Date.valueOf("2015-07-25")
- val ts_date1 = Timestamp.valueOf("2015-07-24 00:00:00")
- val ts_date2 = Timestamp.valueOf("2015-07-25 00:00:00")
- val ts1 = Timestamp.valueOf("2015-07-24 10:00:00")
- val ts2 = Timestamp.valueOf("2015-07-25 02:02:02")
- val s1 = "2015/07/24 10:00:00.5"
- val s2 = "2015/07/25 02:02:02.6"
- val ts1m = Timestamp.valueOf("2015-07-24 10:00:00.5")
- val ts2m = Timestamp.valueOf("2015-07-25 02:02:02.6")
- val ss1 = "2015-07-24 10:00:00"
- val ss2 = "2015-07-25 02:02:02"
- val fmt = "yyyy/MM/dd HH:mm:ss.S"
- val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts",
"s", "ss")
-
- checkAnswer(df.select(to_timestamp(col("ss"))),
- df.select(unix_timestamp(col("ss")).cast("timestamp")))
- checkAnswer(df.select(to_timestamp(col("ss"))), Seq(
- Row(ts1), Row(ts2)))
- checkAnswer(df.select(to_timestamp(col("s"), fmt)), Seq(
- Row(ts1m), Row(ts2m)))
- checkAnswer(df.select(to_timestamp(col("ts"), fmt)), Seq(
- Row(ts1), Row(ts2)))
- checkAnswer(df.select(to_timestamp(col("d"), "yyyy-MM-dd")), Seq(
- Row(ts_date1), Row(ts_date2)))
+ Seq(false, true).foreach { legacyParser =>
+ withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key ->
legacyParser.toString) {
+ val date1 = Date.valueOf("2015-07-24")
+ val date2 = Date.valueOf("2015-07-25")
+ val ts_date1 = Timestamp.valueOf("2015-07-24 00:00:00")
+ val ts_date2 = Timestamp.valueOf("2015-07-25 00:00:00")
+ val ts1 = Timestamp.valueOf("2015-07-24 10:00:00")
+ val ts2 = Timestamp.valueOf("2015-07-25 02:02:02")
+ val s1 = "2015/07/24 10:00:00.5"
+ val s2 = "2015/07/25 02:02:02.6"
+ val ts1m = Timestamp.valueOf("2015-07-24 10:00:00.5")
+ val ts2m = Timestamp.valueOf("2015-07-25 02:02:02.6")
+ val ss1 = "2015-07-24 10:00:00"
+ val ss2 = "2015-07-25 02:02:02"
+ val fmt = "yyyy/MM/dd HH:mm:ss.S"
+ val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d",
"ts", "s", "ss")
+
+ checkAnswer(df.select(to_timestamp(col("ss"))),
+ df.select(unix_timestamp(col("ss")).cast("timestamp")))
+ checkAnswer(df.select(to_timestamp(col("ss"))), Seq(
+ Row(ts1), Row(ts2)))
+ if (legacyParser) {
Review comment:
Unfortunately, `SimpleDateFormat` doesn't work correctly with the pattern
`.S`. In Spark 2.4, it wasn't visible in the test because `to_timestamp`
truncated results to seconds.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]