This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new bd26429  [SPARK-31641][SQL] Fix days conversions by JSON legacy parser
bd26429 is described below

commit bd264299317bba91f2dc1dc27fd51e6bc0609d66
Author: Max Gekk <max.g...@gmail.com>
AuthorDate: Tue May 5 14:15:31 2020 +0000

    [SPARK-31641][SQL] Fix days conversions by JSON legacy parser
    
    ### What changes were proposed in this pull request?
    Perform days rebasing while converting days from JSON string field. In 
Spark 2.4 and earlier versions, the days are interpreted as days since the 
epoch in the hybrid calendar (Julian + Gregorian since 1582-10-15). Since Spark 
3.0, the base calendar was switched to Proleptic Gregorian calendar, so, the 
days should be rebased to represent the same local date.
    
    ### Why are the changes needed?
    The changes fix a bug and restore compatibility with Spark 2.4 in which:
    ```scala
    scala> spark.read.schema("d date").json(Seq("{'d': '-141704'}").toDS).show
    +----------+
    |         d|
    +----------+
    |1582-01-01|
    +----------+
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    Yes.
    
    Before:
    ```scala
    scala> spark.read.schema("d date").json(Seq("{'d': '-141704'}").toDS).show
    +----------+
    |         d|
    +----------+
    |1582-01-11|
    +----------+
    ```
    
    After:
    ```scala
    scala> spark.read.schema("d date").json(Seq("{'d': '-141704'}").toDS).show
    +----------+
    |         d|
    +----------+
    |1582-01-01|
    +----------+
    ```
    
    ### How was this patch tested?
    Add a test to `JsonSuite`.
    
    Closes #28453 from MaxGekk/json-rebase-legacy-days.
    
    Authored-by: Max Gekk <max.g...@gmail.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../org/apache/spark/sql/catalyst/json/JacksonParser.scala   |  2 +-
 .../spark/sql/execution/datasources/json/JsonSuite.scala     | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
index 8965a81..a52c345 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
@@ -259,7 +259,7 @@ class JacksonParser(
                 // In Spark 1.5.0, we store the data as number of days since 
epoch in string.
                 // So, we just convert it to Int.
                 try {
-                  parser.getText.toInt
+                  
RebaseDateTime.rebaseJulianToGregorianDays(parser.getText.toInt)
                 } catch {
                   case _: NumberFormatException => throw e
                 }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 8d4d89f..dcea483 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -2653,13 +2653,17 @@ abstract class JsonSuite extends QueryTest with 
SharedSparkSession with TestJson
     }
   }
 
-  test("SPARK-30960: parse date/timestamp string with legacy format") {
-    val ds = Seq("{'t': '2020-1-12 3:23:34.12', 'd': '2020-1-12 T', 'd2': 
'12345'}").toDS()
-    val json = spark.read.schema("t timestamp, d date, d2 date").json(ds)
+  test("SPARK-30960, SPARK-31641: parse date/timestamp string with legacy 
format") {
+    val julianDay = -141704 // 1582-01-01 in Julian calendar
+    val ds = Seq(
+      s"{'t': '2020-1-12 3:23:34.12', 'd': '2020-1-12 T', 'd2': '12345', 'd3': 
'$julianDay'}"
+    ).toDS()
+    val json = spark.read.schema("t timestamp, d date, d2 date, d3 
date").json(ds)
     checkAnswer(json, Row(
       Timestamp.valueOf("2020-1-12 3:23:34.12"),
       Date.valueOf("2020-1-12"),
-      Date.valueOf(LocalDate.ofEpochDay(12345))))
+      Date.valueOf(LocalDate.ofEpochDay(12345)),
+      Date.valueOf("1582-01-01")))
   }
 
   test("exception mode for parsing date/timestamp string") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to