This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 3702327  [SPARK-31423][SQL] Fix rebasing of not-existed dates
3702327 is described below

commit 37023273fe0171ab758a81956dcc0ae9f8d2253b
Author: Max Gekk <[email protected]>
AuthorDate: Wed Apr 15 16:33:56 2020 +0000

    [SPARK-31423][SQL] Fix rebasing of not-existed dates
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to change rebasing of not-existed dates in the hybrid 
calendar (Julian + Gregorian since 1582-10-15) in the range (1582-10-04, 
1582-10-15). Not existed dates from the range are shifted to the first valid 
date in the hybrid calendar - 1582-10-15. The changes affect only 
`rebaseGregorianToJulianDays()` because reverse rebasing from the hybrid dates 
to Proleptic Gregorian dates does not have such problem.
    
    ### Why are the changes needed?
    Currently, not-existed dates are shifted by standard difference between 
Julian and Gregorian calendar on 1582-10-04, for example 1582-10-14 -> 
1582-10-24. That's contradict to shifting not existed dates in other cases, for 
example:
    ```
    scala> sql("select date'1990-9-31'").show
    +-----------------+
    |DATE '1990-10-01'|
    +-----------------+
    |       1990-10-01|
    +-----------------+
    ```
    
    ### Does this PR introduce any user-facing change?
    Yes, this impacts on conversion of Spark SQL `DATE` values to external 
dates based on non-Proleptic Gregorian calendar. For example, while saving the 
1582-10-14 date to ORC files, it will be shifted to the next valid date 
1582-10-15.
    
    ### How was this patch tested?
    - Added tests to `RebaseDateTimeSuite` and to `OrcSourceSuite`
    - By existing test suites `DateTimeUtilsSuite`, `DateFunctionsSuite`, 
`DateExpressionsSuite`, `CollectionExpressionsSuite`, `ParquetIOSuite`.
    
    Closes #28225 from MaxGekk/fix-not-exist-dates.
    
    Authored-by: Max Gekk <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
    (cherry picked from commit 2b10d70bad30fb7b7c293338c2acc908031af0b8)
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../spark/sql/catalyst/util/RebaseDateTime.scala   | 16 ++++++++++++----
 .../sql/catalyst/util/RebaseDateTimeSuite.scala    | 22 ++++++++++++++++++++++
 .../execution/datasources/orc/OrcSourceSuite.scala |  8 +++++---
 3 files changed, 39 insertions(+), 7 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
index 50b552e..6338a59 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
@@ -131,7 +131,8 @@ object RebaseDateTime {
   // The differences in days between Proleptic Gregorian and Julian dates.
   // The diff at the index `i` is applicable for all days in the date interval:
   // [gregJulianDiffSwitchDay(i), gregJulianDiffSwitchDay(i+1))
-  private val gregJulianDiffs = Array(-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
10, 0)
+  private val gregJulianDiffs = Array(
+    -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
   // The sorted days in Proleptic Gregorian calendar when difference in days 
between
   // Proleptic Gregorian and Julian was changed.
   // The starting point is the `0001-01-01` (-719162 days since the epoch in
@@ -139,13 +140,17 @@ object RebaseDateTime {
   // Rebasing switch days and diffs `gregJulianDiffSwitchDay` and 
`gregJulianDiffs`
   // was generated by the `localRebaseGregorianToJulianDays` function.
   private val gregJulianDiffSwitchDay = Array(
-    -719162, -682944, -646420, -609896, -536847, -500323, -463799,
-    -390750, -354226, -317702, -244653, -208129, -171605, -141427)
+    -719162, -682944, -646420, -609896, -536847, -500323, -463799, -390750,
+    -354226, -317702, -244653, -208129, -171605, -141436, -141435, -141434,
+    -141433, -141432, -141431, -141430, -141429, -141428, -141427)
 
   // The first days of Common Era (CE) which is mapped to the '0001-01-01' date
   // in Proleptic Gregorian calendar.
   private final val gregorianCommonEraStartDay = gregJulianDiffSwitchDay(0)
 
+  private final val gregorianStartDay = LocalDate.of(1582, 10, 15)
+  private final val julianEndDay = LocalDate.of(1582, 10, 4)
+
   /**
    * Converts the given number of days since the epoch day 1970-01-01 to a 
local date in Proleptic
    * Gregorian calendar, interprets the result as a local date in Julian 
calendar, and takes the
@@ -165,7 +170,10 @@ object RebaseDateTime {
    * @return The rebased number of days in Julian calendar.
    */
   private[sql] def localRebaseGregorianToJulianDays(days: Int): Int = {
-    val localDate = LocalDate.ofEpochDay(days)
+    var localDate = LocalDate.ofEpochDay(days)
+    if (localDate.isAfter(julianEndDay) && 
localDate.isBefore(gregorianStartDay)) {
+      localDate = gregorianStartDay
+    }
     val utcCal = new Calendar.Builder()
       // `gregory` is a hybrid calendar that supports both
       // the Julian and Gregorian calendar systems
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala
index 1e70e60..73ca761 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala
@@ -364,4 +364,26 @@ class RebaseDateTimeSuite extends SparkFunSuite with 
Matchers with SQLHelper {
       }
     }
   }
+
+  test("rebase not-existed dates in the hybrid calendar") {
+    outstandingZoneIds.foreach { zid =>
+      withDefaultTimeZone(zid) {
+        Seq(
+          "1582-10-04" -> "1582-10-04",
+          "1582-10-05" -> "1582-10-15", "1582-10-06" -> "1582-10-15", 
"1582-10-07" -> "1582-10-15",
+          "1582-10-08" -> "1582-10-15", "1582-10-09" -> "1582-10-15", 
"1582-10-11" -> "1582-10-15",
+          "1582-10-12" -> "1582-10-15", "1582-10-13" -> "1582-10-15", 
"1582-10-14" -> "1582-10-15",
+          "1582-10-15" -> "1582-10-15").foreach { case (hybridDate, gregDate) 
=>
+          withClue(s"tz = ${zid.getId} hybrid date = $hybridDate greg date = 
$gregDate") {
+            val date = Date.valueOf(gregDate)
+            val hybridDays = fromJavaDateLegacy(date)
+            val gregorianDays = localDateToDays(LocalDate.parse(hybridDate))
+
+            assert(localRebaseGregorianToJulianDays(gregorianDays) === 
hybridDays)
+            assert(rebaseGregorianToJulianDays(gregorianDays) === hybridDays)
+          }
+        }
+      }
+    }
+  }
 }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
index 0b7500c..f0ce6d5 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -493,17 +493,19 @@ abstract class OrcSuite extends OrcTest with 
BeforeAndAfterAll {
     }
   }
 
-  test("SPARK-31238: rebasing dates in write") {
+  test("SPARK-31238, SPARK-31423: rebasing dates in write") {
     withTempPath { dir =>
       val path = dir.getAbsolutePath
-      Seq("1001-01-01").toDF("dateS")
+      Seq("1001-01-01", "1582-10-10").toDF("dateS")
         .select($"dateS".cast("date").as("date"))
         .write
         .orc(path)
 
       Seq(false, true).foreach { vectorized =>
         withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> 
vectorized.toString) {
-          checkAnswer(spark.read.orc(path), Row(Date.valueOf("1001-01-01")))
+          checkAnswer(
+            spark.read.orc(path),
+            Seq(Row(Date.valueOf("1001-01-01")), 
Row(Date.valueOf("1582-10-15"))))
         }
       }
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to