This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new d9ad111 [SPARK-31402][SQL] Fix rebasing of BCE dates/timestamps
d9ad111 is described below
commit d9ad11110428b2d89fcb2f0c0cbdf84a9970910a
Author: Max Gekk <[email protected]>
AuthorDate: Mon Apr 13 06:07:31 2020 +0000
[SPARK-31402][SQL] Fix rebasing of BCE dates/timestamps
In the PR, I propose to fallback to rebasing via local dates/timestamps for
days/micros of before common era (BCE).
It fixes the bug of rebasing dates/timestamps of BCE.
Yes
- By existing tests in `RebaseDateTimeSuite` and `DateTimeUtilsSuite`
- Added tests for negative years to `RebaseDateTimeSuite`
Closes #28172 from MaxGekk/fix-era-in-date-micros-rebasing.
Authored-by: Max Gekk <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
(cherry picked from commit cf63ad61f517a44a2b60c0e4f06cb387569039b3)
Signed-off-by: Wenchen Fan <[email protected]>
---
.../spark/sql/catalyst/util/RebaseDateTime.scala | 96 ++++++++++++++++++++--
.../sql/catalyst/util/RebaseDateTimeSuite.scala | 88 ++++++++++----------
2 files changed, 133 insertions(+), 51 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
index 7440787..f4a98cf 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
@@ -17,7 +17,7 @@
package org.apache.spark.sql.catalyst.util
-import java.time.{LocalDateTime, ZoneId}
+import java.time.{LocalDate, LocalDateTime, ZoneId}
import java.time.temporal.ChronoField
import java.util.{Calendar, TimeZone}
@@ -66,15 +66,53 @@ object RebaseDateTime {
// The starting point is the `0001-01-01` (-719164 days since the epoch in
// Julian calendar). All dates before the staring point have the same
difference
// of 2 days in Julian and Proleptic Gregorian calendars.
+ // Rebasing switch days and diffs `julianGregDiffSwitchDay` and
`julianGregDiffs`
+ // was generated by the `localRebaseJulianToGregorianDays` function.
private val julianGregDiffSwitchDay = Array(
-719164, -682945, -646420, -609895, -536845, -500320, -463795,
-390745, -354220, -317695, -244645, -208120, -171595, -141427)
+ // The first days of Common Era (CE) which is mapped to the '0001-01-01'
date in Julian calendar.
+ private final val julianCommonEraStartDay = julianGregDiffSwitchDay(0)
+
/**
* Converts the given number of days since the epoch day 1970-01-01 to a
local date in Julian
* calendar, interprets the result as a local date in Proleptic Gregorian
calendar, and takes the
* number of days since the epoch from the Gregorian local date.
*
+ * @param days The number of days since the epoch in Julian calendar. It can
be negative.
+ * @return The rebased number of days in Gregorian calendar.
+ */
+ private[sql] def localRebaseJulianToGregorianDays(days: Int): Int = {
+ val utcCal = new Calendar.Builder()
+ // `gregory` is a hybrid calendar that supports both
+ // the Julian and Gregorian calendar systems
+ .setCalendarType("gregory")
+ .setTimeZone(TimeZoneUTC)
+ .setInstant(Math.multiplyExact(days, MILLIS_PER_DAY))
+ .build()
+ val localDate = LocalDate.of(
+ utcCal.get(Calendar.YEAR),
+ utcCal.get(Calendar.MONTH) + 1,
+ // The number of days will be added later to handle non-existing
+ // Julian dates in Proleptic Gregorian calendar.
+ // For example, 1000-02-29 exists in Julian calendar because 1000
+ // is a leap year but it is not a leap year in Gregorian calendar.
+ 1)
+ .`with`(ChronoField.ERA, utcCal.get(Calendar.ERA))
+ .plusDays(utcCal.get(Calendar.DAY_OF_MONTH) - 1)
+ Math.toIntExact(localDate.toEpochDay)
+ }
+
+ /**
+ * For dates of Before Common Era, the function converts the given number of
days since the epoch
+ * day 1970-01-01 to a local date in Julian calendar, interprets the result
as a local date in
+ * Proleptic Gregorian calendar, and takes the number of days since the
epoch from the Gregorian
+ * local date.
+ *
+ * For dates of Common Era, the function performs rebasing via the
pre-calculated array of diffs
+ * between Julian and Proleptic Gregorian calendars.
+ *
* This is used to guarantee backward compatibility, as Spark 2.4 and
earlier versions use
* Julian calendar for dates before 1582-10-15, while Spark 3.0 and later
use Proleptic Gregorian
* calendar. See SPARK-26651.
@@ -88,7 +126,11 @@ object RebaseDateTime {
* @return The rebased number of days in Gregorian calendar.
*/
def rebaseJulianToGregorianDays(days: Int): Int = {
- rebaseDays(julianGregDiffSwitchDay, julianGregDiffs, days)
+ if (days < julianCommonEraStartDay) {
+ localRebaseJulianToGregorianDays(days)
+ } else {
+ rebaseDays(julianGregDiffSwitchDay, julianGregDiffs, days)
+ }
}
// The differences in days between Proleptic Gregorian and Julian dates.
@@ -100,15 +142,46 @@ object RebaseDateTime {
// The starting point is the `0001-01-01` (-719162 days since the epoch in
// Proleptic Gregorian calendar). All dates before the staring point have
the same
// difference of -2 days in Proleptic Gregorian and Julian calendars.
+ // Rebasing switch days and diffs `gregJulianDiffSwitchDay` and
`gregJulianDiffs`
+ // was generated by the `localRebaseGregorianToJulianDays` function.
private val gregJulianDiffSwitchDay = Array(
-719162, -682944, -646420, -609896, -536847, -500323, -463799,
-390750, -354226, -317702, -244653, -208129, -171605, -141427)
+ // The first days of Common Era (CE) which is mapped to the '0001-01-01' date
+ // in Proleptic Gregorian calendar.
+ private final val gregorianCommonEraStartDay = gregJulianDiffSwitchDay(0)
+
/**
* Converts the given number of days since the epoch day 1970-01-01 to a
local date in Proleptic
* Gregorian calendar, interprets the result as a local date in Julian
calendar, and takes the
* number of days since the epoch from the Julian local date.
*
+ * @param days The number of days since the epoch in Proleptic Gregorian
calendar.
+ * It can be negative.
+ * @return The rebased number of days in Julian calendar.
+ */
+ private[sql] def localRebaseGregorianToJulianDays(days: Int): Int = {
+ val localDate = LocalDate.ofEpochDay(days)
+ val utcCal = new Calendar.Builder()
+ // `gregory` is a hybrid calendar that supports both
+ // the Julian and Gregorian calendar systems
+ .setCalendarType("gregory")
+ .setTimeZone(TimeZoneUTC)
+ .setDate(localDate.getYear, localDate.getMonthValue - 1,
localDate.getDayOfMonth)
+ .build()
+ Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, MILLIS_PER_DAY))
+ }
+
+ /**
+ * For dates of Before Common Era, Converts the given number of days since
the epoch day
+ * 1970-01-01 to a local date in Proleptic Gregorian calendar, interprets
the result as a local
+ * date in Julian calendar, and takes the number of days since the epoch
from the Julian
+ * local date.
+ *
+ * For dates of Common Era, the function performs rebasing via the
pre-calculated array of diffs
+ * between Proleptic Gregorian and Julian calendars.
+ *
* This is used to guarantee backward compatibility, as Spark 2.4 and
earlier versions use
* Julian calendar for dates before 1582-10-15, while Spark 3.0 and later
use Proleptic Gregorian
* calendar. See SPARK-26651.
@@ -122,7 +195,11 @@ object RebaseDateTime {
* @return The rebased number of days since the epoch in Julian calendar.
*/
def rebaseGregorianToJulianDays(days: Int): Int = {
- rebaseDays(gregJulianDiffSwitchDay, gregJulianDiffs, days)
+ if (days < gregorianCommonEraStartDay) {
+ localRebaseGregorianToJulianDays(days)
+ } else {
+ rebaseDays(gregJulianDiffSwitchDay, gregJulianDiffs, days)
+ }
}
@@ -248,8 +325,8 @@ object RebaseDateTime {
/**
* An optimized version of [[rebaseGregorianToJulianMicros(ZoneId, Long)]].
This method leverages
* the pre-calculated rebasing maps to save calculation. If the rebasing map
doesn't contain
- * information about the current JVM system time zone, the function falls
back to the regular
- * unoptimized version.
+ * information about the current JVM system time zone or `micros` is related
to Before Common Era,
+ * the function falls back to the regular unoptimized version.
*
* Note: The function assumes that the input micros was derived from a local
timestamp
* at the default system JVM time zone in Proleptic Gregorian calendar.
@@ -262,7 +339,7 @@ object RebaseDateTime {
val timeZone = TimeZone.getDefault
val tzId = timeZone.getID
val rebaseRecord = gregJulianRebaseMap.getOrNull(tzId)
- if (rebaseRecord == null) {
+ if (rebaseRecord == null || micros < rebaseRecord.switches(0)) {
rebaseGregorianToJulianMicros(timeZone.toZoneId, micros)
} else {
rebaseMicros(rebaseRecord, micros)
@@ -308,6 +385,7 @@ object RebaseDateTime {
cal.get(Calendar.MINUTE),
cal.get(Calendar.SECOND),
(Math.floorMod(micros, MICROS_PER_SECOND) * NANOS_PER_MICROS).toInt)
+ .`with`(ChronoField.ERA, cal.get(Calendar.ERA))
.plusDays(cal.get(Calendar.DAY_OF_MONTH) - 1)
val zonedDateTime = localDateTime.atZone(zoneId)
// Assuming the daylight saving switchover time is 2:00, the local clock
will go back to
@@ -335,8 +413,8 @@ object RebaseDateTime {
/**
* An optimized version of [[rebaseJulianToGregorianMicros(ZoneId, Long)]].
This method leverages
* the pre-calculated rebasing maps to save calculation. If the rebasing map
doesn't contain
- * information about the current JVM system time zone, the function falls
back to the regular
- * unoptimized version.
+ * information about the current JVM system time zone or `micros` is related
to Before Common Era,
+ * the function falls back to the regular unoptimized version.
*
* Note: The function assumes that the input micros was derived from a local
timestamp
* at the default system JVM time zone in the Julian calendar.
@@ -349,7 +427,7 @@ object RebaseDateTime {
val timeZone = TimeZone.getDefault
val tzId = timeZone.getID
val rebaseRecord = julianGregRebaseMap.getOrNull(tzId)
- if (rebaseRecord == null) {
+ if (rebaseRecord == null || micros < rebaseRecord.switches(0)) {
rebaseJulianToGregorianMicros(timeZone.toZoneId, micros)
} else {
rebaseMicros(rebaseRecord, micros)
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala
index dcc80b1..1e70e60 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.util
import java.sql.{Date, Timestamp}
import java.time.{Instant, LocalDate, LocalDateTime, ZoneId}
-import java.util.{Calendar, TimeZone}
+import java.util.TimeZone
import org.scalatest.Matchers
@@ -32,15 +32,18 @@ import org.apache.spark.sql.catalyst.util.RebaseDateTime._
class RebaseDateTimeSuite extends SparkFunSuite with Matchers with SQLHelper {
- private def parseToJulianMicros(s: String): Long = {
- val ts = Timestamp.valueOf(s)
+ private def toJulianMicros(ts: Timestamp): Long = {
val julianMicros = fromMillis(ts.getTime) +
((ts.getNanos / NANOS_PER_MICROS) % MICROS_PER_MILLIS)
julianMicros
}
+ private def parseToJulianMicros(s: String): Long =
toJulianMicros(Timestamp.valueOf(s))
+ private def toGregorianMicros(ldt: LocalDateTime, zoneId: ZoneId): Long = {
+ instantToMicros(ldt.atZone(zoneId).toInstant)
+ }
private def parseToGregMicros(s: String, zoneId: ZoneId): Long = {
- instantToMicros(LocalDateTime.parse(s).atZone(zoneId).toInstant)
+ toGregorianMicros(LocalDateTime.parse(s), zoneId)
}
test("rebase julian to/from gregorian micros") {
@@ -141,53 +144,17 @@ class RebaseDateTimeSuite extends SparkFunSuite with
Matchers with SQLHelper {
}
test("optimization of days rebasing - Gregorian to Julian") {
- // Rebasing switch days and diffs `gregJulianDiffSwitchDay` and
`gregJulianDiffs`
- // in `RebaseDateTime` was generated by using this function.
- def refRebaseGregorianToJulianDays(days: Int): Int = {
- val localDate = LocalDate.ofEpochDay(days)
- val utcCal = new Calendar.Builder()
- // `gregory` is a hybrid calendar that supports both
- // the Julian and Gregorian calendar systems
- .setCalendarType("gregory")
- .setTimeZone(TimeZoneUTC)
- .setDate(localDate.getYear, localDate.getMonthValue - 1,
localDate.getDayOfMonth)
- .build()
- Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, MILLIS_PER_DAY))
- }
-
val start = localDateToDays(LocalDate.of(1, 1, 1))
val end = localDateToDays(LocalDate.of(2030, 1, 1))
var days = start
while (days < end) {
- assert(rebaseGregorianToJulianDays(days) ===
refRebaseGregorianToJulianDays(days))
+ assert(rebaseGregorianToJulianDays(days) ===
localRebaseGregorianToJulianDays(days))
days += 1
}
}
test("optimization of days rebasing - Julian to Gregorian") {
- // Rebasing switch days and diffs `julianGregDiffSwitchDay` and
`julianGregDiffs`
- // in `RebaseDateTime` was generated by using this function.
- def refRebaseJulianToGregorianDays(days: Int): Int = {
- val utcCal = new Calendar.Builder()
- // `gregory` is a hybrid calendar that supports both
- // the Julian and Gregorian calendar systems
- .setCalendarType("gregory")
- .setTimeZone(TimeZoneUTC)
- .setInstant(Math.multiplyExact(days, MILLIS_PER_DAY))
- .build()
- val localDate = LocalDate.of(
- utcCal.get(Calendar.YEAR),
- utcCal.get(Calendar.MONTH) + 1,
- // The number of days will be added later to handle non-existing
- // Julian dates in Proleptic Gregorian calendar.
- // For example, 1000-02-29 exists in Julian calendar because 1000
- // is a leap year but it is not a leap year in Gregorian calendar.
- 1)
- .plusDays(utcCal.get(Calendar.DAY_OF_MONTH) - 1)
- Math.toIntExact(localDate.toEpochDay)
- }
-
val start = rebaseGregorianToJulianDays(
localDateToDays(LocalDate.of(1, 1, 1)))
val end = rebaseGregorianToJulianDays(
@@ -195,7 +162,7 @@ class RebaseDateTimeSuite extends SparkFunSuite with
Matchers with SQLHelper {
var days = start
while (days < end) {
- assert(rebaseJulianToGregorianDays(days) ===
refRebaseJulianToGregorianDays(days))
+ assert(rebaseJulianToGregorianDays(days) ===
localRebaseJulianToGregorianDays(days))
days += 1
}
}
@@ -360,4 +327,41 @@ class RebaseDateTimeSuite extends SparkFunSuite with
Matchers with SQLHelper {
dir = "/Users/maximgekk/tmp",
fileName = "julian-gregorian-rebase-micros.json")
}
+
+ test("rebase gregorian to/from julian days - BCE era") {
+ outstandingZoneIds.foreach { zid =>
+ withDefaultTimeZone(zid) {
+ Seq(
+ (-1100, 1, 1),
+ (-1044, 3, 5),
+ (-44, 3, 5)).foreach { case (year, month, day) =>
+ val julianDays = fromJavaDateLegacy(new Date(year - 1900, month - 1,
day))
+ val gregorianDays = localDateToDays(LocalDate.of(year, month, day))
+
+ assert(rebaseGregorianToJulianDays(gregorianDays) === julianDays)
+ assert(rebaseJulianToGregorianDays(julianDays) === gregorianDays)
+ }
+ }
+ }
+ }
+
+ test("rebase gregorian to/from julian micros - BCE era") {
+ outstandingZoneIds.foreach { zid =>
+ withDefaultTimeZone(zid) {
+ Seq(
+ (-1100, 1, 1, 1, 2, 3, 0),
+ (-1044, 3, 5, 0, 0, 0, 123456000),
+ (-44, 3, 5, 23, 59, 59, 99999000)
+ ).foreach { case (year, month, day, hour, minute, second, nanos) =>
+ val julianMicros = toJulianMicros(new Timestamp(
+ year - 1900, month - 1, day, hour, minute, second, nanos))
+ val gregorianMicros = toGregorianMicros(LocalDateTime.of(
+ year, month, day, hour, minute, second, nanos), zid)
+
+ assert(rebaseGregorianToJulianMicros(gregorianMicros) ===
julianMicros)
+ assert(rebaseJulianToGregorianMicros(julianMicros) ===
gregorianMicros)
+ }
+ }
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]