[spark] branch master updated: [SPARK-36418][SQL] Use CAST in parsing of dates/timestamps with default pattern

wenchen Mon, 16 Aug 2021 08:30:17 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new f620996  [SPARK-36418][SQL] Use CAST in parsing of dates/timestamps 
with default pattern
f620996 is described below

commit f620996142ba312f7e52f75476b1b18be667ffdf
Author: Max Gekk <[email protected]>
AuthorDate: Mon Aug 16 23:29:33 2021 +0800

    [SPARK-36418][SQL] Use CAST in parsing of dates/timestamps with default 
pattern
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to use the `CAST` logic when the pattern is not 
specified in `DateFormatter` or `TimestampFormatter`. In particular, invoke the 
`DateTimeUtils.stringToTimestampAnsi()` or `stringToDateAnsi()` in the case.
    
    ### Why are the changes needed?
    1. This can improve user experience with Spark SQL by making the default 
date/timestamp parsers more flexible and tolerant to their inputs.
    2. We make the default case consistent to the behavior of the `CAST` 
expression which makes implementation more consistent.
    
    ### Does this PR introduce _any_ user-facing change?
    The changes shouldn't introduce behavior change in regular cases but it can 
influence on corner cases. New implementation is able to parse more 
dates/timestamps by default. For instance, old (current) date parses can 
recognize dates only in the format **yyyy-MM-dd** but new one can handle:
       * `[+-]yyyy*`
       * `[+-]yyyy*-[m]m`
       * `[+-]yyyy*-[m]m-[d]d`
       * `[+-]yyyy*-[m]m-[d]d `
       * `[+-]yyyy*-[m]m-[d]d *`
       * `[+-]yyyy*-[m]m-[d]dT*`
    
    Similarly for timestamps. The old (current) timestamp formatter is able to 
parse timestamps only in the format **yyyy-MM-dd HH:mm:ss** by default, but new 
implementation can handle:
       * `[+-]yyyy*`
       * `[+-]yyyy*-[m]m`
       * `[+-]yyyy*-[m]m-[d]d`
       * `[+-]yyyy*-[m]m-[d]d `
       * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
       * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
       * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
       * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
    
    ### How was this patch tested?
    By running the affected test suites:
    ```
    $ build/sbt "test:testOnly *ImageFileFormatSuite"
    $ build/sbt "test:testOnly *ParquetV2PartitionDiscoverySuite"
    ```
    
    Closes #33709 from MaxGekk/datetime-cast-default-pattern.
    
    Authored-by: Max Gekk <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../ml/source/image/ImageFileFormatSuite.scala     | 17 ++++++-----
 .../spark/sql/catalyst/util/DateFormatter.scala    | 33 ++++++++++++++++++---
 .../sql/catalyst/util/TimestampFormatter.scala     | 34 +++++++++++++++++++---
 .../sql/catalyst/util/DateFormatterSuite.scala     | 22 ++++++++++++++
 .../catalyst/util/TimestampFormatterSuite.scala    | 25 ++++++++++++++++
 .../parquet/ParquetPartitionDiscoverySuite.scala   |  2 +-
 6 files changed, 116 insertions(+), 17 deletions(-)

diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
index 0ec2747..7dca81e 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/source/image/ImageFileFormatSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.ml.source.image
 
 import java.net.URI
 import java.nio.file.Paths
+import java.sql.Date
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.image.ImageSchema._
@@ -95,14 +96,14 @@ class ImageFileFormatSuite extends SparkFunSuite with 
MLlibTestSparkContext {
       .collect()
 
     assert(Set(result: _*) === Set(
-      Row("29.5.a_b_EGDP022204.jpg", "kittens", "2018-01"),
-      Row("54893.jpg", "kittens", "2018-02"),
-      Row("DP153539.jpg", "kittens", "2018-02"),
-      Row("DP802813.jpg", "kittens", "2018-02"),
-      Row("BGRA.png", "multichannel", "2018-01"),
-      Row("BGRA_alpha_60.png", "multichannel", "2018-01"),
-      Row("chr30.4.184.jpg", "multichannel", "2018-02"),
-      Row("grayscale.jpg", "multichannel", "2018-02")
+      Row("29.5.a_b_EGDP022204.jpg", "kittens", Date.valueOf("2018-01-01")),
+      Row("54893.jpg", "kittens", Date.valueOf("2018-02-01")),
+      Row("DP153539.jpg", "kittens", Date.valueOf("2018-02-01")),
+      Row("DP802813.jpg", "kittens", Date.valueOf("2018-02-01")),
+      Row("BGRA.png", "multichannel", Date.valueOf("2018-01-01")),
+      Row("BGRA_alpha_60.png", "multichannel", Date.valueOf("2018-01-01")),
+      Row("chr30.4.184.jpg", "multichannel", Date.valueOf("2018-02-01")),
+      Row("grayscale.jpg", "multichannel", Date.valueOf("2018-02-01"))
     ))
   }
 
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
index 76bc196..d9ccf30 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
@@ -26,6 +26,7 @@ import org.apache.commons.lang3.time.FastDateFormat
 import org.apache.spark.sql.catalyst.util.DateTimeUtils._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy._
+import org.apache.spark.unsafe.types.UTF8String
 
 sealed trait DateFormatter extends Serializable {
   def parse(s: String): Int // returns days since epoch
@@ -48,7 +49,8 @@ class Iso8601DateFormatter(
   private lazy val formatter = getOrCreateFormatter(pattern, locale, isParsing)
 
   @transient
-  private lazy val legacyFormatter = DateFormatter.getLegacyFormatter(pattern, 
locale, legacyFormat)
+  protected lazy val legacyFormatter =
+    DateFormatter.getLegacyFormatter(pattern, locale, legacyFormat)
 
   override def parse(s: String): Int = {
     try {
@@ -79,6 +81,28 @@ class Iso8601DateFormatter(
   }
 }
 
+/**
+ * The formatter for dates which doesn't require users to specify a pattern. 
While formatting,
+ * it uses the default pattern [[DateFormatter.defaultPattern]]. In parsing, 
it follows the CAST
+ * logic in conversion of strings to Catalyst's DateType.
+ *
+ * @param locale The locale overrides the system locale and is used in 
formatting.
+ * @param legacyFormat Defines the formatter used for legacy dates.
+ * @param isParsing Whether the formatter is used for parsing (`true`) or for 
formatting (`false`).
+ */
+class DefaultDateFormatter(
+    locale: Locale,
+    legacyFormat: LegacyDateFormats.LegacyDateFormat,
+    isParsing: Boolean)
+  extends Iso8601DateFormatter(DateFormatter.defaultPattern, locale, 
legacyFormat, isParsing) {
+
+  override def parse(s: String): Int = {
+    try {
+      DateTimeUtils.stringToDateAnsi(UTF8String.fromString(s))
+    } catch checkParsedDiff(s, legacyFormatter.parse)
+  }
+}
+
 trait LegacyDateFormatter extends DateFormatter {
   def parseToDate(s: String): Date
 
@@ -151,11 +175,12 @@ object DateFormatter {
       locale: Locale = defaultLocale,
       legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
       isParsing: Boolean): DateFormatter = {
-    val pattern = format.getOrElse(defaultPattern)
     if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
-      getLegacyFormatter(pattern, locale, legacyFormat)
+      getLegacyFormatter(format.getOrElse(defaultPattern), locale, 
legacyFormat)
     } else {
-      val df = new Iso8601DateFormatter(pattern, locale, legacyFormat, 
isParsing)
+      val df = format
+        .map(new Iso8601DateFormatter(_, locale, legacyFormat, isParsing))
+        .getOrElse(new DefaultDateFormatter(locale, legacyFormat, isParsing))
       df.validatePatternString()
       df
     }
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
index 35c91a62..fb8502a 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
@@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.util.RebaseDateTime._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy._
 import org.apache.spark.sql.types.Decimal
+import org.apache.spark.unsafe.types.UTF8String
 
 sealed trait TimestampFormatter extends Serializable {
   /**
@@ -161,6 +162,31 @@ class Iso8601TimestampFormatter(
 }
 
 /**
+ * The formatter for timestamps which doesn't require users to specify a 
pattern. While formatting,
+ * it uses the default pattern [[TimestampFormatter.defaultPattern()]]. In 
parsing, it follows
+ * the CAST logic in conversion of strings to Catalyst's TimestampType.
+ *
+ * @param zoneId The time zone ID in which timestamps should be formatted or 
parsed.
+ * @param locale The locale overrides the system locale and is used in 
formatting.
+ * @param legacyFormat Defines the formatter used for legacy timestamps.
+ * @param isParsing Whether the formatter is used for parsing (`true`) or for 
formatting (`false`).
+ */
+class DefaultTimestampFormatter(
+    zoneId: ZoneId,
+    locale: Locale,
+    legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
+    isParsing: Boolean)
+  extends Iso8601TimestampFormatter(
+    TimestampFormatter.defaultPattern(), zoneId, locale, legacyFormat, 
isParsing) {
+
+  override def parse(s: String): Long = {
+    try {
+      DateTimeUtils.stringToTimestampAnsi(UTF8String.fromString(s), zoneId)
+    } catch checkParsedDiff(s, legacyFormatter.parse)
+  }
+}
+
+/**
  * The formatter parses/formats timestamps according to the pattern 
`yyyy-MM-dd HH:mm:ss.[..fff..]`
  * where `[..fff..]` is a fraction of second up to microsecond resolution. The 
formatter does not
  * output trailing zeros in the fraction. For example, the timestamp 
`2019-03-05 15:00:01.123400` is
@@ -341,12 +367,12 @@ object TimestampFormatter {
       legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
       isParsing: Boolean,
       forTimestampNTZ: Boolean = false): TimestampFormatter = {
-    val pattern = format.getOrElse(defaultPattern)
     val formatter = if (SQLConf.get.legacyTimeParserPolicy == LEGACY && 
!forTimestampNTZ) {
-      getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
+      getLegacyFormatter(format.getOrElse(defaultPattern), zoneId, locale, 
legacyFormat)
     } else {
-      new Iso8601TimestampFormatter(
-        pattern, zoneId, locale, legacyFormat, isParsing)
+      format
+        .map(new Iso8601TimestampFormatter(_, zoneId, locale, legacyFormat, 
isParsing))
+        .getOrElse(new DefaultTimestampFormatter(zoneId, locale, legacyFormat, 
isParsing))
     }
     formatter.validatePatternString(checkLegacy = !forTimestampNTZ)
     formatter
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateFormatterSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateFormatterSuite.scala
index 4c22e67..44c90db 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateFormatterSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateFormatterSuite.scala
@@ -188,4 +188,26 @@ class DateFormatterSuite extends DatetimeFormatterSuite {
     // SparkUpgradeException here.
     intercept[SparkUpgradeException](formatter.parse("02-29"))
   }
+
+  test("SPARK-36418: default parsing w/o pattern") {
+    val formatter = new DefaultDateFormatter(
+      locale = DateFormatter.defaultLocale,
+      legacyFormat = LegacyDateFormats.SIMPLE_DATE_FORMAT,
+      isParsing = true)
+    Seq(
+      "-0042-3-4" -> LocalDate.of(-42, 3, 4),
+      "1000" -> LocalDate.of(1000, 1, 1),
+      "1582-10-4" -> LocalDate.of(1582, 10, 4),
+      "1583-1-1 " -> LocalDate.of(1583, 1, 1),
+      "1970-01-1 00:00" -> LocalDate.of(1970, 1, 1),
+      "2021-8-12T18:31:50" -> LocalDate.of(2021, 8, 12)
+    ).foreach { case (inputStr, ld) =>
+      assert(formatter.parse(inputStr) === ld.toEpochDay)
+    }
+
+    val errMsg = intercept[DateTimeException] {
+      formatter.parse("x123")
+    }.getMessage
+    assert(errMsg.contains("Cannot cast x123 to DateType"))
+  }
 }
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampFormatterSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampFormatterSuite.scala
index 6e93866..661e624 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampFormatterSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampFormatterSuite.scala
@@ -431,4 +431,29 @@ class TimestampFormatterSuite extends 
DatetimeFormatterSuite {
     val e2 = intercept[ArithmeticException](formatter.parse("-290308"))
     assert(e2.getMessage === "long overflow")
   }
+
+  test("SPARK-36418: default parsing w/o pattern") {
+    outstandingZoneIds.foreach { zoneId =>
+      val formatter = new DefaultTimestampFormatter(
+        zoneId,
+        locale = DateFormatter.defaultLocale,
+        legacyFormat = LegacyDateFormats.SIMPLE_DATE_FORMAT,
+        isParsing = true)
+      Seq(
+        "-0042-3-4" -> LocalDateTime.of(-42, 3, 4, 0, 0, 0),
+        "1000" -> LocalDateTime.of(1000, 1, 1, 0, 0, 0),
+        "1582-10-4" -> LocalDateTime.of(1582, 10, 4, 0, 0, 0),
+        "1583-1-1 " -> LocalDateTime.of(1583, 1, 1, 0, 0, 0),
+        "1970-01-1 01:02:3" -> LocalDateTime.of(1970, 1, 1, 1, 2, 3),
+        "2021-8-12T18:31:50" -> LocalDateTime.of(2021, 8, 12, 18, 31, 50)
+      ).foreach { case (inputStr, ldt) =>
+        assert(formatter.parse(inputStr) === 
DateTimeTestUtils.localDateTimeToMicros(ldt, zoneId))
+      }
+
+      val errMsg = intercept[DateTimeException] {
+        formatter.parse("x123")
+      }.getMessage
+      assert(errMsg.contains("Cannot cast x123 to TimestampType"))
+    }
+  }
 }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
index a8507b7..a3aa74d 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala
@@ -1067,7 +1067,7 @@ abstract class ParquetPartitionDiscoverySuite
 
   test("SPARK-23436: invalid Dates should be inferred as String in partition 
inference") {
     withTempPath { path =>
-      val data = Seq(("1", "2018-01", "2018-01-01-04", "test"))
+      val data = Seq(("1", "2018-41", "2018-01-01-04", "test"))
         .toDF("id", "date_month", "date_hour", "data")
 
       data.write.partitionBy("date_month", 
"date_hour").parquet(path.getAbsolutePath)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-36418][SQL] Use CAST in parsing of dates/timestamps with default pattern

Reply via email to