bersprockets commented on code in PR #36871:
URL: https://github.com/apache/spark/pull/36871#discussion_r906320780
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala:
##########
@@ -197,34 +199,50 @@ class UnivocityParser(
Decimal(decimalParser(datum), dt.precision, dt.scale)
}
- case _: TimestampType => (d: String) =>
+ case _: DateType => (d: String) =>
nullSafeDatum(d, name, nullable, options) { datum =>
try {
- timestampFormatter.parse(datum)
+ dateFormatter.parse(datum)
} catch {
case NonFatal(e) =>
// If fails to parse, then tries the way used in 2.0 and 1.x for
backwards
// compatibility.
val str =
DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(datum))
- DateTimeUtils.stringToTimestamp(str,
options.zoneId).getOrElse(throw e)
+ DateTimeUtils.stringToDate(str).getOrElse(throw e)
}
}
- case _: TimestampNTZType => (d: String) =>
- nullSafeDatum(d, name, nullable, options) { datum =>
- timestampNTZFormatter.parseWithoutTimeZone(datum, false)
- }
-
- case _: DateType => (d: String) =>
+ case _: TimestampType => (d: String) =>
nullSafeDatum(d, name, nullable, options) { datum =>
try {
- dateFormatter.parse(datum)
+ timestampFormatter.parse(datum)
} catch {
case NonFatal(e) =>
// If fails to parse, then tries the way used in 2.0 and 1.x for
backwards
// compatibility.
val str =
DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(datum))
- DateTimeUtils.stringToDate(str).getOrElse(throw e)
+ DateTimeUtils.stringToTimestamp(str, options.zoneId).getOrElse {
+ // There may be date type entries in timestamp column due to
schema inference
+ if (options.inferDate) {
+ daysToMicros(dateFormatter.parse(datum), options.zoneId)
Review Comment:
>Do you know what is the advantage of allowing Legacy Formatter?
One benefit of the legacy formatter is that it recognizes some pre-Gregorian
leap years (like `1500-02-29`) that exist only in the hybrid Julian calendar.
Note how schema inference chooses `string` until you set the legacy parser.
```
scala> val csvInput = Seq("1425-03-22T00:00:00", "2022-01-01T00:00:00",
"1500-02-29T00:00:00").toDS()
csvInput: org.apache.spark.sql.Dataset[String] = [value: string]
scala> spark.read.options(Map("inferSchema" -> "true", "timestampFormat" ->
"yyyy-MM-dd'T'HH:mm:ss")).csv(csvInput).printSchema
root
|-- _c0: string (nullable = true)
scala> sql("set spark.sql.legacy.timeParserPolicy=legacy")
res1: org.apache.spark.sql.DataFrame = [key: string, value: string]
scala> spark.read.options(Map("inferSchema" -> "true", "timestampFormat" ->
"yyyy-MM-dd'T'HH:mm:ss")).csv(csvInput).printSchema
root
|-- _c0: timestamp (nullable = true)
scala>
```
That, of course, matters only if the application's input comes from legacy
systems that still use hybrid Julian, and the input contains pre-Gregorian
dates (e.g., for date encoding, which is the only real-world use case I have
come across). I would imagine that audience is small and probably getting
smaller.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]