This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.4 by this push:
new f985d716e164 [SPARK-45433][SQL][3.4] Fix CSV/JSON schema inference
when timestamps do not match specified timestampFormat
f985d716e164 is described below
commit f985d716e164885575ec7f36a7782694411da024
Author: Jia Fan <[email protected]>
AuthorDate: Thu Oct 12 17:09:48 2023 +0500
[SPARK-45433][SQL][3.4] Fix CSV/JSON schema inference when timestamps do
not match specified timestampFormat
### What changes were proposed in this pull request?
This is a backport PR of #43243. Fix the bug of schema inference when
timestamps do not match specified timestampFormat. Please check #43243 for
detail.
### Why are the changes needed?
Fix schema inference bug on 3.4.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
add new test.
### Was this patch authored or co-authored using generative AI tooling?
Closes #43343 from Hisoka-X/backport-SPARK-45433-inference-schema.
Authored-by: Jia Fan <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.../org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala | 8 ++++++--
.../org/apache/spark/sql/catalyst/json/JsonInferSchema.scala | 7 +++++--
.../apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala | 10 ++++++++++
.../apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala | 8 ++++++++
4 files changed, 29 insertions(+), 4 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
index 51586a0065e9..dd8ac3985f19 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.util.{DateFormatter,
TimestampFormatter}
import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
import org.apache.spark.sql.errors.QueryExecutionErrors
import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
import org.apache.spark.sql.types._
class CSVInferSchema(val options: CSVOptions) extends Serializable {
@@ -202,8 +203,11 @@ class CSVInferSchema(val options: CSVOptions) extends
Serializable {
// We can only parse the value as TimestampNTZType if it does not have
zone-offset or
// time-zone component and can be parsed with the timestamp formatter.
// Otherwise, it is likely to be a timestamp with timezone.
- if (timestampNTZFormatter.parseWithoutTimeZoneOptional(field,
false).isDefined) {
- SQLConf.get.timestampType
+ val timestampType = SQLConf.get.timestampType
+ if ((SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY ||
+ timestampType == TimestampNTZType) &&
+ timestampNTZFormatter.parseWithoutTimeZoneOptional(field,
false).isDefined) {
+ timestampType
} else {
tryParseTimestamp(field)
}
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
index 5385afe8c935..7e4767750fd3 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
import org.apache.spark.sql.errors.QueryExecutionErrors
import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
import org.apache.spark.sql.types._
import org.apache.spark.util.Utils
@@ -148,11 +149,13 @@ private[sql] class JsonInferSchema(options: JSONOptions)
extends Serializable {
val bigDecimal = decimalParser(field)
DecimalType(bigDecimal.precision, bigDecimal.scale)
}
+ val timestampType = SQLConf.get.timestampType
if (options.prefersDecimal && decimalTry.isDefined) {
decimalTry.get
- } else if (options.inferTimestamp &&
+ } else if (options.inferTimestamp &&
(SQLConf.get.legacyTimeParserPolicy ==
+ LegacyBehaviorPolicy.LEGACY || timestampType == TimestampNTZType) &&
timestampNTZFormatter.parseWithoutTimeZoneOptional(field,
false).isDefined) {
- SQLConf.get.timestampType
+ timestampType
} else if (options.inferTimestamp &&
timestampFormatter.parseOptional(field).isDefined) {
TimestampType
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
index acedf7998c2d..fb91200557a6 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchemaSuite.scala
@@ -263,4 +263,14 @@ class CSVInferSchemaSuite extends SparkFunSuite with
SQLHelper {
inferSchema = new CSVInferSchema(options)
assert(inferSchema.inferField(DateType, "2012_12_12") == DateType)
}
+
+ test("SPARK-45433: inferring the schema when timestamps do not match
specified timestampFormat" +
+ " with only one row") {
+ val options = new CSVOptions(
+ Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss"),
+ columnPruning = false,
+ defaultTimeZoneId = "UTC")
+ val inferSchema = new CSVInferSchema(options)
+ assert(inferSchema.inferField(NullType, "2884-06-24T02:45:51.138") ==
StringType)
+ }
}
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
index 8290b38e3393..81a4858dce82 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
@@ -112,4 +112,12 @@ class JsonInferSchemaSuite extends SparkFunSuite with
SQLHelper {
checkType(Map("inferTimestamp" -> "true"), json, TimestampType)
checkType(Map("inferTimestamp" -> "false"), json, StringType)
}
+
+ test("SPARK-45433: inferring the schema when timestamps do not match
specified timestampFormat" +
+ " with only one row") {
+ checkType(
+ Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss", "inferTimestamp" ->
"true"),
+ """{"a": "2884-06-24T02:45:51.138"}""",
+ StringType)
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]