This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 21d8bbdc59d6 [SPARK-47867][FOLLOWUP] Fix variant parsing in JacksonParser 21d8bbdc59d6 is described below commit 21d8bbdc59d6525d0573c7e624c3b2640ac15795 Author: Chenhao Li <chenhao...@databricks.com> AuthorDate: Thu Apr 18 09:17:17 2024 +0800 [SPARK-47867][FOLLOWUP] Fix variant parsing in JacksonParser ### What changes were proposed in this pull request? This PR fixes an issue introduced in https://github.com/apache/spark/pull/46071. When parsing a JSON object as a map or struct, the `JacksonParser` only peeks the `FIELD_NAME` token without consuming it. `VariantBuilder.parseJson` will then fail because the current token is `FIELD_NAME` rather than the starting token of the value. Previous tests with struct schemas didn't fail because the parsing error was caught and the parser would then consume the field name, and the field value wo [...] ### Why are the changes needed? It is a bug fix and allows Spark to read a map schema with variant value (for example, `map<string, variant>`) correctly. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? A new unit test. It would fail without the changes. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46107 from chenhao-db/fix_json_scan_variant. Authored-by: Chenhao Li <chenhao...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../apache/spark/sql/catalyst/json/JacksonParser.scala | 6 ++++++ .../test/scala/org/apache/spark/sql/VariantSuite.scala | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala index f8318aa7ce0a..eadd0a4f8ab9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala @@ -116,6 +116,12 @@ class JacksonParser( } protected final def parseVariant(parser: JsonParser): VariantVal = { + // Skips `FIELD_NAME` at the beginning. This check is adapted from `parseJsonToken`, but we + // cannot directly use the function here because it also handles the `VALUE_NULL` token and + // returns null (representing a SQL NULL). Instead, we want to return a variant null. + if (parser.getCurrentToken == FIELD_NAME) { + parser.nextToken() + } try { val v = VariantBuilder.parseJson(parser) new VariantVal(v.getValue, v.getMetadata) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala index d2d12920b68a..0dd9d35f9b4a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala @@ -338,6 +338,24 @@ class VariantSuite extends QueryTest with SharedSparkSession { } } + test("json scan with map schema") { + withTempDir { dir => + val file = new File(dir, "file.json") + val content = Seq( + "true", + """{"v": null}""", + """{"v": {"a": 1, "b": null}}""" + ).mkString("\n").getBytes(StandardCharsets.UTF_8) + Files.write(file.toPath, content) + checkAnswer( + spark.read.format("json").schema("v map<string, variant>") + .load(file.getAbsolutePath) + .selectExpr("to_json(v)"), + Seq(Row(null), Row(null), Row("""{"a":1,"b":null}""")) + ) + } + } + test("group/order/join variant are disabled") { var ex = intercept[AnalysisException] { spark.sql("select parse_json('') group by 1") --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org