(spark) branch master updated: [SPARK-47867][FOLLOWUP] Fix variant parsing in JacksonParser

wenchen Wed, 17 Apr 2024 18:18:35 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 21d8bbdc59d6 [SPARK-47867][FOLLOWUP] Fix variant parsing in 
JacksonParser
21d8bbdc59d6 is described below

commit 21d8bbdc59d6525d0573c7e624c3b2640ac15795
Author: Chenhao Li <chenhao...@databricks.com>
AuthorDate: Thu Apr 18 09:17:17 2024 +0800

    [SPARK-47867][FOLLOWUP] Fix variant parsing in JacksonParser
    
    ### What changes were proposed in this pull request?
    
    This PR fixes an issue introduced in 
https://github.com/apache/spark/pull/46071. When parsing a JSON object as a map 
or struct, the `JacksonParser` only peeks the `FIELD_NAME` token without 
consuming it. `VariantBuilder.parseJson` will then fail because the current 
token is `FIELD_NAME` rather than the starting token of the value. Previous 
tests with struct schemas didn't fail because the parsing error was caught and 
the parser would then consume the field name, and the field value wo [...]
    
    ### Why are the changes needed?
    
    It is a bug fix and allows Spark to read a map schema with variant value 
(for example, `map<string, variant>`) correctly.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    A new unit test. It would fail without the changes.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #46107 from chenhao-db/fix_json_scan_variant.
    
    Authored-by: Chenhao Li <chenhao...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../apache/spark/sql/catalyst/json/JacksonParser.scala |  6 ++++++
 .../test/scala/org/apache/spark/sql/VariantSuite.scala | 18 ++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
index f8318aa7ce0a..eadd0a4f8ab9 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
@@ -116,6 +116,12 @@ class JacksonParser(
   }
 
   protected final def parseVariant(parser: JsonParser): VariantVal = {
+    // Skips `FIELD_NAME` at the beginning. This check is adapted from 
`parseJsonToken`, but we
+    // cannot directly use the function here because it also handles the 
`VALUE_NULL` token and
+    // returns null (representing a SQL NULL). Instead, we want to return a 
variant null.
+    if (parser.getCurrentToken == FIELD_NAME) {
+      parser.nextToken()
+    }
     try {
       val v = VariantBuilder.parseJson(parser)
       new VariantVal(v.getValue, v.getMetadata)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala
index d2d12920b68a..0dd9d35f9b4a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala
@@ -338,6 +338,24 @@ class VariantSuite extends QueryTest with 
SharedSparkSession {
     }
   }
 
+  test("json scan with map schema") {
+    withTempDir { dir =>
+      val file = new File(dir, "file.json")
+      val content = Seq(
+        "true",
+        """{"v": null}""",
+        """{"v": {"a": 1, "b": null}}"""
+      ).mkString("\n").getBytes(StandardCharsets.UTF_8)
+      Files.write(file.toPath, content)
+      checkAnswer(
+        spark.read.format("json").schema("v map<string, variant>")
+          .load(file.getAbsolutePath)
+          .selectExpr("to_json(v)"),
+        Seq(Row(null), Row(null), Row("""{"a":1,"b":null}"""))
+      )
+    }
+  }
+
   test("group/order/join variant are disabled") {
     var ex = intercept[AnalysisException] {
       spark.sql("select parse_json('') group by 1")


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-47867][FOLLOWUP] Fix variant parsing in JacksonParser

Reply via email to