(spark) branch master updated: [SPARK-50214][SQL] From json/xml should not change collations in the given schema

maxgekk Tue, 05 Nov 2024 05:04:02 -0800

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 273b02fd6cdc [SPARK-50214][SQL] From json/xml should not change 
collations in the given schema
273b02fd6cdc is described below

commit 273b02fd6cdc8b8f176f78dccc74ae9fc3841fb7
Author: Stefan Kandic <[email protected]>
AuthorDate: Tue Nov 5 14:03:41 2024 +0100

    [SPARK-50214][SQL] From json/xml should not change collations in the given 
schema
    
    ### What changes were proposed in this pull request?
    This fix ensures that `from_json` and `from_xml` return the exact schema 
provided, even when session collation is set.
    
    ### Why are the changes needed?
    When serializing schema with the `sql` method, parsing it back can yield a 
different schema if session collation is set. This fix maintains consistency in 
schema structure regardless of collation settings.
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    New unit tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #48750 from stefankandic/fixParseSchema.
    
    Lead-authored-by: Stefan Kandic <[email protected]>
    Co-authored-by: Hyukjin Kwon <[email protected]>
    Signed-off-by: Max Gekk <[email protected]>
---
 .../scala/org/apache/spark/sql/functions.scala     |   4 +-
 .../query-tests/queries/function_from_json.json    |   2 +-
 .../queries/function_from_json.proto.bin           | Bin 221 -> 394 bytes
 .../query-tests/queries/function_from_xml.json     |   2 +-
 .../queries/function_from_xml.proto.bin            | Bin 220 -> 393 bytes
 .../sql/collation/CollationSQLFunctionsSuite.scala |  60 +++++++++++++++++++++
 6 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala 
b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
index d7b61468b43d..8c49952bc31e 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
@@ -6809,7 +6809,7 @@ object functions {
    */
   // scalastyle:on line.size.limit
   def from_json(e: Column, schema: DataType, options: Map[String, String]): 
Column = {
-    from_json(e, lit(schema.sql), options.iterator)
+    from_json(e, lit(schema.json), options.iterator)
   }
 
   // scalastyle:off line.size.limit
@@ -7645,7 +7645,7 @@ object functions {
    */
   // scalastyle:on line.size.limit
   def from_xml(e: Column, schema: StructType, options: java.util.Map[String, 
String]): Column =
-    from_xml(e, lit(schema.sql), options.asScala.iterator)
+    from_xml(e, lit(schema.json), options.asScala.iterator)
 
   // scalastyle:off line.size.limit
   /**
diff --git 
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json
 
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json
index 5af297b17f8b..ddfa91abca05 100644
--- 
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json
+++ 
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json
@@ -20,7 +20,7 @@
           }
         }, {
           "literal": {
-            "string": "STRUCT\u003cid: BIGINT, a: INT, b: DOUBLE\u003e"
+            "string": 
"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}"
           }
         }]
       }
diff --git 
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin
 
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin
index 1752a847d272..ad95d0f2b343 100644
Binary files 
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin
 and 
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin
 differ
diff --git 
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json
 
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json
index 3b229f6bc762..cfcd40a74b3a 100644
--- 
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json
+++ 
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json
@@ -20,7 +20,7 @@
           }
         }, {
           "literal": {
-            "string": "STRUCT\u003cid: BIGINT, a: INT, b: DOUBLE\u003e"
+            "string": 
"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}"
           }
         }]
       }
diff --git 
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin
 
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin
index 60c1bd68fe33..1cc3a26c254f 100644
Binary files 
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin
 and 
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin
 differ
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSQLFunctionsSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSQLFunctionsSuite.scala
new file mode 100644
index 000000000000..83ec8c8d1baf
--- /dev/null
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSQLFunctionsSuite.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.collation
+
+import org.apache.spark.sql.{Column, Dataset, QueryTest}
+import org.apache.spark.sql.functions.{from_json, from_xml}
+import org.apache.spark.sql.internal.SqlApiConf
+import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.sql.types._
+
+class CollationSQLFunctionsSuite extends QueryTest with SharedSparkSession {
+
+  test("SPARK-50214: from_json and from_xml work correctly with session 
collation") {
+    import testImplicits._
+
+    def checkSchema(
+        dataset: Dataset[String],
+        transformation: Column,
+        expectedSchema: StructType): Unit = {
+      val transformedSchema = 
dataset.select(transformation.as("result")).schema
+      assert(transformedSchema === expectedSchema)
+    }
+
+    withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE_CI_AI") {
+      Seq(
+        StringType,
+        StringType("UTF8_BINARY"),
+        StringType("UNICODE"),
+        StringType("UNICODE_CI_AI")).foreach { stringType =>
+        val dataSchema = StructType(Seq(StructField("fieldName", stringType)))
+        val expectedSchema = StructType(Seq(StructField("result", dataSchema)))
+
+        // JSON Test
+        val jsonData = Seq("""{"fieldName": "fieldValue"}""")
+        val jsonDataset = spark.createDataset(jsonData)
+        checkSchema(jsonDataset, from_json($"value", dataSchema), 
expectedSchema)
+
+        // XML Test
+        val xmlData = Seq("<root><fieldName>fieldValue</fieldName></root>")
+        val xmlDataset = spark.createDataset(xmlData)
+        checkSchema(xmlDataset, from_xml($"value", dataSchema), expectedSchema)
+      }
+    }
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-50214][SQL] From json/xml should not change collations in the given schema

Reply via email to