This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 273b02fd6cdc [SPARK-50214][SQL] From json/xml should not change
collations in the given schema
273b02fd6cdc is described below
commit 273b02fd6cdc8b8f176f78dccc74ae9fc3841fb7
Author: Stefan Kandic <[email protected]>
AuthorDate: Tue Nov 5 14:03:41 2024 +0100
[SPARK-50214][SQL] From json/xml should not change collations in the given
schema
### What changes were proposed in this pull request?
This fix ensures that `from_json` and `from_xml` return the exact schema
provided, even when session collation is set.
### Why are the changes needed?
When serializing schema with the `sql` method, parsing it back can yield a
different schema if session collation is set. This fix maintains consistency in
schema structure regardless of collation settings.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
New unit tests.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #48750 from stefankandic/fixParseSchema.
Lead-authored-by: Stefan Kandic <[email protected]>
Co-authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.../scala/org/apache/spark/sql/functions.scala | 4 +-
.../query-tests/queries/function_from_json.json | 2 +-
.../queries/function_from_json.proto.bin | Bin 221 -> 394 bytes
.../query-tests/queries/function_from_xml.json | 2 +-
.../queries/function_from_xml.proto.bin | Bin 220 -> 393 bytes
.../sql/collation/CollationSQLFunctionsSuite.scala | 60 +++++++++++++++++++++
6 files changed, 64 insertions(+), 4 deletions(-)
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
index d7b61468b43d..8c49952bc31e 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
@@ -6809,7 +6809,7 @@ object functions {
*/
// scalastyle:on line.size.limit
def from_json(e: Column, schema: DataType, options: Map[String, String]):
Column = {
- from_json(e, lit(schema.sql), options.iterator)
+ from_json(e, lit(schema.json), options.iterator)
}
// scalastyle:off line.size.limit
@@ -7645,7 +7645,7 @@ object functions {
*/
// scalastyle:on line.size.limit
def from_xml(e: Column, schema: StructType, options: java.util.Map[String,
String]): Column =
- from_xml(e, lit(schema.sql), options.asScala.iterator)
+ from_xml(e, lit(schema.json), options.asScala.iterator)
// scalastyle:off line.size.limit
/**
diff --git
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json
index 5af297b17f8b..ddfa91abca05 100644
---
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json
+++
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.json
@@ -20,7 +20,7 @@
}
}, {
"literal": {
- "string": "STRUCT\u003cid: BIGINT, a: INT, b: DOUBLE\u003e"
+ "string":
"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}"
}
}]
}
diff --git
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin
index 1752a847d272..ad95d0f2b343 100644
Binary files
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin
and
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin
differ
diff --git
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json
index 3b229f6bc762..cfcd40a74b3a 100644
---
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json
+++
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.json
@@ -20,7 +20,7 @@
}
}, {
"literal": {
- "string": "STRUCT\u003cid: BIGINT, a: INT, b: DOUBLE\u003e"
+ "string":
"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}"
}
}]
}
diff --git
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin
index 60c1bd68fe33..1cc3a26c254f 100644
Binary files
a/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin
and
b/sql/connect/common/src/test/resources/query-tests/queries/function_from_xml.proto.bin
differ
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSQLFunctionsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSQLFunctionsSuite.scala
new file mode 100644
index 000000000000..83ec8c8d1baf
--- /dev/null
+++
b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSQLFunctionsSuite.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.collation
+
+import org.apache.spark.sql.{Column, Dataset, QueryTest}
+import org.apache.spark.sql.functions.{from_json, from_xml}
+import org.apache.spark.sql.internal.SqlApiConf
+import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.sql.types._
+
+class CollationSQLFunctionsSuite extends QueryTest with SharedSparkSession {
+
+ test("SPARK-50214: from_json and from_xml work correctly with session
collation") {
+ import testImplicits._
+
+ def checkSchema(
+ dataset: Dataset[String],
+ transformation: Column,
+ expectedSchema: StructType): Unit = {
+ val transformedSchema =
dataset.select(transformation.as("result")).schema
+ assert(transformedSchema === expectedSchema)
+ }
+
+ withSQLConf(SqlApiConf.DEFAULT_COLLATION -> "UNICODE_CI_AI") {
+ Seq(
+ StringType,
+ StringType("UTF8_BINARY"),
+ StringType("UNICODE"),
+ StringType("UNICODE_CI_AI")).foreach { stringType =>
+ val dataSchema = StructType(Seq(StructField("fieldName", stringType)))
+ val expectedSchema = StructType(Seq(StructField("result", dataSchema)))
+
+ // JSON Test
+ val jsonData = Seq("""{"fieldName": "fieldValue"}""")
+ val jsonDataset = spark.createDataset(jsonData)
+ checkSchema(jsonDataset, from_json($"value", dataSchema),
expectedSchema)
+
+ // XML Test
+ val xmlData = Seq("<root><fieldName>fieldValue</fieldName></root>")
+ val xmlDataset = spark.createDataset(xmlData)
+ checkSchema(xmlDataset, from_xml($"value", dataSchema), expectedSchema)
+ }
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]