This is an automated email from the ASF dual-hosted git repository.
yuanzhou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 0cdcf10e0e [GLUTEN-11088][VL] Add GlutenTests for get_json_object and
schema merging (#11276)
0cdcf10e0e is described below
commit 0cdcf10e0e85a5706e1372a44572229f427ede14
Author: Chang chen <[email protected]>
AuthorDate: Thu Dec 11 19:08:55 2025 +0800
[GLUTEN-11088][VL] Add GlutenTests for get_json_object and schema merging
(#11276)
* Add GlutenTest for schema merging failure in GlutenParquetSchemaSuite
* Add GlutenTest for get_json_object function with GlutenPlan support
* Update TODO comments in VeloxTestSettings to reflect fix
---------
Co-authored-by: Chang chen <[email protected]>
Co-authored-by: Chang chen <[email protected]>
---
.../gluten/utils/velox/VeloxTestSettings.scala | 4 +--
.../spark/sql/GlutenJsonFunctionsSuite.scala | 28 ++++++++++++++++++
.../parquet/GlutenParquetSchemaSuite.scala | 34 ++++++++++++++++++++++
3 files changed, 63 insertions(+), 3 deletions(-)
diff --git
a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 75c70fbd6f..a63ca37d3f 100644
---
a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++
b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -177,7 +177,6 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("SPARK-42782: Hive compatibility check for get_json_object")
// Velox does not support single quotes in get_json_object function.
.exclude("function get_json_object - support single quotes")
- // TODO: fix in Spark-4.0
.exclude("function get_json_object - path is null")
.exclude("function get_json_object - json is null")
.exclude("function get_json_object - Codegen Support")
@@ -527,8 +526,7 @@ class VeloxTestSettings extends BackendTestSettings {
.exclude("schema mismatch failure error message for parquet vectorized
reader")
// https://github.com/apache/incubator-gluten/issues/11220
.excludeByPrefix("SPARK-40819")
- // TODO: fix in Spark-4.0
- .excludeByPrefix("SPARK-46056")
+ .excludeByPrefix("SPARK-46056") // TODO: fix in Spark-4.0
.exclude("CANNOT_MERGE_SCHEMAS: Failed merging schemas")
enableSuite[GlutenParquetThriftCompatibilitySuite]
// Rewrite for file locating.
diff --git
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala
index 5a28031b6c..51602f144b 100644
---
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala
+++
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala
@@ -16,6 +16,8 @@
*/
package org.apache.spark.sql
+import org.apache.gluten.execution.GlutenPlan
+
class GlutenJsonFunctionsSuite extends JsonFunctionsSuite with
GlutenSQLTestsTrait {
import testImplicits._
@@ -101,4 +103,30 @@ class GlutenJsonFunctionsSuite extends JsonFunctionsSuite
with GlutenSQLTestsTra
checkAnswer(Seq(json).toDF().selectExpr(s"get_json_object(value,
'$path')"), Row(exp))
}
}
+
+ testGluten("function get_json_object - Codegen Support") {
+ withTempView("GetJsonObjectTable") {
+ val data = Seq(("1", """{"f1": "value1", "f5": 5.23}""")).toDF("key",
"jstring")
+ data.createOrReplaceTempView("GetJsonObjectTable")
+ val df = sql("SELECT key, get_json_object(jstring, '$.f1') FROM
GetJsonObjectTable")
+ val plan = df.queryExecution.executedPlan
+ assert(plan.isInstanceOf[GlutenPlan])
+ checkAnswer(df, Seq(Row("1", "value1")))
+ }
+ }
+ testGluten("function get_json_object - path is null") {
+ val data = Seq(("""{"name": "alice", "age": 5}""", "")).toDF("a", "b")
+ val df = data.selectExpr("get_json_object(a, null)")
+ val plan = df.queryExecution.executedPlan
+ assert(plan.isInstanceOf[GlutenPlan])
+ checkAnswer(df, Row(null))
+ }
+
+ testGluten("function get_json_object - json is null") {
+ val data = Seq(("""{"name": "alice", "age": 5}""", "")).toDF("a", "b")
+ val df = data.selectExpr("get_json_object(null, '$.name')")
+ val plan = df.queryExecution.executedPlan
+ assert(plan.isInstanceOf[GlutenPlan])
+ checkAnswer(df, Row(null))
+ }
}
diff --git
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetSchemaSuite.scala
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetSchemaSuite.scala
index 9dde5bce27..b4eedeab39 100644
---
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetSchemaSuite.scala
+++
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetSchemaSuite.scala
@@ -16,7 +16,11 @@
*/
package org.apache.spark.sql.execution.datasources.parquet
+import org.apache.spark.SparkException
import org.apache.spark.sql.GlutenSQLTestsBaseTrait
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.expressions.Cast.toSQLType
+import org.apache.spark.sql.types.{IntegerType, LongType, StructField,
StructType}
class GlutenParquetSchemaInferenceSuite
extends ParquetSchemaInferenceSuite
@@ -27,4 +31,34 @@ class GlutenParquetSchemaSuite extends ParquetSchemaSuite
with GlutenSQLTestsBas
override protected def testFile(fileName: String): String = {
getWorkspaceFilePath("sql", "core", "src", "test", "resources").toString +
"/" + fileName
}
+
+ testGluten("CANNOT_MERGE_SCHEMAS: Failed merging schemas") {
+ import testImplicits._
+
+ withTempPath {
+ dir =>
+ val path = dir.getCanonicalPath
+
+ // Note: Velox backend always generates Parquet files with nullable =
true,
+ // regardless of whether nullable is set to false or true in the
schema.
+ // Before https://github.com/apache/spark/pull/44644,
`StructField.sql` would not
+ // return the `NOT NULL` qualifier. This is why this test succeeds in
Spark 3.5.
+ val schema1 = StructType(Seq(StructField("id", LongType, nullable =
true)))
+ val df1 = spark.createDataFrame(
+ spark.sparkContext.parallelize(Seq(Row(0L), Row(1L), Row(2L))),
+ schema1)
+ df1.write.parquet(s"$path/p=1")
+ val df2 = df1.select($"id".cast(IntegerType).as(Symbol("id")))
+ df2.write.parquet(s"$path/p=2")
+
+ checkError(
+ exception = intercept[SparkException] {
+ spark.read.option("mergeSchema", "true").parquet(path)
+ },
+ condition = "CANNOT_MERGE_SCHEMAS",
+ sqlState = "42KD9",
+ parameters = Map("left" -> toSQLType(df1.schema), "right" ->
toSQLType(df2.schema))
+ )
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]