(incubator-gluten) branch main updated: [GLUTEN-11088][VL] Add GlutenTests for get_json_object and schema merging (#11276)

yuanzhou Thu, 11 Dec 2025 03:09:07 -0800

This is an automated email from the ASF dual-hosted git repository.

yuanzhou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git



The following commit(s) were added to refs/heads/main by this push:
     new 0cdcf10e0e [GLUTEN-11088][VL] Add GlutenTests for get_json_object and 
schema merging (#11276)
0cdcf10e0e is described below

commit 0cdcf10e0e85a5706e1372a44572229f427ede14
Author: Chang chen <[email protected]>
AuthorDate: Thu Dec 11 19:08:55 2025 +0800

    [GLUTEN-11088][VL] Add GlutenTests for get_json_object and schema merging 
(#11276)
    
    * Add GlutenTest for schema merging failure in GlutenParquetSchemaSuite
    
    * Add GlutenTest for get_json_object function with GlutenPlan support
    
    * Update TODO comments in VeloxTestSettings to reflect fix
    
    ---------
    
    Co-authored-by: Chang chen <[email protected]>
    Co-authored-by: Chang chen <[email protected]>
---
 .../gluten/utils/velox/VeloxTestSettings.scala     |  4 +--
 .../spark/sql/GlutenJsonFunctionsSuite.scala       | 28 ++++++++++++++++++
 .../parquet/GlutenParquetSchemaSuite.scala         | 34 ++++++++++++++++++++++
 3 files changed, 63 insertions(+), 3 deletions(-)

diff --git 
a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
 
b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 75c70fbd6f..a63ca37d3f 100644
--- 
a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ 
b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -177,7 +177,6 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("SPARK-42782: Hive compatibility check for get_json_object")
     // Velox does not support single quotes in get_json_object function.
     .exclude("function get_json_object - support single quotes")
-    // TODO: fix in Spark-4.0
     .exclude("function get_json_object - path is null")
     .exclude("function get_json_object - json is null")
     .exclude("function get_json_object - Codegen Support")
@@ -527,8 +526,7 @@ class VeloxTestSettings extends BackendTestSettings {
     .exclude("schema mismatch failure error message for parquet vectorized 
reader")
     // https://github.com/apache/incubator-gluten/issues/11220
     .excludeByPrefix("SPARK-40819")
-    // TODO: fix in Spark-4.0
-    .excludeByPrefix("SPARK-46056")
+    .excludeByPrefix("SPARK-46056") // TODO: fix in Spark-4.0
     .exclude("CANNOT_MERGE_SCHEMAS: Failed merging schemas")
   enableSuite[GlutenParquetThriftCompatibilitySuite]
     // Rewrite for file locating.
diff --git 
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala
 
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala
index 5a28031b6c..51602f144b 100644
--- 
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala
+++ 
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala
@@ -16,6 +16,8 @@
  */
 package org.apache.spark.sql
 
+import org.apache.gluten.execution.GlutenPlan
+
 class GlutenJsonFunctionsSuite extends JsonFunctionsSuite with 
GlutenSQLTestsTrait {
   import testImplicits._
 
@@ -101,4 +103,30 @@ class GlutenJsonFunctionsSuite extends JsonFunctionsSuite 
with GlutenSQLTestsTra
       checkAnswer(Seq(json).toDF().selectExpr(s"get_json_object(value, 
'$path')"), Row(exp))
     }
   }
+
+  testGluten("function get_json_object - Codegen Support") {
+    withTempView("GetJsonObjectTable") {
+      val data = Seq(("1", """{"f1": "value1", "f5": 5.23}""")).toDF("key", 
"jstring")
+      data.createOrReplaceTempView("GetJsonObjectTable")
+      val df = sql("SELECT key, get_json_object(jstring, '$.f1') FROM 
GetJsonObjectTable")
+      val plan = df.queryExecution.executedPlan
+      assert(plan.isInstanceOf[GlutenPlan])
+      checkAnswer(df, Seq(Row("1", "value1")))
+    }
+  }
+  testGluten("function get_json_object - path is null") {
+    val data = Seq(("""{"name": "alice", "age": 5}""", "")).toDF("a", "b")
+    val df = data.selectExpr("get_json_object(a, null)")
+    val plan = df.queryExecution.executedPlan
+    assert(plan.isInstanceOf[GlutenPlan])
+    checkAnswer(df, Row(null))
+  }
+
+  testGluten("function get_json_object - json is null") {
+    val data = Seq(("""{"name": "alice", "age": 5}""", "")).toDF("a", "b")
+    val df = data.selectExpr("get_json_object(null, '$.name')")
+    val plan = df.queryExecution.executedPlan
+    assert(plan.isInstanceOf[GlutenPlan])
+    checkAnswer(df, Row(null))
+  }
 }
diff --git 
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetSchemaSuite.scala
 
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetSchemaSuite.scala
index 9dde5bce27..b4eedeab39 100644
--- 
a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetSchemaSuite.scala
+++ 
b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetSchemaSuite.scala
@@ -16,7 +16,11 @@
  */
 package org.apache.spark.sql.execution.datasources.parquet
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql.GlutenSQLTestsBaseTrait
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.expressions.Cast.toSQLType
+import org.apache.spark.sql.types.{IntegerType, LongType, StructField, 
StructType}
 
 class GlutenParquetSchemaInferenceSuite
   extends ParquetSchemaInferenceSuite
@@ -27,4 +31,34 @@ class GlutenParquetSchemaSuite extends ParquetSchemaSuite 
with GlutenSQLTestsBas
   override protected def testFile(fileName: String): String = {
     getWorkspaceFilePath("sql", "core", "src", "test", "resources").toString + 
"/" + fileName
   }
+
+  testGluten("CANNOT_MERGE_SCHEMAS: Failed merging schemas") {
+    import testImplicits._
+
+    withTempPath {
+      dir =>
+        val path = dir.getCanonicalPath
+
+        // Note: Velox backend always generates Parquet files with nullable = 
true,
+        // regardless of whether nullable is set to false or true in the 
schema.
+        // Before https://github.com/apache/spark/pull/44644, 
`StructField.sql` would not
+        // return the `NOT NULL` qualifier. This is why this test succeeds in 
Spark 3.5.
+        val schema1 = StructType(Seq(StructField("id", LongType, nullable = 
true)))
+        val df1 = spark.createDataFrame(
+          spark.sparkContext.parallelize(Seq(Row(0L), Row(1L), Row(2L))),
+          schema1)
+        df1.write.parquet(s"$path/p=1")
+        val df2 = df1.select($"id".cast(IntegerType).as(Symbol("id")))
+        df2.write.parquet(s"$path/p=2")
+
+        checkError(
+          exception = intercept[SparkException] {
+            spark.read.option("mergeSchema", "true").parquet(path)
+          },
+          condition = "CANNOT_MERGE_SCHEMAS",
+          sqlState = "42KD9",
+          parameters = Map("left" -> toSQLType(df1.schema), "right" -> 
toSQLType(df2.schema))
+        )
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(incubator-gluten) branch main updated: [GLUTEN-11088][VL] Add GlutenTests for get_json_object and schema merging (#11276)

Reply via email to