(incubator-gluten) branch main updated: [GLUTEN-5341][VL] Fix SPARK-42782: Hive compatibility check for get_json_object (#5467)

philo Thu, 25 Apr 2024 06:01:41 -0700

This is an automated email from the ASF dual-hosted git repository.

philo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git



The following commit(s) were added to refs/heads/main by this push:
     new f1b00547d [GLUTEN-5341][VL] Fix SPARK-42782: Hive compatibility check 
for get_json_object (#5467)
f1b00547d is described below

commit f1b00547d09cf5b484551e6e06640200ed8e08d0
Author: ayushi-agarwal <[email protected]>
AuthorDate: Thu Apr 25 18:31:00 2024 +0530

    [GLUTEN-5341][VL] Fix SPARK-42782: Hive compatibility check for 
get_json_object (#5467)
---
 docs/velox-backend-limitations.md                  |  2 +
 .../gluten/utils/velox/VeloxTestSettings.scala     |  2 +-
 .../spark/sql/GlutenJsonFunctionsSuite.scala       | 87 +++++++++++++++++++++-
 3 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/docs/velox-backend-limitations.md 
b/docs/velox-backend-limitations.md
index 7f58fea88..73bbdf07a 100644
--- a/docs/velox-backend-limitations.md
+++ b/docs/velox-backend-limitations.md
@@ -47,6 +47,8 @@ In certain cases, Gluten result may be different from Vanilla 
spark.
 #### JSON functions
 Velox only supports double quotes surrounded strings, not single quotes, in 
JSON data. If single quotes are used, gluten will produce incorrect result.
 
+Velox doesn't support [*] in path when get_json_object function is called and 
returns null instead.
+
 #### Parquet read conf
 Gluten supports `spark.files.ignoreCorruptFiles` with default false, if true, 
the behavior is same as config false.
 Gluten ignores `spark.sql.parquet.datetimeRebaseModeInRead`, it only returns 
what write in parquet file. It does not consider the difference between legacy
diff --git 
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
 
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 4e88c40bb..e6a50a56a 100644
--- 
a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++ 
b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -126,7 +126,7 @@ class VeloxTestSettings extends BackendTestSettings {
   enableSuite[GlutenHigherOrderFunctionsSuite]
   enableSuite[GlutenIntervalExpressionsSuite]
   enableSuite[GlutenJsonFunctionsSuite]
-    // Disable for Spark3.5.
+    // * in get_json_object expression not supported in velox
     .exclude("SPARK-42782: Hive compatibility check for get_json_object")
     // Velox does not support single quotes in get_json_object function.
     .exclude("function get_json_object - support single quotes")
diff --git 
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala
 
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala
index cba4e7a37..24963f89d 100644
--- 
a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala
+++ 
b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenJsonFunctionsSuite.scala
@@ -16,4 +16,89 @@
  */
 package org.apache.spark.sql
 
-class GlutenJsonFunctionsSuite extends JsonFunctionsSuite with 
GlutenSQLTestsTrait {}
+class GlutenJsonFunctionsSuite extends JsonFunctionsSuite with 
GlutenSQLTestsTrait {
+  import testImplicits._
+
+  testGluten("SPARK-42782: Hive compatibility check for get_json_object ") {
+    val book0 = "{\"author\":\"Nigel Rees\",\"title\":\"Sayings of the 
Century\"" +
+      ",\"category\":\"reference\",\"price\":8.95}"
+    val backet0 = "[1,2,{\"b\":\"y\",\"a\":\"x\"}]"
+    val backet = "[" + backet0 + ",[3,4],[5,6]]"
+    val backetFlat = backet0.substring(0, backet0.length() - 1) + ",3,4,5,6]"
+
+    val book = "[" + book0 + ",{\"author\":\"Herman 
Melville\",\"title\":\"Moby Dick\"," +
+      "\"category\":\"fiction\",\"price\":8.99" +
+      ",\"isbn\":\"0-553-21311-3\"},{\"author\":\"J. R. R. Tolkien\"" +
+      ",\"title\":\"The Lord of the Rings\",\"category\":\"fiction\"" +
+      
",\"reader\":[{\"age\":25,\"name\":\"bob\"},{\"age\":26,\"name\":\"jack\"}]" +
+      ",\"price\":22.99,\"isbn\":\"0-395-19395-8\"}]"
+
+    val json = "{\"store\":{\"fruit\":[{\"weight\":8,\"type\":\"apple\"}," +
+      "{\"weight\":9,\"type\":\"pear\"}],\"basket\":" + backet + ",\"book\":" 
+ book +
+      ",\"bicycle\":{\"price\":19.95,\"color\":\"red\"}}" +
+      ",\"email\":\"amy@only_for_json_udf_test.net\"" +
+      ",\"owner\":\"amy\",\"zip code\":\"94025\",\"fb:testid\":\"1234\"}"
+
+    // Basic test
+    runTest(json, "$.owner", "amy")
+    runTest(json, "$.store.bicycle", "{\"price\":19.95,\"color\":\"red\"}")
+    runTest(json, "$.store.book", book)
+    runTest(json, "$.store.book[0]", book0)
+    // runTest(json, "$.store.book[*]", book) - not supported in velox
+    runTest(json, "$.store.book[0].category", "reference")
+    // runTest(json, "$.store.book[*].category",
+    // "[\"reference\",\"fiction\",\"fiction\"]") - not supported in velox
+    // runTest(json, "$.store.book[*].reader[0].age", "25") - not supported in 
velox
+    // runTest(json, "$.store.book[*].reader[*].age", "[25,26]") - not 
supported in velox
+    runTest(json, "$.store.basket[0][1]", "2")
+    // runTest(json, "$.store.basket[*]", backet) - not supported in velox
+    // runTest(json, "$.store.basket[*][0]", "[1,3,5]") - not supported in 
velox
+    // runTest(json, "$.store.basket[0][*]", backet0) - not supported in velox
+    // runTest(json, "$.store.basket[*][*]", backetFlat) - not supported in 
velox
+    runTest(json, "$.store.basket[0][2].b", "y")
+    // runTest(json, "$.store.basket[0][*].b", "[\"y\"]") - not supported in 
velox
+    runTest(json, "$.non_exist_key", null)
+    runTest(json, "$.store.book[10]", null)
+    runTest(json, "$.store.book[0].non_exist_key", null)
+    // runTest(json, "$.store.basket[*].non_exist_key", null) - not supported 
in velox
+    // runTest(json, "$.store.basket[0][*].non_exist_key", null) - not 
supported in velox
+    // runTest(json, "$.store.basket[*][*].non_exist_key", null) - not 
supported in velox
+    runTest(json, "$.zip code", "94025")
+    runTest(json, "$.fb:testid", "1234")
+    // runTest("{\"a\":\"b\nc\"}", "$.a", "b\nc") - not supported in velox
+
+    // Test root array
+    runTest("[1,2,3]", "$[0]", "1")
+    runTest("[1,2,3]", "$.[0]", null) // Not supported in spark and velox
+    runTest("[1,2,3]", "$.[1]", null) // Not supported in spark and velox
+    runTest("[1,2,3]", "$[1]", "2")
+
+    runTest("[1,2,3]", "$[3]", null)
+    runTest("[1,2,3]", "$.[*]", null) // Not supported in spark and velox
+    // runTest("[1,2,3]", "$[*]", "[1,2,3]") - not supported in velox
+    // runTest("[1,2,3]", "$", "[1,2,3]") - not supported in velox
+    runTest("[{\"k1\":\"v1\"},{\"k2\":\"v2\"},{\"k3\":\"v3\"}]", "$[2]", 
"{\"k3\":\"v3\"}")
+    runTest("[{\"k1\":\"v1\"},{\"k2\":\"v2\"},{\"k3\":\"v3\"}]", "$[2].k3", 
"v3")
+    runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1[0].k11[1]", "2")
+    runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1[0].k11", "[1,2,3]")
+    runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1[0]", 
"{\"k11\":[1,2,3]}")
+    runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0].k1", "[{\"k11\":[1,2,3]}]")
+    runTest("[{\"k1\":[{\"k11\":[1,2,3]}]}]", "$[0]", 
"{\"k1\":[{\"k11\":[1,2,3]}]}")
+    runTest("[[1,2,3],[4,5,6],[7,8,9]]", "$[1]", "[4,5,6]")
+    runTest("[[1,2,3],[4,5,6],[7,8,9]]", "$[1][0]", "4")
+    runTest("[\"a\",\"b\"]", "$[1]", "b")
+    runTest("[[\"a\",\"b\"]]", "$[0][1]", "b")
+
+    runTest("[1,2,3]", "[0]", "1")
+    // runTest("[1,2,3]", "$0", null) crashes in velox
+    runTest("[1,2,3]", "0", null)
+    runTest("[1,2,3]", "$.", null)
+
+    // runTest("[1,2,3]", "$", "[1,2,3]") crashes in velox
+    // runTest("{\"a\":4}", "$", "{\"a\":4}") crashes in velox
+
+    def runTest(json: String, path: String, exp: String): Unit = {
+      checkAnswer(Seq(json).toDF().selectExpr(s"get_json_object(value, 
'$path')"), Row(exp))
+    }
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(incubator-gluten) branch main updated: [GLUTEN-5341][VL] Fix SPARK-42782: Hive compatibility check for get_json_object (#5467)

Reply via email to