[spark] branch branch-3.4 updated: [SPARK-42692][CONNECT] Implement `Dataset.toJSON`

hvanhovell Tue, 07 Mar 2023 11:47:03 -0800

This is an automated email from the ASF dual-hosted git repository.

hvanhovell pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.4 by this push:
     new c47fc2d8265 [SPARK-42692][CONNECT] Implement `Dataset.toJSON`
c47fc2d8265 is described below

commit c47fc2d82656e569d3ef84a090ea3d2cb25f70eb
Author: yangjie01 <yangji...@baidu.com>
AuthorDate: Tue Mar 7 15:46:35 2023 -0400

    [SPARK-42692][CONNECT] Implement `Dataset.toJSON`
    
    ### What changes were proposed in this pull request?
    This pr aims to implement Dataset.toJSON.
    
    ### Why are the changes needed?
    Add Spark connect jvm client api coverage.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    - Add new test
    - Manually checked Scala 2.13
    
    Closes #40319 from LuciferYang/SPARK-42692.
    
    Authored-by: yangjie01 <yangji...@baidu.com>
    Signed-off-by: Herman van Hovell <her...@databricks.com>
    (cherry picked from commit 51504e4527195f63f33294752f092111d01e5a46)
    Signed-off-by: Herman van Hovell <her...@databricks.com>
---
 .../main/scala/org/apache/spark/sql/Dataset.scala  |   3 ++-
 .../org/apache/spark/sql/ClientE2ETestSuite.scala  |  13 +++++++++
 .../apache/spark/sql/PlanGenerationTestSuite.scala |   4 +++
 .../query-tests/explain-results/toJSON.explain     |   2 ++
 .../test/resources/query-tests/queries/toJSON.json |  29 +++++++++++++++++++++
 .../resources/query-tests/queries/toJSON.proto.bin | Bin 0 -> 185 bytes
 6 files changed, 50 insertions(+), 1 deletion(-)

diff --git 
a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala
 
b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala
index 13dff3a874f..ff37614e87d 100644
--- 
a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ 
b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -29,6 +29,7 @@ import 
org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{PrimitiveLongEnc
 import org.apache.spark.sql.catalyst.expressions.RowOrdering
 import org.apache.spark.sql.connect.client.SparkResult
 import org.apache.spark.sql.connect.common.DataTypeProtoConverter
+import org.apache.spark.sql.functions.{struct, to_json}
 import org.apache.spark.sql.types.{Metadata, StructType}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
@@ -2777,7 +2778,7 @@ class Dataset[T] private[sql] (
   }
 
   def toJSON: Dataset[String] = {
-    throw new UnsupportedOperationException("toJSON is not implemented.")
+    select(to_json(struct(col("*")))).as(StringEncoder)
   }
 
   private[sql] def analyze: proto.AnalyzePlanResponse = {
diff --git 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
index abc182c1b8d..780280144b5 100644
--- 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
+++ 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
@@ -631,6 +631,19 @@ class ClientE2ETestSuite extends RemoteSparkSession with 
SQLHelper {
     val otherPlan = spark.sql("select 1")
     assert(plan.sameSemantics(otherPlan))
   }
+
+  test("toJSON") {
+    val expected = Array(
+      """{"b":0.0,"id":0,"d":"world","a":0}""",
+      """{"b":0.1,"id":1,"d":"world","a":1}""",
+      """{"b":0.2,"id":2,"d":"world","a":0}""")
+    val result = spark
+      .range(3)
+      .select(generateMyTypeColumns: _*)
+      .toJSON
+      .collect()
+    assert(result sameElements expected)
+  }
 }
 
 private[sql] case class MyType(id: Long, a: Double, b: Double)
diff --git 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index 85523a22d2b..027b7a30246 100755
--- 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++ 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -257,6 +257,10 @@ class PlanGenerationTestSuite
     session.read.json(testDataPath.resolve("people.json").toString)
   }
 
+  test("toJSON") {
+    complex.toJSON
+  }
+
   test("read csv") {
     session.read.csv(testDataPath.resolve("people.csv").toString)
   }
diff --git 
a/connector/connect/common/src/test/resources/query-tests/explain-results/toJSON.explain
 
b/connector/connect/common/src/test/resources/query-tests/explain-results/toJSON.explain
new file mode 100644
index 00000000000..1698c562732
--- /dev/null
+++ 
b/connector/connect/common/src/test/resources/query-tests/explain-results/toJSON.explain
@@ -0,0 +1,2 @@
+Project [to_json(struct(id, id#0L, a, a#0, b, b#0, d, d#0, e, e#0, f, f#0, g, 
g#0), Some(America/Los_Angeles)) AS to_json(struct(id, a, b, d, e, f, g))#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/toJSON.json 
b/connector/connect/common/src/test/resources/query-tests/queries/toJSON.json
new file mode 100644
index 00000000000..278767e620a
--- /dev/null
+++ 
b/connector/connect/common/src/test/resources/query-tests/queries/toJSON.json
@@ -0,0 +1,29 @@
+{
+  "common": {
+    "planId": "1"
+  },
+  "project": {
+    "input": {
+      "common": {
+        "planId": "0"
+      },
+      "localRelation": {
+        "schema": 
"struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedFunction": {
+        "functionName": "to_json",
+        "arguments": [{
+          "unresolvedFunction": {
+            "functionName": "struct",
+            "arguments": [{
+              "unresolvedStar": {
+              }
+            }]
+          }
+        }]
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/toJSON.proto.bin
 
b/connector/connect/common/src/test/resources/query-tests/queries/toJSON.proto.bin
new file mode 100644
index 00000000000..e08d0fd2180
Binary files /dev/null and 
b/connector/connect/common/src/test/resources/query-tests/queries/toJSON.proto.bin
 differ


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.4 updated: [SPARK-42692][CONNECT] Implement `Dataset.toJSON`

Reply via email to