(spark) branch branch-3.5 updated: [SPARK-46732][CONNECT][3.5] Make Subquery/Broadcast thread work with Connect's artifact management

gurwls223 Tue, 16 Jan 2024 22:25:23 -0800

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.5 by this push:
     new 5a0bc96d5f5e [SPARK-46732][CONNECT][3.5] Make Subquery/Broadcast 
thread work with Connect's artifact management
5a0bc96d5f5e is described below

commit 5a0bc96d5f5e42fa5e6ea9d024da572343f239a9
Author: xieshuaihu <xieshua...@agora.io>
AuthorDate: Wed Jan 17 15:24:58 2024 +0900

    [SPARK-46732][CONNECT][3.5] Make Subquery/Broadcast thread work with 
Connect's artifact management
    
    ### What changes were proposed in this pull request?
    
    Similar with SPARK-44794, propagate JobArtifactState to broadcast/subquery 
thread.
    
    This is an example:
    
    ```scala
    val add1 = udf((i: Long) => i + 1)
    val tableA = spark.range(2).alias("a")
    val tableB = 
broadcast(spark.range(2).select(add1(col("id")).alias("id"))).alias("b")
    tableA.join(tableB).
      where(col("a.id")===col("b.id")).
      select(col("a.id").alias("a_id"), col("b.id").alias("b_id")).
      collect().
      mkString("[", ", ", "]")
    ```
    
    Before this pr, this example will throw exception `ClassNotFoundException`. 
Subquery and Broadcast execution use a separate ThreadPool which don't have the 
`JobArtifactState`.
    
    ### Why are the changes needed?
    Fix bug. Make Subquery/Broadcast thread work with Connect's artifact 
management.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    Add a new test to `ReplE2ESuite`
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #44763 from xieshuaihu/SPARK-46732backport.
    
    Authored-by: xieshuaihu <xieshua...@agora.io>
    Signed-off-by: Hyukjin Kwon <gurwls...@apache.org>
---
 .../org/apache/spark/sql/application/ReplE2ESuite.scala  | 16 ++++++++++++++++
 .../org/apache/spark/sql/execution/SQLExecution.scala    |  5 +++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala
 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala
index 5bb8cbf3543b..9d61b4d56e1e 100644
--- 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala
+++ 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/application/ReplE2ESuite.scala
@@ -362,4 +362,20 @@ class ReplE2ESuite extends RemoteSparkSession with 
BeforeAndAfterEach {
     val output = runCommandsInShell(input)
     assertContains("noException: Boolean = true", output)
   }
+
+  test("broadcast works with REPL generated code") {
+    val input =
+      """
+        |val add1 = udf((i: Long) => i + 1)
+        |val tableA = spark.range(2).alias("a")
+        |val tableB = 
broadcast(spark.range(2).select(add1(col("id")).alias("id"))).alias("b")
+        |tableA.join(tableB).
+        |  where(col("a.id")===col("b.id")).
+        |  select(col("a.id").alias("a_id"), col("b.id").alias("b_id")).
+        |  collect().
+        |  mkString("[", ", ", "]")
+        |""".stripMargin
+    val output = runCommandsInShell(input)
+    assertContains("""String = "[[1,1]]"""", output)
+  }
 }
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
index daeac699c279..b4cbb6135223 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution
 import java.util.concurrent.{ConcurrentHashMap, ExecutorService, Future => 
JFuture}
 import java.util.concurrent.atomic.AtomicLong
 
-import org.apache.spark.{ErrorMessageFormat, SparkContext, SparkThrowable, 
SparkThrowableHelper}
+import org.apache.spark.{ErrorMessageFormat, JobArtifactSet, SparkContext, 
SparkThrowable, SparkThrowableHelper}
 import org.apache.spark.internal.config.{SPARK_DRIVER_PREFIX, 
SPARK_EXECUTOR_PREFIX}
 import org.apache.spark.internal.config.Tests.IS_TESTING
 import org.apache.spark.sql.SparkSession
@@ -215,7 +215,8 @@ object SQLExecution {
     val activeSession = sparkSession
     val sc = sparkSession.sparkContext
     val localProps = Utils.cloneProperties(sc.getLocalProperties)
-    exec.submit(() => {
+    val artifactState = JobArtifactSet.getCurrentJobArtifactState.orNull
+    exec.submit(() => JobArtifactSet.withActiveJobArtifactState(artifactState) 
{
       val originalSession = SparkSession.getActiveSession
       val originalLocalProps = sc.getLocalProperties
       SparkSession.setActiveSession(activeSession)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch branch-3.5 updated: [SPARK-46732][CONNECT][3.5] Make Subquery/Broadcast thread work with Connect's artifact management

Reply via email to