[spark] branch branch-2.4 updated: [SPARK-26586][SS] Fix race condition that causes streams to run with unexpected confs

zsxwing Fri, 11 Jan 2019 11:47:01 -0800

This is an automated email from the ASF dual-hosted git repository.

zsxwing pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-2.4 by this push:
     new bbf61eb  [SPARK-26586][SS] Fix race condition that causes streams to 
run with unexpected confs
bbf61eb is described below

commit bbf61eb4222d7b46e71dc91eeedf82d27226fc2c
Author: Mukul Murthy <mukul.mur...@gmail.com>
AuthorDate: Fri Jan 11 11:46:14 2019 -0800

    [SPARK-26586][SS] Fix race condition that causes streams to run with 
unexpected confs
    
    ## What changes were proposed in this pull request?
    
    Fix race condition where streams can have unexpected conf values.
    
    New streaming queries should run with isolated SparkSessions so that they 
aren't affected by conf updates after they are started. In StreamExecution, the 
parent SparkSession is cloned and used to run each batch, but this cloning 
happens in a separate thread and may happen after DataStreamWriter.start() 
returns. If a stream is started and a conf key is set immediately after, the 
stream is likely to have the new value.
    
    ## How was this patch tested?
    
    New unit test that fails prior to the production change and passes with it.
    
    Please review http://spark.apache.org/contributing.html before opening a 
pull request.
    
    Closes #23513 from mukulmurthy/26586.
    
    Authored-by: Mukul Murthy <mukul.mur...@gmail.com>
    Signed-off-by: Shixiong Zhu <zsxw...@gmail.com>
    (cherry picked from commit ae382c94dd10ff494dde4de44e66182bf6dbe8f8)
    Signed-off-by: Shixiong Zhu <zsxw...@gmail.com>
---
 .../sql/execution/streaming/StreamExecution.scala  |  5 +++--
 .../test/DataStreamReaderWriterSuite.scala         | 24 ++++++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index de33844..c1aa98a 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -176,6 +176,9 @@ abstract class StreamExecution(
   lazy val streamMetrics = new MetricsReporter(
     this, s"spark.streaming.${Option(name).getOrElse(id)}")
 
+  /** Isolated spark session to run the batches with. */
+  private val sparkSessionForStream = sparkSession.cloneSession()
+
   /**
    * The thread that runs the micro-batches of this stream. Note that this 
thread must be
    * [[org.apache.spark.util.UninterruptibleThread]] to workaround KAFKA-1894: 
interrupting a
@@ -265,8 +268,6 @@ abstract class StreamExecution(
       // force initialization of the logical plan so that the sources can be 
created
       logicalPlan
 
-      // Isolated spark session to run the batches with.
-      val sparkSessionForStream = sparkSession.cloneSession()
       // Adaptive execution can change num shuffle partitions, disallow
       sparkSessionForStream.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, 
"false")
       // Disable cost-based join optimization as we do not want stateful 
operations to be rearranged
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
index 8212fb9..569114a 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.streaming.test
 
 import java.io.File
+import java.util.ConcurrentModificationException
 import java.util.Locale
 import java.util.concurrent.TimeUnit
 
@@ -651,4 +652,27 @@ class DataStreamReaderWriterSuite extends StreamTest with 
BeforeAndAfter {
 
     LastOptions.clear()
   }
+
+  test("SPARK-26586: Streams should have isolated confs") {
+    import testImplicits._
+    val input = MemoryStream[Int]
+    input.addData(1 to 10)
+    spark.conf.set("testKey1", 0)
+    val queries = (1 to 10).map { i =>
+      spark.conf.set("testKey1", i)
+      input.toDF().writeStream
+        .foreachBatch { (df: Dataset[Row], id: Long) =>
+          val v = df.sparkSession.conf.get("testKey1").toInt
+          if (i != v) {
+            throw new ConcurrentModificationException(s"Stream $i has the 
wrong conf value $v")
+          }
+        }
+        .start()
+    }
+    try {
+      queries.foreach(_.processAllAvailable())
+    } finally {
+      queries.foreach(_.stop())
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-2.4 updated: [SPARK-26586][SS] Fix race condition that causes streams to run with unexpected confs

Reply via email to