[spark] branch master updated: [SPARK-26586][SS] Fix race condition that causes streams to run with unexpected confs

zsxwing Fri, 11 Jan 2019 11:47:00 -0800

This is an automated email from the ASF dual-hosted git repository.

zsxwing pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new ae382c9  [SPARK-26586][SS] Fix race condition that causes streams to 
run with unexpected confs
ae382c9 is described below

commit ae382c94dd10ff494dde4de44e66182bf6dbe8f8
Author: Mukul Murthy <mukul.mur...@gmail.com>
AuthorDate: Fri Jan 11 11:46:14 2019 -0800

    [SPARK-26586][SS] Fix race condition that causes streams to run with 
unexpected confs
    
    ## What changes were proposed in this pull request?
    
    Fix race condition where streams can have unexpected conf values.
    
    New streaming queries should run with isolated SparkSessions so that they 
aren't affected by conf updates after they are started. In StreamExecution, the 
parent SparkSession is cloned and used to run each batch, but this cloning 
happens in a separate thread and may happen after DataStreamWriter.start() 
returns. If a stream is started and a conf key is set immediately after, the 
stream is likely to have the new value.
    
    ## How was this patch tested?
    
    New unit test that fails prior to the production change and passes with it.
    
    Please review http://spark.apache.org/contributing.html before opening a 
pull request.
    
    Closes #23513 from mukulmurthy/26586.
    
    Authored-by: Mukul Murthy <mukul.mur...@gmail.com>
    Signed-off-by: Shixiong Zhu <zsxw...@gmail.com>
---
 .../sql/execution/streaming/StreamExecution.scala  |  5 +++--
 .../test/DataStreamReaderWriterSuite.scala         | 24 ++++++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 83824f4..90f7b47 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -181,6 +181,9 @@ abstract class StreamExecution(
   lazy val streamMetrics = new MetricsReporter(
     this, s"spark.streaming.${Option(name).getOrElse(id)}")
 
+  /** Isolated spark session to run the batches with. */
+  private val sparkSessionForStream = sparkSession.cloneSession()
+
   /**
    * The thread that runs the micro-batches of this stream. Note that this 
thread must be
    * [[org.apache.spark.util.UninterruptibleThread]] to workaround KAFKA-1894: 
interrupting a
@@ -270,8 +273,6 @@ abstract class StreamExecution(
       // force initialization of the logical plan so that the sources can be 
created
       logicalPlan
 
-      // Isolated spark session to run the batches with.
-      val sparkSessionForStream = sparkSession.cloneSession()
       // Adaptive execution can change num shuffle partitions, disallow
       sparkSessionForStream.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, 
"false")
       // Disable cost-based join optimization as we do not want stateful 
operations to be rearranged
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
index 4d3a54a..74ea0bf 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.streaming.test
 
 import java.io.File
+import java.util.ConcurrentModificationException
 import java.util.Locale
 import java.util.concurrent.TimeUnit
 
@@ -651,4 +652,27 @@ class DataStreamReaderWriterSuite extends StreamTest with 
BeforeAndAfter {
 
     LastOptions.clear()
   }
+
+  test("SPARK-26586: Streams should have isolated confs") {
+    import testImplicits._
+    val input = MemoryStream[Int]
+    input.addData(1 to 10)
+    spark.conf.set("testKey1", 0)
+    val queries = (1 to 10).map { i =>
+      spark.conf.set("testKey1", i)
+      input.toDF().writeStream
+        .foreachBatch { (df: Dataset[Row], id: Long) =>
+          val v = df.sparkSession.conf.get("testKey1").toInt
+          if (i != v) {
+            throw new ConcurrentModificationException(s"Stream $i has the 
wrong conf value $v")
+          }
+        }
+        .start()
+    }
+    try {
+      queries.foreach(_.processAllAvailable())
+    } finally {
+      queries.foreach(_.stop())
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-26586][SS] Fix race condition that causes streams to run with unexpected confs

Reply via email to