spark git commit: Revert "[SPARK-17513][SQL] Make StreamExecution garbage-collect its metadata"

wenchen Tue, 20 Sep 2016 01:14:54 -0700

Repository: spark
Updated Branches:
  refs/heads/master be9d57fc9 -> f039d964d



Revert "[SPARK-17513][SQL] Make StreamExecution garbage-collect its metadata"

This reverts commit be9d57fc9d8b10e4234c01c06ed43fd7dd12c07b.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f039d964
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f039d964
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f039d964

Branch: refs/heads/master
Commit: f039d964d152c0aeb5b71eb5188a9a7fd4b5aef3
Parents: be9d57f
Author: Wenchen Fan <[email protected]>
Authored: Tue Sep 20 16:12:35 2016 +0800
Committer: Wenchen Fan <[email protected]>
Committed: Tue Sep 20 16:12:35 2016 +0800

----------------------------------------------------------------------
 .../sql/execution/streaming/MetadataLog.scala   |  1 -
 .../execution/streaming/StreamExecution.scala   |  7 ------
 .../sql/streaming/StreamingQuerySuite.scala     | 24 --------------------
 3 files changed, 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/f039d964/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLog.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLog.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLog.scala
index 9e2604c..78d6be1 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLog.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetadataLog.scala
@@ -24,7 +24,6 @@ package org.apache.spark.sql.execution.streaming
  *  - Allow the user to query the latest batch id.
  *  - Allow the user to query the metadata object of a specified batch id.
  *  - Allow the user to query metadata objects in a range of batch ids.
- *  - Allow the user to remove obsolete metadata
  */
 trait MetadataLog[T] {
 

http://git-wip-us.apache.org/repos/asf/spark/blob/f039d964/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 220f77d..a1aae61 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -290,13 +290,6 @@ class StreamExecution(
       assert(offsetLog.add(currentBatchId, 
availableOffsets.toCompositeOffset(sources)),
         s"Concurrent update to the log. Multiple streaming jobs detected for 
$currentBatchId")
       logInfo(s"Committed offsets for batch $currentBatchId.")
-
-      // Now that we have logged the new batch, no further processing will 
happen for
-      // the previous batch, and it is safe to discard the old metadata.
-      // Note that purge is exclusive, i.e. it purges everything before 
currentBatchId.
-      // NOTE: If StreamExecution implements pipeline parallelism (multiple 
batches in
-      // flight at the same time), this cleanup logic will need to change.
-      offsetLog.purge(currentBatchId)
     } else {
       awaitBatchLock.lock()
       try {

http://git-wip-us.apache.org/repos/asf/spark/blob/f039d964/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index d3e2cab..9d58315 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -125,30 +125,6 @@ class StreamingQuerySuite extends StreamTest with 
BeforeAndAfter {
     )
   }
 
-  testQuietly("StreamExecution metadata garbage collection") {
-    val inputData = MemoryStream[Int]
-    val mapped = inputData.toDS().map(6 / _)
-
-    // Run 3 batches, and then assert that only 1 metadata file is left at the 
end
-    // since the first 2 should have been purged.
-    testStream(mapped)(
-      AddData(inputData, 1, 2),
-      CheckAnswer(6, 3),
-      AddData(inputData, 1, 2),
-      CheckAnswer(6, 3, 6, 3),
-      AddData(inputData, 4, 6),
-      CheckAnswer(6, 3, 6, 3, 1, 1),
-
-      AssertOnQuery("metadata log should contain only one file") { q =>
-        val metadataLogDir = new 
java.io.File(q.offsetLog.metadataPath.toString)
-        val logFileNames = metadataLogDir.listFiles().toSeq.map(_.getName())
-        val toTest = logFileNames // Workaround for SPARK-17475
-        assert(toTest.size == 1 && toTest.head == "2")
-        true
-      }
-    )
-  }
-
   /**
    * A [[StreamAction]] to test the behavior of 
`StreamingQuery.awaitTermination()`.
    *


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: Revert "[SPARK-17513][SQL] Make StreamExecution garbage-collect its metadata"

Reply via email to