Github user brkyvz commented on a diff in the pull request:

    https://github.com/apache/spark/pull/21220#discussion_r185886437
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
 ---
    @@ -128,40 +130,49 @@ class MicroBatchExecution(
        * Repeatedly attempts to run batches as data arrives.
        */
       protected def runActivatedStream(sparkSessionForStream: SparkSession): 
Unit = {
    -    triggerExecutor.execute(() => {
    -      startTrigger()
     
    +    triggerExecutor.execute(() => {
           if (isActive) {
    +        var currentBatchIsRunnable = false // Whether the current batch is 
runnable / has been run
    +        var currentBatchHadNewData = false // Whether the current batch 
had new data
    +
             reportTimeTaken("triggerExecution") {
    +          startTrigger()
    +
    +          // We'll do this initialization only once every start / restart
               if (currentBatchId < 0) {
    -            // We'll do this initialization only once
                 populateStartOffsets(sparkSessionForStream)
    -            
sparkSession.sparkContext.setJobDescription(getBatchDescriptionString)
    -            logDebug(s"Stream running from $committedOffsets to 
$availableOffsets")
    -          } else {
    -            constructNextBatch()
    +            logInfo(s"Stream started from $committedOffsets")
               }
    -          if (dataAvailable) {
    -            currentStatus = currentStatus.copy(isDataAvailable = true)
    +
    +          
sparkSession.sparkContext.setJobDescription(getBatchDescriptionString)
    +
    +          // Try to construct the next batch. This will return true only 
if the next batch is
    +          // ready and runnable. Note that the current batch may be 
runnable even without
    +          // new data to process as `constructNextBatch` may decide to run 
a batch for
    +          // state cleanup, etc. `isNewDataAvailable` will be updated to 
reflect whether new data
    +          // is available or not.
    +          currentBatchIsRunnable = constructNextBatch()
    +
    +          currentStatus = currentStatus.copy(isDataAvailable = 
isNewDataAvailable)
    --- End diff --
    
    then you can do something like:
    ```scala
    if (currentBatchIsRunnable && currentBatchHasNewData) {
      updateStatusMessage("Processing new data")
      runBatch(sparkSessionForStream)
    } else if (currentBatchIsRunnable) {
      updateStatusMessage("Processing empty trigger to timeout state") // or 
whatever
      runBatch(sparkSessionForStream)
    } else {
      updateStatusMessage("Waiting for data to arrive")
    }
    ```


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to