tdas commented on a change in pull request #28040: [SPARK-31278][SS] Fix
StreamingQuery output rows metric
URL: https://github.com/apache/spark/pull/28040#discussion_r404608550
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
##########
@@ -202,47 +202,55 @@ class StreamingAggregationSuite extends
StateStoreMetricsTest with Assertions {
}
}
- def stateOperatorProgresses: Seq[StateOperatorProgress] = {
- val operatorProgress = mutable.ArrayBuffer[StateOperatorProgress]()
- var progress = query.recentProgress.last
-
- operatorProgress ++= progress.stateOperators.map { op =>
op.copy(op.numRowsUpdated) }
- if (progress.numInputRows == 0) {
- // empty batch, merge metrics from previous batch as well
- progress = query.recentProgress.takeRight(2).head
- operatorProgress.zipWithIndex.foreach { case (sop, index) =>
- // "numRowsUpdated" should be merged, as it could be updated in
both batches.
- // (for now it is only updated from previous batch, but things can
be changed.)
- // other metrics represent current status of state so picking up
the latest values.
- val newOperatorProgress = sop.copy(
- sop.numRowsUpdated +
progress.stateOperators(index).numRowsUpdated)
- operatorProgress(index) = newOperatorProgress
- }
- }
+ // Pick the latest progress that actually ran a batch
+ def lastExecutedBatch: StreamingQueryProgress = {
+ query.recentProgress.filter(_.durationMs.containsKey("addBatch")).last
+ }
- operatorProgress
+ def stateOperatorProgresses: Seq[StateOperatorProgress] = {
+ lastExecutedBatch.stateOperators
}
}
+ val clock = new StreamManualClock()
+
testStream(aggWithWatermark)(
+ StartStream(Trigger.ProcessingTime("interval 1 second"), clock),
AddData(inputData, 15),
- CheckAnswer(), // watermark = 5
+ AdvanceManualClock(1000L), // triggers first batch
+ CheckAnswer(), // watermark = 0
AssertOnQuery { _.stateNodes.size === 1 },
AssertOnQuery { _.stateNodes.head.metrics("numOutputRows").value === 0 },
AssertOnQuery { _.stateOperatorProgresses.head.numRowsUpdated === 1 },
AssertOnQuery { _.stateOperatorProgresses.head.numRowsTotal === 1 },
+ AssertOnQuery { _.lastExecutedBatch.sink.numOutputRows == 0 },
AddData(inputData, 10, 12, 14),
+ AdvanceManualClock(1000L), // watermark = 0, runs with the just added
data
CheckAnswer(), // watermark = 5
AssertOnQuery { _.stateNodes.size === 1 },
AssertOnQuery { _.stateNodes.head.metrics("numOutputRows").value === 0 },
AssertOnQuery { _.stateOperatorProgresses.head.numRowsUpdated === 1 },
AssertOnQuery { _.stateOperatorProgresses.head.numRowsTotal === 2 },
+ AssertOnQuery { _.lastExecutedBatch.sink.numOutputRows == 0 },
AddData(inputData, 25),
- CheckAnswer((10, 3)), // watermark = 15
+ AdvanceManualClock(1000L), // actually runs batch with data
+ CheckAnswer(), // watermark = 5, will update to 15 next batch
AssertOnQuery { _.stateNodes.size === 1 },
- AssertOnQuery { _.stateNodes.head.metrics("numOutputRows").value === 1 },
+ AssertOnQuery { _.stateNodes.head.metrics("numOutputRows").value === 0 },
+ AssertOnQuery { _.stateOperatorProgresses.head.numRowsUpdated === 1 },
+ AssertOnQuery { _.stateOperatorProgresses.head.numRowsTotal === 3 },
+ AssertOnQuery { _.lastExecutedBatch.sink.numOutputRows == 0 },
+ AdvanceManualClock(1000L), // runs batch with no new data and watermark
progresses
+ CheckAnswer(), // watermark = 15, but nothing yet
Review comment:
I feel like this will flaky. CheckAnswer() works reliably only when there is
new data to process because it waits for the new data's offset to be reported
as processed. Here there is no new data in the no-data-batch, so its possible
that this CheckAnswer wont wait for the no-data-batch to finish before starting
the last progress checks.
Instead its more reliable (probably) to use eventually, where you check that
the lastprogress has the expected batchId.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]