lostluck commented on code in PR #36163: URL: https://github.com/apache/beam/pull/36163#discussion_r2349965073
########## sdks/go/pkg/beam/runners/prism/internal/engine/elementmanager.go: ########## @@ -892,70 +892,75 @@ func (em *ElementManager) PersistBundle(rb RunBundle, col2Coders map[string]PCol // Clear out the inprogress elements associated with the completed bundle. // Must be done after adding the new pending elements to avoid an incorrect // watermark advancement. - stage.mu.Lock() - completed := stage.inprogress[rb.BundleID] - em.addPending(-len(completed.es)) - delete(stage.inprogress, rb.BundleID) - for k := range stage.inprogressKeysByBundle[rb.BundleID] { - delete(stage.inprogressKeys, k) - } - delete(stage.inprogressKeysByBundle, rb.BundleID) - - // Adjust holds as needed. - for h, c := range newHolds { - if c > 0 { - stage.watermarkHolds.Add(h, c) - } else if c < 0 { - stage.watermarkHolds.Drop(h, -c) - } - } - for hold, v := range stage.inprogressHoldsByBundle[rb.BundleID] { - stage.watermarkHolds.Drop(hold, v) - } - delete(stage.inprogressHoldsByBundle, rb.BundleID) - - // Clean up OnWindowExpiration bundle accounting, so window state - // may be garbage collected. - if stage.expiryWindowsByBundles != nil { - win, ok := stage.expiryWindowsByBundles[rb.BundleID] - if ok { - stage.inProgressExpiredWindows[win] -= 1 - if stage.inProgressExpiredWindows[win] == 0 { - delete(stage.inProgressExpiredWindows, win) + func() { + stage.mu.Lock() + // Defer unlocking the mutex within an anonymous function to ensure it's released + // even if a panic occurs during `em.addPending`. This prevents potential deadlocks + // if the waitgroup unexpectedly drops below zero due to a runner bug. + defer stage.mu.Unlock() + completed := stage.inprogress[rb.BundleID] + em.addPending(-len(completed.es)) Review Comment: Hmmm. Avoiding the deadlock is good, but it's definitely something that should be causing the pipeline to stop, and error out. I'd recommend, at least adding a log or something to em.addPending, to log when the livePending count returns <= 0. Then we can at least log that the subsequent panic indicates a logic error in prism, while not interrupting it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@beam.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org