m-trieu commented on code in PR #31902: URL: https://github.com/apache/beam/pull/31902#discussion_r1757335502
########## runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/client/grpc/observers/DirectStreamObserver.java: ########## @@ -74,69 +76,125 @@ public void onNext(T value) { while (true) { try { synchronized (lock) { + // If we awaited previously and timed out, wait for the same phase. Otherwise we're + // careful to observe the phase before observing isReady. + if (awaitPhase < 0) { + awaitPhase = isReadyNotifier.getPhase(); + // If getPhase() returns a value less than 0, the phaser has been terminated. + if (awaitPhase < 0) { + return; + } + } + // We only check isReady periodically to effectively allow for increasing the outbound // buffer periodically. This reduces the overhead of blocking while still restricting // memory because there is a limited # of streams, and we have a max messages size of 2MB. if (++messagesSinceReady <= messagesBetweenIsReadyChecks) { - outboundObserver.onNext(value); + tryOnNext(value); return; } - // If we awaited previously and timed out, wait for the same phase. Otherwise we're - // careful to observe the phase before observing isReady. - if (awaitPhase < 0) { - awaitPhase = phaser.getPhase(); - } + if (outboundObserver.isReady()) { messagesSinceReady = 0; - outboundObserver.onNext(value); + tryOnNext(value); return; } } + // A callback has been registered to advance the phaser whenever the observer // transitions to is ready. Since we are waiting for a phase observed before the // outboundObserver.isReady() returned false, we expect it to advance after the // channel has become ready. This doesn't always seem to be the case (despite // documentation stating otherwise) so we poll periodically and enforce an overall // timeout related to the stream deadline. - phaser.awaitAdvanceInterruptibly(awaitPhase, waitSeconds, TimeUnit.SECONDS); + int nextPhase = + isReadyNotifier.awaitAdvanceInterruptibly(awaitPhase, waitSeconds, TimeUnit.SECONDS); + // If nextPhase is a value less than 0, the phaser has been terminated. + if (nextPhase < 0) { + return; + } + synchronized (lock) { messagesSinceReady = 0; - outboundObserver.onNext(value); + tryOnNext(value); return; } } catch (TimeoutException e) { + if (isReadyNotifier.isTerminated()) { + return; + } + totalSecondsWaited += waitSeconds; - if (totalSecondsWaited > deadlineSeconds) { - LOG.error( - "Exceeded timeout waiting for the outboundObserver to become ready meaning " - + "that the stream deadline was not respected."); - throw new RuntimeException(e); + if (hasDeadlineExpired(totalSecondsWaited)) { + String errorMessage = constructStreamCancelledErrorMessage(totalSecondsWaited); + LOG.error(errorMessage); + throw new StreamObserverCancelledException(errorMessage, e); } + if (totalSecondsWaited > 30) { LOG.info( "Output channel stalled for {}s, outbound thread {}.", totalSecondsWaited, Thread.currentThread().getName()); } + waitSeconds = waitSeconds * 2; } catch (InterruptedException e) { Thread.currentThread().interrupt(); - throw new RuntimeException(e); + StreamObserverCancelledException ex = new StreamObserverCancelledException(e); + LOG.error("Interrupted while waiting for outboundObserver to become ready.", ex); + throw ex; + } + } + } + + /** + * Only send the next value if the phaser is not terminated by the time we acquire the lock since + * the phaser can be terminated at any time. + */ + private void tryOnNext(T value) { Review Comment: but if the phaser is terminated, we don't want to call outboundObserver.onNext() right? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@beam.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org