rkhachatryan commented on a change in pull request #12670: URL: https://github.com/apache/flink/pull/12670#discussion_r440915613
##########
File path:
flink-runtime/src/main/java/org/apache/flink/runtime/checkpoint/CheckpointCoordinator.java
##########
@@ -538,51 +542,61 @@ private void
startTriggeringCheckpoint(CheckpointTriggerRequest request) {
coordinatorsToCheckpoint, pendingCheckpoint, timer),
timer);
- FutureUtils.assertNoException(
- CompletableFuture.allOf(masterStatesComplete,
coordinatorCheckpointsComplete)
- .handleAsync(
- (ignored, throwable) -> {
- final PendingCheckpoint
checkpoint =
-
FutureUtils.getWithoutException(pendingCheckpointCompletableFuture);
-
-
Preconditions.checkState(
- checkpoint !=
null || throwable != null,
- "Either the
pending checkpoint needs to be created or an error must have been occurred.");
-
- if (throwable != null) {
- // the
initialization might not be finished yet
- if (checkpoint
== null) {
-
onTriggerFailure(request, throwable);
- } else {
-
onTriggerFailure(checkpoint, throwable);
- }
+ FutureUtils.waitForAll(asList(masterStatesComplete,
coordinatorCheckpointsComplete))
+ .handleAsync(
+ (ignored, throwable) -> {
+ final PendingCheckpoint
checkpoint =
+
FutureUtils.getWithoutException(pendingCheckpointCompletableFuture);
+
+ Preconditions.checkState(
+ checkpoint != null ||
throwable != null,
+ "Either the pending
checkpoint needs to be created or an error must have been occurred.");
+
+ if (throwable != null) {
+ // the initialization
might not be finished yet
+ if (checkpoint == null)
{
+
onTriggerFailure(request, throwable);
} else {
- if
(checkpoint.isDiscarded()) {
-
onTriggerFailure(
-
checkpoint,
-
new CheckpointException(
-
CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE,
-
checkpoint.getFailureCause()));
- } else {
- // no
exception, no discarding, everything is OK
- final
long checkpointId = checkpoint.getCheckpointId();
-
snapshotTaskState(
-
timestamp,
-
checkpointId,
-
checkpoint.getCheckpointStorageLocation(),
-
request.props,
-
executions,
-
request.advanceToEndOfTime);
-
-
coordinatorsToCheckpoint.forEach((ctx) ->
ctx.afterSourceBarrierInjection(checkpointId));
-
-
onTriggerSuccess();
- }
+
onTriggerFailure(checkpoint, throwable);
}
+ } else {
+ if
(checkpoint.isDiscarded()) {
+
onTriggerFailure(
+
checkpoint,
+ new
CheckpointException(
+
CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE,
+
checkpoint.getFailureCause()));
+ } else {
+ // no
exception, no discarding, everything is OK
+ final long
checkpointId = checkpoint.getCheckpointId();
+
snapshotTaskState(
+
timestamp,
+
checkpointId,
+
checkpoint.getCheckpointStorageLocation(),
+
request.props,
+
executions,
+
request.advanceToEndOfTime);
+
+
coordinatorsToCheckpoint.forEach((ctx) ->
ctx.afterSourceBarrierInjection(checkpointId));
+
+
onTriggerSuccess();
+ }
+ }
- return null;
- },
- timer));
+ return null;
+ },
+ timer)
+ .whenComplete((unused, error) -> {
+ if (error != null) {
+ if (!isShutdown()) {
+
failureManager.handleJobLevelCheckpointException(new
CheckpointException(EXCEPTION, error), Optional.empty());
Review comment:
> Why do you think that it hides error details?
Because `System.exit` can leave log buffers containing error details
unflushed.
Plus
> hides error itself in failover setup?
I.e. if programming error corrupts state then job restart can be undesirable.
Just crashing doesn't prevent restart, but failing the job does.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
