zhuzhurk commented on a change in pull request #9663: 
[WIP][FLINK-12433][runtime] Implement DefaultScheduler stub
URL: https://github.com/apache/flink/pull/9663#discussion_r324477116
 
 

 ##########
 File path: 
flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/DefaultScheduler.java
 ##########
 @@ -75,10 +137,281 @@ public DefaultScheduler(
                        slotRequestTimeout,
                        shuffleMaster,
                        partitionTracker);
+
+               this.restartBackoffTimeStrategy = 
checkNotNull(restartBackoffTimeStrategy);
+               this.slotRequestTimeout = checkNotNull(slotRequestTimeout);
+               this.slotProvider = checkNotNull(slotProvider);
+               this.delayExecutor = checkNotNull(delayExecutor);
+               this.userCodeLoader = checkNotNull(userCodeLoader);
+               this.schedulingStrategyFactory = 
checkNotNull(schedulingStrategyFactory);
+               this.failoverStrategyFactory = 
checkNotNull(failoverStrategyFactory);
+               this.executionVertexOperations = 
checkNotNull(executionVertexOperations);
+               this.executionVertexVersioner = 
checkNotNull(executionVertexVersioner);
+               this.conditionalFutureHandlerFactory = new 
ConditionalFutureHandlerFactory(executionVertexVersioner);
        }
 
+       // 
------------------------------------------------------------------------
+       // SchedulerNG
+       // 
------------------------------------------------------------------------
+
        @Override
-       public void startScheduling() {
-               throw new UnsupportedOperationException();
+       public void startSchedulingInternal() {
+               initializeScheduling();
+               schedulingStrategy.startScheduling();
+       }
+
+       private void initializeScheduling() {
+               executionFailureHandler = new 
ExecutionFailureHandler(failoverStrategyFactory.create(getFailoverTopology()), 
restartBackoffTimeStrategy);
+               schedulingStrategy = 
schedulingStrategyFactory.createInstance(this, getSchedulingTopology(), 
getJobGraph());
+               executionSlotAllocator = new 
DefaultExecutionSlotAllocator(slotProvider, getInputsLocationsRetriever(), 
slotRequestTimeout);
+               setTaskFailureListener(new 
UpdateTaskExecutionStateInDefaultSchedulerListener(this, 
getJobGraph().getJobID()));
+               prepareExecutionGraphForScheduling();
+       }
+
+       @Override
+       public boolean updateTaskExecutionState(final TaskExecutionState 
taskExecutionState) {
+               final Optional<ExecutionVertexID> executionVertexIdOptional = 
getExecutionVertexId(taskExecutionState.getID());
+               if (executionVertexIdOptional.isPresent()) {
+                       final ExecutionVertexID executionVertexId = 
executionVertexIdOptional.get();
+                       updateState(taskExecutionState);
+                       
schedulingStrategy.onExecutionStateChange(executionVertexId, 
taskExecutionState.getExecutionState());
+                       maybeHandleTaskFailure(taskExecutionState, 
executionVertexId);
+                       return true;
+               }
+
+               return false;
+       }
+
+       private void maybeHandleTaskFailure(final TaskExecutionState 
taskExecutionState, final ExecutionVertexID executionVertexId) {
+               if (taskExecutionState.getExecutionState() == 
ExecutionState.FAILED) {
+                       final Throwable error = 
taskExecutionState.getError(userCodeLoader);
+                       handleTaskFailure(executionVertexId, error);
+               }
+       }
+
+       private void handleTaskFailure(final ExecutionVertexID 
executionVertexId, final Throwable error) {
+               final FailureHandlingResult failureHandlingResult = 
executionFailureHandler.getFailureHandlingResult(executionVertexId, error);
+               maybeRestartTasks(failureHandlingResult);
+       }
+
+       private void maybeRestartTasks(final FailureHandlingResult 
failureHandlingResult) {
+               if (failureHandlingResult.canRestart()) {
+                       restartTasksWithDelay(failureHandlingResult);
+               } else {
+                       failJob(failureHandlingResult.getError());
+               }
+       }
+
+       private void restartTasksWithDelay(final FailureHandlingResult 
failureHandlingResult) {
+               final Set<ExecutionVertexID> verticesToRestart = 
failureHandlingResult.getVerticesToRestart();
+
+               final Set<ExecutionVertexVersion> executionVertexVersions =
+                       new 
HashSet<>(executionVertexVersioner.recordVertexModifications(verticesToRestart).values());
+
+               final CompletableFuture<?> cancelFuture = 
cancelTasksAsync(verticesToRestart);
+
+               delayExecutor.schedule(
 
 Review comment:
   This makes the delay be "time span between task failure happening and 
restarting tasks".
   Previously it's "time span between completing affected tasks canceling and 
restarting tasks".
   I think this change is reasonable as it better respects the config 
description. However it changes the existing behavior.
   Is this expected?

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to