[GitHub] [flink] GJL commented on a change in pull request #9663: [WIP][FLINK-12433][runtime] Implement DefaultScheduler stub

GitBox Fri, 20 Sep 2019 05:30:45 -0700

GJL commented on a change in pull request #9663: [WIP][FLINK-12433][runtime] 
Implement DefaultScheduler stub
URL: https://github.com/apache/flink/pull/9663#discussion_r326604670


 ##########
 File path: 
flink-runtime/src/main/java/org/apache/flink/runtime/scheduler/DefaultScheduler.java
 ##########
 @@ -75,10 +128,293 @@ public DefaultScheduler(
                        slotRequestTimeout,
                        shuffleMaster,
                        partitionTracker);
+
+               this.log = log;
+
+               this.delayExecutor = checkNotNull(delayExecutor);
+               this.userCodeLoader = checkNotNull(userCodeLoader);
+               this.executionVertexOperations = 
checkNotNull(executionVertexOperations);
+               this.executionVertexVersioner = 
checkNotNull(executionVertexVersioner);
+
+               this.executionFailureHandler = new 
ExecutionFailureHandler(failoverStrategyFactory.create(getFailoverTopology()), 
restartBackoffTimeStrategy);
+               this.schedulingStrategy = 
schedulingStrategyFactory.createInstance(this, getSchedulingTopology(), 
getJobGraph());
+               this.executionSlotAllocator = new 
DefaultExecutionSlotAllocator(slotProvider, getInputsLocationsRetriever(), 
slotRequestTimeout);
+               setTaskFailureListener(new 
UpdateSchedulerNgOnInternalTaskFailuresListener(this, 
getJobGraph().getJobID()));
+       }
+
+       // 
------------------------------------------------------------------------
+       // SchedulerNG
+       // 
------------------------------------------------------------------------
+
+       @Override
+       protected void startSchedulingInternal() {
+               prepareExecutionGraphForScheduling();
+               schedulingStrategy.startScheduling();
+       }
+
+       @Override
+       public boolean updateTaskExecutionState(final TaskExecutionState 
taskExecutionState) {
+               final Optional<ExecutionVertexID> executionVertexIdOptional = 
getExecutionVertexId(taskExecutionState.getID());
+               if (executionVertexIdOptional.isPresent()) {
+                       final ExecutionVertexID executionVertexId = 
executionVertexIdOptional.get();
+                       updateState(taskExecutionState);
+                       
schedulingStrategy.onExecutionStateChange(executionVertexId, 
taskExecutionState.getExecutionState());
+                       maybeHandleTaskFailure(taskExecutionState, 
executionVertexId);
+                       return true;
+               }
+
+               return false;
+       }
+
+       private void maybeHandleTaskFailure(final TaskExecutionState 
taskExecutionState, final ExecutionVertexID executionVertexId) {
+               if (taskExecutionState.getExecutionState() == 
ExecutionState.FAILED) {
+                       final Throwable error = 
taskExecutionState.getError(userCodeLoader);
+                       handleTaskFailure(executionVertexId, error);
+               }
+       }
+
+       private void handleTaskFailure(final ExecutionVertexID 
executionVertexId, final Throwable error) {
+               final FailureHandlingResult failureHandlingResult = 
executionFailureHandler.getFailureHandlingResult(executionVertexId, error);
+               maybeRestartTasks(failureHandlingResult);
+       }
+
+       private void maybeRestartTasks(final FailureHandlingResult 
failureHandlingResult) {
+               if (failureHandlingResult.canRestart()) {
+                       restartTasksWithDelay(failureHandlingResult);
+               } else {
+                       failJob(failureHandlingResult.getError());
+               }
+       }
+
+       private void restartTasksWithDelay(final FailureHandlingResult 
failureHandlingResult) {
+               final Set<ExecutionVertexID> verticesToRestart = 
failureHandlingResult.getVerticesToRestart();
+
+               final Set<ExecutionVertexVersion> executionVertexVersions =
+                       new 
HashSet<>(executionVertexVersioner.recordVertexModifications(verticesToRestart).values());
+
+               final CompletableFuture<?> cancelFuture = 
cancelTasksAsync(verticesToRestart);
+
+               delayExecutor.schedule(
+                       () -> FutureUtils.assertNoException(
+                               
cancelFuture.handleAsync(restartTasksOrHandleError(executionVertexVersions), 
getMainThreadExecutor())),
+                       failureHandlingResult.getRestartDelayMS(),
+                       TimeUnit.MILLISECONDS);
+       }
+
+       private BiFunction<Object, Throwable, Void> 
restartTasksOrHandleError(final Set<ExecutionVertexVersion> 
executionVertexVersions) {
+               return (Object ignored, Throwable throwable) -> {
+
+                       if (throwable == null) {
+                               final Set<ExecutionVertexID> verticesToRestart 
= 
executionVertexVersioner.getUnmodifiedExecutionVertices(executionVertexVersions);
+                               
schedulingStrategy.restartTasks(verticesToRestart);
+                       } else {
+                               failJob(throwable);
+                       }
+                       return null;
+               };
+       }
+
+       private CompletableFuture<?> cancelTasksAsync(final 
Set<ExecutionVertexID> verticesToRestart) {
+               final List<CompletableFuture<?>> cancelFutures = 
verticesToRestart.stream()
+                       .map(this::cancelExecutionVertex)
+                       .collect(Collectors.toList());
+
+               return FutureUtils.combineAll(cancelFutures);
+       }
+
+       private CompletableFuture<?> cancelExecutionVertex(final 
ExecutionVertexID executionVertexId) {
+               return 
executionVertexOperations.cancel(getExecutionVertex(executionVertexId));
        }
 
        @Override
-       public void startScheduling() {
-               throw new UnsupportedOperationException();
+       public void scheduleOrUpdateConsumers(final ResultPartitionID 
partitionId) {
+               final Optional<ExecutionVertexID> producerVertexId = 
getExecutionVertexId(partitionId.getProducerId());
+               if (producerVertexId.isPresent()) {
+                       updateConsumers(partitionId);
+                       
schedulingStrategy.onPartitionConsumable(producerVertexId.get(), partitionId);
+               }
+       }
+
+       // 
------------------------------------------------------------------------
+       // SchedulerOperations
+       // 
------------------------------------------------------------------------
+
+       @Override
+       public void allocateSlotsAndDeploy(final 
Collection<ExecutionVertexDeploymentOption> executionVertexDeploymentOptions) {
+               final Map<ExecutionVertexID, ExecutionVertexDeploymentOption> 
deploymentOptionsByVertex = 
groupDeploymentOptionsByVertexId(executionVertexDeploymentOptions);
+               final Set<ExecutionVertexID> verticesToDeploy = 
deploymentOptionsByVertex.keySet();
+               final Map<ExecutionVertexID, ExecutionVertexVersion> 
requiredVersionByVertex = 
executionVertexVersioner.recordVertexModifications(verticesToDeploy);
+
+               prepareToDeployVertices(verticesToDeploy);
+
+               final Collection<SlotExecutionVertexAssignment> 
slotExecutionVertexAssignments = 
allocateSlots(executionVertexDeploymentOptions);
+
+               final Collection<DeploymentHandle> deploymentHandles = 
createDeploymentHandles(
+                       requiredVersionByVertex,
+                       deploymentOptionsByVertex,
+                       slotExecutionVertexAssignments);
+
+               if (isDeployIndividually()) {
+                       deployIndividually(deploymentHandles);
+               } else {
+                       waitForAllSlotsAndDeploy(deploymentHandles);
+               }
+       }
+
+       private static Map<ExecutionVertexID, ExecutionVertexDeploymentOption> 
groupDeploymentOptionsByVertexId(
+                       final Collection<ExecutionVertexDeploymentOption> 
executionVertexDeploymentOptions) {
+               return 
executionVertexDeploymentOptions.stream().collect(Collectors.toMap(
+                               
ExecutionVertexDeploymentOption::getExecutionVertexId,
+                               Function.identity()));
+       }
+
+       private void prepareToDeployVertices(final Set<ExecutionVertexID> 
verticesToDeploy) {
+               cancelSlotAssignments(verticesToDeploy);
+               resetForNewExecution(verticesToDeploy);
+               transitionToScheduled(verticesToDeploy);
+       }
+
+       private void cancelSlotAssignments(final Collection<ExecutionVertexID> 
vertices) {
+               vertices.forEach(executionVertexId -> 
executionSlotAllocator.cancel(executionVertexId));
+       }
+
+       private Collection<SlotExecutionVertexAssignment> allocateSlots(final 
Collection<ExecutionVertexDeploymentOption> executionVertexDeploymentOptions) {
+               return 
executionSlotAllocator.allocateSlotsFor(executionVertexDeploymentOptions
+                       .stream()
+                       
.map(ExecutionVertexDeploymentOption::getExecutionVertexId)
+                       .map(this::getExecutionVertex)
+                       .map(ExecutionVertexSchedulingRequirementsMapper::from)
+                       .collect(Collectors.toList()));
+       }
+
+       private static Collection<DeploymentHandle> createDeploymentHandles(
+               final Map<ExecutionVertexID, ExecutionVertexVersion> 
requiredVersionByVertex,
+               final Map<ExecutionVertexID, ExecutionVertexDeploymentOption> 
deploymentOptionsByVertex,
+               final Collection<SlotExecutionVertexAssignment> 
slotExecutionVertexAssignments) {
+
+               return slotExecutionVertexAssignments
+                       .stream()
+                       .map(slotExecutionVertexAssignment -> {
+                               final ExecutionVertexID executionVertexId = 
slotExecutionVertexAssignment.getExecutionVertexId();
+                               return new DeploymentHandle(
+                                       
requiredVersionByVertex.get(executionVertexId),
+                                       
deploymentOptionsByVertex.get(executionVertexId),
+                                       slotExecutionVertexAssignment);
+                       })
+                       .collect(Collectors.toList());
+       }
+
+       private boolean isDeployIndividually() {
+               return schedulingStrategy instanceof 
LazyFromSourcesSchedulingStrategy;
 
 Review comment:
   > I think this check is not friendly for future scheduling strategies.
   
   I agree. 
   
   `deployIndividually` has the advantage that one can run batch jobs with 
higher parallelism than there are slots (this would not be possible in the 
`waitForAllSlotsAndDeploy` code path). In theory it is possible to have a 
single code path for both eager and lazy scheduling. However, I did not have 
time to think it through. I think this requires a follow up task.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] [flink] GJL commented on a change in pull request #9663: [WIP][FLINK-12433][runtime] Implement DefaultScheduler stub

Reply via email to