zhuzhurk commented on a change in pull request #12375:
URL: https://github.com/apache/flink/pull/12375#discussion_r434987385



##########
File path: 
flink-runtime/src/main/java/org/apache/flink/runtime/jobmaster/slotpool/BulkSlotProviderImpl.java
##########
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.runtime.jobmaster.slotpool;
+
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.flink.api.common.time.Time;
+import org.apache.flink.runtime.clusterframework.types.AllocationID;
+import org.apache.flink.runtime.clusterframework.types.ResourceProfile;
+import org.apache.flink.runtime.clusterframework.types.SlotProfile;
+import org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor;
+import org.apache.flink.runtime.concurrent.FutureUtils;
+import org.apache.flink.runtime.jobmaster.SlotInfo;
+import org.apache.flink.runtime.jobmaster.SlotRequestId;
+import org.apache.flink.util.clock.Clock;
+import org.apache.flink.util.clock.SystemClock;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Optional;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.TimeoutException;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import static org.apache.flink.util.Preconditions.checkNotNull;
+
+/**
+ * Default implementation of {@link BulkSlotProvider}.
+ */
+class BulkSlotProviderImpl implements BulkSlotProvider {
+
+       private static final Logger LOG = 
LoggerFactory.getLogger(BulkSlotProviderImpl.class);
+
+       private ComponentMainThreadExecutor componentMainThreadExecutor;
+
+       private final SlotSelectionStrategy slotSelectionStrategy;
+
+       private final SlotPool slotPool;
+
+       private final Clock clock;
+
+       private final PhysicalSlotRequestBulkTracker slotRequestBulkTracker;
+
+       BulkSlotProviderImpl(final SlotSelectionStrategy slotSelectionStrategy, 
final SlotPool slotPool) {
+               this(slotSelectionStrategy, slotPool, 
SystemClock.getInstance());
+       }
+
+       @VisibleForTesting
+       BulkSlotProviderImpl(
+                       final SlotSelectionStrategy slotSelectionStrategy,
+                       final SlotPool slotPool,
+                       final Clock clock) {
+
+               this.slotSelectionStrategy = 
checkNotNull(slotSelectionStrategy);
+               this.slotPool = checkNotNull(slotPool);
+               this.clock = checkNotNull(clock);
+
+               this.slotRequestBulkTracker = new 
PhysicalSlotRequestBulkTracker(clock);
+
+               this.componentMainThreadExecutor = new 
ComponentMainThreadExecutor.DummyComponentMainThreadExecutor(
+                       "Scheduler is not initialized with proper main thread 
executor. " +
+                               "Call to BulkSlotProvider.start(...) 
required.");
+       }
+
+       @Override
+       public void start(final ComponentMainThreadExecutor mainThreadExecutor) 
{
+               this.componentMainThreadExecutor = mainThreadExecutor;
+       }
+
+       @Override
+       public CompletableFuture<Collection<PhysicalSlotRequest.Result>> 
allocatePhysicalSlots(
+                       final Collection<PhysicalSlotRequest> 
physicalSlotRequests,
+                       final Time timeout) {
+
+               componentMainThreadExecutor.assertRunningInMainThread();
+
+               LOG.debug("Received {} slot requests.", 
physicalSlotRequests.size());
+
+               final PhysicalSlotRequestBulk slotRequestBulk = new 
PhysicalSlotRequestBulk(physicalSlotRequests);
+
+               final List<CompletableFuture<PhysicalSlotRequest.Result>> 
resultFutures = new ArrayList<>(physicalSlotRequests.size());
+               for (PhysicalSlotRequest request : physicalSlotRequests) {
+                       final CompletableFuture<PhysicalSlotRequest.Result> 
resultFuture =
+                               allocatePhysicalSlot(request, 
timeout).thenApply(result -> {
+                                       slotRequestBulk.markRequestFulfilled(
+                                               result.getSlotRequestId(),
+                                               
result.getPhysicalSlot().getAllocationId());
+
+                                       return result;
+                               });
+                       resultFutures.add(resultFuture);
+               }
+
+               slotRequestBulkTracker.track(slotRequestBulk);
+               schedulePendingRequestBulkTimeoutCheck(slotRequestBulk, 
timeout);
+
+               return FutureUtils.combineAll(resultFutures)
+                       .whenComplete((ignore, throwable) -> 
slotRequestBulkTracker.untrack(slotRequestBulk));
+       }
+
+       private CompletableFuture<PhysicalSlotRequest.Result> 
allocatePhysicalSlot(
+                       final PhysicalSlotRequest physicalSlotRequest,
+                       final Time timeout) {
+
+               final SlotRequestId slotRequestId = 
physicalSlotRequest.getSlotRequestId();
+               final SlotProfile slotProfile = 
physicalSlotRequest.getSlotProfile();
+               final ResourceProfile resourceProfile = 
slotProfile.getPhysicalSlotResourceProfile();
+
+               LOG.debug("Received slot request [{}] with resource 
requirements: {}", slotRequestId, resourceProfile);
+
+               final Optional<PhysicalSlot> availablePhysicalSlot = 
tryAllocateFromAvailable(slotRequestId, slotProfile);
+
+               final CompletableFuture<PhysicalSlot> slotFuture;
+               if (availablePhysicalSlot.isPresent()) {
+                       slotFuture = 
CompletableFuture.completedFuture(availablePhysicalSlot.get());
+               } else {
+                       slotFuture = 
slotPool.requestNewAllocatedSlotWithoutTimeout(

Review comment:
       I'm thinking whether we should treat slot pool more a like a pool, and 
thus to split concerns of bulk allocation failures and external request 
failures.
   1. bulk allocation fails because it is not fulfillable, we can print debug 
level logs of desc of slot requests  and state of available/allocated slots in 
the slot pool.
   2. external request failures should be exposed in logs to demonstrate that 
Flink fails to extend the pool, which can be the deep reason that why a bulk of 
slot requests fails. However, external sot request failures does not mean that 
the internal slot request is definitely to fail.
   
   I'm even thinking to enable auto-retry for failed external sot requests 
rather than fail the initiating internal slot request to make slot pool more a 
like a pool. (related to this 
[comment](https://github.com/apache/flink/pull/12278#discussion_r433607269))




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to