zhuzhurk commented on a change in pull request #13181: URL: https://github.com/apache/flink/pull/13181#discussion_r474532534
########## File path: flink-runtime/src/test/java/org/apache/flink/runtime/jobmaster/slotpool/TestingPhysicalSlotRequestBulkBuilder.java ########## @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.jobmaster.slotpool; + +import org.apache.flink.runtime.clusterframework.types.ResourceProfile; +import org.apache.flink.runtime.jobmaster.SlotRequestId; +import org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotRequestBulkCheckerImpl.PhysicalSlotRequestBulkWithTimestamp; + +import java.util.HashMap; +import java.util.Map; +import java.util.function.BiConsumer; + +class TestingPhysicalSlotRequestBulkBuilder { + private static final BiConsumer<SlotRequestId, Throwable> EMPTY_CANCELLER = (r, t) -> {}; + private Map<SlotRequestId, ResourceProfile> pendingRequests = new HashMap<>(); + private BiConsumer<SlotRequestId, Throwable> canceller = EMPTY_CANCELLER; + + TestingPhysicalSlotRequestBulkBuilder addPendingRequests(SlotRequestId slotRequestId, ResourceProfile resourceProfile) { Review comment: `addPendingRequests` -> `addPendingRequest` ########## File path: flink-runtime/src/test/java/org/apache/flink/runtime/jobmaster/slotpool/PhysicalSlotRequestBulkCheckerImplTest.java ########## @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.jobmaster.slotpool; + +import org.apache.flink.api.common.time.Time; +import org.apache.flink.runtime.clusterframework.types.AllocationID; +import org.apache.flink.runtime.clusterframework.types.ResourceProfile; +import org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor; +import org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter; +import org.apache.flink.runtime.jobmaster.SlotInfo; +import org.apache.flink.runtime.jobmaster.SlotRequestId; +import org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotRequestBulkCheckerImpl.PhysicalSlotRequestBulkWithTimestamp; +import org.apache.flink.util.TestLogger; +import org.apache.flink.util.clock.ManualClock; + +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.function.Supplier; + +import static org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotTestUtils.createPhysicalSlot; +import static org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotTestUtils.occupyPhysicalSlot; +import static org.hamcrest.Matchers.is; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.fail; + +/** + * Tests for {@link PhysicalSlotRequestBulkCheckerImpl}. + */ +public class PhysicalSlotRequestBulkCheckerImplTest extends TestLogger { + + private static final Time TIMEOUT = Time.milliseconds(100L); + + private static ScheduledExecutorService singleThreadScheduledExecutorService; + + private static ComponentMainThreadExecutor mainThreadExecutor; + + private final ManualClock clock = new ManualClock(); + + private PhysicalSlotRequestBulkCheckerImpl bulkChecker; + + private Set<PhysicalSlot> slots; + + private Supplier<Set<SlotInfo>> slotsRetriever; + + @BeforeClass + public static void setupClass() { + singleThreadScheduledExecutorService = Executors.newSingleThreadScheduledExecutor(); + mainThreadExecutor = ComponentMainThreadExecutorServiceAdapter.forSingleThreadExecutor(singleThreadScheduledExecutorService); + } + + @AfterClass + public static void teardownClass() { + if (singleThreadScheduledExecutorService != null) { + singleThreadScheduledExecutorService.shutdownNow(); + } + } + + @Before + public void setup() throws Exception { + slots = new HashSet<>(); + slotsRetriever = () -> new HashSet<>(slots); + bulkChecker = new PhysicalSlotRequestBulkCheckerImpl(slotsRetriever, clock); + bulkChecker.start(mainThreadExecutor); + } + + @Test + public void testPendingBulkIsNotCancelled() throws InterruptedException, ExecutionException { + final PhysicalSlotRequestBulkWithCancellationTracking bulk = new PhysicalSlotRequestBulkWithCancellationTracking(ResourceProfile.ANY); + bulkChecker.schedulePendingRequestBulkTimeoutCheck(bulk, TIMEOUT); + bulk.checkNotCancelledAfter(TIMEOUT.toMilliseconds() + 10); + } + + @Test + public void testFulfilledBulkIsNotCancelled() throws InterruptedException, ExecutionException { + final PhysicalSlotRequestBulkWithCancellationTracking bulk = new PhysicalSlotRequestBulkWithCancellationTracking(); + bulkChecker.schedulePendingRequestBulkTimeoutCheck(bulk, TIMEOUT); + bulk.checkNotCancelledAfter(2 * TIMEOUT.toMilliseconds() + 10); + } + + @Test + public void testUnfulfilledBulkIsCancelled() { Review comment: `testUnfulfillableBulkIsCancelled ` seems more accurate because `PENDING` requests are also unfulfilled but fulfillable. ########## File path: flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/SlotSharingExecutionSlotAllocatorTest.java ########## @@ -274,6 +277,73 @@ public void testPhysicalSlotReleaseLogicalSlots() throws ExecutionException, Int assertThat(payloads.stream().allMatch(payload -> payload.getTerminalStateFuture().isDone()), is(true)); } + @Test + public void testSchedulePendingRequestBulkTimeoutCheck() { + TestingPhysicalSlotRequestBulkChecker bulkChecker = new TestingPhysicalSlotRequestBulkChecker(); + AllocationContext context = createBulkCheckerContextWithEv12GroupAndEv3Group(bulkChecker); + + context.allocateSlotsFor(EV1, EV3); + PhysicalSlotRequestBulk bulk = bulkChecker.getBulk(); + + assertThat(bulk.getPendingRequests(), hasSize(2)); + assertThat(bulk.getPendingRequests(), containsInAnyOrder(RESOURCE_PROFILE.multiply(2), RESOURCE_PROFILE)); + assertThat(bulk.getAllocationIdsOfFulfilledRequests(), hasSize(0)); + assertThat(bulkChecker.getTimeout(), is(ALLOCATION_TIMEOUT)); + } + + @Test + public void testRequestFulfilledInBulk() { + TestingPhysicalSlotRequestBulkChecker bulkChecker = new TestingPhysicalSlotRequestBulkChecker(); + AllocationContext context = createBulkCheckerContextWithEv12GroupAndEv3Group(bulkChecker); + + context.allocateSlotsFor(EV1, EV3); + AllocationID allocationId = new AllocationID(); + ResourceProfile pendingSlotResourceProfile = fulfilOneOfTwoSlotRequestsAndGetPendingProfile(context, allocationId); + PhysicalSlotRequestBulk bulk = bulkChecker.getBulk(); + + assertThat(bulk.getPendingRequests(), hasSize(1)); + assertThat(bulk.getPendingRequests(), containsInAnyOrder(pendingSlotResourceProfile)); + assertThat(bulk.getAllocationIdsOfFulfilledRequests(), hasSize(1)); + assertThat(bulk.getAllocationIdsOfFulfilledRequests(), containsInAnyOrder(allocationId)); + } + + @Test + public void testRequestBulkCancel() { + TestingPhysicalSlotRequestBulkChecker bulkChecker = new TestingPhysicalSlotRequestBulkChecker(); + AllocationContext context = createBulkCheckerContextWithEv12GroupAndEv3Group(bulkChecker); + + // allocate 2 physical slots for 2 groups + List<SlotExecutionVertexAssignment> assignments1 = context.allocateSlotsFor(EV1, EV3); + fulfilOneOfTwoSlotRequestsAndGetPendingProfile(context, new AllocationID()); + PhysicalSlotRequestBulk bulk1 = bulkChecker.getBulk(); + List<SlotExecutionVertexAssignment> assignments2 = context.allocateSlotsFor(EV2); + // cancelling of (EV1, EV3) releases assignments1 and only one physical slot for EV3 + // the second physical slot is held by sharing EV2 from the next bulk + bulk1.cancel(new Throwable()); + // EV3 needs again a physical slot, therefore there are 3 requests overall + context.allocateSlotsFor(EV1, EV3); + boolean ev1failed = assignments1.get(0).getLogicalSlotFuture().isCompletedExceptionally(); + boolean ev2failed = assignments1.get(0).getLogicalSlotFuture().isCompletedExceptionally(); Review comment: ```suggestion boolean ev3failed = assignments1.get(1).getLogicalSlotFuture().isCompletedExceptionally(); ``` ########## File path: flink-runtime/src/test/java/org/apache/flink/runtime/scheduler/SlotSharingExecutionSlotAllocatorTest.java ########## @@ -274,6 +277,73 @@ public void testPhysicalSlotReleaseLogicalSlots() throws ExecutionException, Int assertThat(payloads.stream().allMatch(payload -> payload.getTerminalStateFuture().isDone()), is(true)); } + @Test + public void testSchedulePendingRequestBulkTimeoutCheck() { + TestingPhysicalSlotRequestBulkChecker bulkChecker = new TestingPhysicalSlotRequestBulkChecker(); + AllocationContext context = createBulkCheckerContextWithEv12GroupAndEv3Group(bulkChecker); + + context.allocateSlotsFor(EV1, EV3); + PhysicalSlotRequestBulk bulk = bulkChecker.getBulk(); + + assertThat(bulk.getPendingRequests(), hasSize(2)); + assertThat(bulk.getPendingRequests(), containsInAnyOrder(RESOURCE_PROFILE.multiply(2), RESOURCE_PROFILE)); + assertThat(bulk.getAllocationIdsOfFulfilledRequests(), hasSize(0)); + assertThat(bulkChecker.getTimeout(), is(ALLOCATION_TIMEOUT)); + } + + @Test + public void testRequestFulfilledInBulk() { + TestingPhysicalSlotRequestBulkChecker bulkChecker = new TestingPhysicalSlotRequestBulkChecker(); + AllocationContext context = createBulkCheckerContextWithEv12GroupAndEv3Group(bulkChecker); + + context.allocateSlotsFor(EV1, EV3); + AllocationID allocationId = new AllocationID(); + ResourceProfile pendingSlotResourceProfile = fulfilOneOfTwoSlotRequestsAndGetPendingProfile(context, allocationId); + PhysicalSlotRequestBulk bulk = bulkChecker.getBulk(); + + assertThat(bulk.getPendingRequests(), hasSize(1)); + assertThat(bulk.getPendingRequests(), containsInAnyOrder(pendingSlotResourceProfile)); + assertThat(bulk.getAllocationIdsOfFulfilledRequests(), hasSize(1)); + assertThat(bulk.getAllocationIdsOfFulfilledRequests(), containsInAnyOrder(allocationId)); + } + + @Test + public void testRequestBulkCancel() { + TestingPhysicalSlotRequestBulkChecker bulkChecker = new TestingPhysicalSlotRequestBulkChecker(); + AllocationContext context = createBulkCheckerContextWithEv12GroupAndEv3Group(bulkChecker); + + // allocate 2 physical slots for 2 groups + List<SlotExecutionVertexAssignment> assignments1 = context.allocateSlotsFor(EV1, EV3); + fulfilOneOfTwoSlotRequestsAndGetPendingProfile(context, new AllocationID()); + PhysicalSlotRequestBulk bulk1 = bulkChecker.getBulk(); + List<SlotExecutionVertexAssignment> assignments2 = context.allocateSlotsFor(EV2); + // cancelling of (EV1, EV3) releases assignments1 and only one physical slot for EV3 + // the second physical slot is held by sharing EV2 from the next bulk + bulk1.cancel(new Throwable()); + // EV3 needs again a physical slot, therefore there are 3 requests overall + context.allocateSlotsFor(EV1, EV3); + boolean ev1failed = assignments1.get(0).getLogicalSlotFuture().isCompletedExceptionally(); + boolean ev2failed = assignments1.get(0).getLogicalSlotFuture().isCompletedExceptionally(); + + assertThat(context.getSlotProvider().getRequests().values(), hasSize(3)); + // either EV1 or EV3 logical slot future is fulfilled before cancellation + assertThat(ev1failed != ev2failed, is(false)); + assertThat(assignments2.get(0).getLogicalSlotFuture().isCompletedExceptionally(), is(false)); Review comment: maybe also verify that `assignments2.get(0).getLogicalSlotFuture().isDone() == true` ? ########## File path: flink-runtime/src/test/java/org/apache/flink/runtime/jobmaster/slotpool/PhysicalSlotRequestBulkCheckerImplTest.java ########## @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.jobmaster.slotpool; + +import org.apache.flink.api.common.time.Time; +import org.apache.flink.runtime.clusterframework.types.AllocationID; +import org.apache.flink.runtime.clusterframework.types.ResourceProfile; +import org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor; +import org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter; +import org.apache.flink.runtime.jobmaster.SlotInfo; +import org.apache.flink.runtime.jobmaster.SlotRequestId; +import org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotRequestBulkCheckerImpl.PhysicalSlotRequestBulkWithTimestamp; +import org.apache.flink.util.TestLogger; +import org.apache.flink.util.clock.ManualClock; + +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.function.Supplier; + +import static org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotTestUtils.createPhysicalSlot; +import static org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotTestUtils.occupyPhysicalSlot; +import static org.hamcrest.Matchers.is; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.fail; + +/** + * Tests for {@link PhysicalSlotRequestBulkCheckerImpl}. + */ +public class PhysicalSlotRequestBulkCheckerImplTest extends TestLogger { + + private static final Time TIMEOUT = Time.milliseconds(100L); + + private static ScheduledExecutorService singleThreadScheduledExecutorService; + + private static ComponentMainThreadExecutor mainThreadExecutor; + + private final ManualClock clock = new ManualClock(); + + private PhysicalSlotRequestBulkCheckerImpl bulkChecker; + + private Set<PhysicalSlot> slots; + + private Supplier<Set<SlotInfo>> slotsRetriever; + + @BeforeClass + public static void setupClass() { + singleThreadScheduledExecutorService = Executors.newSingleThreadScheduledExecutor(); + mainThreadExecutor = ComponentMainThreadExecutorServiceAdapter.forSingleThreadExecutor(singleThreadScheduledExecutorService); + } + + @AfterClass + public static void teardownClass() { + if (singleThreadScheduledExecutorService != null) { + singleThreadScheduledExecutorService.shutdownNow(); + } + } + + @Before + public void setup() throws Exception { + slots = new HashSet<>(); + slotsRetriever = () -> new HashSet<>(slots); + bulkChecker = new PhysicalSlotRequestBulkCheckerImpl(slotsRetriever, clock); + bulkChecker.start(mainThreadExecutor); + } + + @Test + public void testPendingBulkIsNotCancelled() throws InterruptedException, ExecutionException { + final PhysicalSlotRequestBulkWithCancellationTracking bulk = new PhysicalSlotRequestBulkWithCancellationTracking(ResourceProfile.ANY); + bulkChecker.schedulePendingRequestBulkTimeoutCheck(bulk, TIMEOUT); + bulk.checkNotCancelledAfter(TIMEOUT.toMilliseconds() + 10); Review comment: I think a small wait time is not very stable. Although the test would not fail in this case but would just not have executed the fulfillability check yet. I do not have a good ides yet. One workaround is to increase the timeout to several seconds to make to more stable. ########## File path: flink-runtime/src/test/java/org/apache/flink/runtime/jobmaster/slotpool/PhysicalSlotRequestBulkCheckerImplTest.java ########## @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.runtime.jobmaster.slotpool; + +import org.apache.flink.api.common.time.Time; +import org.apache.flink.runtime.clusterframework.types.AllocationID; +import org.apache.flink.runtime.clusterframework.types.ResourceProfile; +import org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor; +import org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter; +import org.apache.flink.runtime.jobmaster.SlotInfo; +import org.apache.flink.runtime.jobmaster.SlotRequestId; +import org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotRequestBulkCheckerImpl.PhysicalSlotRequestBulkWithTimestamp; +import org.apache.flink.util.TestLogger; +import org.apache.flink.util.clock.ManualClock; + +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.function.Supplier; + +import static org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotTestUtils.createPhysicalSlot; +import static org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotTestUtils.occupyPhysicalSlot; +import static org.hamcrest.Matchers.is; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.fail; + +/** + * Tests for {@link PhysicalSlotRequestBulkCheckerImpl}. + */ +public class PhysicalSlotRequestBulkCheckerImplTest extends TestLogger { + + private static final Time TIMEOUT = Time.milliseconds(100L); + + private static ScheduledExecutorService singleThreadScheduledExecutorService; + + private static ComponentMainThreadExecutor mainThreadExecutor; + + private final ManualClock clock = new ManualClock(); + + private PhysicalSlotRequestBulkCheckerImpl bulkChecker; + + private Set<PhysicalSlot> slots; + + private Supplier<Set<SlotInfo>> slotsRetriever; + + @BeforeClass + public static void setupClass() { + singleThreadScheduledExecutorService = Executors.newSingleThreadScheduledExecutor(); + mainThreadExecutor = ComponentMainThreadExecutorServiceAdapter.forSingleThreadExecutor(singleThreadScheduledExecutorService); + } + + @AfterClass + public static void teardownClass() { + if (singleThreadScheduledExecutorService != null) { + singleThreadScheduledExecutorService.shutdownNow(); + } + } + + @Before + public void setup() throws Exception { + slots = new HashSet<>(); + slotsRetriever = () -> new HashSet<>(slots); + bulkChecker = new PhysicalSlotRequestBulkCheckerImpl(slotsRetriever, clock); + bulkChecker.start(mainThreadExecutor); + } + + @Test + public void testPendingBulkIsNotCancelled() throws InterruptedException, ExecutionException { + final PhysicalSlotRequestBulkWithCancellationTracking bulk = new PhysicalSlotRequestBulkWithCancellationTracking(ResourceProfile.ANY); Review comment: I think `ResourceProfile.ANY` is used for slot size only. For resource requirements, we can use `ResourceProfile.UNKNOWN`. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
