[
https://issues.apache.org/jira/browse/FLINK-22945?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Zhu Zhu updated FLINK-22945:
----------------------------
Description:
The pending requests in ExecutionSlotAllocator are not cleared when a job
transitions to CANCELING or FAILING, while all vertices will be canceled and
assigned slot will be returned. The returned slot is possible to be used to
fulfill the pending request of a CANCELED vertex and the assignment will fail
immediately and the slot will be returned and used to fulfilled another vertex
in a recursive way. StackOverflow can happen in this way when there are many
vertices, and fatal error can happen and lead to JM will crash. A sample call
stack is attached below.
To fix this problem, we should clear the pending requests in
ExecutionSlotAllocator when a job is CANCELING or FAILING. However, I think
it's better to improve the call stack of slot assignment to avoid similar
StackOverflowException to occur.
...
at
org.apache.flink.runtime.scheduler.SharedSlot.returnLogicalSlot(SharedSlot.java:234)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.lambda$returnSlotToOwner$0(SingleLogicalSlot.java:203)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
java.util.concurrent.CompletableFuture.uniRun(CompletableFuture.java:705)
~[?:1.8.0_102]
at
java.util.concurrent.CompletableFuture.uniRunStage(CompletableFuture.java:717)
~[?:1.8.0_102]
at
java.util.concurrent.CompletableFuture.thenRun(CompletableFuture.java:2010)
~[?:1.8.0_102]
at
org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.returnSlotToOwner(SingleLogicalSlot.java:200)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.releaseSlot(SingleLogicalSlot.java:130)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.scheduler.DefaultScheduler.releaseSlotIfPresent(DefaultScheduler.java:533)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$assignResourceOrHandleError$8(DefaultScheduler.java:512)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:822)
~[?:1.8.0_102]
at
java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:797)
~[?:1.8.0_102]
at
java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:474)
~[?:1.8.0_102]
at
java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1962)
~[?:1.8.0_102]
at
org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge$PendingRequest.fulfill(DeclarativeSlotPoolBridge.java:552)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge$PendingRequestSlotMatching.fulfillPendingRequest(DeclarativeSlotPoolBridge.java:587)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge.newSlotsAreAvailable(DeclarativeSlotPoolBridge.java:171)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPool.lambda$freeReservedSlot$0(DefaultDeclarativeSlotPool.java:316)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at java.util.Optional.ifPresent(Optional.java:159) ~[?:1.8.0_102]
at
org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPool.freeReservedSlot(DefaultDeclarativeSlotPool.java:313)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge.releaseSlot(DeclarativeSlotPoolBridge.java:335)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProviderImpl.cancelSlotRequest(PhysicalSlotProviderImpl.java:112)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.scheduler.SlotSharingExecutionSlotAllocator.releaseSharedSlot(SlotSharingExecutionSlotAllocator.java:242)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.scheduler.SharedSlot.releaseExternally(SharedSlot.java:281)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.scheduler.SharedSlot.removeLogicalSlotRequest(SharedSlot.java:242)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.scheduler.SharedSlot.returnLogicalSlot(SharedSlot.java:234)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.lambda$returnSlotToOwner$0(SingleLogicalSlot.java:203)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
java.util.concurrent.CompletableFuture.uniRun(CompletableFuture.java:705)
~[?:1.8.0_102]
at
java.util.concurrent.CompletableFuture.uniRunStage(CompletableFuture.java:717)
~[?:1.8.0_102]
at
java.util.concurrent.CompletableFuture.thenRun(CompletableFuture.java:2010)
~[?:1.8.0_102]
at
org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.returnSlotToOwner(SingleLogicalSlot.java:200)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.releaseSlot(SingleLogicalSlot.java:130)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
...
was:
The pending requests in ExecutionSlotAllocator are not cleared when a job
transitions to CANCELING or FAILING, while all vertices will be canceled and
assigned slot will be returned. The returned slot is possible to be used to
fulfill the pending request of a CANCELED vertex and the assignment will fail
immediately and the slot will be returned and used to fulfilled another vertex
in a recursive way. StackOverflow can happen in this way when there are many
vertices. A sample call stack is attached below.
To fix this problem, we should clear the pending requests in
ExecutionSlotAllocator when a job is CANCELING or FAILING. However, I think
it's better to improve the call stack of slot assignment to avoid similar
StackOverflowException to occur.
...
at
org.apache.flink.runtime.scheduler.SharedSlot.returnLogicalSlot(SharedSlot.java:234)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.lambda$returnSlotToOwner$0(SingleLogicalSlot.java:203)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
java.util.concurrent.CompletableFuture.uniRun(CompletableFuture.java:705)
~[?:1.8.0_102]
at
java.util.concurrent.CompletableFuture.uniRunStage(CompletableFuture.java:717)
~[?:1.8.0_102]
at
java.util.concurrent.CompletableFuture.thenRun(CompletableFuture.java:2010)
~[?:1.8.0_102]
at
org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.returnSlotToOwner(SingleLogicalSlot.java:200)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.releaseSlot(SingleLogicalSlot.java:130)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.scheduler.DefaultScheduler.releaseSlotIfPresent(DefaultScheduler.java:533)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$assignResourceOrHandleError$8(DefaultScheduler.java:512)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:822)
~[?:1.8.0_102]
at
java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:797)
~[?:1.8.0_102]
at
java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:474)
~[?:1.8.0_102]
at
java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1962)
~[?:1.8.0_102]
at
org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge$PendingRequest.fulfill(DeclarativeSlotPoolBridge.java:552)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge$PendingRequestSlotMatching.fulfillPendingRequest(DeclarativeSlotPoolBridge.java:587)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge.newSlotsAreAvailable(DeclarativeSlotPoolBridge.java:171)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPool.lambda$freeReservedSlot$0(DefaultDeclarativeSlotPool.java:316)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at java.util.Optional.ifPresent(Optional.java:159) ~[?:1.8.0_102]
at
org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPool.freeReservedSlot(DefaultDeclarativeSlotPool.java:313)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge.releaseSlot(DeclarativeSlotPoolBridge.java:335)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProviderImpl.cancelSlotRequest(PhysicalSlotProviderImpl.java:112)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.scheduler.SlotSharingExecutionSlotAllocator.releaseSharedSlot(SlotSharingExecutionSlotAllocator.java:242)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.scheduler.SharedSlot.releaseExternally(SharedSlot.java:281)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.scheduler.SharedSlot.removeLogicalSlotRequest(SharedSlot.java:242)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.scheduler.SharedSlot.returnLogicalSlot(SharedSlot.java:234)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.lambda$returnSlotToOwner$0(SingleLogicalSlot.java:203)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
java.util.concurrent.CompletableFuture.uniRun(CompletableFuture.java:705)
~[?:1.8.0_102]
at
java.util.concurrent.CompletableFuture.uniRunStage(CompletableFuture.java:717)
~[?:1.8.0_102]
at
java.util.concurrent.CompletableFuture.thenRun(CompletableFuture.java:2010)
~[?:1.8.0_102]
at
org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.returnSlotToOwner(SingleLogicalSlot.java:200)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
at
org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.releaseSlot(SingleLogicalSlot.java:130)
~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
...
> StackOverflowException can happen when a large scale job is CANCELED/FAILED
> ---------------------------------------------------------------------------
>
> Key: FLINK-22945
> URL: https://issues.apache.org/jira/browse/FLINK-22945
> Project: Flink
> Issue Type: Bug
> Affects Versions: 1.13.1, 1.12.4
> Reporter: Zhu Zhu
> Priority: Critical
>
> The pending requests in ExecutionSlotAllocator are not cleared when a job
> transitions to CANCELING or FAILING, while all vertices will be canceled and
> assigned slot will be returned. The returned slot is possible to be used to
> fulfill the pending request of a CANCELED vertex and the assignment will fail
> immediately and the slot will be returned and used to fulfilled another
> vertex in a recursive way. StackOverflow can happen in this way when there
> are many vertices, and fatal error can happen and lead to JM will crash. A
> sample call stack is attached below.
> To fix this problem, we should clear the pending requests in
> ExecutionSlotAllocator when a job is CANCELING or FAILING. However, I think
> it's better to improve the call stack of slot assignment to avoid similar
> StackOverflowException to occur.
> ...
> at
> org.apache.flink.runtime.scheduler.SharedSlot.returnLogicalSlot(SharedSlot.java:234)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.lambda$returnSlotToOwner$0(SingleLogicalSlot.java:203)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> java.util.concurrent.CompletableFuture.uniRun(CompletableFuture.java:705)
> ~[?:1.8.0_102]
> at
> java.util.concurrent.CompletableFuture.uniRunStage(CompletableFuture.java:717)
> ~[?:1.8.0_102]
> at
> java.util.concurrent.CompletableFuture.thenRun(CompletableFuture.java:2010)
> ~[?:1.8.0_102]
> at
> org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.returnSlotToOwner(SingleLogicalSlot.java:200)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.releaseSlot(SingleLogicalSlot.java:130)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> org.apache.flink.runtime.scheduler.DefaultScheduler.releaseSlotIfPresent(DefaultScheduler.java:533)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$assignResourceOrHandleError$8(DefaultScheduler.java:512)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:822)
> ~[?:1.8.0_102]
> at
> java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:797)
> ~[?:1.8.0_102]
> at
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:474)
> ~[?:1.8.0_102]
> at
> java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1962)
> ~[?:1.8.0_102]
> at
> org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge$PendingRequest.fulfill(DeclarativeSlotPoolBridge.java:552)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge$PendingRequestSlotMatching.fulfillPendingRequest(DeclarativeSlotPoolBridge.java:587)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge.newSlotsAreAvailable(DeclarativeSlotPoolBridge.java:171)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPool.lambda$freeReservedSlot$0(DefaultDeclarativeSlotPool.java:316)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at java.util.Optional.ifPresent(Optional.java:159) ~[?:1.8.0_102]
> at
> org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPool.freeReservedSlot(DefaultDeclarativeSlotPool.java:313)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridge.releaseSlot(DeclarativeSlotPoolBridge.java:335)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProviderImpl.cancelSlotRequest(PhysicalSlotProviderImpl.java:112)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> org.apache.flink.runtime.scheduler.SlotSharingExecutionSlotAllocator.releaseSharedSlot(SlotSharingExecutionSlotAllocator.java:242)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> org.apache.flink.runtime.scheduler.SharedSlot.releaseExternally(SharedSlot.java:281)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> org.apache.flink.runtime.scheduler.SharedSlot.removeLogicalSlotRequest(SharedSlot.java:242)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> org.apache.flink.runtime.scheduler.SharedSlot.returnLogicalSlot(SharedSlot.java:234)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.lambda$returnSlotToOwner$0(SingleLogicalSlot.java:203)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> java.util.concurrent.CompletableFuture.uniRun(CompletableFuture.java:705)
> ~[?:1.8.0_102]
> at
> java.util.concurrent.CompletableFuture.uniRunStage(CompletableFuture.java:717)
> ~[?:1.8.0_102]
> at
> java.util.concurrent.CompletableFuture.thenRun(CompletableFuture.java:2010)
> ~[?:1.8.0_102]
> at
> org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.returnSlotToOwner(SingleLogicalSlot.java:200)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> at
> org.apache.flink.runtime.jobmaster.slotpool.SingleLogicalSlot.releaseSlot(SingleLogicalSlot.java:130)
> ~[flink-dist_2.11-1.13-vvr-4.0-SNAPSHOT.jar:1.13-vvr-4.0-SNAPSHOT]
> ...
--
This message was sent by Atlassian Jira
(v8.3.4#803005)