[ 
https://issues.apache.org/jira/browse/FLINK-20547?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17255707#comment-17255707
 ] 

Roman Khachatryan commented on FLINK-20547:
-------------------------------------------

Debugging an unrelated issue FLINK-20654, I see there is probably a memory leak 
when running UnalignedCheckpointITCase multiple times:
{code:java}
122805 [ForkJoinPool.commonPool-worker-5] WARN  
org.apache.flink.runtime.minicluster.MiniClusterJobClient [] - Shutdown of 
MiniCluster failed.
org.apache.flink.util.FlinkException: Error while shutting the TaskExecutor 
down.
        at 
org.apache.flink.runtime.taskexecutor.TaskExecutor.handleOnStopException(TaskExecutor.java:461)
 ~[classes/:?]
        at 
org.apache.flink.runtime.taskexecutor.TaskExecutor.lambda$onStop$2(TaskExecutor.java:443)
 ~[classes/:?]
        at 
java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836) 
~[?:1.8.0_271]
        at 
java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811)
 ~[?:1.8.0_271]
        at 
java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) 
~[?:1.8.0_271]
        at 
java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990)
 ~[?:1.8.0_271]
        at 
org.apache.flink.runtime.concurrent.FutureUtils.lambda$runAfterwardsAsync$17(FutureUtils.java:678)
 ~[classes/:?]
        at 
java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774)
 ~[?:1.8.0_271]
        at 
java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
 ~[?:1.8.0_271]
        at 
java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
 ~[?:1.8.0_271]
        at 
org.apache.flink.runtime.concurrent.DirectExecutorService.execute(DirectExecutorService.java:217)
 ~[classes/:?]
        at 
java.util.concurrent.CompletableFuture$UniCompletion.claim(CompletableFuture.java:543)
 ~[?:1.8.0_271]
        at 
java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:765)
 ~[?:1.8.0_271]
        at 
java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
 ~[?:1.8.0_271]
        at 
java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) 
~[?:1.8.0_271]
        at 
java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975) 
~[?:1.8.0_271]
        at 
org.apache.flink.runtime.concurrent.FutureUtils.lambda$forwardTo$22(FutureUtils.java:1323)
 ~[classes/:?]
        at 
java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774)
 ~[?:1.8.0_271]
        at 
java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
 ~[?:1.8.0_271]
        at 
java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) 
~[?:1.8.0_271]
        at 
java.util.concurrent.CompletableFuture.postFire(CompletableFuture.java:575) 
~[?:1.8.0_271]
        at 
java.util.concurrent.CompletableFuture$UniRun.tryFire(CompletableFuture.java:704)
 ~[?:1.8.0_271]
        at 
java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
 ~[?:1.8.0_271]
        at 
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:442)
 ~[classes/:?]
        at 
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:209)
 ~[classes/:?]
        at 
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:159)
 ~[classes/:?]
        at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) 
~[akka-actor_2.11-2.5.21.jar:2.5.21]
        at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) 
~[akka-actor_2.11-2.5.21.jar:2.5.21]
        at scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123) 
~[scala-library-2.11.12.jar:?]
        at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21) 
~[akka-actor_2.11-2.5.21.jar:2.5.21]
        at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:170) 
~[scala-library-2.11.12.jar:?]
        at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) 
~[scala-library-2.11.12.jar:?]
        at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) 
~[scala-library-2.11.12.jar:?]
        at akka.actor.Actor$class.aroundReceive(Actor.scala:517) 
~[akka-actor_2.11-2.5.21.jar:2.5.21]
        at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) 
~[akka-actor_2.11-2.5.21.jar:2.5.21]
        at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) 
~[akka-actor_2.11-2.5.21.jar:2.5.21]
        at akka.actor.ActorCell.invoke(ActorCell.scala:561) 
~[akka-actor_2.11-2.5.21.jar:2.5.21]
        at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) 
~[akka-actor_2.11-2.5.21.jar:2.5.21]
        at akka.dispatch.Mailbox.run(Mailbox.scala:225) 
~[akka-actor_2.11-2.5.21.jar:2.5.21]
        at akka.dispatch.Mailbox.exec(Mailbox.scala:235) 
~[akka-actor_2.11-2.5.21.jar:2.5.21]
        at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) 
~[akka-actor_2.11-2.5.21.jar:2.5.21]
        at 
akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) 
~[akka-actor_2.11-2.5.21.jar:2.5.21]
        at 
akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) 
~[akka-actor_2.11-2.5.21.jar:2.5.21]
        at 
akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) 
~[akka-actor_2.11-2.5.21.jar:2.5.21]
Caused by: org.apache.flink.util.FlinkException: Could not properly shut down 
the TaskManager services.
        at 
org.apache.flink.runtime.taskexecutor.TaskManagerServices.shutDown(TaskManagerServices.java:235)
 ~[classes/:?]
        at 
org.apache.flink.runtime.taskexecutor.TaskExecutor.stopTaskExecutorServices(TaskExecutor.java:484)
 ~[classes/:?]
        at 
org.apache.flink.runtime.concurrent.FutureUtils.lambda$runAfterwardsAsync$17(FutureUtils.java:672)
 ~[classes/:?]
        ... 37 more
Caused by: java.lang.IllegalStateException: NetworkBufferPool is not empty 
after destroying all LocalBufferPools
        at 
org.apache.flink.runtime.io.network.buffer.NetworkBufferPool.destroyAllBufferPools(NetworkBufferPool.java:431)
 ~[classes/:?]
        at 
org.apache.flink.runtime.io.network.NettyShuffleEnvironment.close(NettyShuffleEnvironment.java:346)
 ~[classes/:?]
        at 
org.apache.flink.runtime.taskexecutor.TaskManagerServices.shutDown(TaskManagerServices.java:191)
 ~[classes/:?]
        at 
org.apache.flink.runtime.taskexecutor.TaskExecutor.stopTaskExecutorServices(TaskExecutor.java:484)
 ~[classes/:?]
        at 
org.apache.flink.runtime.concurrent.FutureUtils.lambda$runAfterwardsAsync$17(FutureUtils.java:672)
 ~[classes/:?]
        ... 37 mor
{code}

> Batch job fails due to the exception in network stack
> -----------------------------------------------------
>
>                 Key: FLINK-20547
>                 URL: https://issues.apache.org/jira/browse/FLINK-20547
>             Project: Flink
>          Issue Type: Bug
>          Components: Runtime / Network
>    Affects Versions: 1.13.0
>            Reporter: Zhilong Hong
>            Assignee: Yingjie Cao
>            Priority: Major
>         Attachments: inconsistent.tar.gz
>
>
> I run a simple batch job with only two job vertices: a source and a sink.
> The parallelisms of them are both 8000. They are connected via all-to-all 
> blocking edges.
> During the running of sink tasks, an exception raises:
> {code:java}
> 2020-12-09 18:43:48,981 INFO  
> org.apache.flink.runtime.executiongraph.ExecutionGraph       [] - Sink: Sink 
> 1 (1595/8000) (08bd4214d6e0dc144e9654f1faaa3b28) switched from RUNNING to 
> FAILED on [masked container name] @ [masked address] (dataPort=47872).
> java.io.IOException: java.lang.IllegalStateException: Inconsistent 
> availability: expected true
>       at 
> org.apache.flink.runtime.io.network.partition.consumer.InputChannel.checkError(InputChannel.java:232)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.partition.consumer.RecoveredInputChannel.getNextBuffer(RecoveredInputChannel.java:165)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.partition.consumer.SingleInputGate.waitAndGetNextData(SingleInputGate.java:626)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.partition.consumer.SingleInputGate.getNextBufferOrEvent(SingleInputGate.java:603)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.partition.consumer.SingleInputGate.pollNext(SingleInputGate.java:591)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.taskmanager.InputGateWithMetrics.pollNext(InputGateWithMetrics.java:109)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.streaming.runtime.io.CheckpointedInputGate.pollNext(CheckpointedInputGate.java:142)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.streaming.runtime.io.StreamTaskNetworkInput.emitNext(StreamTaskNetworkInput.java:157)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.streaming.runtime.io.StreamOneInputProcessor.processInput(StreamOneInputProcessor.java:67)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.streaming.runtime.tasks.StreamTask.processInput(StreamTask.java:372)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.runMailboxLoop(MailboxProcessor.java:186)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.streaming.runtime.tasks.StreamTask.runMailboxLoop(StreamTask.java:575)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:539)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:722) 
> ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at org.apache.flink.runtime.taskmanager.Task.run(Task.java:547) 
> ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at java.lang.Thread.run(Thread.java:834) ~[?:1.8.0_102]
> Caused by: java.lang.IllegalStateException: Inconsistent availability: 
> expected true
>       at 
> org.apache.flink.util.Preconditions.checkState(Preconditions.java:198) 
> ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.buffer.LocalBufferPool.checkConsistentAvailability(LocalBufferPool.java:434)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.buffer.LocalBufferPool.setNumBuffers(LocalBufferPool.java:564)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.buffer.NetworkBufferPool.redistributeBuffers(NetworkBufferPool.java:509)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.buffer.NetworkBufferPool.tryRedistributeBuffers(NetworkBufferPool.java:438)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.buffer.NetworkBufferPool.requestMemorySegments(NetworkBufferPool.java:166)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.buffer.NetworkBufferPool.requestMemorySegments(NetworkBufferPool.java:60)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.partition.consumer.BufferManager.requestExclusiveBuffers(BufferManager.java:131)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.partition.consumer.RemoteInputChannel.setup(RemoteInputChannel.java:148)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.partition.consumer.RemoteRecoveredInputChannel.toInputChannelInternal(RemoteRecoveredInputChannel.java:76)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.partition.consumer.RecoveredInputChannel.toInputChannel(RecoveredInputChannel.java:91)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.partition.consumer.SingleInputGate.convertRecoveredInputChannels(SingleInputGate.java:299)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.io.network.partition.consumer.SingleInputGate.requestPartitions(SingleInputGate.java:285)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.runtime.taskmanager.InputGateWithMetrics.requestPartitions(InputGateWithMetrics.java:94)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor$1.runThrowing(StreamTaskActionExecutor.java:47)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.streaming.runtime.tasks.mailbox.Mail.run(Mail.java:78) 
> ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.processMail(MailboxProcessor.java:283)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       at 
> org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.runMailboxLoop(MailboxProcessor.java:184)
>  ~[flink-dist_2.11-1.13-SNAPSHOT.jar:1.13-SNAPSHOT]
>       ... 5 more
> {code}
>  It seems to be an exception in network stack.
> The full log of the job is attached below.
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to