你这个报错看着是TM向JM注册超时了,使用的HA还是非HA部署呢
如果是HA的话,TM是直接使用JM的Pod ip进行通信的,这个时候需要登录pod确认一下网络是否是通的 如果是非HA的话,TM是使用service来向JM注册,你需要检查一下K8s的kube proxy是否正常 另外,是所有TM都注册不上来,还是只有个别的。这个也可以排除网络问题 Best, Yang yanzhibo <[email protected]> 于2020年9月16日周三 下午5:25写道: > 一个job manager pod 提交job后,申请taskmanager失败 > > > Taskmanager 的异常 > > Fatal error occurred in TaskExecutor akka.tcp:// > [email protected]:6122/user/rpc/taskmanager_0. > org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException: > Could not register at the ResourceManager within the specified maximum > registration duration 300000 ms. This indicates a problem with this > instance. Terminating now. > at > org.apache.flink.runtime.taskexecutor.TaskExecutor.registrationTimeout(TaskExecutor.java:1251) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.taskexecutor.TaskExecutor.lambda$startRegistrationTimeout$18(TaskExecutor.java:1237) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:195) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.Actor.aroundReceive(Actor.scala:517) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.Actor.aroundReceive$(Actor.scala:515) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.ActorCell.invoke(ActorCell.scala:561) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.dispatch.Mailbox.run(Mailbox.scala:225) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.dispatch.Mailbox.exec(Mailbox.scala:235) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) > [flink-dist_2.12-1.11.1.jar:1.11.1] > 2020-09-16 09:14:39,077 ERROR > org.apache.flink.runtime.taskexecutor.TaskManagerRunner [] - Fatal > error occurred while executing the TaskManager. Shutting it down... > org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException: > Could not register at the ResourceManager within the specified maximum > registration duration 300000 ms. This indicates a problem with this > instance. Terminating now. > at > org.apache.flink.runtime.taskexecutor.TaskExecutor.registrationTimeout(TaskExecutor.java:1251) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.taskexecutor.TaskExecutor.lambda$startRegistrationTimeout$18(TaskExecutor.java:1237) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) > ~ > > > Jobmanger 异常 > > 0d5f8478a2ab4e17d816810752f669eb) switched from SCHEDULED to FAILED on not > deployed. > org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException: > Could not allocate the required slot within slot request timeout. Please > make sure that the cluster has enough resources. > at > org.apache.flink.runtime.scheduler.DefaultScheduler.maybeWrapWithNoResourceAvailableException(DefaultScheduler.java:441) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.scheduler.DefaultScheduler.lambda$assignResourceOrHandleError$6(DefaultScheduler.java:422) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > ~[?:1.8.0_265] > at > org.apache.flink.runtime.jobmaster.slotpool.SchedulerImpl.lambda$internalAllocateSlot$0(SchedulerImpl.java:168) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > ~[?:1.8.0_265] > at > org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$SingleTaskSlot.release(SlotSharingManager.java:726) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.release(SlotSharingManager.java:537) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.jobmaster.slotpool.SlotSharingManager$MultiTaskSlot.lambda$new$0(SlotSharingManager.java:432) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > ~[?:1.8.0_265] > at > org.apache.flink.runtime.concurrent.FutureUtils.lambda$forwardTo$21(FutureUtils.java:1120) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1990) > ~[?:1.8.0_265] > at > org.apache.flink.runtime.concurrent.FutureUtils$Timeout.run(FutureUtils.java:1036) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:402) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:195) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:74) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at > org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at scala.PartialFunction.applyOrElse(PartialFunction.scala:123) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.Actor.aroundReceive(Actor.scala:517) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.Actor.aroundReceive$(Actor.scala:515) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.actor.ActorCell.invoke(ActorCell.scala:561) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.dispatch.Mailbox.run(Mailbox.scala:225) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.dispatch.Mailbox.exec(Mailbox.scala:235) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > [flink-dist_2.12-1.11.1.jar:1.11.1] > at > akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) > [flink-dist_2.12-1.11.1.jar:1.11.1] > Caused by: java.util.concurrent.CompletionException: > java.util.concurrent.TimeoutException > at > java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:292) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:308) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:607) > ~[?:1.8.0_265] > at > java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591) > ~[?:1.8.0_265] > ... 27 more > Caused by: java.util.concurrent.TimeoutException > ... 25 more > 2020-09-16 09:14:35,603 INFO > org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy > [] - Calculating tasks to restart to recover the failed task > cbc357ccb763df2852fee8c4fc7d55f2_0. > 2020-09-16 09:14:35,604 INFO > org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy > [] - 7 tasks should be restarted to recover the failed task > cbc357ccb763df2852fee8c4fc7d55f2_0. > 2020-09-16 09:14:35,606 INFO > org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Job > Dragonsnake Indicator Accumulator Job (c60f4f384f904467e4c54416d634852f) > switched from state RUNNING to FAILING. > org.apache.flink.runtime.JobException: Recovery is suppressed by > NoRestartBackoffTimeStrategy > at > org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:116) > ~[flink-dist_2.12-1.11.1.jar:1.11.1] > at org.apache.flink.runtime.executiongraph.failover.flip1. > > >
