[
https://issues.apache.org/jira/browse/FLINK-31064?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Tobias Hofer updated FLINK-31064:
---------------------------------
Description:
JobManager enters corrupt state after restart (e.g. increasing restartNonce).
{code:java}
WARN | flink-akka.actor.default-dispatcher-5 | RpcGatewayRetriever
| Error while retrieving the leader gateway. Retrying to connect to
akka.tcp://[email protected]:6123/user/rpc/resourcemanager_*.
org.apache.flink.util.concurrent.FutureUtils$RetryException: Could not complete
the operation. Number of retries has been exhausted.
at
org.apache.flink.util.concurrent.FutureUtils.lambda$retryOperationWithDelay$6(FutureUtils.java:293)
at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(Unknown
Source)
at
java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(Unknown
Source)
at java.base/java.util.concurrent.CompletableFuture.postComplete(Unknown
Source)
at
java.base/java.util.concurrent.CompletableFuture.completeExceptionally(Unknown
Source)
at
org.apache.flink.util.concurrent.FutureUtils.doForward(FutureUtils.java:1275)
at
org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$null$1(ClassLoadingUtils.java:93)
at
org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
at
org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$guardCompletionWithContextClassLoader$2(ClassLoadingUtils.java:92)
...
Caused by: java.util.concurrent.CompletionException:
org.apache.flink.runtime.rpc.exceptions.RpcConnectionException: Could not
connect to rpc endpoint under address
akka.tcp://[email protected]:6123/user/rpc/resourcemanager_*.
at
org.apache.flink.runtime.rpc.akka.AkkaRpcService.lambda$resolveActorAddress$11(AkkaRpcService.java:604)
at
scala.concurrent.java8.FuturesConvertersImpl$CF$$anon$1.accept(FutureConvertersImpl.scala:59)
... 5 more
Caused by: org.apache.flink.runtime.rpc.exceptions.RpcConnectionException:
Could not connect to rpc endpoint under address
akka.tcp://[email protected]:6123/user/rpc/resourcemanager_*.
... 7 more
Caused by: akka.actor.ActorNotFound: Actor not found for:
ActorSelection[Anchor(akka://flink/), Path(/user/rpc/resourcemanager_*)]
at akka.actor.ActorSelection.$anonfun$resolveOne$1(ActorSelection.scala:74)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:60)
at
akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:63)
at akka.dispatch.BatchingExecutor$Batch.run(BatchingExecutor.scala:81)
at
akka.dispatch.internal.SameThreadExecutionContext$$anon$1.unbatchedExecute(SameThreadExecutionContext.scala:21)
at akka.dispatch.BatchingExecutor.execute(BatchingExecutor.scala:130)
at akka.dispatch.BatchingExecutor.execute$(BatchingExecutor.scala:124)
at
akka.dispatch.internal.SameThreadExecutionContext$$anon$1.execute(SameThreadExecutionContext.scala:20)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:68)
at
scala.concurrent.impl.Promise$DefaultPromise.dispatchOrAddCallback(Promise.scala:312)
at
scala.concurrent.impl.Promise$DefaultPromise.onComplete(Promise.scala:303)
at akka.actor.ActorSelection.resolveOne(ActorSelection.scala:72)
at akka.actor.ActorSelection.resolveOne(ActorSelection.scala:89)
at akka.actor.ActorSelection.resolveOne(ActorSelection.scala:130)
at
org.apache.flink.runtime.rpc.akka.AkkaRpcService.resolveActorAddress(AkkaRpcService.java:598)
at
org.apache.flink.runtime.rpc.akka.AkkaRpcService.connectInternal(AkkaRpcService.java:549)
at
org.apache.flink.runtime.rpc.akka.AkkaRpcService.connect(AkkaRpcService.java:232)
at
org.apache.flink.runtime.webmonitor.retriever.impl.RpcGatewayRetriever.lambda$null$0(RpcGatewayRetriever.java:66)
at java.base/java.util.concurrent.CompletableFuture.uniComposeStage(Unknown
Source)
at java.base/java.util.concurrent.CompletableFuture.thenCompose(Unknown
Source)
at
org.apache.flink.runtime.webmonitor.retriever.impl.RpcGatewayRetriever.lambda$createGateway$1(RpcGatewayRetriever.java:64)
at
org.apache.flink.util.concurrent.FutureUtils.retryOperationWithDelay(FutureUtils.java:259)
at
org.apache.flink.util.concurrent.FutureUtils.lambda$null$4(FutureUtils.java:279)
at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Unknown
Source)
at java.base/java.util.concurrent.FutureTask.run(Unknown Source)
at
org.apache.flink.runtime.concurrent.akka.ActorSystemScheduledExecutorAdapter$ScheduledFutureTask.run(ActorSystemScheduledExecutorAdapter.java:171)
at
org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
at
org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$withContextClassLoader$0(ClassLoadingUtils.java:41)
at akka.dispatch.TaskInvocation.run(AbstractDispatcher.scala:49)
at
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(ForkJoinExecutorConfigurator.scala:48)
{code}
I run Flink v1.16 on GKE v1.24. The hostname ‘my-session.flink’ can get
resolved successfully in k8s.
was:
JobManager enters corrupt state after restart (e.g. increasing restartNonce).
{code:java}
WARN | flink-akka.actor.default-dispatcher-5 | RpcGatewayRetriever
| Errorwhile retrieving the leader gateway. Retrying to connect to
akka.tcp://[email protected]:6123/user/rpc/resourcemanager_*.org.apache.flink.util.concurrent.FutureUtils$RetryException:
Could not complete the operation. Number of retries has been exhausted. at
org.apache.flink.util.concurrent.FutureUtils.lambda$retryOperationWithDelay$6(FutureUtils.java:293)
at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(Unknown
Source) at
java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(UnknownSource)
at java.base/java.util.concurrent.CompletableFuture.postComplete(Unknown
Source) at
java.base/java.util.concurrent.CompletableFuture.completeExceptionally(Unknown
Source) at
org.apache.flink.util.concurrent.FutureUtils.doForward(FutureUtils.java:1275)
at
org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$null$1(ClassLoadingUtils.java:93)
at
org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
at
org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$guardCompletionWithContextClassLoader$2(ClassLoadingUtils.java:92)
at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(Unknown
Source) at
java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(UnknownSource)
at java.base/java.util.concurrent.CompletableFuture.postComplete(Unknown
Source) at
java.base/java.util.concurrent.CompletableFuture.completeExceptionally(Unknown
Source) at
scala.concurrent.java8.FuturesConvertersImpl$CF$$anon$1.accept(FutureConvertersImpl.scala:61)
at
scala.concurrent.java8.FuturesConvertersImpl$CF$$anon$1.accept(FutureConvertersImpl.scala:53)
at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(Unknown
Source) at
java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(UnknownSource)
at java.base/java.util.concurrent.CompletableFuture$Completion.run(Unknown
Source) at java.base/java.lang.Thread.run(Unknown Source)Caused by:
java.util.concurrent.CompletionException:
org.apache.flink.runtime.rpc.exceptions.RpcConnectionException: Could not
connect to rpc endpoint under address
akka.tcp://[email protected]:6123/user/rpc/resourcemanager_*. at
org.apache.flink.runtime.rpc.akka.AkkaRpcService.lambda$resolveActorAddress$11(AkkaRpcService.java:604)
at
scala.concurrent.java8.FuturesConvertersImpl$CF$$anon$1.accept(FutureConvertersImpl.scala:59)
... 5 moreCaused by:
org.apache.flink.runtime.rpc.exceptions.RpcConnectionException: Could not
connect to rpc endpoint under address
akka.tcp://[email protected]:6123/user/rpc/resourcemanager_*. ... 7
moreCaused by: akka.actor.ActorNotFound: Actor not found for:
ActorSelection[Anchor(akka://flink/), Path(/user/rpc/resourcemanager_*)] at
akka.actor.ActorSelection.$anonfun$resolveOne$1(ActorSelection.scala:74) at
scala.concurrent.impl.CallbackRunnable.run(Promise.scala:60) at
akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:63)
at akka.dispatch.BatchingExecutor$Batch.run(BatchingExecutor.scala:81)
at
akka.dispatch.internal.SameThreadExecutionContext$$anon$1.unbatchedExecute(SameThreadExecutionContext.scala:21)
at akka.dispatch.BatchingExecutor.execute(BatchingExecutor.scala:130) at
akka.dispatch.BatchingExecutor.execute$(BatchingExecutor.scala:124) at
akka.dispatch.internal.SameThreadExecutionContext$$anon$1.execute(SameThreadExecutionContext.scala:20)
at
scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:68) at
scala.concurrent.impl.Promise$DefaultPromise.dispatchOrAddCallback(Promise.scala:312)
at
scala.concurrent.impl.Promise$DefaultPromise.onComplete(Promise.scala:303)
at akka.actor.ActorSelection.resolveOne(ActorSelection.scala:72) at
akka.actor.ActorSelection.resolveOne(ActorSelection.scala:89) at
akka.actor.ActorSelection.resolveOne(ActorSelection.scala:130) at
org.apache.flink.runtime.rpc.akka.AkkaRpcService.resolveActorAddress(AkkaRpcService.java:598)
at
org.apache.flink.runtime.rpc.akka.AkkaRpcService.connectInternal(AkkaRpcService.java:549)
at
org.apache.flink.runtime.rpc.akka.AkkaRpcService.connect(AkkaRpcService.java:232)
at
org.apache.flink.runtime.webmonitor.retriever.impl.RpcGatewayRetriever.lambda$null$0(RpcGatewayRetriever.java:66)
at java.base/java.util.concurrent.CompletableFuture.uniComposeStage(Unknown
Source) at
java.base/java.util.concurrent.CompletableFuture.thenCompose(Unknown Source)
at
org.apache.flink.runtime.webmonitor.retriever.impl.RpcGatewayRetriever.lambda$createGateway$1(RpcGatewayRetriever.java:64)
at
org.apache.flink.util.concurrent.FutureUtils.retryOperationWithDelay(FutureUtils.java:259)
at
org.apache.flink.util.concurrent.FutureUtils.lambda$null$4(FutureUtils.java:279)
at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Unknown
Source) at java.base/java.util.concurrent.FutureTask.run(Unknown Source)
at
org.apache.flink.runtime.concurrent.akka.ActorSystemScheduledExecutorAdapter$ScheduledFutureTask.run(ActorSystemScheduledExecutorAdapter.java:171)
at
org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
at
org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$withContextClassLoader$0(ClassLoadingUtils.java:41)
at akka.dispatch.TaskInvocation.run(AbstractDispatcher.scala:49) at
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(ForkJoinExecutorConfigurator.scala:48)
at java.base/java.util.concurrent.ForkJoinTask.doExec(Unknown Source) at
java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(Unknown
Source) at java.base/java.util.concurrent.ForkJoinPool.scan(Unknown Source)
at java.base/java.util.concurrent.ForkJoinPool.runWorker(Unknown Source)
at java.base/java.util.concurrent.ForkJoinWorkerThread.run(Unknown Source)
{code}
I run Flink v1.16 on GKE v1.24. The hostname ‘my-session.flink’ can get
resolved successfully in k8s.
> Error while retrieving the leader gateway
> -----------------------------------------
>
> Key: FLINK-31064
> URL: https://issues.apache.org/jira/browse/FLINK-31064
> Project: Flink
> Issue Type: Bug
> Components: Kubernetes Operator
> Affects Versions: kubernetes-operator-1.3.1
> Environment:
>
> Reporter: Tobias Hofer
> Priority: Major
> Attachments: jobmanager_log.txt
>
>
> JobManager enters corrupt state after restart (e.g. increasing restartNonce).
> {code:java}
> WARN | flink-akka.actor.default-dispatcher-5 | RpcGatewayRetriever
> | Error while retrieving the leader gateway. Retrying to connect to
> akka.tcp://[email protected]:6123/user/rpc/resourcemanager_*.
> org.apache.flink.util.concurrent.FutureUtils$RetryException: Could not
> complete the operation. Number of retries has been exhausted.
> at
> org.apache.flink.util.concurrent.FutureUtils.lambda$retryOperationWithDelay$6(FutureUtils.java:293)
> at
> java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(Unknown
> Source)
> at
> java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(Unknown
> Source)
> at java.base/java.util.concurrent.CompletableFuture.postComplete(Unknown
> Source)
> at
> java.base/java.util.concurrent.CompletableFuture.completeExceptionally(Unknown
> Source)
> at
> org.apache.flink.util.concurrent.FutureUtils.doForward(FutureUtils.java:1275)
> at
> org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$null$1(ClassLoadingUtils.java:93)
> at
> org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
> at
> org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$guardCompletionWithContextClassLoader$2(ClassLoadingUtils.java:92)
> ...
> Caused by: java.util.concurrent.CompletionException:
> org.apache.flink.runtime.rpc.exceptions.RpcConnectionException: Could not
> connect to rpc endpoint under address
> akka.tcp://[email protected]:6123/user/rpc/resourcemanager_*.
> at
> org.apache.flink.runtime.rpc.akka.AkkaRpcService.lambda$resolveActorAddress$11(AkkaRpcService.java:604)
> at
> scala.concurrent.java8.FuturesConvertersImpl$CF$$anon$1.accept(FutureConvertersImpl.scala:59)
> ... 5 more
> Caused by: org.apache.flink.runtime.rpc.exceptions.RpcConnectionException:
> Could not connect to rpc endpoint under address
> akka.tcp://[email protected]:6123/user/rpc/resourcemanager_*.
> ... 7 more
> Caused by: akka.actor.ActorNotFound: Actor not found for:
> ActorSelection[Anchor(akka://flink/), Path(/user/rpc/resourcemanager_*)]
> at
> akka.actor.ActorSelection.$anonfun$resolveOne$1(ActorSelection.scala:74)
> at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:60)
> at
> akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:63)
> at akka.dispatch.BatchingExecutor$Batch.run(BatchingExecutor.scala:81)
> at
> akka.dispatch.internal.SameThreadExecutionContext$$anon$1.unbatchedExecute(SameThreadExecutionContext.scala:21)
> at akka.dispatch.BatchingExecutor.execute(BatchingExecutor.scala:130)
> at akka.dispatch.BatchingExecutor.execute$(BatchingExecutor.scala:124)
> at
> akka.dispatch.internal.SameThreadExecutionContext$$anon$1.execute(SameThreadExecutionContext.scala:20)
> at
> scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:68)
> at
> scala.concurrent.impl.Promise$DefaultPromise.dispatchOrAddCallback(Promise.scala:312)
> at
> scala.concurrent.impl.Promise$DefaultPromise.onComplete(Promise.scala:303)
> at akka.actor.ActorSelection.resolveOne(ActorSelection.scala:72)
> at akka.actor.ActorSelection.resolveOne(ActorSelection.scala:89)
> at akka.actor.ActorSelection.resolveOne(ActorSelection.scala:130)
> at
> org.apache.flink.runtime.rpc.akka.AkkaRpcService.resolveActorAddress(AkkaRpcService.java:598)
> at
> org.apache.flink.runtime.rpc.akka.AkkaRpcService.connectInternal(AkkaRpcService.java:549)
> at
> org.apache.flink.runtime.rpc.akka.AkkaRpcService.connect(AkkaRpcService.java:232)
> at
> org.apache.flink.runtime.webmonitor.retriever.impl.RpcGatewayRetriever.lambda$null$0(RpcGatewayRetriever.java:66)
> at
> java.base/java.util.concurrent.CompletableFuture.uniComposeStage(Unknown
> Source)
> at java.base/java.util.concurrent.CompletableFuture.thenCompose(Unknown
> Source)
> at
> org.apache.flink.runtime.webmonitor.retriever.impl.RpcGatewayRetriever.lambda$createGateway$1(RpcGatewayRetriever.java:64)
> at
> org.apache.flink.util.concurrent.FutureUtils.retryOperationWithDelay(FutureUtils.java:259)
> at
> org.apache.flink.util.concurrent.FutureUtils.lambda$null$4(FutureUtils.java:279)
> at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Unknown
> Source)
> at java.base/java.util.concurrent.FutureTask.run(Unknown Source)
> at
> org.apache.flink.runtime.concurrent.akka.ActorSystemScheduledExecutorAdapter$ScheduledFutureTask.run(ActorSystemScheduledExecutorAdapter.java:171)
> at
> org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
> at
> org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$withContextClassLoader$0(ClassLoadingUtils.java:41)
> at akka.dispatch.TaskInvocation.run(AbstractDispatcher.scala:49)
> at
> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(ForkJoinExecutorConfigurator.scala:48)
> {code}
> I run Flink v1.16 on GKE v1.24. The hostname ‘my-session.flink’ can get
> resolved successfully in k8s.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)