[
https://issues.apache.org/jira/browse/FLINK-22084?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17316132#comment-17316132
]
Till Rohrmann commented on FLINK-22084:
---------------------------------------
I think this is one of these cases where we try to be super smart instead of
requiring explicit configuration and failing if the job is not properly
configured.
Unfortunately, it is now supported by Flink and ideally we don't break this
behaviour. If we decouple the state restore from the {{ExecutionGraph}}
creation (or at least move it out of the EG and do it before we create the EG),
then it should be easy to fix. Now we probably need a workaround until this is
possible.
> RescalingITCase fails with adaptive scheduler
> ---------------------------------------------
>
> Key: FLINK-22084
> URL: https://issues.apache.org/jira/browse/FLINK-22084
> Project: Flink
> Issue Type: Bug
> Components: Runtime / Checkpointing, Runtime / Coordination
> Affects Versions: 1.13.0
> Reporter: Dawid Wysakowicz
> Assignee: Austin Cawley-Edwards
> Priority: Blocker
> Labels: pull-request-available, test-stability
> Fix For: 1.13.0
>
>
> https://dev.azure.com/apache-flink/apache-flink/_build/results?buildId=15934&view=logs&j=8fd9202e-fd17-5b26-353c-ac1ff76c8f28&t=a0a633b8-47ef-5c5a-2806-3c13b9e48228&l=4472
> {code}
> 2021-03-31T22:16:07.8416407Z [ERROR]
> testSavepointRescalingOutKeyedStateDerivedMaxParallelism[backend =
> rocksdb](org.apache.flink.test.checkpointing.RescalingITCase) Time elapsed:
> 9.945 s <<< ERROR!
> 2021-03-31T22:16:07.8417534Z
> org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
> 2021-03-31T22:16:07.8418516Z at
> org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144)
> 2021-03-31T22:16:07.8419281Z at
> org.apache.flink.test.util.TestUtils.submitJobAndWaitForResult(TestUtils.java:63)
> 2021-03-31T22:16:07.8420142Z at
> org.apache.flink.test.checkpointing.RescalingITCase.testSavepointRescalingKeyedState(RescalingITCase.java:251)
> 2021-03-31T22:16:07.8421173Z at
> org.apache.flink.test.checkpointing.RescalingITCase.testSavepointRescalingOutKeyedStateDerivedMaxParallelism(RescalingITCase.java:168)
> 2021-03-31T22:16:07.8421985Z at
> sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> 2021-03-31T22:16:07.8422651Z at
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> 2021-03-31T22:16:07.8423649Z at
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> 2021-03-31T22:16:07.8424231Z at
> java.lang.reflect.Method.invoke(Method.java:498)
> 2021-03-31T22:16:07.8424657Z at
> org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50)
> 2021-03-31T22:16:07.8425147Z at
> org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
> 2021-03-31T22:16:07.8425609Z at
> org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47)
> 2021-03-31T22:16:07.8426183Z at
> org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
> 2021-03-31T22:16:07.8569060Z at
> org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26)
> 2021-03-31T22:16:07.8569781Z at
> org.apache.flink.util.TestNameProvider$1.evaluate(TestNameProvider.java:45)
> 2021-03-31T22:16:07.8570451Z at
> org.junit.rules.TestWatcher$1.evaluate(TestWatcher.java:55)
> 2021-03-31T22:16:07.8571040Z at
> org.junit.rules.RunRules.evaluate(RunRules.java:20)
> 2021-03-31T22:16:07.8571604Z at
> org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:325)
> 2021-03-31T22:16:07.8572303Z at
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:78)
> 2021-03-31T22:16:07.8573259Z at
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:57)
> 2021-03-31T22:16:07.8573975Z at
> org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
> 2021-03-31T22:16:07.8574660Z at
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
> 2021-03-31T22:16:07.8575359Z at
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
> 2021-03-31T22:16:07.8576037Z at
> org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
> 2021-03-31T22:16:07.8576728Z at
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
> 2021-03-31T22:16:07.8577588Z at
> org.junit.runners.ParentRunner.run(ParentRunner.java:363)
> 2021-03-31T22:16:07.8578181Z at
> org.junit.runners.Suite.runChild(Suite.java:128)
> 2021-03-31T22:16:07.8578771Z at
> org.junit.runners.Suite.runChild(Suite.java:27)
> 2021-03-31T22:16:07.8579402Z at
> org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
> 2021-03-31T22:16:07.8580061Z at
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
> 2021-03-31T22:16:07.8580774Z at
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
> 2021-03-31T22:16:07.8581480Z at
> org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
> 2021-03-31T22:16:07.8582148Z at
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
> 2021-03-31T22:16:07.8582896Z at
> org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27)
> 2021-03-31T22:16:07.8583762Z at
> org.junit.rules.ExternalResource$1.evaluate(ExternalResource.java:48)
> 2021-03-31T22:16:07.8584427Z at
> org.junit.rules.RunRules.evaluate(RunRules.java:20)
> 2021-03-31T22:16:07.8585069Z at
> org.junit.runners.ParentRunner.run(ParentRunner.java:363)
> 2021-03-31T22:16:07.8585671Z at
> org.junit.runners.Suite.runChild(Suite.java:128)
> 2021-03-31T22:16:07.8586254Z at
> org.junit.runners.Suite.runChild(Suite.java:27)
> 2021-03-31T22:16:07.8586875Z at
> org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
> 2021-03-31T22:16:07.8587643Z at
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
> 2021-03-31T22:16:07.8779731Z at
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
> 2021-03-31T22:16:07.8780398Z at
> org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
> 2021-03-31T22:16:07.8781024Z at
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
> 2021-03-31T22:16:07.8781702Z at
> org.junit.runners.ParentRunner.run(ParentRunner.java:363)
> 2021-03-31T22:16:07.8782346Z at
> org.apache.maven.surefire.junitcore.JUnitCore.run(JUnitCore.java:55)
> 2021-03-31T22:16:07.8783166Z at
> org.apache.maven.surefire.junitcore.JUnitCoreWrapper.createRequestAndRun(JUnitCoreWrapper.java:137)
> 2021-03-31T22:16:07.8784006Z at
> org.apache.maven.surefire.junitcore.JUnitCoreWrapper.executeEager(JUnitCoreWrapper.java:107)
> 2021-03-31T22:16:07.8784796Z at
> org.apache.maven.surefire.junitcore.JUnitCoreWrapper.execute(JUnitCoreWrapper.java:83)
> 2021-03-31T22:16:07.8785556Z at
> org.apache.maven.surefire.junitcore.JUnitCoreWrapper.execute(JUnitCoreWrapper.java:75)
> 2021-03-31T22:16:07.8786346Z at
> org.apache.maven.surefire.junitcore.JUnitCoreProvider.invoke(JUnitCoreProvider.java:158)
> 2021-03-31T22:16:07.8787299Z at
> org.apache.maven.surefire.booter.ForkedBooter.invokeProviderInSameClassLoader(ForkedBooter.java:384)
> 2021-03-31T22:16:07.8788104Z at
> org.apache.maven.surefire.booter.ForkedBooter.runSuitesInProcess(ForkedBooter.java:345)
> 2021-03-31T22:16:07.8815851Z at
> org.apache.maven.surefire.booter.ForkedBooter.execute(ForkedBooter.java:126)
> 2021-03-31T22:16:07.8816576Z at
> org.apache.maven.surefire.booter.ForkedBooter.main(ForkedBooter.java:418)
> 2021-03-31T22:16:07.8819737Z Caused by:
> java.util.concurrent.CompletionException:
> java.util.concurrent.CompletionException: java.lang.IllegalStateException:
> Failed to rollback to checkpoint/savepoint Checkpoint Metadata. Max
> parallelism mismatch between checkpoint/savepoint state and new program.
> Cannot map operator 20ba6b65f97481d5570070de90e4e791 with max parallelism 13
> to new program with max parallelism 128. This indicates that the program has
> been changed in a non-compatible way after the checkpoint/savepoint.
> 2021-03-31T22:16:07.8821554Z at
> org.apache.flink.runtime.concurrent.FutureUtils.lambda$switchExecutor$23(FutureUtils.java:1362)
> 2021-03-31T22:16:07.8822349Z at
> java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836)
> 2021-03-31T22:16:07.8823178Z at
> java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811)
> 2021-03-31T22:16:07.8823948Z at
> java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
> 2021-03-31T22:16:07.8824698Z at
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:440)
> 2021-03-31T22:16:07.8825485Z at
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:208)
> 2021-03-31T22:16:07.8826318Z at
> org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:77)
> 2021-03-31T22:16:07.8827203Z at
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:158)
> 2021-03-31T22:16:07.8827925Z at
> akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26)
> 2021-03-31T22:16:07.8828561Z at
> akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21)
> 2021-03-31T22:16:07.8829192Z at
> scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123)
> 2021-03-31T22:16:07.8829860Z at
> akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21)
> 2021-03-31T22:16:07.8830536Z at
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:170)
> 2021-03-31T22:16:07.8831187Z at
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
> 2021-03-31T22:16:07.8831853Z at
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
> 2021-03-31T22:16:07.8832477Z at
> akka.actor.Actor$class.aroundReceive(Actor.scala:517)
> 2021-03-31T22:16:07.8833155Z at
> akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225)
> 2021-03-31T22:16:07.8833784Z at
> akka.actor.ActorCell.receiveMessage(ActorCell.scala:592)
> 2021-03-31T22:16:07.8834351Z at
> akka.actor.ActorCell.invoke(ActorCell.scala:561)
> 2021-03-31T22:16:07.8835134Z at
> akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258)
> 2021-03-31T22:16:07.8835693Z at akka.dispatch.Mailbox.run(Mailbox.scala:225)
> 2021-03-31T22:16:07.8836208Z at akka.dispatch.Mailbox.exec(Mailbox.scala:235)
> 2021-03-31T22:16:07.8836805Z at
> akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
> 2021-03-31T22:16:07.8837571Z at
> akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
> 2021-03-31T22:16:07.8838263Z at
> akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
> 2021-03-31T22:16:07.8838962Z at
> akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
> 2021-03-31T22:16:07.8841344Z Caused by:
> java.util.concurrent.CompletionException: java.lang.IllegalStateException:
> Failed to rollback to checkpoint/savepoint Checkpoint Metadata. Max
> parallelism mismatch between checkpoint/savepoint state and new program.
> Cannot map operator 20ba6b65f97481d5570070de90e4e791 with max parallelism 13
> to new program with max parallelism 128. This indicates that the program has
> been changed in a non-compatible way after the checkpoint/savepoint.
> 2021-03-31T22:16:07.8843188Z at
> org.apache.flink.runtime.scheduler.adaptive.BackgroundTask.lambda$new$0(BackgroundTask.java:59)
> 2021-03-31T22:16:07.8843956Z at
> java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
> 2021-03-31T22:16:07.8844829Z at
> java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
> 2021-03-31T22:16:07.8845596Z at
> java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
> 2021-03-31T22:16:07.8846306Z at
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> 2021-03-31T22:16:07.8846964Z at
> java.util.concurrent.FutureTask.run(FutureTask.java:266)
> 2021-03-31T22:16:07.8847833Z at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
> 2021-03-31T22:16:07.8848782Z at
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
> 2021-03-31T22:16:07.8849613Z at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> 2021-03-31T22:16:07.8850360Z at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> 2021-03-31T22:16:07.8850983Z at java.lang.Thread.run(Thread.java:748)
> 2021-03-31T22:16:07.8852976Z Caused by: java.lang.IllegalStateException:
> Failed to rollback to checkpoint/savepoint Checkpoint Metadata. Max
> parallelism mismatch between checkpoint/savepoint state and new program.
> Cannot map operator 20ba6b65f97481d5570070de90e4e791 with max parallelism 13
> to new program with max parallelism 128. This indicates that the program has
> been changed in a non-compatible way after the checkpoint/savepoint.
> 2021-03-31T22:16:07.8854549Z at
> org.apache.flink.runtime.checkpoint.Checkpoints.loadAndValidateCheckpoint(Checkpoints.java:181)
> 2021-03-31T22:16:07.8855424Z at
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator.restoreSavepoint(CheckpointCoordinator.java:1630)
> 2021-03-31T22:16:07.8856440Z at
> org.apache.flink.runtime.scheduler.DefaultExecutionGraphFactory.tryRestoreExecutionGraphFromSavepoint(DefaultExecutionGraphFactory.java:163)
> 2021-03-31T22:16:07.8862100Z at
> org.apache.flink.runtime.scheduler.DefaultExecutionGraphFactory.createAndRestoreExecutionGraph(DefaultExecutionGraphFactory.java:138)
> 2021-03-31T22:16:07.8863316Z at
> org.apache.flink.runtime.scheduler.adaptive.AdaptiveScheduler.createExecutionGraphAndRestoreState(AdaptiveScheduler.java:971)
> 2021-03-31T22:16:07.8864391Z at
> org.apache.flink.runtime.scheduler.adaptive.AdaptiveScheduler.lambda$createExecutionGraphAndRestoreStateAsync$24(AdaptiveScheduler.java:961)
> 2021-03-31T22:16:07.8865366Z at
> org.apache.flink.runtime.scheduler.adaptive.BackgroundTask.lambda$new$0(BackgroundTask.java:57)
> 2021-03-31T22:16:07.8865913Z ... 10 more
> {code}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)