[ 
https://issues.apache.org/jira/browse/FLINK-22084?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17316132#comment-17316132
 ] 

Till Rohrmann commented on FLINK-22084:
---------------------------------------

I think this is one of these cases where we try to be super smart instead of 
requiring explicit configuration and failing if the job is not properly 
configured.

Unfortunately, it is now supported by Flink and ideally we don't break this 
behaviour. If we decouple the state restore from the {{ExecutionGraph}} 
creation (or at least move it out of the EG and do it before we create the EG), 
then it should be easy to fix. Now we probably need a workaround until this is 
possible.

> RescalingITCase fails with adaptive scheduler
> ---------------------------------------------
>
>                 Key: FLINK-22084
>                 URL: https://issues.apache.org/jira/browse/FLINK-22084
>             Project: Flink
>          Issue Type: Bug
>          Components: Runtime / Checkpointing, Runtime / Coordination
>    Affects Versions: 1.13.0
>            Reporter: Dawid Wysakowicz
>            Assignee: Austin Cawley-Edwards
>            Priority: Blocker
>              Labels: pull-request-available, test-stability
>             Fix For: 1.13.0
>
>
> https://dev.azure.com/apache-flink/apache-flink/_build/results?buildId=15934&view=logs&j=8fd9202e-fd17-5b26-353c-ac1ff76c8f28&t=a0a633b8-47ef-5c5a-2806-3c13b9e48228&l=4472
> {code}
> 2021-03-31T22:16:07.8416407Z [ERROR] 
> testSavepointRescalingOutKeyedStateDerivedMaxParallelism[backend = 
> rocksdb](org.apache.flink.test.checkpointing.RescalingITCase)  Time elapsed: 
> 9.945 s  <<< ERROR!
> 2021-03-31T22:16:07.8417534Z 
> org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
> 2021-03-31T22:16:07.8418516Z  at 
> org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144)
> 2021-03-31T22:16:07.8419281Z  at 
> org.apache.flink.test.util.TestUtils.submitJobAndWaitForResult(TestUtils.java:63)
> 2021-03-31T22:16:07.8420142Z  at 
> org.apache.flink.test.checkpointing.RescalingITCase.testSavepointRescalingKeyedState(RescalingITCase.java:251)
> 2021-03-31T22:16:07.8421173Z  at 
> org.apache.flink.test.checkpointing.RescalingITCase.testSavepointRescalingOutKeyedStateDerivedMaxParallelism(RescalingITCase.java:168)
> 2021-03-31T22:16:07.8421985Z  at 
> sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> 2021-03-31T22:16:07.8422651Z  at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> 2021-03-31T22:16:07.8423649Z  at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> 2021-03-31T22:16:07.8424231Z  at 
> java.lang.reflect.Method.invoke(Method.java:498)
> 2021-03-31T22:16:07.8424657Z  at 
> org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50)
> 2021-03-31T22:16:07.8425147Z  at 
> org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
> 2021-03-31T22:16:07.8425609Z  at 
> org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47)
> 2021-03-31T22:16:07.8426183Z  at 
> org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
> 2021-03-31T22:16:07.8569060Z  at 
> org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26)
> 2021-03-31T22:16:07.8569781Z  at 
> org.apache.flink.util.TestNameProvider$1.evaluate(TestNameProvider.java:45)
> 2021-03-31T22:16:07.8570451Z  at 
> org.junit.rules.TestWatcher$1.evaluate(TestWatcher.java:55)
> 2021-03-31T22:16:07.8571040Z  at 
> org.junit.rules.RunRules.evaluate(RunRules.java:20)
> 2021-03-31T22:16:07.8571604Z  at 
> org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:325)
> 2021-03-31T22:16:07.8572303Z  at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:78)
> 2021-03-31T22:16:07.8573259Z  at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:57)
> 2021-03-31T22:16:07.8573975Z  at 
> org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
> 2021-03-31T22:16:07.8574660Z  at 
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
> 2021-03-31T22:16:07.8575359Z  at 
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
> 2021-03-31T22:16:07.8576037Z  at 
> org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
> 2021-03-31T22:16:07.8576728Z  at 
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
> 2021-03-31T22:16:07.8577588Z  at 
> org.junit.runners.ParentRunner.run(ParentRunner.java:363)
> 2021-03-31T22:16:07.8578181Z  at 
> org.junit.runners.Suite.runChild(Suite.java:128)
> 2021-03-31T22:16:07.8578771Z  at 
> org.junit.runners.Suite.runChild(Suite.java:27)
> 2021-03-31T22:16:07.8579402Z  at 
> org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
> 2021-03-31T22:16:07.8580061Z  at 
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
> 2021-03-31T22:16:07.8580774Z  at 
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
> 2021-03-31T22:16:07.8581480Z  at 
> org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
> 2021-03-31T22:16:07.8582148Z  at 
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
> 2021-03-31T22:16:07.8582896Z  at 
> org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27)
> 2021-03-31T22:16:07.8583762Z  at 
> org.junit.rules.ExternalResource$1.evaluate(ExternalResource.java:48)
> 2021-03-31T22:16:07.8584427Z  at 
> org.junit.rules.RunRules.evaluate(RunRules.java:20)
> 2021-03-31T22:16:07.8585069Z  at 
> org.junit.runners.ParentRunner.run(ParentRunner.java:363)
> 2021-03-31T22:16:07.8585671Z  at 
> org.junit.runners.Suite.runChild(Suite.java:128)
> 2021-03-31T22:16:07.8586254Z  at 
> org.junit.runners.Suite.runChild(Suite.java:27)
> 2021-03-31T22:16:07.8586875Z  at 
> org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
> 2021-03-31T22:16:07.8587643Z  at 
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
> 2021-03-31T22:16:07.8779731Z  at 
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
> 2021-03-31T22:16:07.8780398Z  at 
> org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
> 2021-03-31T22:16:07.8781024Z  at 
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
> 2021-03-31T22:16:07.8781702Z  at 
> org.junit.runners.ParentRunner.run(ParentRunner.java:363)
> 2021-03-31T22:16:07.8782346Z  at 
> org.apache.maven.surefire.junitcore.JUnitCore.run(JUnitCore.java:55)
> 2021-03-31T22:16:07.8783166Z  at 
> org.apache.maven.surefire.junitcore.JUnitCoreWrapper.createRequestAndRun(JUnitCoreWrapper.java:137)
> 2021-03-31T22:16:07.8784006Z  at 
> org.apache.maven.surefire.junitcore.JUnitCoreWrapper.executeEager(JUnitCoreWrapper.java:107)
> 2021-03-31T22:16:07.8784796Z  at 
> org.apache.maven.surefire.junitcore.JUnitCoreWrapper.execute(JUnitCoreWrapper.java:83)
> 2021-03-31T22:16:07.8785556Z  at 
> org.apache.maven.surefire.junitcore.JUnitCoreWrapper.execute(JUnitCoreWrapper.java:75)
> 2021-03-31T22:16:07.8786346Z  at 
> org.apache.maven.surefire.junitcore.JUnitCoreProvider.invoke(JUnitCoreProvider.java:158)
> 2021-03-31T22:16:07.8787299Z  at 
> org.apache.maven.surefire.booter.ForkedBooter.invokeProviderInSameClassLoader(ForkedBooter.java:384)
> 2021-03-31T22:16:07.8788104Z  at 
> org.apache.maven.surefire.booter.ForkedBooter.runSuitesInProcess(ForkedBooter.java:345)
> 2021-03-31T22:16:07.8815851Z  at 
> org.apache.maven.surefire.booter.ForkedBooter.execute(ForkedBooter.java:126)
> 2021-03-31T22:16:07.8816576Z  at 
> org.apache.maven.surefire.booter.ForkedBooter.main(ForkedBooter.java:418)
> 2021-03-31T22:16:07.8819737Z Caused by: 
> java.util.concurrent.CompletionException: 
> java.util.concurrent.CompletionException: java.lang.IllegalStateException: 
> Failed to rollback to checkpoint/savepoint Checkpoint Metadata. Max 
> parallelism mismatch between checkpoint/savepoint state and new program. 
> Cannot map operator 20ba6b65f97481d5570070de90e4e791 with max parallelism 13 
> to new program with max parallelism 128. This indicates that the program has 
> been changed in a non-compatible way after the checkpoint/savepoint.
> 2021-03-31T22:16:07.8821554Z  at 
> org.apache.flink.runtime.concurrent.FutureUtils.lambda$switchExecutor$23(FutureUtils.java:1362)
> 2021-03-31T22:16:07.8822349Z  at 
> java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836)
> 2021-03-31T22:16:07.8823178Z  at 
> java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811)
> 2021-03-31T22:16:07.8823948Z  at 
> java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
> 2021-03-31T22:16:07.8824698Z  at 
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:440)
> 2021-03-31T22:16:07.8825485Z  at 
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:208)
> 2021-03-31T22:16:07.8826318Z  at 
> org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:77)
> 2021-03-31T22:16:07.8827203Z  at 
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:158)
> 2021-03-31T22:16:07.8827925Z  at 
> akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26)
> 2021-03-31T22:16:07.8828561Z  at 
> akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21)
> 2021-03-31T22:16:07.8829192Z  at 
> scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123)
> 2021-03-31T22:16:07.8829860Z  at 
> akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21)
> 2021-03-31T22:16:07.8830536Z  at 
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:170)
> 2021-03-31T22:16:07.8831187Z  at 
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
> 2021-03-31T22:16:07.8831853Z  at 
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
> 2021-03-31T22:16:07.8832477Z  at 
> akka.actor.Actor$class.aroundReceive(Actor.scala:517)
> 2021-03-31T22:16:07.8833155Z  at 
> akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225)
> 2021-03-31T22:16:07.8833784Z  at 
> akka.actor.ActorCell.receiveMessage(ActorCell.scala:592)
> 2021-03-31T22:16:07.8834351Z  at 
> akka.actor.ActorCell.invoke(ActorCell.scala:561)
> 2021-03-31T22:16:07.8835134Z  at 
> akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258)
> 2021-03-31T22:16:07.8835693Z  at akka.dispatch.Mailbox.run(Mailbox.scala:225)
> 2021-03-31T22:16:07.8836208Z  at akka.dispatch.Mailbox.exec(Mailbox.scala:235)
> 2021-03-31T22:16:07.8836805Z  at 
> akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
> 2021-03-31T22:16:07.8837571Z  at 
> akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
> 2021-03-31T22:16:07.8838263Z  at 
> akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
> 2021-03-31T22:16:07.8838962Z  at 
> akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
> 2021-03-31T22:16:07.8841344Z Caused by: 
> java.util.concurrent.CompletionException: java.lang.IllegalStateException: 
> Failed to rollback to checkpoint/savepoint Checkpoint Metadata. Max 
> parallelism mismatch between checkpoint/savepoint state and new program. 
> Cannot map operator 20ba6b65f97481d5570070de90e4e791 with max parallelism 13 
> to new program with max parallelism 128. This indicates that the program has 
> been changed in a non-compatible way after the checkpoint/savepoint.
> 2021-03-31T22:16:07.8843188Z  at 
> org.apache.flink.runtime.scheduler.adaptive.BackgroundTask.lambda$new$0(BackgroundTask.java:59)
> 2021-03-31T22:16:07.8843956Z  at 
> java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
> 2021-03-31T22:16:07.8844829Z  at 
> java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
> 2021-03-31T22:16:07.8845596Z  at 
> java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
> 2021-03-31T22:16:07.8846306Z  at 
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> 2021-03-31T22:16:07.8846964Z  at 
> java.util.concurrent.FutureTask.run(FutureTask.java:266)
> 2021-03-31T22:16:07.8847833Z  at 
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
> 2021-03-31T22:16:07.8848782Z  at 
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
> 2021-03-31T22:16:07.8849613Z  at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> 2021-03-31T22:16:07.8850360Z  at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> 2021-03-31T22:16:07.8850983Z  at java.lang.Thread.run(Thread.java:748)
> 2021-03-31T22:16:07.8852976Z Caused by: java.lang.IllegalStateException: 
> Failed to rollback to checkpoint/savepoint Checkpoint Metadata. Max 
> parallelism mismatch between checkpoint/savepoint state and new program. 
> Cannot map operator 20ba6b65f97481d5570070de90e4e791 with max parallelism 13 
> to new program with max parallelism 128. This indicates that the program has 
> been changed in a non-compatible way after the checkpoint/savepoint.
> 2021-03-31T22:16:07.8854549Z  at 
> org.apache.flink.runtime.checkpoint.Checkpoints.loadAndValidateCheckpoint(Checkpoints.java:181)
> 2021-03-31T22:16:07.8855424Z  at 
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator.restoreSavepoint(CheckpointCoordinator.java:1630)
> 2021-03-31T22:16:07.8856440Z  at 
> org.apache.flink.runtime.scheduler.DefaultExecutionGraphFactory.tryRestoreExecutionGraphFromSavepoint(DefaultExecutionGraphFactory.java:163)
> 2021-03-31T22:16:07.8862100Z  at 
> org.apache.flink.runtime.scheduler.DefaultExecutionGraphFactory.createAndRestoreExecutionGraph(DefaultExecutionGraphFactory.java:138)
> 2021-03-31T22:16:07.8863316Z  at 
> org.apache.flink.runtime.scheduler.adaptive.AdaptiveScheduler.createExecutionGraphAndRestoreState(AdaptiveScheduler.java:971)
> 2021-03-31T22:16:07.8864391Z  at 
> org.apache.flink.runtime.scheduler.adaptive.AdaptiveScheduler.lambda$createExecutionGraphAndRestoreStateAsync$24(AdaptiveScheduler.java:961)
> 2021-03-31T22:16:07.8865366Z  at 
> org.apache.flink.runtime.scheduler.adaptive.BackgroundTask.lambda$new$0(BackgroundTask.java:57)
> 2021-03-31T22:16:07.8865913Z  ... 10 more
> {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to