[ 
https://issues.apache.org/jira/browse/FLINK-22173?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17338400#comment-17338400
 ] 

Roman Khachatryan edited comment on FLINK-22173 at 5/3/21, 2:48 PM:
--------------------------------------------------------------------

A recent failure (different stacktrace though):
[https://dev.azure.com/apache-flink/apache-flink/_build/results?buildId=17468&view=logs&j=34f41360-6c0d-54d3-11a1-0292a2def1d9&t=2d56e022-1ace-542f-bf1a-b37dd63243f2&l=9772]
{code}
Apr 30 14:29:59 Caused by: 
org.apache.flink.shaded.netty4.io.netty.util.IllegalReferenceCountException: 
refCnt: 0, in crement: 1 
Apr 30 14:29:59    at 
org.apache.flink.shaded.netty4.io.netty.util.internal.ReferenceCountUpdater.retain0(ReferenceCo
 untUpdater.java:123) 
Apr 30 14:29:59    at 
org.apache.flink.shaded.netty4.io.netty.util.internal.ReferenceCountUpdater.retain(ReferenceCou
 ntUpdater.java:110) 
Apr 30 14:29:59    at 
org.apache.flink.shaded.netty4.io.netty.buffer.AbstractReferenceCountedByteBuf.retain(AbstractR
 eferenceCountedByteBuf.java:80) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.io.network.buffer.NetworkBuffer.retainBuffer(NetworkBuffer.java:166)
 
Apr 30 14:29:59    at 
org.apache.flink.runtime.io.network.buffer.NetworkBuffer.retainBuffer(NetworkBuffer.java:47)
 
Apr 30 14:29:59    at 
org.apache.flink.runtime.io.network.buffer.BufferConsumer.copy(BufferConsumer.java:143)
 
Apr 30 14:29:59    at 
org.apache.flink.runtime.io.network.buffer.BufferConsumer.toDebugString(BufferConsumer.java:202
 ) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.io.network.logger.NetworkActionsLogger.traceRecover(NetworkActionsLogg
 er.java:94) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.io.network.partition.PipelinedSubpartition.addRecovered(PipelinedSubpa
 rtition.java:142) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.checkpoint.channel.ResultSubpartitionRecoveredStateHandler.recover(Rec
 overedChannelStateHandler.java:195) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.checkpoint.channel.ResultSubpartitionRecoveredStateHandler.recover(Rec
 overedChannelStateHandler.java:144) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.checkpoint.channel.ChannelStateChunkReader.readChunk(SequentialChannel
 StateReaderImpl.java:207) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.checkpoint.channel.SequentialChannelStateReaderImpl.readSequentially(S
 equentialChannelStateReaderImpl.java:107) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.checkpoint.channel.SequentialChannelStateReaderImpl.read(SequentialCha
 nnelStateReaderImpl.java:93) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.checkpoint.channel.SequentialChannelStateReaderImpl.readOutputData(Seq
 uentialChannelStateReaderImpl.java:79) 
Apr 30 14:29:59    at 
org.apache.flink.streaming.runtime.tasks.StreamTask.restoreGates(StreamTask.java:571)
 
Apr 30 14:29:59    at 
org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor$1.call(StreamTaskActionExecut
 or.java:55) 
Apr 30 14:29:59    at 
org.apache.flink.streaming.runtime.tasks.StreamTask.restore(StreamTask.java:554)
 
Apr 30 14:29:59    at 
org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:757) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.taskmanager.Task.run(Task.java:564) 
Apr 30 14:29:59    at java.lang.Thread.run(Thread.java:748)
{code}

Commits from master up to 89c6c03660a88a648bbd13b4e6696124fe46d013


was (Author: roman_khachatryan):
A recent failure (with commits in master up to 
89c6c03660a88a648bbd13b4e6696124fe46d013):
[https://dev.azure.com/apache-flink/apache-flink/_build/results?buildId=17468&view=logs&j=34f41360-6c0d-54d3-11a1-0292a2def1d9&t=2d56e022-1ace-542f-bf1a-b37dd63243f2&l=9772]
{code}
Apr 30 14:29:59 Caused by: 
org.apache.flink.shaded.netty4.io.netty.util.IllegalReferenceCountException: 
refCnt: 0, in crement: 1 
Apr 30 14:29:59    at 
org.apache.flink.shaded.netty4.io.netty.util.internal.ReferenceCountUpdater.retain0(ReferenceCo
 untUpdater.java:123) 
Apr 30 14:29:59    at 
org.apache.flink.shaded.netty4.io.netty.util.internal.ReferenceCountUpdater.retain(ReferenceCou
 ntUpdater.java:110) 
Apr 30 14:29:59    at 
org.apache.flink.shaded.netty4.io.netty.buffer.AbstractReferenceCountedByteBuf.retain(AbstractR
 eferenceCountedByteBuf.java:80) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.io.network.buffer.NetworkBuffer.retainBuffer(NetworkBuffer.java:166)
 
Apr 30 14:29:59    at 
org.apache.flink.runtime.io.network.buffer.NetworkBuffer.retainBuffer(NetworkBuffer.java:47)
 
Apr 30 14:29:59    at 
org.apache.flink.runtime.io.network.buffer.BufferConsumer.copy(BufferConsumer.java:143)
 
Apr 30 14:29:59    at 
org.apache.flink.runtime.io.network.buffer.BufferConsumer.toDebugString(BufferConsumer.java:202
 ) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.io.network.logger.NetworkActionsLogger.traceRecover(NetworkActionsLogg
 er.java:94) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.io.network.partition.PipelinedSubpartition.addRecovered(PipelinedSubpa
 rtition.java:142) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.checkpoint.channel.ResultSubpartitionRecoveredStateHandler.recover(Rec
 overedChannelStateHandler.java:195) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.checkpoint.channel.ResultSubpartitionRecoveredStateHandler.recover(Rec
 overedChannelStateHandler.java:144) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.checkpoint.channel.ChannelStateChunkReader.readChunk(SequentialChannel
 StateReaderImpl.java:207) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.checkpoint.channel.SequentialChannelStateReaderImpl.readSequentially(S
 equentialChannelStateReaderImpl.java:107) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.checkpoint.channel.SequentialChannelStateReaderImpl.read(SequentialCha
 nnelStateReaderImpl.java:93) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.checkpoint.channel.SequentialChannelStateReaderImpl.readOutputData(Seq
 uentialChannelStateReaderImpl.java:79) 
Apr 30 14:29:59    at 
org.apache.flink.streaming.runtime.tasks.StreamTask.restoreGates(StreamTask.java:571)
 
Apr 30 14:29:59    at 
org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor$1.call(StreamTaskActionExecut
 or.java:55) 
Apr 30 14:29:59    at 
org.apache.flink.streaming.runtime.tasks.StreamTask.restore(StreamTask.java:554)
 
Apr 30 14:29:59    at 
org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:757) 
Apr 30 14:29:59    at 
org.apache.flink.runtime.taskmanager.Task.run(Task.java:564) 
Apr 30 14:29:59    at java.lang.Thread.run(Thread.java:748)
{code}


> UnalignedCheckpointRescaleITCase fails on azure
> -----------------------------------------------
>
>                 Key: FLINK-22173
>                 URL: https://issues.apache.org/jira/browse/FLINK-22173
>             Project: Flink
>          Issue Type: Bug
>          Components: Runtime / Checkpointing
>    Affects Versions: 1.13.0
>            Reporter: Dawid Wysakowicz
>            Assignee: Arvid Heise
>            Priority: Critical
>              Labels: test-stability
>             Fix For: 1.13.0
>
>
> https://dev.azure.com/apache-flink/apache-flink/_build/results?buildId=16232&view=logs&j=d8d26c26-7ec2-5ed2-772e-7a1a1eb8317c&t=be5fb08e-1ad7-563c-4f1a-a97ad4ce4865&l=9628
> {code}
> 2021-04-08T23:25:56.3131361Z [ERROR] Tests run: 31, Failures: 0, Errors: 1, 
> Skipped: 0, Time elapsed: 839.623 s <<< FAILURE! - in 
> org.apache.flink.test.checkpointing.UnalignedCheckpointRescaleITCase
> 2021-04-08T23:25:56.3132784Z [ERROR] shouldRescaleUnalignedCheckpoint[no 
> scale union from 7 to 
> 7](org.apache.flink.test.checkpointing.UnalignedCheckpointRescaleITCase)  
> Time elapsed: 607.467 s  <<< ERROR!
> 2021-04-08T23:25:56.3133586Z 
> org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
> 2021-04-08T23:25:56.3134070Z  at 
> org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144)
> 2021-04-08T23:25:56.3134643Z  at 
> org.apache.flink.test.checkpointing.UnalignedCheckpointTestBase.execute(UnalignedCheckpointTestBase.java:168)
> 2021-04-08T23:25:56.3135577Z  at 
> org.apache.flink.test.checkpointing.UnalignedCheckpointRescaleITCase.shouldRescaleUnalignedCheckpoint(UnalignedCheckpointRescaleITCase.java:368)
> 2021-04-08T23:25:56.3138843Z  at 
> sun.reflect.GeneratedMethodAccessor93.invoke(Unknown Source)
> 2021-04-08T23:25:56.3139402Z  at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> 2021-04-08T23:25:56.3139880Z  at 
> java.lang.reflect.Method.invoke(Method.java:498)
> 2021-04-08T23:25:56.3140328Z  at 
> org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50)
> 2021-04-08T23:25:56.3140844Z  at 
> org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
> 2021-04-08T23:25:56.3141768Z  at 
> org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47)
> 2021-04-08T23:25:56.3142272Z  at 
> org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
> 2021-04-08T23:25:56.3142706Z  at 
> org.junit.rules.Verifier$1.evaluate(Verifier.java:35)
> 2021-04-08T23:25:56.3143142Z  at 
> org.junit.rules.ExternalResource$1.evaluate(ExternalResource.java:48)
> 2021-04-08T23:25:56.3143608Z  at 
> org.apache.flink.util.TestNameProvider$1.evaluate(TestNameProvider.java:45)
> 2021-04-08T23:25:56.3144039Z  at 
> org.junit.rules.TestWatcher$1.evaluate(TestWatcher.java:55)
> 2021-04-08T23:25:56.3144434Z  at 
> org.junit.rules.RunRules.evaluate(RunRules.java:20)
> 2021-04-08T23:25:56.3145027Z  at 
> org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:325)
> 2021-04-08T23:25:56.3145484Z  at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:78)
> 2021-04-08T23:25:56.3145981Z  at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:57)
> 2021-04-08T23:25:56.3146421Z  at 
> org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
> 2021-04-08T23:25:56.3146843Z  at 
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
> 2021-04-08T23:25:56.3147274Z  at 
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
> 2021-04-08T23:25:56.3147692Z  at 
> org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
> 2021-04-08T23:25:56.3148116Z  at 
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
> 2021-04-08T23:25:56.3148543Z  at 
> org.junit.runners.ParentRunner.run(ParentRunner.java:363)
> 2021-04-08T23:25:56.3148930Z  at 
> org.junit.runners.Suite.runChild(Suite.java:128)
> 2021-04-08T23:25:56.3149298Z  at 
> org.junit.runners.Suite.runChild(Suite.java:27)
> 2021-04-08T23:25:56.3149663Z  at 
> org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
> 2021-04-08T23:25:56.3150075Z  at 
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
> 2021-04-08T23:25:56.3150488Z  at 
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
> 2021-04-08T23:25:56.3151148Z  at 
> org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
> 2021-04-08T23:25:56.3151691Z  at 
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
> 2021-04-08T23:25:56.3152115Z  at 
> org.junit.rules.ExternalResource$1.evaluate(ExternalResource.java:48)
> 2021-04-08T23:25:56.3152534Z  at 
> org.junit.rules.RunRules.evaluate(RunRules.java:20)
> 2021-04-08T23:25:56.3152919Z  at 
> org.junit.runners.ParentRunner.run(ParentRunner.java:363)
> 2021-04-08T23:25:56.3153349Z  at 
> org.apache.maven.surefire.junit4.JUnit4Provider.execute(JUnit4Provider.java:365)
> 2021-04-08T23:25:56.3154029Z  at 
> org.apache.maven.surefire.junit4.JUnit4Provider.executeWithRerun(JUnit4Provider.java:273)
> 2021-04-08T23:25:56.3154670Z  at 
> org.apache.maven.surefire.junit4.JUnit4Provider.executeTestSet(JUnit4Provider.java:238)
> 2021-04-08T23:25:56.3155183Z  at 
> org.apache.maven.surefire.junit4.JUnit4Provider.invoke(JUnit4Provider.java:159)
> 2021-04-08T23:25:56.3155715Z  at 
> org.apache.maven.surefire.booter.ForkedBooter.invokeProviderInSameClassLoader(ForkedBooter.java:384)
> 2021-04-08T23:25:56.3156250Z  at 
> org.apache.maven.surefire.booter.ForkedBooter.runSuitesInProcess(ForkedBooter.java:345)
> 2021-04-08T23:25:56.3156749Z  at 
> org.apache.maven.surefire.booter.ForkedBooter.execute(ForkedBooter.java:126)
> 2021-04-08T23:25:56.3157343Z  at 
> org.apache.maven.surefire.booter.ForkedBooter.main(ForkedBooter.java:418)
> 2021-04-08T23:25:56.3157928Z Caused by: 
> org.apache.flink.runtime.JobException: Recovery is suppressed by 
> FixedDelayRestartBackoffTimeStrategy(maxNumberRestartAttempts=1, 
> backoffTimeMS=100)
> 2021-04-08T23:25:56.3158627Z  at 
> org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:138)
> 2021-04-08T23:25:56.3159356Z  at 
> org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.getGlobalFailureHandlingResult(ExecutionFailureHandler.java:101)
> 2021-04-08T23:25:56.3160053Z  at 
> org.apache.flink.runtime.scheduler.DefaultScheduler.handleGlobalFailure(DefaultScheduler.java:227)
> 2021-04-08T23:25:56.3160721Z  at 
> org.apache.flink.runtime.scheduler.UpdateSchedulerNgOnInternalFailuresListener.notifyGlobalFailure(UpdateSchedulerNgOnInternalFailuresListener.java:57)
> 2021-04-08T23:25:56.3161721Z  at 
> org.apache.flink.runtime.executiongraph.DefaultExecutionGraph.failGlobal(DefaultExecutionGraph.java:973)
> 2021-04-08T23:25:56.3162331Z  at 
> org.apache.flink.runtime.executiongraph.DefaultExecutionGraph$1.lambda$failJob$0(DefaultExecutionGraph.java:412)
> 2021-04-08T23:25:56.3162910Z  at 
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:440)
> 2021-04-08T23:25:56.3163435Z  at 
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:208)
> 2021-04-08T23:25:56.3164057Z  at 
> org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:77)
> 2021-04-08T23:25:56.3164599Z  at 
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:158)
> 2021-04-08T23:25:56.3165052Z  at 
> akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26)
> 2021-04-08T23:25:56.3165471Z  at 
> akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21)
> 2021-04-08T23:25:56.3165895Z  at 
> scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123)
> 2021-04-08T23:25:56.3166322Z  at 
> akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21)
> 2021-04-08T23:25:56.3166767Z  at 
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:170)
> 2021-04-08T23:25:56.3167194Z  at 
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
> 2021-04-08T23:25:56.3167636Z  at 
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
> 2021-04-08T23:25:56.3168056Z  at 
> akka.actor.Actor$class.aroundReceive(Actor.scala:517)
> 2021-04-08T23:25:56.3168447Z  at 
> akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225)
> 2021-04-08T23:25:56.3168867Z  at 
> akka.actor.ActorCell.receiveMessage(ActorCell.scala:592)
> 2021-04-08T23:25:56.3169254Z  at 
> akka.actor.ActorCell.invoke(ActorCell.scala:561)
> 2021-04-08T23:25:56.3169616Z  at 
> akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258)
> 2021-04-08T23:25:56.3169991Z  at akka.dispatch.Mailbox.run(Mailbox.scala:225)
> 2021-04-08T23:25:56.3170328Z  at akka.dispatch.Mailbox.exec(Mailbox.scala:235)
> 2021-04-08T23:25:56.3170716Z  at 
> akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
> 2021-04-08T23:25:56.3171420Z  at 
> akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
> 2021-04-08T23:25:56.3171886Z  at 
> akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
> 2021-04-08T23:25:56.3172349Z  at 
> akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
> 2021-04-08T23:25:56.3172851Z Caused by: 
> org.apache.flink.util.FlinkRuntimeException: Exceeded checkpoint tolerable 
> failure threshold.
> 2021-04-08T23:25:56.3173453Z  at 
> org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleCheckpointException(CheckpointFailureManager.java:98)
> 2021-04-08T23:25:56.3174129Z  at 
> org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleJobLevelCheckpointException(CheckpointFailureManager.java:67)
> 2021-04-08T23:25:56.3174771Z  at 
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoordinator.java:1935)
> 2021-04-08T23:25:56.3175481Z  at 
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoordinator.java:1907)
> 2021-04-08T23:25:56.3176078Z  at 
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator.access$600(CheckpointCoordinator.java:95)
> 2021-04-08T23:25:56.3176658Z  at 
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator$CheckpointCanceller.run(CheckpointCoordinator.java:1991)
> 2021-04-08T23:25:56.3177209Z  at 
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> 2021-04-08T23:25:56.3177627Z  at 
> java.util.concurrent.FutureTask.run(FutureTask.java:266)
> 2021-04-08T23:25:56.3178141Z  at 
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
> 2021-04-08T23:25:56.3178766Z  at 
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
> 2021-04-08T23:25:56.3179308Z  at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> 2021-04-08T23:25:56.3179799Z  at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> 2021-04-08T23:25:56.3180204Z  at java.lang.Thread.run(Thread.java:748)
> {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to