[
https://issues.apache.org/jira/browse/FLINK-31138?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17691725#comment-17691725
]
Roman Khachatryan commented on FLINK-31138:
-------------------------------------------
Thanks [~mapohl] and [~fanrui] ,
In the artifacts
(logs-cron_azure-test_cron_azure_finegrained_resource_management-1676692614/mvn-1.log),
I've found the same problem with PartiallyFinishedSourcesITCase at 04:46 as in
FLINK-31133: checkpoint failure -> running for too long -> no space left on
device.
So I'd close this ticket as a duplicate.
However, there are some strange exceptions before that:
[https://dev.azure.com/apache-flink/apache-flink/_build/results?buildId=46283&view=logs&j=4d4a0d10-fca2-5507-8eed-c07f0bdf4887&t=7b25afdf-cc6c-566f-5459-359dc2585798&l=5584]
{code:java}
Feb 18 03:58:47 [INFO] Running
org.apache.flink.core.memory.OffHeapUnsafeMemorySegmentTest
Exception in thread "Thread-13" java.lang.IllegalStateException: MemorySegment
can be freed only once!
at org.apache.flink.core.memory.MemorySegment.free(MemorySegment.java:244)
at java.lang.Thread.run(Thread.java:748)
Exception in thread "Thread-15" java.lang.IllegalStateException: MemorySegment
can be freed only once!
at org.apache.flink.core.memory.MemorySegment.free(MemorySegment.java:244)
at java.lang.Thread.run(Thread.java:748)
Exception in thread "Thread-17" java.lang.IllegalStateException: MemorySegment
can be freed only once!
at org.apache.flink.core.memory.MemorySegment.free(MemorySegment.java:244)
at java.lang.Thread.run(Thread.java:748){code}
Starting at 04:08:
{code:java}
04:32:18,352 [flink-akka.actor.default-dispatcher-8] INFO
org.apache.flink.runtime.dispatcher.StandaloneDispatcher [] - Job
f1ec611893996fc2fc1830697195194b reached terminal state FAILED.
org.apache.flink.runtime.JobException: Recovery is suppressed by
NoRestartBackoffTimeStrategy
at
org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:138)
at
org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler.java:82)
at
org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(DefaultScheduler.java:301)
at
org.apache.flink.runtime.scheduler.DefaultScheduler.maybeHandleTaskFailure(DefaultScheduler.java:291)
at
org.apache.flink.runtime.scheduler.DefaultScheduler.updateTaskExecutionStateInternal(DefaultScheduler.java:282)
at
org.apache.flink.runtime.scheduler.SchedulerBase.updateTaskExecutionState(SchedulerBase.java:739)
at
org.apache.flink.runtime.scheduler.SchedulerNG.updateTaskExecutionState(SchedulerNG.java:78)
at
org.apache.flink.runtime.jobmaster.JobMaster.updateTaskExecutionState(JobMaster.java:443)
at sun.reflect.GeneratedMethodAccessor11.invoke(Unknown Source)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.lambda$handleRpcInvocation$1(AkkaRpcActor.java:304)
at
org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:83)
at
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcInvocation(AkkaRpcActor.java:302)
at
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:217)
at
org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:78)
at
org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:163)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20)
at scala.PartialFunction.applyOrElse(PartialFunction.scala:123)
at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122)
at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
at akka.actor.Actor.aroundReceive(Actor.scala:537)
at akka.actor.Actor.aroundReceive$(Actor.scala:535)
at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:220)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:580)
at akka.actor.ActorCell.invoke(ActorCell.scala:548)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:270)
at akka.dispatch.Mailbox.run(Mailbox.scala:231)
at akka.dispatch.Mailbox.exec(Mailbox.scala:243)
at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289)
at
java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1056)
at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1692)
at
java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:175)
Caused by: java.lang.ArrayIndexOutOfBoundsException: 2
at
org.apache.flink.runtime.taskmanager.RuntimeEnvironment.getInputGate(RuntimeEnvironment.java:274)
at
org.apache.flink.runtime.jobmanager.Tasks.consumeInputs(Tasks.java:111)
at org.apache.flink.runtime.jobmanager.Tasks.access$000(Tasks.java:30)
at
org.apache.flink.runtime.jobmanager.Tasks$AgnosticTertiaryReceiver.invoke(Tasks.java:102)
at
org.apache.flink.runtime.taskmanager.Task.runWithSystemExitMonitoring(Task.java:948)
at
org.apache.flink.runtime.taskmanager.Task.restoreAndInvoke(Task.java:927)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:741)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:563)
{code}
{code:java}
05:45:35,489 [CHAIN DataSource (at
testIncorrectSerializer2(CustomSerializationITCase.java:104)
(org.apache.flink.api.java.io.ParallelIteratorInputFormat)) -> Map (Map at
testIncorrectSerializer 2(CustomSerializationITCase.java:105)) (5/5)#0] INFO
org.apache.flink.runtime.taskmanager.Task [] - Freeing task
resources for CHAIN DataSource (at testIncorrectSerializer2(C
ustomSerializationITCase.java:104)
(org.apache.flink.api.java.io.ParallelIteratorInputFormat)) -> Map (Map at
testIncorrectSerializer2(CustomSerializationITCase.java:105)) (5/5)#0
(2288df9d2da462bd61d2000e88d726ab).
05:45:35,490 [ Partition (4/5)#0] ERROR
org.apache.flink.runtime.operators.BatchTask [] - Error in task
code: Partition (4/5)
java.io.IOException: Serializer consumed more bytes than the record had. This
indicates broken serialization. If you are using custom serialization types
(Value or Writable), check their seriali zation methods. If you are using a
Kryo-serialized type, check the corresponding Kryo serializer.
at
org.apache.flink.runtime.io.network.api.serialization.NonSpanningWrapper.readInto(NonSpanningWrapper.java:339)
~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.runtime.io.network.api.serialization.SpillingAdaptiveSpanningRecordDeserializer.readNonSpanningRecord(SpillingAdaptiveSpanningRecordDeserializer.java:128)
~[flink-run time-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.runtime.io.network.api.serialization.SpillingAdaptiveSpanningRecordDeserializer.readNextRecord(SpillingAdaptiveSpanningRecordDeserializer.java:103)
~[flink-runtime-1. 15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.runtime.io.network.api.serialization.SpillingAdaptiveSpanningRecordDeserializer.getNextRecord(SpillingAdaptiveSpanningRecordDeserializer.java:93)
~[flink-runtime-1.15 -SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.runtime.io.network.api.reader.AbstractRecordReader.getNextRecord(AbstractRecordReader.java:118)
~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.runtime.io.network.api.reader.MutableRecordReader.next(MutableRecordReader.java:48)
~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.runtime.operators.util.ReaderIterator.next(ReaderIterator.java:73)
~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.runtime.operators.NoOpDriver.run(NoOpDriver.java:100)
~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at org.apache.flink.runtime.operators.BatchTask.run(BatchTask.java:514)
~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.runtime.operators.BatchTask.invoke(BatchTask.java:357)
~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.runtime.taskmanager.Task.runWithSystemExitMonitoring(Task.java:948)
[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.runtime.taskmanager.Task.restoreAndInvoke(Task.java:927)
[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:741)
[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:563)
[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at java.lang.Thread.run(Thread.java:748) [?:1.8.0_292]
Caused by: java.lang.IndexOutOfBoundsException: pos: 140625469934036, length:
32941, index: 4, offset: 0
at
org.apache.flink.core.memory.MemorySegment.get(MemorySegment.java:453)
~[flink-core-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.runtime.io.network.api.serialization.NonSpanningWrapper.readFully(NonSpanningWrapper.java:101)
~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.runtime.io.network.api.serialization.NonSpanningWrapper.readFully(NonSpanningWrapper.java:92)
~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.test.misc.CustomSerializationITCase$ConsumesTooMuchSpanning.read(CustomSerializationITCase.java:214)
~[test-classes/:?]
at
org.apache.flink.api.java.typeutils.runtime.ValueSerializer.deserialize(ValueSerializer.java:123)
~[flink-core-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.api.java.typeutils.runtime.ValueSerializer.deserialize(ValueSerializer.java:118)
~[flink-core-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.api.java.typeutils.runtime.ValueSerializer.deserialize(ValueSerializer.java:46)
~[flink-core-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.runtime.plugable.NonReusingDeserializationDelegate.read(NonReusingDeserializationDelegate.java:53)
~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
at
org.apache.flink.runtime.io.network.api.serialization.NonSpanningWrapper.readInto(NonSpanningWrapper.java:337)
~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
... 14 more{code}
cc: [~pnowojski]
> Either StreamCheckpointingITCase/StreamFaultToleranceTestBase or
> EventTimeWindowCheckpointingITCase are timinng out
> -------------------------------------------------------------------------------------------------------------------
>
> Key: FLINK-31138
> URL: https://issues.apache.org/jira/browse/FLINK-31138
> Project: Flink
> Issue Type: Bug
> Components: Runtime / Checkpointing, Runtime / State Backends
> Affects Versions: 1.15.3
> Reporter: Matthias Pohl
> Priority: Critical
> Labels: test-stability
> Attachments:
> logs-cron_azure-test_cron_azure_finegrained_resource_management-1676692614.zip
>
>
> https://dev.azure.com/apache-flink/apache-flink/_build/results?buildId=46283&view=logs&j=4d4a0d10-fca2-5507-8eed-c07f0bdf4887&t=7b25afdf-cc6c-566f-5459-359dc2585798
> This build timed out. The stacktraces revealed multiple tests that might have
> caused this:
> * {{StreamCheckpointingITCase}} through
> {{StreamFaultToleranceTestBase.runCheckpointedProgram}}
> {code}
> [...]
> 2023-02-18T07:37:47.6861582Z - locked <0x0000000083c85250> (a
> java.lang.Object)
> 2023-02-18T07:37:47.6862179Z at
> org.apache.flink.streaming.api.operators.StreamSourceContexts$SwitchingOnClose.collect(StreamSourceContexts.java:103)
> 2023-02-18T07:37:47.6862981Z at
> org.apache.flink.test.checkpointing.StreamCheckpointingITCase$StringGeneratingSourceFunction.run(StreamCheckpointingITCase.java:169)
> 2023-02-18T07:37:47.6863762Z - locked <0x0000000083c85250> (a
> java.lang.Object)
> [...]
> 2023-02-18T07:37:47.7904307Z "main" #1 prio=5 os_prio=0
> tid=0x00007fca5000b800 nid=0x56636 waiting on condition [0x00007fca57c58000]
> 2023-02-18T07:37:47.7904803Z java.lang.Thread.State: WAITING (parking)
> 2023-02-18T07:37:47.7905160Z at sun.misc.Unsafe.park(Native Method)
> 2023-02-18T07:37:47.7905932Z - parking to wait for <0x0000000083c9df48>
> (a java.util.concurrent.CompletableFuture$Signaller)
> 2023-02-18T07:37:47.7906498Z at
> java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
> 2023-02-18T07:37:47.7907074Z at
> java.util.concurrent.CompletableFuture$Signaller.block(CompletableFuture.java:1707)
> 2023-02-18T07:37:47.7907764Z at
> java.util.concurrent.ForkJoinPool.managedBlock(ForkJoinPool.java:3323)
> 2023-02-18T07:37:47.7908457Z at
> java.util.concurrent.CompletableFuture.waitingGet(CompletableFuture.java:1742)
> 2023-02-18T07:37:47.7909019Z at
> java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1908)
> 2023-02-18T07:37:47.7909605Z at
> org.apache.flink.test.util.TestUtils.submitJobAndWaitForResult(TestUtils.java:93)
> 2023-02-18T07:37:47.7910413Z at
> org.apache.flink.test.checkpointing.StreamFaultToleranceTestBase.runCheckpointedProgram(StreamFaultToleranceTestBase.java:134)
> [...]
> {code}
> or {{LocalRecoveryITCase}}/{{EventTimeWindowCheckpointingITCase}}:
> {code}
> [...]
> 2023-02-18T07:37:51.6744983Z "main" #1 prio=5 os_prio=0
> tid=0x00007efc4000b800 nid=0x5645a waiting on condition [0x00007efc49b4e000]
> 2023-02-18T07:37:51.6745471Z java.lang.Thread.State: WAITING (parking)
> 2023-02-18T07:37:51.6745823Z at sun.misc.Unsafe.park(Native Method)
> 2023-02-18T07:37:51.6746482Z - parking to wait for <0x000000008718cce8>
> (a java.util.concurrent.CompletableFuture$Signaller)
> 2023-02-18T07:37:51.6747147Z at
> java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
> 2023-02-18T07:37:51.6747725Z at
> java.util.concurrent.CompletableFuture$Signaller.block(CompletableFuture.java:1707)
> 2023-02-18T07:37:51.6748313Z at
> java.util.concurrent.ForkJoinPool.managedBlock(ForkJoinPool.java:3323)
> 2023-02-18T07:37:51.6748892Z at
> java.util.concurrent.CompletableFuture.waitingGet(CompletableFuture.java:1742)
> 2023-02-18T07:37:51.6749457Z at
> java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1908)
> 2023-02-18T07:37:51.6750118Z at
> org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.java:1989)
> 2023-02-18T07:37:51.6750881Z at
> org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.java:1969)
> 2023-02-18T07:37:51.6751694Z at
> org.apache.flink.test.checkpointing.EventTimeWindowCheckpointingITCase.testSlidingTimeWindow(EventTimeWindowCheckpointingITCase.java:524)
> 2023-02-18T07:37:51.6752476Z at
> org.apache.flink.test.checkpointing.LocalRecoveryITCase.executeTest(LocalRecoveryITCase.java:84)
> 2023-02-18T07:37:51.6753157Z at
> org.apache.flink.test.checkpointing.LocalRecoveryITCase.executeTest(LocalRecoveryITCase.java:66)
> 2023-02-18T07:37:51.6753727Z at
> sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> [...]
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)