java.lang.Exception: The slot in which the task was scheduled has been killed (probably loss of TaskManager). at org.apache.flink.runtime.instance.SimpleSlot.cancel(SimpleSlot.java:98) at org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroupAssignment.releaseSimpleSlot(SlotSharingGroupAssignment.java:320) at org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroupAssignment.releaseSharedSlot(SlotSharingGroupAssignment.java:304) at org.apache.flink.runtime.instance.SharedSlot.releaseSlot(SharedSlot.java:106) at org.apache.flink.runtime.instance.Instance.markDead(Instance.java:148) at org.apache.flink.runtime.instance.InstanceManager.shutdown(InstanceManager.java:111) at org.apache.flink.runtime.jobmanager.JobManager.postStop(JobManager.scala:132) at org.apache.flink.runtime.jobmanager.JobManager$$anonfun$main$1$$anon$1.org$apache$flink$runtime$jobmanager$WithWebServer$$super$postStop(JobManager.scala:559) at org.apache.flink.runtime.jobmanager.WithWebServer$class.postStop(WithWebServer.scala:38) at org.apache.flink.runtime.jobmanager.JobManager$$anonfun$main$1$$anon$1.postStop(JobManager.scala:559)
at akka.actor.Actor$class.preRestart(Actor.scala:533)
at
org.apache.flink.runtime.jobmanager.JobManager.preRestart(JobManager.scala:80)
at akka.actor.Actor$class.aroundPreRestart(Actor.scala:480)
at
org.apache.flink.runtime.jobmanager.JobManager.aroundPreRestart(JobManager.scala:80)
at
akka.actor.dungeon.FaultHandling$class.faultRecreate(FaultHandling.scala:67)
at akka.actor.ActorCell.faultRecreate(ActorCell.scala:369)
at akka.actor.ActorCell.invokeAll$1(ActorCell.scala:459)
at akka.actor.ActorCell.systemInvoke(ActorCell.scala:478)
at akka.dispatch.Mailbox.processAllSystemMessages(Mailbox.scala:279)
at akka.dispatch.Mailbox.run(Mailbox.scala:220)
at akka.dispatch.Mailbox.exec(Mailbox.scala:231)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at
scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
the following is an exempt from the jobmanager log:10:47:13,567 ERROR akka.actor.OneForOneStrategy - Received unknown message RequestArchivedJobs
java.lang.RuntimeException: Received unknown message RequestArchivedJobsat org.apache.flink.runtime.jobmanager.JobManager.unhandled(JobManager.scala:510)
at akka.actor.Actor$$anonfun$aroundReceive$1.apply(Actor.scala:465)
at akka.actor.Actor$$anonfun$aroundReceive$1.apply(Actor.scala:465)
at scala.PartialFunction$class.applyOrElse(PartialFunction.scala:118)
at
org.apache.flink.runtime.ActorLogMessages$$anon$1.applyOrElse(ActorLogMessages.scala:30)
at akka.actor.Actor$class.aroundReceive(Actor.scala:465)
at
org.apache.flink.runtime.jobmanager.JobManager.aroundReceive(JobManager.scala:80)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
at akka.actor.ActorCell.invoke(ActorCell.scala:487)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:254)
at akka.dispatch.Mailbox.run(Mailbox.scala:221)
at akka.dispatch.Mailbox.exec(Mailbox.scala:231)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at
scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
10:47:13,569 INFO
org.apache.flink.runtime.jobmanager.JobManager$$anonfun$main$1$$anon$1 -
Stopping webserver.
10:47:13,620 WARN
org.eclipse.jetty.util.log - /jobsInfo
org.eclipse.jetty.io.RuntimeIOException: org.eclipse.jetty.io.EofExceptionat org.eclipse.jetty.io.UncheckedPrintWriter.setError(UncheckedPrintWriter.java:107) at org.eclipse.jetty.io.UncheckedPrintWriter.write(UncheckedPrintWriter.java:280) at org.eclipse.jetty.io.UncheckedPrintWriter.write(UncheckedPrintWriter.java:295) at org.eclipse.jetty.io.UncheckedPrintWriter.print(UncheckedPrintWriter.java:460) at org.apache.flink.runtime.jobmanager.web.JobManagerInfoServlet.doGet(JobManagerInfoServlet.java:158)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:734)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:847)
at
org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:532)
at
org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
at
org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:227)
at
org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:965)
at
org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:388)
at
org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:187)
at
org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:901)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:117)
at
org.eclipse.jetty.server.handler.HandlerList.handle(HandlerList.java:47)
at
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:113)
at org.eclipse.jetty.server.Server.handle(Server.java:352)
at
org.eclipse.jetty.server.HttpConnection.handleRequest(HttpConnection.java:596)
at
org.eclipse.jetty.server.HttpConnection$RequestHandler.headerComplete(HttpConnection.java:1048)
at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:549)
at
org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:211)
at
org.eclipse.jetty.server.HttpConnection.handle(HttpConnection.java:425)
at
org.eclipse.jetty.io.nio.SelectChannelEndPoint.run(SelectChannelEndPoint.java:489)
at
org.eclipse.jetty.util.thread.QueuedThreadPool$2.run(QueuedThreadPool.java:436)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.eclipse.jetty.io.EofException
at org.eclipse.jetty.server.HttpOutput.write(HttpOutput.java:142)
at org.eclipse.jetty.server.HttpOutput.write(HttpOutput.java:86)
at
java.io.ByteArrayOutputStream.writeTo(ByteArrayOutputStream.java:167)
at org.eclipse.jetty.server.HttpWriter.write(HttpWriter.java:258)
at org.eclipse.jetty.server.HttpWriter.write(HttpWriter.java:107)
at
org.eclipse.jetty.io.UncheckedPrintWriter.write(UncheckedPrintWriter.java:271)
... 24 more
10:47:13,623 INFO
org.apache.flink.runtime.jobmanager.JobManager$$anonfun$main$1$$anon$1 -
Stopped webserver.
10:47:13,624 INFO
org.apache.flink.runtime.jobmanager.JobManager$$anonfun$main$1$$anon$1 -
Stopping job manager akka://flink/user/jobmanager.
10:47:13,624 INFO
org.apache.flink.runtime.jobmanager.JobManager$$anonfun$main$1$$anon$1 -
Starting job manager at akka://flink/user/jobmanager.
10:47:13,625 INFO
org.apache.flink.runtime.blob.BlobServer - Started
BLOB server on port 34038
10:47:13,625 INFO
org.apache.flink.runtime.blob.BlobServer - Created
BLOB server storage directory
/tmp/blobStore-88f5ebb0-15e2-47a6-ad56-fb2970d83ee2
10:47:13,626 INFO
org.apache.flink.runtime.jobmanager.web.WebInfoServer - Setting
up web info server, using web-root directoryjar:file: ...
10:47:13,626 INFO
org.apache.flink.runtime.jobmanager.JobManager$$anonfun$main$1$$anon$1 -
Started job manager. Waiting for incoming messages.
10:47:13,626 INFO
org.apache.flink.runtime.jobmanager.web.WebInfoServer - Web info
server will display information about flink job-manager on ...
10:47:13,627 INFO
org.apache.flink.runtime.jobmanager.web.WebInfoServer - Starting
web info server for JobManager on port ...
10:47:13,627 INFO
org.eclipse.jetty.util.log -
jetty-0.9-SNAPSHOT
10:47:13,738 INFO
org.eclipse.jetty.util.log - Started
[email protected]:8082
10:47:14,032 ERROR
org.apache.flink.runtime.jobmanager.JobManager$$anonfun$main$1$$anon$1 -
Cannot find execution graph for job ID 1e0c0e5a1a7a741a1182791ea597ba7e.
10:47:14,068 ERROR
org.apache.flink.runtime.jobmanager.JobManager$$anonfun$main$1$$anon$1 -
Cannot find execution graph for job ID 1e0c0e5a1a7a741a1182791ea597ba7e.
10:47:14,069 ERROR
org.apache.flink.runtime.jobmanager.JobManager$$anonfun$main$1$$anon$1 -
Cannot find execution graph for job ID 1e0c0e5a1a7a741a1182791ea597ba7e.
10:47:14,107 ERROR
org.apache.flink.runtime.jobmanager.JobManager$$anonfun$main$1$$anon$1 -
Cannot find execution graph for job ID 1e0c0e5a1a7a741a1182791ea597ba7e.
On 05.02.2015 11:09, Till Rohrmann wrote:
I checked and indeed the scheduleOrUpdateConsumers method can throw an IllegalStateException without properly handling such an exception on the JobManager level. It is a design decision of Scala not to complain about unhandled exceptions which are otherwise properly annotated in Java code. We should definitely pay attention in Scala to properly handle thrown exceptions of Java code. On Thu, Feb 5, 2015 at 10:42 AM, Stephan Ewen <[email protected]> wrote:I suspect that this is one of the cases where an exception in an actor causes the actor to die (here the job manager) On Thu, Feb 5, 2015 at 10:40 AM, Till Rohrmann <[email protected]> wrote:It looks to me that the TaskManager does not receive a ConsumerNotificationResult after having send theScheduleOrUpdateConsumersmessage. This can either mean that something went wrong in ExecutionGraph.scheduleOrUpdateConsumers method or the connection was disassociated for some reasons. The logs would indeed be very helpful to understand what happened. On Thu, Feb 5, 2015 at 10:36 AM, Ufuk Celebi <[email protected]> wrote:Hey Chesnay, I will look into it. Can you share the complete LOGs? – Ufuk On 04 Feb 2015, at 14:49, Chesnay Schepler <[email protected]>wrote:Hello, I'm trying to run python jobs with the latest master on a cluster andget the following exception:Error: The program execution failed: JobManager not reachableanymore.Terminate waiting for job answer.org.apache.flink.client.program.ProgramInvocationException: Theprogramexecution failed: JobManager not reachable anymore. Terminate waitingforjob answer.at org.apache.flink.client.program.Client.run(Client.java:345) at org.apache.flink.client.program.Client.run(Client.java:304) at org.apache.flink.client.program.Client.run(Client.java:298) atorg.apache.flink.client.program.ContextEnvironment.execute(ContextEnvironment.java:55)atorg.apache.flink.api.java.ExecutionEnvironment.execute(ExecutionEnvironment.java:677)atorg.apache.flink.languagebinding.api.java.python.PythonPlanBinder.runPlan(PythonPlanBinder.java:106)atorg.apache.flink.languagebinding.api.java.python.PythonPlanBinder.main(PythonPlanBinder.java:79)at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) atsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)atsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)at java.lang.reflect.Method.invoke(Method.java:606) atorg.apache.flink.client.program.PackagedProgram.callMainMethod(PackagedProgram.java:437)atorg.apache.flink.client.program.PackagedProgram.invokeInteractiveModeForExecution(PackagedProgram.java:353)at org.apache.flink.client.program.Client.run(Client.java:250) atorg.apache.flink.client.CliFrontend.executeProgram(CliFrontend.java:387)at org.apache.flink.client.CliFrontend.run(CliFrontend.java:356) atorg.apache.flink.client.CliFrontend.parseParameters(CliFrontend.java:1066)at org.apache.flink.client.CliFrontend.main(CliFrontend.java:1090) In the jobmanager log file i find this exception: java.lang.IllegalStateException: Buffer has already been recycled. atorg.apache.flink.shaded.com.google.common.base.Preconditions.checkState(Preconditions.java:176)atorg.apache.flink.runtime.io.network.buffer.Buffer.ensureNotRecycled(Buffer.java:131)atorg.apache.flink.runtime.io.network.buffer.Buffer.setSize(Buffer.java:95)atorg.apache.flink.runtime.io.network.api.serialization.SpanningRecordSerializer.getCurrentBuffer(SpanningRecordSerializer.java:151)atorg.apache.flink.runtime.io.network.api.writer.RecordWriter.clearBuffers(RecordWriter.java:158)atorg.apache.flink.runtime.operators.RegularPactTask.clearWriters(RegularPactTask.java:1533)atorg.apache.flink.runtime.operators.RegularPactTask.invoke(RegularPactTask.java:367)atorg.apache.flink.runtime.execution.RuntimeEnvironment.run(RuntimeEnvironment.java:204)at java.lang.Thread.run(Thread.java:745) the same exception is in the task manager logs, along with thefollowingone:java.util.concurrent.TimeoutException: Futures timed out after [100seconds]atscala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:219)atscala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223)atscala.concurrent.Await$$anonfun$result$1.apply(package.scala:107)atscala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53)at scala.concurrent.Await$.result(package.scala:107) atorg.apache.flink.runtime.akka.AkkaUtils$.ask(AkkaUtils.scala:265)atorg.apache.flink.runtime.akka.AkkaUtils.ask(AkkaUtils.scala)atorg.apache.flink.runtime.io.network.partition.IntermediateResultPartition.scheduleOrUpdateConsumers(IntermediateResultPartition.java:247)atorg.apache.flink.runtime.io.network.partition.IntermediateResultPartition.maybeNotifyConsumers(IntermediateResultPartition.java:240)atorg.apache.flink.runtime.io.network.partition.IntermediateResultPartition.add(IntermediateResultPartition.java:144)atorg.apache.flink.runtime.io.network.api.writer.BufferWriter.writeBuffer(BufferWriter.java:74)atorg.apache.flink.runtime.io.network.api.writer.RecordWriter.emit(RecordWriter.java:91)atorg.apache.flink.runtime.operators.shipping.OutputCollector.collect(OutputCollector.java:88)atorg.apache.flink.languagebinding.api.java.common.streaming.Receiver.collectBuffer(Receiver.java:253)atorg.apache.flink.languagebinding.api.java.common.streaming.Streamer.streamBufferWithoutGroups(Streamer.java:193)atorg.apache.flink.languagebinding.api.java.python.functions.PythonMapPartition.mapPartition(PythonMapPartition.java:54)atorg.apache.flink.runtime.operators.MapPartitionDriver.run(MapPartitionDriver.java:98)atorg.apache.flink.runtime.operators.RegularPactTask.run(RegularPactTask.java:496)atorg.apache.flink.runtime.operators.RegularPactTask.invoke(RegularPactTask.java:360)atorg.apache.flink.runtime.execution.RuntimeEnvironment.run(RuntimeEnvironment.java:204)at java.lang.Thread.run(Thread.java:745)
