it can also fail with the following message Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 133 in stage 33.1 failed 4 times, most recent failure: Lost task 133.3 in stage 33.1 (TID 172737, ip-10-0-25-2.ec2.internal): java.io.IOException: Failed to connect to ip-10-0-25-2.ec2.internal/10.0.25.2:48048 at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193) at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156) at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88) at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140) at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43) at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471) at java.util.concurrent.FutureTask.run(FutureTask.java:262) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) Caused by: java.net.ConnectException: Connection refused: ip-10-0-25-2.ec2.internal/10.0.25.2:48048 at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744) at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224) at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289) at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528) at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468) at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354) at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111) ... 1 more
Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914) at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124) at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065) at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065) at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989) at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965) at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965) at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897) at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897) at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896) at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430) at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409) at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409) at com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65) at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49) at com.radius.distiller.Execute$.run(Execute.scala:56) at com.radius.distiller.Execute$.main(Execute.scala:33) at com.radius.distiller.Execute.main(Execute.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674) at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) Caused by: java.io.IOException: Failed to connect to ip-10-0-25-2.ec2.internal/10.0.25.2:48048 at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193) at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156) at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88) at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140) at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43) at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471) at java.util.concurrent.FutureTask.run(FutureTask.java:262) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) Caused by: java.net.ConnectException: Connection refused: ip-10-0-25-2.ec2.internal/10.0.25.2:48048 at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744) at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224) at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289) at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528) at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468) at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354) at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111) ... 1 more On Sun, Dec 20, 2015 at 10:42 AM, Alexander Pivovarov <apivova...@gmail.com> wrote: > I run Spark 1.5.2 on YARN (EMR) > > I noticed that my long running jobs always failed after 1h 40 min (6000s) > with the exceptions below. > > Then I found that Spark has spark.akka.heartbeat.pauses=6000s by default > > I changed the settings to the following and it solve my issue. > > "spark.akka.heartbeat.pauses": "60000s", > "spark.akka.heartbeat.interval": "10000s" > > > > RROR ErrorMonitor - Uncaught fatal error from thread > [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down > ActorSystem [sparkDriver] > java.lang.OutOfMemoryError: Java heap space > at com.google.protobuf.ByteString.copyFrom(ByteString.java:192) > at com.google.protobuf.ByteString.copyFrom(ByteString.java:204) > at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36) > at > akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) > at > akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) > at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) > at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842) > at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743) > at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) > at akka.actor.Actor$class.aroundReceive(Actor.scala:467) > at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) > at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) > at akka.actor.ActorCell.invoke(ActorCell.scala:487) > at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) > at akka.dispatch.Mailbox.run(Mailbox.scala:220) > at > akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) > at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > at > scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) > at > scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > at > scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) > ERROR ActorSystemImpl - Uncaught fatal error from thread > [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down > ActorSystem [sparkDriver] > java.lang.OutOfMemoryError: Java heap space > at com.google.protobuf.ByteString.copyFrom(ByteString.java:192) > at com.google.protobuf.ByteString.copyFrom(ByteString.java:204) > at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36) > at > akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) > at > akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) > at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) > at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842) > at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743) > at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) > at akka.actor.Actor$class.aroundReceive(Actor.scala:467) > at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) > at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) > at akka.actor.ActorCell.invoke(ActorCell.scala:487) > at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) > at akka.dispatch.Mailbox.run(Mailbox.scala:220) > at > akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) > at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > at > scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) > at > scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > at > scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) > ERROR ActorSystemImpl - Uncaught fatal error from thread > [sparkDriver-akka.remote.default-remote-dispatcher-5] shutting down > ActorSystem [sparkDriver] > java.lang.OutOfMemoryError: Java heap space > at java.util.Arrays.copyOf(Arrays.java:2271) > at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118) > at > java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93) > at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153) > at > java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1876) > at > java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1785) > at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1188) > at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347) > at > akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply$mcV$sp(Serializer.scala:129) > at > akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129) > at > akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129) > at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) > at akka.serialization.JavaSerializer.toBinary(Serializer.scala:129) > at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36) > at > akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) > at > akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) > at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) > at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842) > at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743) > at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) > at akka.actor.Actor$class.aroundReceive(Actor.scala:467) > at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) > at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) > at akka.actor.ActorCell.invoke(ActorCell.scala:487) > at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) > at akka.dispatch.Mailbox.run(Mailbox.scala:220) > at > akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) > at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > at > scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) > at > scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > at > scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) > ERROR ActorSystemImpl - Uncaught fatal error from thread > [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down > ActorSystem [sparkDriver] > java.lang.OutOfMemoryError: Java heap space > at > com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62) > at > akka.remote.transport.AkkaPduProtobufCodec$.constructMessage(AkkaPduCodec.scala:138) > at akka.remote.EndpointWriter.writeSend(Endpoint.scala:740) > at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) > at akka.actor.Actor$class.aroundReceive(Actor.scala:467) > at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) > at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) > at akka.actor.ActorCell.invoke(ActorCell.scala:487) > at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) > at akka.dispatch.Mailbox.run(Mailbox.scala:220) > at > akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) > at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > at > scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) > at > scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > > at > scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) > >