it can also fail with the following message

Exception in thread "main" org.apache.spark.SparkException: Job
aborted due to stage failure: Task 133 in stage 33.1 failed 4 times,
most recent failure: Lost task 133.3 in stage 33.1 (TID 172737,
ip-10-0-25-2.ec2.internal): java.io.IOException: Failed to connect to
ip-10-0-25-2.ec2.internal/10.0.25.2:48048
        at 
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193)
        at 
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156)
        at 
org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88)
        at 
org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
        at 
org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
        at 
org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
        at 
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
        at java.util.concurrent.FutureTask.run(FutureTask.java:262)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)
Caused by: java.net.ConnectException: Connection refused:
ip-10-0-25-2.ec2.internal/10.0.25.2:48048
        at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
        at 
sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)
        at 
io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
        at 
io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
        at 
io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
        at 
io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
        at 
io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
        at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
        at 
io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
        ... 1 more

Driver stacktrace:
        at 
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
        at 
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
        at 
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
        at 
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
        at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
        at 
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
        at 
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
        at 
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
        at scala.Option.foreach(Option.scala:236)
        at 
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
        at 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
        at 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
        at 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
        at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
        at 
org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914)
        at 
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124)
        at 
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
        at 
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
        at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
        at 
org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065)
        at 
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989)
        at 
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
        at 
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
        at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
        at 
org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965)
        at 
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897)
        at 
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
        at 
org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
        at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
        at 
org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896)
        at 
org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430)
        at 
org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
        at 
org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
        at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
        at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
        at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409)
        at 
com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65)
        at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49)
        at com.radius.distiller.Execute$.run(Execute.scala:56)
        at com.radius.distiller.Execute$.main(Execute.scala:33)
        at com.radius.distiller.Execute.main(Execute.scala)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at 
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
        at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:606)
        at 
org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
        at 
org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
        at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
        at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
        at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.IOException: Failed to connect to
ip-10-0-25-2.ec2.internal/10.0.25.2:48048
        at 
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193)
        at 
org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156)
        at 
org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88)
        at 
org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
        at 
org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
        at 
org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
        at 
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
        at java.util.concurrent.FutureTask.run(FutureTask.java:262)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)
Caused by: java.net.ConnectException: Connection refused:
ip-10-0-25-2.ec2.internal/10.0.25.2:48048
        at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
        at 
sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)
        at 
io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
        at 
io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
        at 
io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
        at 
io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
        at 
io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
        at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
        at 
io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
        ... 1 more


On Sun, Dec 20, 2015 at 10:42 AM, Alexander Pivovarov <apivova...@gmail.com>
wrote:

> I run Spark 1.5.2 on YARN (EMR)
>
> I noticed that my long running jobs always failed after 1h 40 min  (6000s)
> with the exceptions below.
>
> Then I found that Spark has spark.akka.heartbeat.pauses=6000s by default
>
> I changed the settings to the following and it solve my issue.
>
> "spark.akka.heartbeat.pauses": "60000s",
> "spark.akka.heartbeat.interval": "10000s"
>
>
>
> RROR ErrorMonitor - Uncaught fatal error from thread 
> [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down 
> ActorSystem [sparkDriver]
> java.lang.OutOfMemoryError: Java heap space
>       at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
>       at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
>       at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
>       at 
> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>       at 
> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>       at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>       at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
>       at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
>       at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>       at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>       at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>       at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>       at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>       at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>       at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>       at 
> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>       at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>       at 
> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>       at 
> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>       at 
> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
> ERROR ActorSystemImpl - Uncaught fatal error from thread 
> [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down 
> ActorSystem [sparkDriver]
> java.lang.OutOfMemoryError: Java heap space
>       at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
>       at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
>       at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
>       at 
> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>       at 
> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>       at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>       at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
>       at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
>       at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>       at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>       at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>       at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>       at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>       at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>       at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>       at 
> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>       at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>       at 
> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>       at 
> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>       at 
> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
> ERROR ActorSystemImpl - Uncaught fatal error from thread 
> [sparkDriver-akka.remote.default-remote-dispatcher-5] shutting down 
> ActorSystem [sparkDriver]
> java.lang.OutOfMemoryError: Java heap space
>       at java.util.Arrays.copyOf(Arrays.java:2271)
>       at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
>       at 
> java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
>       at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
>       at 
> java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1876)
>       at 
> java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1785)
>       at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1188)
>       at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347)
>       at 
> akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply$mcV$sp(Serializer.scala:129)
>       at 
> akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
>       at 
> akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
>       at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>       at akka.serialization.JavaSerializer.toBinary(Serializer.scala:129)
>       at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
>       at 
> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>       at 
> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
>       at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
>       at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
>       at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
>       at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>       at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>       at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>       at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>       at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>       at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>       at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>       at 
> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>       at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>       at 
> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>       at 
> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>       at 
> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
> ERROR ActorSystemImpl - Uncaught fatal error from thread 
> [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down 
> ActorSystem [sparkDriver]
> java.lang.OutOfMemoryError: Java heap space
>       at 
> com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62)
>       at 
> akka.remote.transport.AkkaPduProtobufCodec$.constructMessage(AkkaPduCodec.scala:138)
>       at akka.remote.EndpointWriter.writeSend(Endpoint.scala:740)
>       at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
>       at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
>       at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
>       at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
>       at akka.actor.ActorCell.invoke(ActorCell.scala:487)
>       at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
>       at akka.dispatch.Mailbox.run(Mailbox.scala:220)
>       at 
> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
>       at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
>       at 
> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>       at 
> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>
> at
> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>
>

Reply via email to