Would you mind copying this information into a JIRA ticket to make it easier to discover / track? Thanks!
On Sun, Dec 20, 2015 at 11:35 AM Alexander Pivovarov <apivova...@gmail.com> wrote: > Usually Spark EMR job fails with the following exception in 1 hour 40 min > - Job cancelled because SparkContext was shut down > > java.util.concurrent.RejectedExecutionException: Task > scala.concurrent.impl.CallbackRunnable@2d602a14 rejected from > java.util.concurrent.ThreadPoolExecutor@46a9e52[Terminated, pool size = 0, > active threads = 0, queued tasks = 0, completed tasks = 6294] > at > java.util.concurrent.ThreadPoolExecutor$AbortPolicy.rejectedExecution(ThreadPoolExecutor.java:2048) > at > java.util.concurrent.ThreadPoolExecutor.reject(ThreadPoolExecutor.java:821) > at > java.util.concurrent.ThreadPoolExecutor.execute(ThreadPoolExecutor.java:1372) > at > scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133) > at > scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) > at > scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) > at scala.concurrent.Promise$class.complete(Promise.scala:55) > at > scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153) > at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:324) > at scala.concurrent.Future$$anonfun$recover$1.apply(Future.scala:324) > at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32) > at > org.spark-project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293) > at > scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133) > at > scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) > at > scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) > at scala.concurrent.Promise$class.complete(Promise.scala:55) > at > scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153) > at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235) > at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235) > at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32) > at > scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.processBatch$1(Future.scala:643) > at > scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply$mcV$sp(Future.scala:658) > at > scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply(Future.scala:635) > at > scala.concurrent.Future$InternalCallbackExecutor$Batch$$anonfun$run$1.apply(Future.scala:635) > at > scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:72) > at > scala.concurrent.Future$InternalCallbackExecutor$Batch.run(Future.scala:634) > at > scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694) > at > scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:685) > at > scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) > at > scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) > at scala.concurrent.Promise$class.complete(Promise.scala:55) > at > scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153) > at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:249) > at scala.concurrent.Future$$anonfun$flatMap$1.apply(Future.scala:249) > at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32) > at > org.spark-project.guava.util.concurrent.MoreExecutors$SameThreadExecutorService.execute(MoreExecutors.java:293) > at > scala.concurrent.impl.ExecutionContextImpl$$anon$1.execute(ExecutionContextImpl.scala:133) > at > scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) > at > scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) > at > akka.pattern.PromiseActorRef$$anonfun$1.apply$mcV$sp(AskSupport.scala:334) > at akka.actor.Scheduler$$anon$7.run(Scheduler.scala:117) > at > scala.concurrent.Future$InternalCallbackExecutor$.scala$concurrent$Future$InternalCallbackExecutor$$unbatchedExecute(Future.scala:694) > at > scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:691) > at > akka.actor.LightArrayRevolverScheduler$TaskHolder.executeTask(Scheduler.scala:467) > at > akka.actor.LightArrayRevolverScheduler$$anon$8.executeBucket$1(Scheduler.scala:419) > at > akka.actor.LightArrayRevolverScheduler$$anon$8.nextTick(Scheduler.scala:423) > at > akka.actor.LightArrayRevolverScheduler$$anon$8.run(Scheduler.scala:375) > at java.lang.Thread.run(Thread.java:745) > Exception in thread "main" org.apache.spark.SparkException: Job cancelled > because SparkContext was shut down > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:703) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:702) > at scala.collection.mutable.HashSet.foreach(HashSet.scala:79) > at > org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:702) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1514) > at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84) > at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1438) > at > org.apache.spark.SparkContext$$anonfun$stop$7.apply$mcV$sp(SparkContext.scala:1724) > at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1185) > at org.apache.spark.SparkContext.stop(SparkContext.scala:1723) > at > org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:146) > at > org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944) > at org.apache.spark.rdd.RDD$$anonfun$fold$1.apply(RDD.scala:1063) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) > at org.apache.spark.rdd.RDD.fold(RDD.scala:1057) > at > org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply$mcD$sp(DoubleRDDFunctions.scala:34) > at > org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply(DoubleRDDFunctions.scala:34) > at > org.apache.spark.rdd.DoubleRDDFunctions$$anonfun$sum$1.apply(DoubleRDDFunctions.scala:34) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) > at > org.apache.spark.rdd.DoubleRDDFunctions.sum(DoubleRDDFunctions.scala:33) > at > com.radius.core.util.SparkUtils$.estimateNewPartitionsNum(SparkUtils.scala:41) > at com.radius.core.util.SparkUtils$.coalesceRdd(SparkUtils.scala:35) > at com.radius.distiller.Distiller.saveExtract(Distiller.scala:75) > at com.radius.distiller.Execute$.run(Execute.scala:55) > at com.radius.distiller.Execute$.main(Execute.scala:29) > at com.radius.distiller.Execute.main(Execute.scala) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:606) > at > org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674) > at > org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) > at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) > at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120) > at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) > Command exiting with ret '1' > > > On Sun, Dec 20, 2015 at 11:29 AM, Alexander Pivovarov < > apivova...@gmail.com> wrote: > >> Or this message >> >> Exception in thread "main" org.apache.spark.SparkException: Job cancelled >> because SparkContext was shut down >> at >> org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:703) >> at >> org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:702) >> at scala.collection.mutable.HashSet.foreach(HashSet.scala:79) >> at >> org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:702) >> at >> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1514) >> at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84) >> at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1438) >> at >> org.apache.spark.SparkContext$$anonfun$stop$7.apply$mcV$sp(SparkContext.scala:1724) >> at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1185) >> at org.apache.spark.SparkContext.stop(SparkContext.scala:1723) >> at >> org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:146) >> at >> org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567) >> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824) >> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837) >> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >> at >> org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >> at >> org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897) >> at >> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >> at >> org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896) >> at >> org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430) >> at >> org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409) >> at >> org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >> at >> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >> at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409) >> at >> com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65) >> at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49) >> at com.radius.distiller.Execute$.run(Execute.scala:56) >> at com.radius.distiller.Execute$.main(Execute.scala:33) >> at com.radius.distiller.Execute.main(Execute.scala) >> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) >> at >> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) >> at >> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) >> at java.lang.reflect.Method.invoke(Method.java:606) >> at >> org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674) >> at >> org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) >> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) >> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120) >> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) >> >> >> On Sun, Dec 20, 2015 at 11:28 AM, Alexander Pivovarov < >> apivova...@gmail.com> wrote: >> >>> it can also fail with the following message >>> >>> Exception in thread "main" org.apache.spark.SparkException: Job aborted due >>> to stage failure: Task 133 in stage 33.1 failed 4 times, most recent >>> failure: Lost task 133.3 in stage 33.1 (TID 172737, >>> ip-10-0-25-2.ec2.internal): java.io.IOException: Failed to connect to >>> ip-10-0-25-2.ec2.internal/10.0.25.2:48048 >>> at >>> org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193) >>> at >>> org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156) >>> at >>> org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88) >>> at >>> org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140) >>> at >>> org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43) >>> at >>> org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170) >>> at >>> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471) >>> at java.util.concurrent.FutureTask.run(FutureTask.java:262) >>> at >>> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) >>> at >>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) >>> at java.lang.Thread.run(Thread.java:745) >>> Caused by: java.net.ConnectException: Connection refused: >>> ip-10-0-25-2.ec2.internal/10.0.25.2:48048 >>> at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) >>> at >>> sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744) >>> at >>> io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224) >>> at >>> io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289) >>> at >>> io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528) >>> at >>> io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468) >>> at >>> io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382) >>> at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354) >>> at >>> io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111) >>> ... 1 more >>> >>> Driver stacktrace: >>> at >>> org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283) >>> at >>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271) >>> at >>> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270) >>> at >>> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) >>> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) >>> at >>> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270) >>> at >>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697) >>> at >>> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697) >>> at scala.Option.foreach(Option.scala:236) >>> at >>> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697) >>> at >>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496) >>> at >>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458) >>> at >>> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447) >>> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) >>> at >>> org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567) >>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824) >>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837) >>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >>> at >>> org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >>> at >>> org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897) >>> at >>> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >>> at >>> org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896) >>> at >>> org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430) >>> at >>> org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409) >>> at >>> org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147) >>> at >>> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108) >>> at org.apache.spark.rdd.RDD.withScope(RDD.scala:310) >>> at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409) >>> at >>> com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65) >>> at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49) >>> at com.radius.distiller.Execute$.run(Execute.scala:56) >>> at com.radius.distiller.Execute$.main(Execute.scala:33) >>> at com.radius.distiller.Execute.main(Execute.scala) >>> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) >>> at >>> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) >>> at >>> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) >>> at java.lang.reflect.Method.invoke(Method.java:606) >>> at >>> org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674) >>> at >>> org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180) >>> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205) >>> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120) >>> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) >>> Caused by: java.io.IOException: Failed to connect to >>> ip-10-0-25-2.ec2.internal/10.0.25.2:48048 >>> at >>> org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193) >>> at >>> org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156) >>> at >>> org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88) >>> at >>> org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140) >>> at >>> org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43) >>> at >>> org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170) >>> at >>> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471) >>> at java.util.concurrent.FutureTask.run(FutureTask.java:262) >>> at >>> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) >>> at >>> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) >>> at java.lang.Thread.run(Thread.java:745) >>> Caused by: java.net.ConnectException: Connection refused: >>> ip-10-0-25-2.ec2.internal/10.0.25.2:48048 >>> at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) >>> at >>> sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744) >>> at >>> io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224) >>> at >>> io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289) >>> at >>> io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528) >>> at >>> io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468) >>> at >>> io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382) >>> at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354) >>> at >>> io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111) >>> ... 1 more >>> >>> >>> On Sun, Dec 20, 2015 at 10:42 AM, Alexander Pivovarov < >>> apivova...@gmail.com> wrote: >>> >>>> I run Spark 1.5.2 on YARN (EMR) >>>> >>>> I noticed that my long running jobs always failed after 1h 40 min >>>> (6000s) with the exceptions below. >>>> >>>> Then I found that Spark has spark.akka.heartbeat.pauses=6000s by default >>>> >>>> I changed the settings to the following and it solve my issue. >>>> >>>> "spark.akka.heartbeat.pauses": "60000s", >>>> "spark.akka.heartbeat.interval": "10000s" >>>> >>>> >>>> >>>> RROR ErrorMonitor - Uncaught fatal error from thread >>>> [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down >>>> ActorSystem [sparkDriver] >>>> java.lang.OutOfMemoryError: Java heap space >>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192) >>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204) >>>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36) >>>> at >>>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>>> at >>>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) >>>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842) >>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743) >>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) >>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467) >>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) >>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) >>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487) >>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) >>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220) >>>> at >>>> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) >>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >>>> at >>>> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >>>> at >>>> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >>>> at >>>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >>>> ERROR ActorSystemImpl - Uncaught fatal error from thread >>>> [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down >>>> ActorSystem [sparkDriver] >>>> java.lang.OutOfMemoryError: Java heap space >>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:192) >>>> at com.google.protobuf.ByteString.copyFrom(ByteString.java:204) >>>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36) >>>> at >>>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>>> at >>>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) >>>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842) >>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743) >>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) >>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467) >>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) >>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) >>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487) >>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) >>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220) >>>> at >>>> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) >>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >>>> at >>>> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >>>> at >>>> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >>>> at >>>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >>>> ERROR ActorSystemImpl - Uncaught fatal error from thread >>>> [sparkDriver-akka.remote.default-remote-dispatcher-5] shutting down >>>> ActorSystem [sparkDriver] >>>> java.lang.OutOfMemoryError: Java heap space >>>> at java.util.Arrays.copyOf(Arrays.java:2271) >>>> at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118) >>>> at >>>> java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93) >>>> at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153) >>>> at >>>> java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1876) >>>> at >>>> java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1785) >>>> at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1188) >>>> at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347) >>>> at >>>> akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply$mcV$sp(Serializer.scala:129) >>>> at >>>> akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129) >>>> at >>>> akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129) >>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) >>>> at akka.serialization.JavaSerializer.toBinary(Serializer.scala:129) >>>> at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36) >>>> at >>>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>>> at >>>> akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843) >>>> at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57) >>>> at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842) >>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743) >>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) >>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467) >>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) >>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) >>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487) >>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) >>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220) >>>> at >>>> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) >>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >>>> at >>>> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >>>> at >>>> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >>>> at >>>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >>>> ERROR ActorSystemImpl - Uncaught fatal error from thread >>>> [sparkDriver-akka.remote.default-remote-dispatcher-6] shutting down >>>> ActorSystem [sparkDriver] >>>> java.lang.OutOfMemoryError: Java heap space >>>> at >>>> com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62) >>>> at >>>> akka.remote.transport.AkkaPduProtobufCodec$.constructMessage(AkkaPduCodec.scala:138) >>>> at akka.remote.EndpointWriter.writeSend(Endpoint.scala:740) >>>> at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718) >>>> at akka.actor.Actor$class.aroundReceive(Actor.scala:467) >>>> at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411) >>>> at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) >>>> at akka.actor.ActorCell.invoke(ActorCell.scala:487) >>>> at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238) >>>> at akka.dispatch.Mailbox.run(Mailbox.scala:220) >>>> at >>>> akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397) >>>> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) >>>> at >>>> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) >>>> at >>>> scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) >>>> >>>> at >>>> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) >>>> >>>> >>> >>> >> >