[ 
https://issues.apache.org/jira/browse/SPARK-3687?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14147504#comment-14147504
 ] 

Ziv Huang commented on SPARK-3687:
----------------------------------

The following is the jstack dump of one executor when it hangs:

"File appending thread for 
/opt/spark-1.1.0-bin-hadoop2.4/work/app-20140925150845-0007/2/stderr" daemon 
prio=10 tid=0x00007ffe0c002800 nid=0x18a3 runnable [0x00007ffebc402000]
   java.lang.Thread.State: RUNNABLE
        at java.io.FileInputStream.readBytes(Native Method)
        at java.io.FileInputStream.read(FileInputStream.java:272)
        at java.io.BufferedInputStream.read1(BufferedInputStream.java:273)
        at java.io.BufferedInputStream.read(BufferedInputStream.java:334)
        - locked <0x00000000faeee1d0> (a 
java.lang.UNIXProcess$ProcessPipeInputStream)
        at java.io.FilterInputStream.read(FilterInputStream.java:107)
        at 
org.apache.spark.util.logging.FileAppender.appendStreamToFile(FileAppender.scala:70)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply$mcV$sp(FileAppender.scala:39)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply(FileAppender.scala:39)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply(FileAppender.scala:39)
        at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1311)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1.run(FileAppender.scala:38)

"File appending thread for 
/opt/spark-1.1.0-bin-hadoop2.4/work/app-20140925150845-0007/2/stdout" daemon 
prio=10 tid=0x00007ffe0c004000 nid=0x18a2 runnable [0x00007ffebc503000]
   java.lang.Thread.State: RUNNABLE
        at java.io.FileInputStream.readBytes(Native Method)
        at java.io.FileInputStream.read(FileInputStream.java:272)
        at java.io.BufferedInputStream.read1(BufferedInputStream.java:273)
        at java.io.BufferedInputStream.read(BufferedInputStream.java:334)
        - locked <0x00000000faeec108> (a 
java.lang.UNIXProcess$ProcessPipeInputStream)
        at java.io.FilterInputStream.read(FilterInputStream.java:107)
        at 
org.apache.spark.util.logging.FileAppender.appendStreamToFile(FileAppender.scala:70)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply$mcV$sp(FileAppender.scala:39)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply(FileAppender.scala:39)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply(FileAppender.scala:39)
        at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1311)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1.run(FileAppender.scala:38)

"process reaper" daemon prio=10 tid=0x00007ffe0c001000 nid=0x1868 runnable 
[0x00007ffecc0c7000]
   java.lang.Thread.State: RUNNABLE
        at java.lang.UNIXProcess.waitForProcessExit(Native Method)
        at java.lang.UNIXProcess.access$500(UNIXProcess.java:54)
        at java.lang.UNIXProcess$4.run(UNIXProcess.java:227)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)

"ExecutorRunner for app-20140925150845-0007/2" daemon prio=10 
tid=0x00007ffe7011b800 nid=0x1866 in Object.wait() [0x00007ffebc705000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        - waiting on <0x00000000faee9df8> (a java.lang.UNIXProcess)
        at java.lang.Object.wait(Object.java:503)
        at java.lang.UNIXProcess.waitFor(UNIXProcess.java:263)
        - locked <0x00000000faee9df8> (a java.lang.UNIXProcess)
        at 
org.apache.spark.deploy.worker.ExecutorRunner.fetchAndRunExecutor(ExecutorRunner.scala:164)
        at 
org.apache.spark.deploy.worker.ExecutorRunner$$anon$1.run(ExecutorRunner.scala:63)

"Attach Listener" daemon prio=10 tid=0x00007ffe84001000 nid=0x170f waiting on 
condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"sparkWorker-akka.actor.default-dispatcher-16" daemon prio=10 
tid=0x00007ffe68214800 nid=0x13a3 waiting on condition [0x00007ffebc806000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000fd614a78> (a 
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinPool)
        at scala.concurrent.forkjoin.ForkJoinPool.scan(ForkJoinPool.java:2075)
        at 
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)

"sparkWorker-akka.actor.default-dispatcher-15" daemon prio=10 
tid=0x00007ffe7011e000 nid=0x13a2 waiting on condition [0x00007ffebc604000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000fd614a78> (a 
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinPool)
        at scala.concurrent.forkjoin.ForkJoinPool.scan(ForkJoinPool.java:2075)
        at 
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)

"sparkWorker-akka.actor.default-dispatcher-14" daemon prio=10 
tid=0x00007ffe68213800 nid=0x13a1 waiting on condition [0x00007ffebca08000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000fd614a78> (a 
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinPool)
        at scala.concurrent.forkjoin.ForkJoinPool.scan(ForkJoinPool.java:2075)
        at 
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)

"sparkWorker-akka.actor.default-dispatcher-13" daemon prio=10 
tid=0x00007ffe70116000 nid=0x7216 waiting on condition [0x00007ffebcb09000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000fd614a78> (a 
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinPool)
        at scala.concurrent.forkjoin.ForkJoinPool.scan(ForkJoinPool.java:2075)
        at 
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)

"Hashed wheel timer #1" daemon prio=10 tid=0x00007ffe58001000 nid=0x7215 
waiting on condition [0x00007ffebcc0a000]
   java.lang.Thread.State: TIMED_WAITING (sleeping)
        at java.lang.Thread.sleep(Native Method)
        at 
org.jboss.netty.util.HashedWheelTimer$Worker.waitForNextTick(HashedWheelTimer.java:503)
        at 
org.jboss.netty.util.HashedWheelTimer$Worker.run(HashedWheelTimer.java:401)
        at 
org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
        at java.lang.Thread.run(Thread.java:745)

"qtp627106512-29" daemon prio=10 tid=0x00007ffe6812a800 nid=0x7214 waiting on 
condition [0x00007ffebcd0b000]
   java.lang.Thread.State: TIMED_WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000fd8617c0> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at 
java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:226)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2082)
        at 
org.eclipse.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:342)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool.idleJobPoll(QueuedThreadPool.java:526)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool.access$600(QueuedThreadPool.java:44)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:572)
        at java.lang.Thread.run(Thread.java:745)

"qtp627106512-28" daemon prio=10 tid=0x00007ffe68128800 nid=0x7213 waiting on 
condition [0x00007ffebce0c000]
   java.lang.Thread.State: TIMED_WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000fd8617c0> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at 
java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:226)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2082)
        at 
org.eclipse.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:342)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool.idleJobPoll(QueuedThreadPool.java:526)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool.access$600(QueuedThreadPool.java:44)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:572)
        at java.lang.Thread.run(Thread.java:745)

"qtp627106512-27" daemon prio=10 tid=0x00007ffe68126000 nid=0x7212 waiting on 
condition [0x00007ffebcf0d000]
   java.lang.Thread.State: TIMED_WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000fd8617c0> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at 
java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:226)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2082)
        at 
org.eclipse.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:342)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool.idleJobPoll(QueuedThreadPool.java:526)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool.access$600(QueuedThreadPool.java:44)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:572)
        at java.lang.Thread.run(Thread.java:745)

"qtp627106512-26" daemon prio=10 tid=0x00007ffe68124800 nid=0x7211 waiting on 
condition [0x00007ffebd00e000]
   java.lang.Thread.State: TIMED_WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000fd8617c0> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at 
java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:226)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2082)
        at 
org.eclipse.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:342)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool.idleJobPoll(QueuedThreadPool.java:526)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool.access$600(QueuedThreadPool.java:44)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:572)
        at java.lang.Thread.run(Thread.java:745)

"qtp627106512-25 Acceptor1 SelectChannelConnector@0.0.0.0:8081" daemon prio=10 
tid=0x00007ffe68123000 nid=0x7210 runnable [0x00007ffebd10f000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.ServerSocketChannelImpl.accept0(Native Method)
        at 
sun.nio.ch.ServerSocketChannelImpl.accept(ServerSocketChannelImpl.java:241)
        - locked <0x00000000fd861518> (a java.lang.Object)
        at 
org.eclipse.jetty.server.nio.SelectChannelConnector.accept(SelectChannelConnector.java:109)
        at 
org.eclipse.jetty.server.AbstractConnector$Acceptor.run(AbstractConnector.java:938)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
        at java.lang.Thread.run(Thread.java:745)

"qtp627106512-24 Acceptor0 SelectChannelConnector@0.0.0.0:8081" daemon prio=10 
tid=0x00007ffe68122000 nid=0x720f waiting for monitor entry [0x00007ffebd210000]
   java.lang.Thread.State: BLOCKED (on object monitor)
        at 
sun.nio.ch.ServerSocketChannelImpl.accept(ServerSocketChannelImpl.java:225)
        - waiting to lock <0x00000000fd861518> (a java.lang.Object)
        at 
org.eclipse.jetty.server.nio.SelectChannelConnector.accept(SelectChannelConnector.java:109)
        at 
org.eclipse.jetty.server.AbstractConnector$Acceptor.run(AbstractConnector.java:938)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
        at java.lang.Thread.run(Thread.java:745)

"qtp627106512-23 Selector1" daemon prio=10 tid=0x00007ffe68120000 nid=0x720e 
runnable [0x00007ffebd311000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:87)
        - locked <0x00000000fd85ff28> (a sun.nio.ch.Util$2)
        - locked <0x00000000fd85ff18> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000000fd85fdf0> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:98)
        at 
org.eclipse.jetty.io.nio.SelectorManager$SelectSet.doSelect(SelectorManager.java:569)
        at 
org.eclipse.jetty.io.nio.SelectorManager$1.run(SelectorManager.java:290)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
        at java.lang.Thread.run(Thread.java:745)

"qtp627106512-22 Selector0" daemon prio=10 tid=0x00007ffe6811e800 nid=0x720d 
runnable [0x00007ffebd412000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:87)
        - locked <0x00000000fd85ea88> (a sun.nio.ch.Util$2)
        - locked <0x00000000fd85ea78> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000000fd85e950> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:98)
        at 
org.eclipse.jetty.io.nio.SelectorManager$SelectSet.doSelect(SelectorManager.java:569)
        at 
org.eclipse.jetty.io.nio.SelectorManager$1.run(SelectorManager.java:290)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
        at 
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
        at java.lang.Thread.run(Thread.java:745)

"New I/O server boss #6" daemon prio=10 tid=0x00007ffe70085000 nid=0x720c 
runnable [0x00007ffebd719000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:87)
        - locked <0x00000000fd72bae0> (a sun.nio.ch.Util$2)
        - locked <0x00000000fd72bad0> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000000fd72b6a0> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:98)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:102)
        at 
org.jboss.netty.channel.socket.nio.NioServerBoss.select(NioServerBoss.java:163)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:206)
        at 
org.jboss.netty.channel.socket.nio.NioServerBoss.run(NioServerBoss.java:42)
        at 
org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
        at 
org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)

"New I/O worker #5" daemon prio=10 tid=0x00007ffe7007f000 nid=0x720b runnable 
[0x00007ffebd81a000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:87)
        - locked <0x00000000fd729b38> (a sun.nio.ch.Util$2)
        - locked <0x00000000fd729b28> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000000fd729900> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:98)
        at 
org.jboss.netty.channel.socket.nio.SelectorUtil.select(SelectorUtil.java:64)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.select(AbstractNioSelector.java:409)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:206)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:90)
        at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178)
        at 
org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
        at 
org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)

"New I/O worker #4" daemon prio=10 tid=0x00007ffe70081800 nid=0x720a runnable 
[0x00007ffebd91b000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:87)
        - locked <0x00000000fd7284b0> (a sun.nio.ch.Util$2)
        - locked <0x00000000fd7284a0> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000000fd728268> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:98)
        at 
org.jboss.netty.channel.socket.nio.SelectorUtil.select(SelectorUtil.java:64)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.select(AbstractNioSelector.java:409)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:206)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:90)
        at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178)
        at 
org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
        at 
org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)

"New I/O boss #3" daemon prio=10 tid=0x00007ffe7007c000 nid=0x7209 runnable 
[0x00007ffebda1c000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:87)
        - locked <0x00000000fd72e408> (a sun.nio.ch.Util$2)
        - locked <0x00000000fd72e3f8> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000000fd72e1c0> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:98)
        at 
org.jboss.netty.channel.socket.nio.SelectorUtil.select(SelectorUtil.java:64)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.select(AbstractNioSelector.java:409)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:206)
        at 
org.jboss.netty.channel.socket.nio.NioClientBoss.run(NioClientBoss.java:42)
        at 
org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
        at 
org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)

"New I/O worker #2" daemon prio=10 tid=0x00007ffe7004b800 nid=0x7208 runnable 
[0x00007ffebdb1d000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:87)
        - locked <0x00000000fd82af68> (a sun.nio.ch.Util$2)
        - locked <0x00000000fd82af58> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000000fd82ae40> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:98)
        at 
org.jboss.netty.channel.socket.nio.SelectorUtil.select(SelectorUtil.java:64)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.select(AbstractNioSelector.java:409)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:206)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:90)
        at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178)
        at 
org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
        at 
org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)

"New I/O worker #1" daemon prio=10 tid=0x00007ffe70034000 nid=0x7207 runnable 
[0x00007ffebdc1e000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:87)
        - locked <0x00000000fd829a20> (a sun.nio.ch.Util$2)
        - locked <0x00000000fd829a10> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000000fd8295f0> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:98)
        at 
org.jboss.netty.channel.socket.nio.SelectorUtil.select(SelectorUtil.java:64)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.select(AbstractNioSelector.java:409)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:206)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:90)
        at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178)
        at 
org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
        at 
org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)

"sparkWorker-akka.actor.default-dispatcher-5" daemon prio=10 
tid=0x00007ffe70012800 nid=0x7206 waiting on condition [0x00007ffebdd1f000]
   java.lang.Thread.State: TIMED_WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000fd614a78> (a 
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinPool)
        at 
scala.concurrent.forkjoin.ForkJoinPool.idleAwaitWork(ForkJoinPool.java:2135)
        at scala.concurrent.forkjoin.ForkJoinPool.scan(ForkJoinPool.java:2067)
        at 
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)

"sparkWorker-akka.actor.default-dispatcher-4" daemon prio=10 
tid=0x00007ffe64001800 nid=0x7205 waiting on condition [0x00007ffebde20000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000fd614a78> (a 
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinPool)
        at scala.concurrent.forkjoin.ForkJoinPool.scan(ForkJoinPool.java:2075)
        at 
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)

"sparkWorker-akka.actor.default-dispatcher-3" daemon prio=10 
tid=0x00007ffed0465800 nid=0x7204 waiting on condition [0x00007ffebdf21000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000fd614a78> (a 
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinPool)
        at scala.concurrent.forkjoin.ForkJoinPool.scan(ForkJoinPool.java:2075)
        at 
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)

"sparkWorker-akka.actor.default-dispatcher-2" daemon prio=10 
tid=0x00007ffed0461000 nid=0x7203 waiting on condition [0x00007ffebe022000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000fd614a78> (a 
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinPool)
        at scala.concurrent.forkjoin.ForkJoinPool.scan(ForkJoinPool.java:2075)
        at 
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)

"sparkWorker-scheduler-1" daemon prio=10 tid=0x00007ffed03e8000 nid=0x7202 
sleeping[0x00007ffebe123000]
   java.lang.Thread.State: TIMED_WAITING (sleeping)
        at java.lang.Thread.sleep(Native Method)
        at akka.actor.LightArrayRevolverScheduler.waitNanos(Scheduler.scala:226)
        at 
akka.actor.LightArrayRevolverScheduler$$anon$12.nextTick(Scheduler.scala:393)
        at 
akka.actor.LightArrayRevolverScheduler$$anon$12.run(Scheduler.scala:363)
        at java.lang.Thread.run(Thread.java:745)

"Service Thread" daemon prio=10 tid=0x00007ffed009d800 nid=0x7200 runnable 
[0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"C2 CompilerThread1" daemon prio=10 tid=0x00007ffed009b800 nid=0x71ff waiting 
on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"C2 CompilerThread0" daemon prio=10 tid=0x00007ffed0098800 nid=0x71fe waiting 
on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"Signal Dispatcher" daemon prio=10 tid=0x00007ffed0097000 nid=0x71fd runnable 
[0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"Finalizer" daemon prio=10 tid=0x00007ffed0077800 nid=0x71fc in Object.wait() 
[0x00007ffebf6f5000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        - waiting on <0x00000000fd623240> (a java.lang.ref.ReferenceQueue$Lock)
        at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:135)
        - locked <0x00000000fd623240> (a java.lang.ref.ReferenceQueue$Lock)
        at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:151)
        at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:209)

"Reference Handler" daemon prio=10 tid=0x00007ffed0075800 nid=0x71fb in 
Object.wait() [0x00007ffebf7f6000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        - waiting on <0x00000000fd622ca0> (a java.lang.ref.Reference$Lock)
        at java.lang.Object.wait(Object.java:503)
        at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:133)
        - locked <0x00000000fd622ca0> (a java.lang.ref.Reference$Lock)

"main" prio=10 tid=0x00007ffed0008800 nid=0x71f1 waiting on condition 
[0x00007ffed731a000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000fd83fd60> (a 
java.util.concurrent.CountDownLatch$Sync)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:834)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireSharedInterruptibly(AbstractQueuedSynchronizer.java:994)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireSharedInterruptibly(AbstractQueuedSynchronizer.java:1303)
        at java.util.concurrent.CountDownLatch.await(CountDownLatch.java:236)
        at 
akka.actor.ActorSystemImpl$TerminationCallbacks.ready(ActorSystem.scala:760)
        at 
akka.actor.ActorSystemImpl$TerminationCallbacks.ready(ActorSystem.scala:729)
        at scala.concurrent.Await$$anonfun$ready$1.apply(package.scala:86)
        at scala.concurrent.Await$$anonfun$ready$1.apply(package.scala:86)
        at 
scala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53)
        at scala.concurrent.Await$.ready(package.scala:86)
        at akka.actor.ActorSystemImpl.awaitTermination(ActorSystem.scala:598)
        at akka.actor.ActorSystemImpl.awaitTermination(ActorSystem.scala:599)
        at org.apache.spark.deploy.worker.Worker$.main(Worker.scala:379)
        at org.apache.spark.deploy.worker.Worker.main(Worker.scala)

"VM Thread" prio=10 tid=0x00007ffed0071000 nid=0x71fa runnable 

"GC task thread#0 (ParallelGC)" prio=10 tid=0x00007ffed001e800 nid=0x71f2 
runnable 

"GC task thread#1 (ParallelGC)" prio=10 tid=0x00007ffed0020800 nid=0x71f3 
runnable 

"GC task thread#2 (ParallelGC)" prio=10 tid=0x00007ffed0022000 nid=0x71f4 
runnable 

"GC task thread#3 (ParallelGC)" prio=10 tid=0x00007ffed0024000 nid=0x71f5 
runnable 

"GC task thread#4 (ParallelGC)" prio=10 tid=0x00007ffed0026000 nid=0x71f6 
runnable 

"GC task thread#5 (ParallelGC)" prio=10 tid=0x00007ffed0027800 nid=0x71f7 
runnable 

"GC task thread#6 (ParallelGC)" prio=10 tid=0x00007ffed0029800 nid=0x71f8 
runnable 

"GC task thread#7 (ParallelGC)" prio=10 tid=0x00007ffed002b800 nid=0x71f9 
runnable 

"VM Periodic Task Thread" prio=10 tid=0x00007ffed00a8800 nid=0x7201 waiting on 
condition 

JNI global references: 147

> Spark hang while processing more than 100 sequence files
> --------------------------------------------------------
>
>                 Key: SPARK-3687
>                 URL: https://issues.apache.org/jira/browse/SPARK-3687
>             Project: Spark
>          Issue Type: Bug
>          Components: Spark Core
>    Affects Versions: 1.0.2, 1.1.0
>            Reporter: Ziv Huang
>
> In my application, I read more than 100 sequence files to a JavaPairRDD, 
> perform flatmap to get another JavaRDD, and then use takeOrdered to get the 
> result.
> It is quite often (but not always) that the spark hangs while the executing 
> some of 110th-130th tasks.
> The job can hang for several hours, maybe forever (I can't wait for its 
> completion).
> When the spark job hangs, I can't find any error message in anywhere, and I 
> can't kill the job from web UI.
> The current workaround is to use coalesce to reduce the number of partitions 
> to be processed.
> I never get a job hanged if the number of partitions to be processed is no 
> greater than 80.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to