Hung MR child when closing file systems
---------------------------------------

                 Key: HADOOP-5707
                 URL: https://issues.apache.org/jira/browse/HADOOP-5707
             Project: Hadoop Core
          Issue Type: Bug
            Reporter: Ben Maurer


I recently found a number of MR processes that had been launched days ago and 
were stuck in the following situation:

{quote}
Full thread dump Java HotSpot(TM) 64-Bit Server VM (11.0-b16 mixed mode):

"Attach Listener" daemon prio=10 tid=0x00000000559ef400 nid=0x2d22 waiting on 
condition [0x0000000000000000..0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"SIGTERM handler" daemon prio=10 tid=0x00000000563d8400 nid=0x5651 waiting for 
monitor entry [0x00000000410a5000..0x00000000410a5d10]
   java.lang.Thread.State: BLOCKED (on object monitor)
        at java.lang.Shutdown.exit(Shutdown.java:178)
        - waiting to lock <0x00002aaaae3a8ed0> (a java.lang.Class for 
java.lang.Shutdown)
        at java.lang.Terminator$1.handle(Terminator.java:35)
        at sun.misc.Signal$1.run(Signal.java:195)
        at java.lang.Thread.run(Thread.java:619)

"Thread-2" daemon prio=10 tid=0x00000000563d5000 nid=0x53e5 
sleeping[0x00000000419db000..0x00000000419dbd90]
   java.lang.Thread.State: TIMED_WAITING (sleeping)
        at java.lang.Thread.sleep(Native Method)
        at org.apache.hadoop.ipc.Client.stop(Client.java:668)
        at org.apache.hadoop.ipc.RPC$ClientCache.stopClient(RPC.java:189)
        at org.apache.hadoop.ipc.RPC$ClientCache.access$400(RPC.java:138)
        at org.apache.hadoop.ipc.RPC$Invoker.close(RPC.java:229)
        - locked <0x00002aaab9b0eee0> (a org.apache.hadoop.ipc.RPC$Invoker)
        at org.apache.hadoop.ipc.RPC$Invoker.access$500(RPC.java:196)
        at org.apache.hadoop.ipc.RPC.stopProxy(RPC.java:382)
        at org.apache.hadoop.hdfs.DFSClient.close(DFSClient.java:212)
        - locked <0x00002aaab9b0ee10> (a org.apache.hadoop.hdfs.DFSClient)
        at 
org.apache.hadoop.hdfs.DistributedFileSystem.close(DistributedFileSystem.java:264)
        at org.apache.hadoop.fs.FileSystem$Cache.closeAll(FileSystem.java:1413)
        - locked <0x00002aaab9aa9100> (a org.apache.hadoop.fs.FileSystem$Cache)
        at org.apache.hadoop.fs.FileSystem.closeAll(FileSystem.java:236)
        at 
org.apache.hadoop.fs.FileSystem$ClientFinalizer.run(FileSystem.java:221)
        - locked <0x00002aaab9a90698> (a 
org.apache.hadoop.fs.FileSystem$ClientFinalizer)

"DestroyJavaVM" prio=10 tid=0x000000005589f400 nid=0x4c40 in Object.wait() 
[0x0000000041cc9000..0x0000000041cc9d40]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        - waiting on <0x00002aaab9a90698> (a 
org.apache.hadoop.fs.FileSystem$ClientFinalizer)
        at java.lang.Thread.join(Thread.java:1143)
        - locked <0x00002aaab9a90698> (a 
org.apache.hadoop.fs.FileSystem$ClientFinalizer)
        at java.lang.Thread.join(Thread.java:1196)
        at 
java.lang.ApplicationShutdownHooks.run(ApplicationShutdownHooks.java:79)
        at java.lang.Shutdown.runHooks(Shutdown.java:89)
        at java.lang.Shutdown.sequence(Shutdown.java:133)
        at java.lang.Shutdown.shutdown(Shutdown.java:200)
        - locked <0x00002aaaae3a8ed0> (a java.lang.Class for java.lang.Shutdown)

"SpillThread" daemon prio=10 tid=0x00002aaac440dc00 nid=0x4c81 waiting on 
condition [0x0000000041adc000..0x0000000041adcb90]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00002aaab9aaf860> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
        at 
org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:882)

"Comm thread for attempt_200904081633_0288_m_000430_2" daemon prio=10 
tid=0x0000000055e12000 nid=0x4c80 waiting for monitor entry 
[0x0000000042a40000..0x0000000042a40a10]
   java.lang.Thread.State: BLOCKED (on object monitor)
        at java.lang.Shutdown.exit(Shutdown.java:178)
        - waiting to lock <0x00002aaaae3a8ed0> (a java.lang.Class for 
java.lang.Shutdown)
        at java.lang.Runtime.exit(Runtime.java:90)
        at java.lang.System.exit(System.java:906)
        at org.apache.hadoop.mapred.Task$1.run(Task.java:441)
        at java.lang.Thread.run(Thread.java:619)

"CompilerThread1" daemon prio=10 tid=0x000000005593a400 nid=0x4c4e waiting on 
condition [0x0000000000000000..0x0000000040fa3450]
   java.lang.Thread.State: RUNNABLE

"CompilerThread0" daemon prio=10 tid=0x0000000055936400 nid=0x4c4d waiting on 
condition [0x0000000000000000..0x0000000040ea24d0]
   java.lang.Thread.State: RUNNABLE

"Signal Dispatcher" daemon prio=10 tid=0x0000000055934800 nid=0x4c4c runnable 
[0x0000000000000000..0x0000000040da2a60]
   java.lang.Thread.State: RUNNABLE

"Finalizer" daemon prio=10 tid=0x0000000055911800 nid=0x4c4b in Object.wait() 
[0x000000004283e000..0x000000004283eb90]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        - waiting on <0x00002aaab9a72d40> (a java.lang.ref.ReferenceQueue$Lock)
        at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:116)
        - locked <0x00002aaab9a72d40> (a java.lang.ref.ReferenceQueue$Lock)
        at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:132)
        at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:159)

"Reference Handler" daemon prio=10 tid=0x000000005590fc00 nid=0x4c4a in 
Object.wait() [0x000000004273d000..0x000000004273da10]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        - waiting on <0x00002aaab9a78de0> (a java.lang.ref.Reference$Lock)
        at java.lang.Object.wait(Object.java:485)
        at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:116)
        - locked <0x00002aaab9a78de0> (a java.lang.ref.Reference$Lock)

"VM Thread" prio=10 tid=0x000000005590a400 nid=0x4c49 runnable 

"GC task thread#0 (ParallelGC)" prio=10 tid=0x00000000558a9c00 nid=0x4c41 
runnable 

"GC task thread#1 (ParallelGC)" prio=10 tid=0x00000000558ab800 nid=0x4c42 
runnable 

"GC task thread#2 (ParallelGC)" prio=10 tid=0x00000000558ad000 nid=0x4c43 
runnable 

"GC task thread#3 (ParallelGC)" prio=10 tid=0x00000000558ae800 nid=0x4c44 
runnable 

"GC task thread#4 (ParallelGC)" prio=10 tid=0x00000000558b0400 nid=0x4c45 
runnable 

"GC task thread#5 (ParallelGC)" prio=10 tid=0x00000000558b1c00 nid=0x4c46 
runnable 

"GC task thread#6 (ParallelGC)" prio=10 tid=0x00000000558b3400 nid=0x4c47 
runnable 

"GC task thread#7 (ParallelGC)" prio=10 tid=0x00000000558b5000 nid=0x4c48 
runnable 

"VM Periodic Task Thread" prio=10 tid=0x000000005593f400 nid=0x4c50 waiting on 
condition 

JNI global references: 1012
{quote}

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.

Reply via email to