I did a jstack
and found all the threads for 1 mapper are stuck, I guess the other mappers are like this too yyang15@yyang15-VirtualBox:~/work/CIReco/title_flow/java_code$ cat /tmp/yyang Deadlock Detection: No deadlocks found. Thread 2381: (state = BLOCKED) - java.lang.Object.wait(long) @bci=0 (Interpreted frame) - org.apache.hadoop.ipc.Client$Connection.waitForWork() @bci=59, line=903 (Interpreted frame) - org.apache.hadoop.ipc.Client$Connection.run() @bci=55, line=948 (Interpreted frame) Thread 10422: (state = IN_NATIVE) - sun.nio.ch.EPollArrayWrapper.epollWait(long, int, long, int) @bci=0 (Interpreted frame) - sun.nio.ch.EPollArrayWrapper.poll(long) @bci=18, line=269 (Interpreted frame) - sun.nio.ch.EPollSelectorImpl.doSelect(long) @bci=28, line=79 (Interpreted frame) - sun.nio.ch.SelectorImpl.lockAndDoSelect(long) @bci=37, line=87 (Interpreted frame) - sun.nio.ch.SelectorImpl.select(long) @bci=30, line=98 (Interpreted frame) - org.apache.hadoop.net.SocketIOWithTimeout$SelectorPool.select(java.nio.channels.SelectableChannel, int, long) @bci=46, line=335 (Interpreted frame) - org.apache.hadoop.net.SocketIOWithTimeout.doIO(java.nio.ByteBuffer, int) @bci=80, line=157 (Interpreted frame) - org.apache.hadoop.net.SocketInputStream.read(java.nio.ByteBuffer) @bci=6, line=161 (Interpreted frame) - org.apache.hadoop.net.SocketInputStream.read(byte[], int, int) @bci=7, line=131 (Interpreted frame) - org.apache.hadoop.net.SocketInputStream.read() @bci=8, line=118 (Interpreted frame) - java.io.FilterInputStream.read() @bci=4, line=83 (Interpreted frame) - java.io.FilterInputStream.read() @bci=4, line=83 (Interpreted frame) - org.apache.hadoop.hdfs.protocolPB.PBHelper.vintPrefixed(java.io.InputStream) @bci=1, line=1988 (Interpreted frame) - org.apache.hadoop.hdfs.protocol.datatransfer.PipelineAck.readFields(java.io.InputStream) @bci=2, line=176 (Interpreted frame) - org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer$ResponseProcessor.run() @bci=75, line=796 (Interpreted frame) Thread 10407: (state = BLOCKED) - java.lang.Thread.sleep(long) @bci=0 (Interpreted frame) - org.apache.hadoop.hdfs.LeaseRenewer.run(int) @bci=429, line=438 (Interpreted frame) - org.apache.hadoop.hdfs.LeaseRenewer.access$700(org.apache.hadoop.hdfs.LeaseRenewer, int) @bci=2, line=71 (Interpreted frame) - org.apache.hadoop.hdfs.LeaseRenewer$1.run() @bci=69, line=298 (Interpreted frame) - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame) Thread 10406: (state = BLOCKED) - java.lang.Object.wait(long) @bci=0 (Interpreted frame) - org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run() @bci=265, line=502 (Interpreted frame) Thread 6486: (state = BLOCKED) - sun.misc.Unsafe.park(boolean, long) @bci=0 (Interpreted frame) - java.util.concurrent.locks.LockSupport.park(java.lang.Object) @bci=14, line=186 (Interpreted frame) - java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await() @bci=42, line=2043 (Interpreted frame) - org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run() @bci=47, line=1501 (Interpreted frame) Thread 6453: (state = BLOCKED) - java.lang.Thread.sleep(long) @bci=0 (Interpreted frame) - org.apache.hadoop.hdfs.PeerCache.run() @bci=41, line=245 (Interpreted frame) - org.apache.hadoop.hdfs.PeerCache.access$000(org.apache.hadoop.hdfs.PeerCache) @bci=1, line=41 (Interpreted frame) - org.apache.hadoop.hdfs.PeerCache$1.run() @bci=4, line=119 (Interpreted frame) - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame) Thread 6423: (state = BLOCKED) - java.lang.Object.wait(long) @bci=0 (Interpreted frame) - org.apache.hadoop.mapred.Task$TaskReporter.run() @bci=86, line=719 (Interpreted frame) - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame) Thread 6422: (state = IN_NATIVE) - org.apache.hadoop.net.unix.DomainSocketWatcher.doPoll0(int, org.apache.hadoop.net.unix.DomainSocketWatcher$FdSet) @bci=0 (Interpreted frame) - org.apache.hadoop.net.unix.DomainSocketWatcher.access$800(int, org.apache.hadoop.net.unix.DomainSocketWatcher$FdSet) @bci=2, line=52 (Interpreted frame) - org.apache.hadoop.net.unix.DomainSocketWatcher$1.run() @bci=551, line=457 (Interpreted frame) - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame) Thread 6421: (state = BLOCKED) - sun.misc.Unsafe.park(boolean, long) @bci=0 (Interpreted frame) - java.util.concurrent.locks.LockSupport.parkNanos(java.lang.Object, long) @bci=20, line=226 (Interpreted frame) - java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(long) @bci=68, line=2082 (Interpreted frame) - java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take() @bci=122, line=1090 (Interpreted frame) - java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take() @bci=1, line=807 (Interpreted frame) - java.util.concurrent.ThreadPoolExecutor.getTask() @bci=156, line=1068 (Interpreted frame) - java.util.concurrent.ThreadPoolExecutor.runWorker(java.util.concurrent.ThreadPoolExecutor$Worker) @bci=26, line=1130 (Interpreted frame) - java.util.concurrent.ThreadPoolExecutor$Worker.run() @bci=5, line=615 (Interpreted frame) - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame) Thread 6414: (state = BLOCKED) - sun.misc.Unsafe.park(boolean, long) @bci=0 (Interpreted frame) - java.util.concurrent.locks.LockSupport.parkNanos(java.lang.Object, long) @bci=20, line=226 (Compiled frame) - java.util.concurrent.SynchronousQueue$TransferStack.awaitFulfill(java.util.concurrent.SynchronousQueue$TransferStack$SNode, boolean, long) @bci=174, line=460 (Compiled frame) - java.util.concurrent.SynchronousQueue$TransferStack.transfer(java.lang.Object, boolean, long) @bci=102, line=359 (Interpreted frame) - java.util.concurrent.SynchronousQueue.poll(long, java.util.concurrent.TimeUnit) @bci=11, line=942 (Interpreted frame) - java.util.concurrent.ThreadPoolExecutor.getTask() @bci=141, line=1068 (Interpreted frame) - java.util.concurrent.ThreadPoolExecutor.runWorker(java.util.concurrent.ThreadPoolExecutor$Worker) @bci=26, line=1130 (Interpreted frame) - java.util.concurrent.ThreadPoolExecutor$Worker.run() @bci=5, line=615 (Interpreted frame) - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame) Thread 6413: (state = BLOCKED) - java.lang.Object.wait(long) @bci=0 (Interpreted frame) - org.apache.hadoop.ipc.Client$Connection.waitForWork() @bci=59, line=903 (Interpreted frame) - org.apache.hadoop.ipc.Client$Connection.run() @bci=55, line=948 (Interpreted frame) Thread 6408: (state = BLOCKED) - java.lang.Object.wait(long) @bci=0 (Interpreted frame) - java.util.TimerThread.mainLoop() @bci=201, line=552 (Interpreted frame) - java.util.TimerThread.run() @bci=1, line=505 (Interpreted frame) Thread 6406: (state = BLOCKED) - java.lang.Object.wait(long) @bci=0 (Interpreted frame) - java.lang.Object.wait() @bci=2, line=503 (Interpreted frame) - org.apache.hadoop.metrics2.impl.SinkQueue.waitForData() @bci=13, line=114 (Interpreted frame) - org.apache.hadoop.metrics2.impl.SinkQueue.consumeAll(org.apache.hadoop.metrics2.impl.SinkQueue$Consumer) @bci=1, line=83 (Interpreted frame) - org.apache.hadoop.metrics2.impl.MetricsSinkAdapter.publishMetricsFromQueue() @bci=46, line=127 (Interpreted frame) - org.apache.hadoop.metrics2.impl.MetricsSinkAdapter$1.run() @bci=4, line=86 (Interpreted frame) Thread 6394: (state = BLOCKED) Thread 6393: (state = BLOCKED) - java.lang.Object.wait(long) @bci=0 (Interpreted frame) - java.lang.ref.ReferenceQueue.remove(long) @bci=44, line=135 (Interpreted frame) - java.lang.ref.ReferenceQueue.remove() @bci=2, line=151 (Interpreted frame) - java.lang.ref.Finalizer$FinalizerThread.run() @bci=36, line=209 (Interpreted frame) Thread 6392: (state = BLOCKED) - java.lang.Object.wait(long) @bci=0 (Interpreted frame) - java.lang.Object.wait() @bci=2, line=503 (Interpreted frame) - java.lang.ref.Reference$ReferenceHandler.run() @bci=46, line=133 (Interpreted frame) Thread 6372: (state = IN_NATIVE) - sun.nio.ch.EPollArrayWrapper.epollWait(long, int, long, int) @bci=0 (Interpreted frame) - sun.nio.ch.EPollArrayWrapper.poll(long) @bci=18, line=269 (Interpreted frame) - sun.nio.ch.EPollSelectorImpl.doSelect(long) @bci=28, line=79 (Interpreted frame) - sun.nio.ch.SelectorImpl.lockAndDoSelect(long) @bci=37, line=87 (Interpreted frame) - sun.nio.ch.SelectorImpl.select(long) @bci=30, line=98 (Interpreted frame) - org.apache.hadoop.net.SocketIOWithTimeout$SelectorPool.select(java.nio.channels.SelectableChannel, int, long) @bci=46, line=335 (Interpreted frame) - org.apache.hadoop.net.SocketIOWithTimeout.doIO(java.nio.ByteBuffer, int) @bci=80, line=157 (Interpreted frame) - org.apache.hadoop.net.SocketInputStream.read(java.nio.ByteBuffer) @bci=6, line=161 (Interpreted frame) - org.apache.hadoop.hdfs.protocol.datatransfer.PacketReceiver.readChannelFully(java.nio.channels.ReadableByteChannel, java.nio.ByteBuffer) @bci=9, line=258 (Interpreted frame) - org.apache.hadoop.hdfs.protocol.datatransfer.PacketReceiver.doReadFully(java.nio.channels.ReadableByteChannel, java.io.InputStream, java.nio.ByteBuffer) @bci=6, line=209 (Interpreted frame) - org.apache.hadoop.hdfs.protocol.datatransfer.PacketReceiver.doRead(java.nio.channels.ReadableByteChannel, java.io.InputStream) @bci=293, line=171 (Interpreted frame) - org.apache.hadoop.hdfs.protocol.datatransfer.PacketReceiver.receiveNextPacket(java.nio.channels.ReadableByteChannel) @bci=3, line=102 (Interpreted frame) - org.apache.hadoop.hdfs.RemoteBlockReader2.readNextPacket() @bci=8, line=173 (Interpreted frame) - org.apache.hadoop.hdfs.RemoteBlockReader2.read(byte[], int, int) @bci=27, line=138 (Interpreted frame) - org.apache.hadoop.hdfs.DFSInputStream$ByteArrayStrategy.doRead(org.apache.hadoop.hdfs.BlockReader, int, int, org.apache.hadoop.hdfs.DFSInputStream$ReadStatistics) @bci=7, line=683 (Interpreted frame) - org.apache.hadoop.hdfs.DFSInputStream.readBuffer(org.apache.hadoop.hdfs.DFSInputStream$ReaderStrategy, int, int, java.util.Map) @bci=14, line=739 (Interpreted frame) - org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(org.apache.hadoop.hdfs.DFSInputStream$ReaderStrategy, int, int) @bci=141, line=796 (Interpreted frame) - org.apache.hadoop.hdfs.DFSInputStream.read(byte[], int, int) @bci=15, line=837 (Interpreted frame) - java.io.DataInputStream.readFully(byte[], int, int) @bci=34, line=195 (Compiled frame) - org.apache.hadoop.io.DataOutputBuffer$Buffer.write(java.io.DataInput, int) @bci=62, line=70 (Interpreted frame) - org.apache.hadoop.io.DataOutputBuffer.write(java.io.DataInput, int) @bci=6, line=120 (Interpreted frame) - org.apache.hadoop.io.SequenceFile$Reader.next(org.apache.hadoop.io.DataOutputBuffer) @bci=43, line=2358 (Interpreted frame) - org.apache.hadoop.io.SequenceFile$Reader.next(org.apache.hadoop.io.Writable) @bci=77, line=2257 (Interpreted frame) - org.apache.hadoop.io.SequenceFile$Reader.next(org.apache.hadoop.io.Writable, org.apache.hadoop.io.Writable) @bci=52, line=2303 (Interpreted frame) - org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator.computeNext() @bci=44, line=81 (Interpreted frame) - org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator.computeNext() @bci=1, line=37 (Interpreted frame) - com.google.common.collect.AbstractIterator.tryToComputeNext() @bci=9, line=143 (Interpreted frame) - com.google.common.collect.AbstractIterator.hasNext() @bci=61, line=138 (Interpreted frame) - org.apache.mahout.math.hadoop.stochasticsvd.qr.QRLastStep.loadNextQt() @bci=4, line=86 (Interpreted frame) - org.apache.mahout.math.hadoop.stochasticsvd.qr.QRLastStep.hasNext() @bci=36, line=112 (Compiled frame) - org.apache.mahout.math.hadoop.stochasticsvd.qr.QRLastStep.next() @bci=16, line=123 (Compiled frame) - org.apache.mahout.math.hadoop.stochasticsvd.BtJob$BtMapper.map(org.apache.hadoop.io.Writable, org.apache.mahout.math.VectorWritable, org.apache.hadoop.mapreduce.Mapper$Context) @bci=15, line=134 (Compiled frame) - org.apache.mahout.math.hadoop.stochasticsvd.BtJob$BtMapper.map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapreduce.Mapper$Context) @bci=10, line=103 (Compiled frame) - org.apache.hadoop.mapreduce.Mapper.run(org.apache.hadoop.mapreduce.Mapper$Context) @bci=22, line=145 (Compiled frame) - org.apache.hadoop.mapred.MapTask.runNewMapper(org.apache.hadoop.mapred.JobConf, org.apache.hadoop.mapreduce.split.JobSplit$TaskSplitIndex, org.apache.hadoop.mapred.TaskUmbilicalProtocol, org.apache.hadoop.mapred.Task$TaskReporter) @bci=228, line=764 (Interpreted frame) - org.apache.hadoop.mapred.MapTask.run(org.apache.hadoop.mapred.JobConf, org.apache.hadoop.mapred.TaskUmbilicalProtocol) @bci=148, line=340 (Interpreted frame) - org.apache.hadoop.mapred.YarnChild$2.run() @bci=29, line=167 (Interpreted frame) - java.security.AccessController.doPrivileged(java.security.PrivilegedExceptionAction, java.security.AccessControlContext) @bci=0 (Interpreted frame) - javax.security.auth.Subject.doAs(javax.security.auth.Subject, java.security.PrivilegedExceptionAction) @bci=42, line=415 (Interpreted frame) - org.apache.hadoop.security.UserGroupInformation.doAs(java.security.PrivilegedExceptionAction) @bci=14, line=1650 (Interpreted frame) - org.apache.hadoop.mapred.YarnChild.main(java.lang.String[]) @bci=514, line=162 (Interpreted frame) sounds like some bug ? On Wed, Oct 15, 2014 at 3:24 PM, Yang <[email protected]> wrote: > > > > attempt_1413267265041_14045_m_000008_166.70RUNNINGmap > map > phxaishdc9dn1896.stratus.phx.ebay.com:50060logs > <https://phxaishdc9dn1896.stratus.phx.ebay.com:50060/node/containerlogs/container_1413267265041_14045_01_000113/yyang15>Wed, > 15 Oct 2014 20:54:44 GMTN/A1hrs, 27mins, 29sec > attempt_1413267265041_14045_m_000031_066.70RUNNINGmap > map > phxaishdc9dn1440.stratus.phx.ebay.com:50060logs > <https://phxaishdc9dn1440.stratus.phx.ebay.com:50060/node/containerlogs/container_1413267265041_14045_01_000075/yyang15>Wed, > 15 Oct 2014 15:49:19 GMTN/A6hrs, 32mins, 53sec > attempt_1413267265041_14045_m_000036_066.70RUNNINGmap > map > phxaishdc9dn0440.phx.ebay.com:50060logs > <https://phxaishdc9dn0440.phx.ebay.com:50060/node/containerlogs/container_1413267265041_14045_01_000079/yyang15>Wed, > 15 Oct 2014 15:49:19 GMTN/A6hrs, 32mins, 53sec > attempt_1413267265041_14045_m_000070_066.70RUNNINGmap > map > phxaishdc9dn1137.stratus.phx.ebay.com:50060logs > <https://phxaishdc9dn1137.stratus.phx.ebay.com:50060/node/containerlogs/container_1413267265041_14045_01_000081/yyang15>Wed, > 15 Oct 2014 15:49:19 GMTN/A6hrs, 32mins, 54sec > attempt_1413267265041_14045_m_000018_066.70RUNNINGmap > map > phxaishdc9dn1278.stratus.phx.ebay.com:50060 > > > > > logs > <https://phxaishdc9dn1278.stratus.phx.ebay.com:50060/node/containerlogs/container_1413267265041_14045_01_000012/yyang15>Wed, > 15 Oct 2014 15:49:19 GMTN/A6hrs, 32mins, 54sec > > > > > > the logs show > > 2014-10-15 13:55:01,130 INFO [main] org.apache.hadoop.mapred.MapTask: kvstart > = 268435452; length = 67108864 > 2014-10-15 13:55:01,278 INFO [main] > org.apache.hadoop.io.compress.zlib.ZlibFactory: Successfully loaded & > initialized native-zlib library > 2014-10-15 13:55:01,279 INFO [main] org.apache.hadoop.io.compress.CodecPool: > Got brand-new decompressor [.gz] > 2014-10-15 13:55:01,288 INFO [main] org.apache.hadoop.io.compress.CodecPool: > Got brand-new decompressor [.gz] > 2014-10-15 13:55:01,289 INFO [main] org.apache.hadoop.io.compress.CodecPool: > Got brand-new decompressor [.gz] > 2014-10-15 13:55:01,289 INFO [main] org.apache.hadoop.io.compress.CodecPool: > Got brand-new decompressor [.gz] > 2014-10-15 13:55:01,389 INFO [main] org.apache.hadoop.io.compress.CodecPool: > Got brand-new decompressor [.deflate] > 2014-10-15 13:55:01,501 INFO [main] > org.apache.hadoop.conf.Configuration.deprecation: fs.default.name is > deprecated. Instead, use fs.defaultFS > 2014-10-15 13:55:01,557 INFO [main] org.apache.hadoop.io.compress.CodecPool: > Got brand-new decompressor [.deflate] > 2014-10-15 13:57:54,066 INFO [main] org.apache.hadoop.ipc.Client: Retrying > connect to server: apollo-phx-nn.vip.ebay.com/10.115.201.75:8020. Already > tried 0 time(s); maxRetries=45 > 2014-10-15 13:58:14,086 INFO [main] org.apache.hadoop.ipc.Client: Retrying > connect to server: apollo-phx-nn.vip.ebay.com/10.115.201.75:8020. Already > tried 1 time(s); maxRetries=45 > 2014-10-15 13:58:14,123 INFO [main] org.apache.hadoop.io.compress.CodecPool: > Got brand-new compressor [.deflate] > 2014-10-15 14:23:59,883 INFO > [LeaseRenewer:[email protected]:8020] > org.apache.hadoop.ipc.Client: Retrying connect to server: > apollo-phx-nn.vip.ebay.com/10.115.201.75:8020. Already tried 0 time(s); > maxRetries=45 > 2014-10-15 14:24:19,903 INFO > [LeaseRenewer:[email protected]:8020] > org.apache.hadoop.ipc.Client: Retrying connect to server: > apollo-phx-nn.vip.ebay.com/10.115.201.75:8020. Already tried 1 time(s); > maxRetries=45 > 2014-10-15 14:24:39,924 INFO > [LeaseRenewer:[email protected]:8020] > org.apache.hadoop.ipc.Client: Retrying connect to server: > apollo-phx-nn.vip.ebay.com/10.115.201.75:8020. Already tried 2 time(s); > maxRetries=45 > > > > I actually killed on of the attempts and it restarted, but again froze at > 66.70% > >
