[
https://issues.apache.org/jira/browse/MAPREDUCE-4801?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13498300#comment-13498300
]
Jason Lowe commented on MAPREDUCE-4801:
---------------------------------------
Some sample exceptions:
{noformat}
2012-11-15 12:47:02,365 [New I/O server worker #1-14] ERROR
org.apache.hadoop.mapred.ShuffleHandler: Shuffle error:
java.io.IOException: Connection reset by peer
at sun.nio.ch.FileDispatcher.read0(Native Method)
at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:21)
at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:237)
at sun.nio.ch.IOUtil.read(IOUtil.java:204)
at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:236)
at org.jboss.netty.channel.socket.nio.NioWorker.read(NioWorker.java:321)
at
org.jboss.netty.channel.socket.nio.NioWorker.processSelectedKeys(NioWorker.java:280)
at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:200)
at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
at java.lang.Thread.run(Thread.java:619)
2012-11-15 12:47:02,366 [New I/O server worker #1-14] ERROR
org.apache.hadoop.mapred.ShuffleHandler: Shuffle error [id: 0x01901fc1,
/xx.xx.xx.xx:xx => /xx.xx.xx.xx:xx] EXCEPTION: java.io.IOException: Connection
reset by peer
2012-11-15 12:47:02,366 [New I/O server worker #1-14] ERROR
org.apache.hadoop.mapred.ShuffleHandler: Shuffle error:
java.nio.channels.ClosedChannelException
at
org.jboss.netty.channel.socket.nio.NioWorker.cleanUpWriteBuffer(NioWorker.java:616)
at
org.jboss.netty.channel.socket.nio.NioWorker.close(NioWorker.java:592)
at org.jboss.netty.channel.socket.nio.NioWorker.read(NioWorker.java:355)
at
org.jboss.netty.channel.socket.nio.NioWorker.processSelectedKeys(NioWorker.java:280)
at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:200)
at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
at java.lang.Thread.run(Thread.java:619)
...
2012-11-15 12:47:02,367 [New I/O server worker #1-15] ERROR
org.apache.hadoop.mapred.ShuffleHandler: Shuffle error:
java.io.IOException: Broken pipe
at sun.nio.ch.FileDispatcher.write0(Native Method)
at sun.nio.ch.SocketDispatcher.write(SocketDispatcher.java:29)
at sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:100)
at sun.nio.ch.IOUtil.write(IOUtil.java:56)
at sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:334)
at
org.jboss.netty.channel.socket.nio.SocketSendBufferPool$PooledSendBuffer.transferTo(SocketSendBufferPool.java:239)
at
org.jboss.netty.channel.socket.nio.NioWorker.write0(NioWorker.java:469)
at
org.jboss.netty.channel.socket.nio.NioWorker.writeFromUserCode(NioWorker.java:387)
at
org.jboss.netty.channel.socket.nio.NioServerSocketPipelineSink.handleAcceptedSocket(NioServerSocketPipelineSink.java:137)
at
org.jboss.netty.channel.socket.nio.NioServerSocketPipelineSink.eventSunk(NioServerSocketPipelineSink.java:76)
at
org.jboss.netty.handler.codec.oneone.OneToOneEncoder.handleDownstream(OneToOneEncoder.java:68)
at
org.jboss.netty.handler.stream.ChunkedWriteHandler.flush(ChunkedWriteHandler.java:255)
at
org.jboss.netty.handler.stream.ChunkedWriteHandler.handleDownstream(ChunkedWriteHandler.java:124)
at org.jboss.netty.channel.Channels.write(Channels.java:611)
at org.jboss.netty.channel.Channels.write(Channels.java:578)
at
org.jboss.netty.channel.AbstractChannel.write(AbstractChannel.java:259)
at
org.apache.hadoop.mapred.ShuffleHandler$Shuffle.sendMapOutput(ShuffleHandler.java:477)
at
org.apache.hadoop.mapred.ShuffleHandler$Shuffle.messageReceived(ShuffleHandler.java:397)
at
org.jboss.netty.handler.stream.ChunkedWriteHandler.handleUpstream(ChunkedWriteHandler.java:148)
at
org.jboss.netty.handler.codec.http.HttpChunkAggregator.messageReceived(HttpChunkAggregator.java:116)
at
org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:302)
at
org.jboss.netty.handler.codec.replay.ReplayingDecoder.unfoldAndfireMessageReceived(ReplayingDecoder.java:522)
at
org.jboss.netty.handler.codec.replay.ReplayingDecoder.callDecode(ReplayingDecoder.java:506)
at
org.jboss.netty.handler.codec.replay.ReplayingDecoder.messageReceived(ReplayingDecoder.java:443)
at
org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:274)
at
org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:261)
at org.jboss.netty.channel.socket.nio.NioWorker.read(NioWorker.java:349)
at
org.jboss.netty.channel.socket.nio.NioWorker.processSelectedKeys(NioWorker.java:280)
at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:200)
at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
at java.lang.Thread.run(Thread.java:619)
2012-11-15 12:47:02,367 [New I/O server worker #1-15] ERROR
org.apache.hadoop.mapred.ShuffleHandler: Shuffle error [id: 0x01fd225b,
/xx.xx.xx.xx:xx => /xx.xx.xx.xx:xx] EXCEPTION: java.io.IOException: Broken pipe
2012-11-15 12:47:02,367 [New I/O server worker #1-15] ERROR
org.apache.hadoop.mapred.ShuffleHandler: Shuffle error:
java.nio.channels.ClosedChannelException
at
org.jboss.netty.channel.socket.nio.NioWorker.cleanUpWriteBuffer(NioWorker.java:636)
at
org.jboss.netty.channel.socket.nio.NioWorker.close(NioWorker.java:592)
at
org.jboss.netty.channel.socket.nio.NioWorker.write0(NioWorker.java:512)
at
org.jboss.netty.channel.socket.nio.NioWorker.writeFromUserCode(NioWorker.java:387)
at
org.jboss.netty.channel.socket.nio.NioServerSocketPipelineSink.handleAcceptedSocket(NioServerSocketPipelineSink.java:137)
at
org.jboss.netty.channel.socket.nio.NioServerSocketPipelineSink.eventSunk(NioServerSocketPipelineSink.java:76)
at
org.jboss.netty.handler.codec.oneone.OneToOneEncoder.handleDownstream(OneToOneEncoder.java:68)
at
org.jboss.netty.handler.stream.ChunkedWriteHandler.flush(ChunkedWriteHandler.java:255)
at
org.jboss.netty.handler.stream.ChunkedWriteHandler.handleDownstream(ChunkedWriteHandler.java:124)
at org.jboss.netty.channel.Channels.write(Channels.java:611)
at org.jboss.netty.channel.Channels.write(Channels.java:578)
at
org.jboss.netty.channel.AbstractChannel.write(AbstractChannel.java:259)
at
org.apache.hadoop.mapred.ShuffleHandler$Shuffle.sendMapOutput(ShuffleHandler.java:477)
at
org.apache.hadoop.mapred.ShuffleHandler$Shuffle.messageReceived(ShuffleHandler.java:397)
at
org.jboss.netty.handler.stream.ChunkedWriteHandler.handleUpstream(ChunkedWriteHandler.java:148)
at
org.jboss.netty.handler.codec.http.HttpChunkAggregator.messageReceived(HttpChunkAggregator.java:116)
at
org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:302)
at
org.jboss.netty.handler.codec.replay.ReplayingDecoder.unfoldAndfireMessageReceived(ReplayingDecoder.java:522)
at
org.jboss.netty.handler.codec.replay.ReplayingDecoder.callDecode(ReplayingDecoder.java:506)
at
org.jboss.netty.handler.codec.replay.ReplayingDecoder.messageReceived(ReplayingDecoder.java:443)
at
org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:274)
at
org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:261)
at org.jboss.netty.channel.socket.nio.NioWorker.read(NioWorker.java:349)
at
org.jboss.netty.channel.socket.nio.NioWorker.processSelectedKeys(NioWorker.java:280)
at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:200)
at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
at java.lang.Thread.run(Thread.java:619)
{noformat}
> ShuffleHandler can generate large logs due to prematurely closed channels
> -------------------------------------------------------------------------
>
> Key: MAPREDUCE-4801
> URL: https://issues.apache.org/jira/browse/MAPREDUCE-4801
> Project: Hadoop Map/Reduce
> Issue Type: Bug
> Affects Versions: 0.23.3, 2.0.1-alpha
> Reporter: Jason Lowe
> Priority: Critical
>
> We ran into an instance where many nodes on a cluster ran out of disk space
> because the nodemanager logs were huge. Examining the logs showed many, many
> shuffle errors due to either ClosedChannelException or IOException from
> "Connection reset by peer" or "Broken pipe".
--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira