Hi all,
We use hbase 0.9.2. We recently started to experience region servers
crashed under heavy load (2-3 different servers crashes eah load).
Seems like missing block in HDFS causes a full GC and regions are being
closed.
Following logs sample from the region server (gc log, region server log)
and data node log.
gc.log:
82619.081: [Full GC 82619.081: [CMS: 7973415K->7973415K(8005248K),
8.7188750 secs] 8304143K->8304130K(8350272K), [CMS Perm :
20159K->20153K(33976K)] icms_dc=100 , 8.7189890 secs] [Times: user=8.71
sys=0.00, real=8.72 secs]
82627.801: [Full GC 82627.801: [CMS: 7973415K->7973414K(8005248K),
12.2467710 secs] 8305494K->8304129K(8350272K), [CMS Perm :
20153K->20153K(33976K)] icms_dc=100 , 12.2468820 secs] [Times: user=12.24
sys=0.00, real=12.25 secs]
82640.048: [Full GC 82640.048: [CMS: 7973414K->7973414K(8005248K),
8.3197090 secs] 8304129K->8304129K(8350272K), [CMS Perm :
20153K->20153K(33976K)] icms_dc=100 , 8.3198190 secs] [Times: user=8.32
sys=0.00, real=8.32 secs]
82648.369: [Full GC 82648.369: [CMS: 7973414K->7973414K(8005248K),
8.2264360 secs] 8304237K->8304130K(8350272K), [CMS Perm :
20153K->20153K(33976K)] icms_dc=100 , 8.2265410 secs] [Times: user=8.22
sys=0.01, real=8.22 secs]
82656.596: [Full GC 82656.596: [CMS: 7973414K->7973414K(8005248K),
8.4928260 secs] 8304130K->8304130K(8350272K), [CMS Perm :
20153K->20153K(33976K)] icms_dc=100 , 8.4929270 secs] [Times: user=8.48
sys=0.01, real=8.49 secs]
82665.089: [Full GC 82665.089: [CMS: 7973414K->7973414K(8005248K),
8.3110610 secs] 8304132K->8304130K(8350272K), [CMS Perm :
20153K->20153K(33976K)] icms_dc=100 , 8.3111690 secs] [Times: user=8.29
sys=0.02, real=8.31 secs]
82673.400: [Full GC 82673.400: [CMS: 7973414K->7973414K(8005248K),
8.5270500 secs] 8304130K->8304130K(8350272K), [CMS Perm :
20153K->20153K(33976K)] icms_dc=100 , 8.5271560 secs] [Times: user=8.52
sys=0.00, real=8.53 secs]
82681.929: [Full GC 82681.929: [CMS (concurrent mode failure):
7973414K->7973414K(8005248K), 12.0992320 secs]
8305391K->8304128K(8350272K), [CMS Perm : 20153K->20153K(33976K)]
icms_dc=100 , 12.0993380 secs] [Times: user=12.08 sys=0.03, real=12.10 secs]
region server log:
2012-03-25 07:25:28,729 WARN org.apache.hadoop.ipc.HBaseServer: IPC Server
handler 8 on 8041 caught: java.nio.channels.ClosedChannelException
at
sun.nio.ch.SocketChannelImpl.ensureWriteOpen(SocketChannelImpl.java:133)
at sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:324)
at
org.apache.hadoop.hbase.ipc.HBaseServer.channelIO(HBaseServer.java:1387)
at
org.apache.hadoop.hbase.ipc.HBaseServer.channelWrite(HBaseServer.java:1339)
at
org.apache.hadoop.hbase.ipc.HBaseServer$Responder.processResponse(HBaseServer.java:727)
at
org.apache.hadoop.hbase.ipc.HBaseServer$Responder.doRespond(HBaseServer.java:792)
at
org.apache.hadoop.hbase.ipc.HBaseServer$Handler.run(HBaseServer.java:1083)
2012-03-25 07:31:36,931 WARN org.apache.hadoop.hdfs.DFSClient:
DFSOutputStream ResponseProcessor exception for block
blk_9110507610649672616_2064873java.io.EOFException
at java.io.DataInputStream.readFully(DataInputStream.java:180)
at java.io.DataInputStream.readLong(DataInputStream.java:399)
at
org.apache.hadoop.hdfs.protocol.DataTransferProtocol$PipelineAck.readFields(DataTransferProtocol.java:122)
at
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream$ResponseProcessor.run(DFSClient.java:2548)
org.apache.hadoop.ipc.RemoteException:
org.apache.hadoop.ipc.RemoteException: java.io.IOException:
blk_-6219291421501721811_2065530 is already commited, storedBlock == null.
at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.nextGenerationStampForBlock(FSNamesystem.java:4877)
at
org.apache.hadoop.hdfs.server.namenode.NameNode.nextGenerationStamp(NameNode.java:501)
at sun.reflect.GeneratedMethodAccessor24.invoke(Unknown Source)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
at java.lang.reflect.Method.invoke(Method.java:597)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:508)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:961)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:957)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:955)
at org.apache.hadoop.ipc.Client.call(Client.java:740)
at org.apache.hadoop.ipc.RPC$Invoker.invoke(RPC.java:220)
at $Proxy4.nextGenerationStamp(Unknown Source)
at
org.apache.hadoop.hdfs.server.datanode.DataNode.syncBlock(DataNode.java:1577)
at
org.apache.hadoop.hdfs.server.datanode.DataNode.recoverBlock(DataNode.java:1551)
at
org.apache.hadoop.hdfs.server.datanode.DataNode.recoverBlock(DataNode.java:1617)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
at java.lang.reflect.Method.invoke(Method.java:597)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:508)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:961)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:957)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:955)
at org.apache.hadoop.ipc.Client.call(Client.java:740)
at org.apache.hadoop.ipc.RPC$Invoker.invoke(RPC.java:220)
at $Proxy9.recoverBlock(Unknown Source)
at
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.processDatanodeError(DFSClient.java:2706)
at
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.access$1500(DFSClient.java:2173)
at
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream$DataStreamer.run(DFSClient.java:2372)
2012-03-25 07:32:20,167 INFO org.apache.hadoop.hbase.regionserver.HRegion:
Closed URLS,20120117_www.u,1326899258350.9f23cb209671c5f85020a7bb6f31800e.
.... //closing all regions (Lior)
2012-03-25 07:32:21,192 WARN org.apache.hadoop.hdfs.DFSClient: Failed
recovery attempt #2 from primary datanode 10.11.87.71:50010
org.apache.hadoop.ipc.RemoteException:
org.apache.hadoop.ipc.RemoteException: java.io.IOException: Block
(=blk_9110507610649672616_2064873) not found
at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.commitBlockSynchronization(FSNamesystem.java:2038)
at
org.apache.hadoop.hdfs.server.namenode.NameNode.commitBlockSynchronization(NameNode.java:509)
at sun.reflect.GeneratedMethodAccessor22.invoke(Unknown Source)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
at java.lang.reflect.Method.invoke(Method.java:597)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:508)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:961)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:957)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:955)
at org.apache.hadoop.ipc.Client.call(Client.java:740)
at org.apache.hadoop.ipc.RPC$Invoker.invoke(RPC.java:220)
at $Proxy4.commitBlockSynchronization(Unknown Source)
at
org.apache.hadoop.hdfs.server.datanode.DataNode.syncBlock(DataNode.java:1570)
at
org.apache.hadoop.hdfs.server.datanode.DataNode.recoverBlock(DataNode.java:1551)
at
org.apache.hadoop.hdfs.server.datanode.DataNode.recoverBlock(DataNode.java:1617)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
at java.lang.reflect.Method.invoke(Method.java:597)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:508)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:961)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:957)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:955)
at org.apache.hadoop.ipc.Client.call(Client.java:740)
at org.apache.hadoop.ipc.RPC$Invoker.invoke(RPC.java:220)
at $Proxy9.recoverBlock(Unknown Source)
at
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.processDatanodeError(DFSClient.java:2706)
at
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.access$1500(DFSClient.java:2173)
at
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream$DataStreamer.run(DFSClient.java:2372)
data node log:
2012-03-25 07:32:19,301 ERROR
org.apache.hadoop.hdfs.server.datanode.DataNode: DatanodeRegistration(
10.11.87.60:50010, storageID=DS-713809212-10.11.87.61-50010-1303380680379,
infoPort=8011, ipcPort=50020):DataXceiver
java.io.IOException: Connection reset by peer
at sun.nio.ch.FileDispatcher.read0(Native Method)
at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:21)
at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:202)
at sun.nio.ch.IOUtil.read(IOUtil.java:175)
at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:243)
at
org.apache.hadoop.net.SocketInputStream$Reader.performIO(SocketInputStream.java:55)
at
org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:142)
at
org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:155)
at
org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:128)
at java.io.BufferedInputStream.read1(BufferedInputStream.java:256)
at java.io.BufferedInputStream.read(BufferedInputStream.java:317)
at java.io.DataInputStream.read(DataInputStream.java:132)
at
org.apache.hadoop.hdfs.server.datanode.BlockReceiver.readToBuf(BlockReceiver.java:265)
at
org.apache.hadoop.hdfs.server.datanode.BlockReceiver.readNextPacket(BlockReceiver.java:312)
at
org.apache.hadoop.hdfs.server.datanode.BlockReceiver.receivePacket(BlockReceiver.java:376)
at
org.apache.hadoop.hdfs.server.datanode.BlockReceiver.receiveBlock(BlockReceiver.java:533)
at
org.apache.hadoop.hdfs.server.datanode.DataXceiver.writeBlock(DataXceiver.java:358)
at
org.apache.hadoop.hdfs.server.datanode.DataXceiver.run(DataXceiver.java:103)
at java.lang.Thread.run(Thread.java:662)
2012-03-25 07:32:19,620 INFO
org.apache.hadoop.hdfs.server.datanode.DataNode: Client calls
recoverBlock(block=blk_-6219291421501721811_2065530, targets=[
10.11.87.60:50010, 10.11.87.79:50010])
2012-03-25 07:32:19,625 INFO org.apache.hadoop.ipc.Server: IPC Server
handler 2 on 50020, call recoverBlock(blk_-6219291421501721811_2065530,
false, [Lorg.apache.hadoop.hdfs.protocol.DatanodeInfo;@7ed06553) from
10.11.87.60:58991: error:
org.apache.hadoop.ipc.RemoteException: java.io.IOException:
blk_-6219291421501721811_2065530 is already commited, storedBlock == null.
at
org.apache.hadoop.hdfs.server.namenode.FSNamesystem.nextGenerationStampForBlock(FSNamesystem.java:4877)
at
org.apache.hadoop.hdfs.server.namenode.NameNode.nextGenerationStamp(NameNode.java:501)
at sun.reflect.GeneratedMethodAccessor24.invoke(Unknown Source)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
at java.lang.reflect.Method.invoke(Method.java:597)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:508)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:961)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:957)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:955)