When using HFileOutputFormat, HMaster often crashes. I guess this critical problem is caused by the timeout of Zookeeper.
Does anyone know what may have caused this? How might I prevent this from happening again? HBase Version : 0.90.3, r1100350 Hadoop Version : 0.20.3-SNAPSHOT, r1057313 Some snippets from the logs are pasted below Thanks 2011-06-29 02:26:40,454 INFO org.apache.zookeeper.ClientCnxn: Client session timed out, have not heard from server in 200000ms for sessionid 0x130d6fba3cc0001, closing socket connection and attempting reconnect 2011-06-29 02:26:40,557 WARN org.apache.hadoop.hbase.zookeeper.ZKUtil: master:60000-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001 Unable to get data of znode /hbase/unassigned/3624b719752997d87cbba6529edcd2a7 org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode = ConnectionLoss for /hbase/unassigned/3624b719752997d87cbba6529edcd2a7 at org.apache.zookeeper.KeeperException.create(KeeperException.java:90) at org.apache.zookeeper.KeeperException.create(KeeperException.java:42) at org.apache.zookeeper.ZooKeeper.getData(ZooKeeper.java:921) at org.apache.hadoop.hbase.zookeeper.ZKUtil.getDataNoWatch(ZKUtil.java:586) at org.apache.hadoop.hbase.zookeeper.ZKAssign.getDataNoWatch(ZKAssign.java:770) at org.apache.hadoop.hbase.master.AssignmentManager$TimeoutMonitor.chore(AssignmentManager.java:1709) at org.apache.hadoop.hbase.Chore.run(Chore.java:66) 2011-06-29 02:26:40,557 FATAL org.apache.hadoop.hbase.master.HMaster: Unexpected ZK exception creating/setting node OFFLINE org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode = ConnectionLoss for /hbase/unassigned/f3523f7dd3eeaae3bb3c66907227e8b0 at org.apache.zookeeper.KeeperException.create(KeeperException.java:90) at org.apache.zookeeper.KeeperException.create(KeeperException.java:42) at org.apache.zookeeper.ZooKeeper.create(ZooKeeper.java:637) at org.apache.hadoop.hbase.zookeeper.ZKUtil.createAndWatch(ZKUtil.java:856) at org.apache.hadoop.hbase.zookeeper.ZKAssign.createOrForceNodeOffline(ZKAssign.java:246) at org.apache.hadoop.hbase.master.AssignmentManager.setOfflineInZooKeeper(AssignmentManager.java:967) at org.apache.hadoop.hbase.master.AssignmentManager.assign(AssignmentManager.java:918) at org.apache.hadoop.hbase.master.AssignmentManager.assign(AssignmentManager.java:746) at org.apache.hadoop.hbase.master.AssignmentManager.assign(AssignmentManager.java:726) at org.apache.hadoop.hbase.master.handler.ServerShutdownHandler.process(ServerShutdownHandler.java:154) at org.apache.hadoop.hbase.executor.EventHandler.run(EventHandler.java:156) at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908) at java.lang.Thread.run(Thread.java:662) 2011-06-29 02:26:40,557 INFO org.apache.hadoop.hbase.master.HMaster: Aborting 2011-06-29 02:26:40,557 ERROR org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher: master:60000-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001 Received unexpected KeeperException, re-throwing exception org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode = ConnectionLoss for /hbase/unassigned/3624b719752997d87cbba6529edcd2a7 at org.apache.zookeeper.KeeperException.create(KeeperException.java:90) at org.apache.zookeeper.KeeperException.create(KeeperException.java:42) at org.apache.zookeeper.ZooKeeper.getData(ZooKeeper.java:921) at org.apache.hadoop.hbase.zookeeper.ZKUtil.getDataNoWatch(ZKUtil.java:586) at org.apache.hadoop.hbase.zookeeper.ZKAssign.getDataNoWatch(ZKAssign.java:770) at org.apache.hadoop.hbase.master.AssignmentManager$TimeoutMonitor.chore(AssignmentManager.java:1709) at org.apache.hadoop.hbase.Chore.run(Chore.java:66) 2011-06-29 02:26:40,558 ERROR org.apache.hadoop.hbase.master.AssignmentManager: Unexpected ZK exception timing out CLOSING region org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode = ConnectionLoss for /hbase/unassigned/3624b719752997d87cbba6529edcd2a7 at org.apache.zookeeper.KeeperException.create(KeeperException.java:90) at org.apache.zookeeper.KeeperException.create(KeeperException.java:42) at org.apache.zookeeper.ZooKeeper.getData(ZooKeeper.java:921) at org.apache.hadoop.hbase.zookeeper.ZKUtil.getDataNoWatch(ZKUtil.java:586) at org.apache.hadoop.hbase.zookeeper.ZKAssign.getDataNoWatch(ZKAssign.java:770) at org.apache.hadoop.hbase.master.AssignmentManager$TimeoutMonitor.chore(AssignmentManager.java:1709) at org.apache.hadoop.hbase.Chore.run(Chore.java:66) 2011-06-29 02:26:40,558 INFO org.apache.hadoop.hbase.master.AssignmentManager: Regions in transition timed out: BaseLog,C110,1309156850234.bfcc3150f5b6ebd169de3266ce49e764. state=OPENING, ts=1309285400162 2011-06-29 02:26:40,558 INFO org.apache.hadoop.hbase.master.AssignmentManager: Region has been OPENING for too long, reassigning region=BaseLog,C110,1309156850234.bfcc3150f5b6ebd169de3266ce49e764. 2011-06-29 02:26:41,063 DEBUG org.apache.hadoop.hbase.master.HMaster: Stopping service threads 2011-06-29 02:26:41,064 INFO org.apache.hadoop.ipc.HBaseServer: Stopping server on 60000 2011-06-29 02:26:41,064 INFO org.apache.hadoop.hbase.master.HMaster$1: dev-199-121:60000-BalancerChore exiting 2011-06-29 02:26:41,064 INFO org.apache.hadoop.ipc.HBaseServer: IPC Server handler 3 on 60000: exiting 2011-06-29 02:26:41,064 INFO org.apache.hadoop.hbase.master.CatalogJanitor: dev-199-121:60000-CatalogJanitor exiting 2011-06-29 02:26:41,066 INFO org.apache.hadoop.ipc.HBaseServer: IPC Server handler 7 on 60000: exiting 2011-06-29 02:26:41,066 INFO org.apache.hadoop.ipc.HBaseServer: IPC Server handler 6 on 60000: exiting 2011-06-29 02:26:41,066 INFO org.apache.hadoop.ipc.HBaseServer: Stopping IPC Server listener on 60000 2011-06-29 02:26:41,066 INFO org.apache.hadoop.ipc.HBaseServer: IPC Server handler 9 on 60000: exiting 2011-06-29 02:26:41,066 INFO org.apache.hadoop.ipc.HBaseServer: IPC Server handler 1 on 60000: exiting 2011-06-29 02:26:41,066 INFO org.apache.hadoop.ipc.HBaseServer: IPC Server handler 4 on 60000: exiting 2011-06-29 02:26:41,066 INFO org.apache.hadoop.ipc.HBaseServer: IPC Server handler 0 on 60000: exiting 2011-06-29 02:26:41,065 INFO org.apache.hadoop.hbase.master.LogCleaner: master-dev-199-121:60000.oldLogCleaner exiting 2011-06-29 02:26:41,065 INFO org.apache.hadoop.hbase.master.HMaster: Stopping infoServer 2011-06-29 02:26:41,065 INFO org.apache.hadoop.ipc.HBaseServer: Stopping IPC Server Responder 2011-06-29 02:26:41,065 INFO org.apache.hadoop.ipc.HBaseServer: IPC Server handler 5 on 60000: exiting 2011-06-29 02:26:41,064 INFO org.apache.hadoop.ipc.HBaseServer: IPC Server handler 2 on 60000: exiting 2011-06-29 02:26:41,066 INFO org.apache.hadoop.ipc.HBaseServer: IPC Server handler 8 on 60000: exiting 2011-06-29 02:26:41,073 INFO org.mortbay.log: Stopped [email protected]:60010 2011-06-29 02:26:41,390 INFO org.apache.zookeeper.ClientCnxn: Opening socket connection to server dev-197-149/10.249.197.149:61000 2011-06-29 02:26:41,391 INFO org.apache.zookeeper.ClientCnxn: Socket connection established to dev-197-149/10.249.197.149:61000, initiating session 2011-06-29 02:26:41,394 INFO org.apache.zookeeper.ClientCnxn: Session establishment complete on server dev-197-149/10.249.197.149:61000, sessionid = 0x130d6fba3cc0001, negotiated timeout = 300000 2011-06-29 02:27:21,996 DEBUG org.apache.hadoop.hbase.catalog.CatalogTracker: Stopping catalog tracker org.apache.hadoop.hbase.catalog.CatalogTracker@7bd33a6b 2011-06-29 02:27:21,997 WARN org.apache.hadoop.hbase.zookeeper.ZKUtil: master:60000-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001 Unable to get data of znode /hbase/unassigned/bfcc3150f5b6ebd169de3266ce49e764 java.lang.InterruptedException at java.lang.Object.wait(Native Method) at java.lang.Object.wait(Object.java:485) at org.apache.zookeeper.ClientCnxn.submitRequest(ClientCnxn.java:1317) at org.apache.zookeeper.ZooKeeper.getData(ZooKeeper.java:919) at org.apache.hadoop.hbase.zookeeper.ZKUtil.getDataAndWatch(ZKUtil.java:549) at org.apache.hadoop.hbase.master.AssignmentManager$TimeoutMonitor.chore(AssignmentManager.java:1734) at org.apache.hadoop.hbase.Chore.run(Chore.java:66) 2011-06-29 02:27:21,997 INFO org.apache.hadoop.hbase.master.AssignmentManager: Successfully transitioned region=BaseLog,C110,1309156850234.bfcc3150f5b6ebd169de3266ce49e764. into OFFLINE and forcing a new assignment 2011-06-29 02:27:21,997 INFO org.apache.hadoop.hbase.master.AssignmentManager: Regions in transition timed out: BaseLog,C2,1309156850235.f3523f7dd3eeaae3bb3c66907227e8b0. state=OFFLINE, ts=1309285399926 2011-06-29 02:27:21,997 INFO org.apache.hadoop.hbase.client.HConnectionManager$HConnectionImplementation: Closed zookeeper sessionid=0x30d6fc37f40006 2011-06-29 02:27:21,997 INFO org.apache.hadoop.hbase.master.AssignmentManager: Region has been OFFLINE for too long, reassigning BaseLog,C2,1309156850235.f3523f7dd3eeaae3bb3c66907227e8b0. to a random server 2011-06-29 02:27:22,001 DEBUG org.apache.hadoop.hbase.master.AssignmentManager: Handling transition=RS_ZK_REGION_OPENING, server=dev-195-151,60020,1309276923241, region=6aec4056cb58be626bc1686cb77b5557, which is more than 15 seconds late 2011-06-29 02:27:22,001 INFO org.apache.zookeeper.ZooKeeper: Session: 0x30d6fc37f40006 closed 2011-06-29 02:27:22,002 INFO org.apache.zookeeper.ClientCnxn: EventThread shut down 2011-06-29 02:27:22,200 INFO org.apache.hadoop.hbase.zookeeper.RegionServerTracker: RegionServer ephemeral node deleted, processing expiration [dev-192-19,60020,1309277114541] 2011-06-29 02:27:22,200 DEBUG org.apache.hadoop.hbase.zookeeper.ZKAssign: master:60000-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001 Creating (or updating) unassigned node for 4510ea062f3aee7fea9c8c034e6085af with OFFLINE state 2011-06-29 02:27:22,200 DEBUG org.apache.hadoop.hbase.master.AssignmentManager: Forcing OFFLINE; was=BaseLog,C110,1309156850234.bfcc3150f5b6ebd169de3266ce49e764. state=OPENING, ts=1309285400162 2011-06-29 02:27:22,201 DEBUG org.apache.hadoop.hbase.master.AssignmentManager: Server stopped; skipping assign of BaseLog,C110,1309156850234.bfcc3150f5b6ebd169de3266ce49e764. state=OFFLINE, ts=1309285642201 2011-06-29 02:27:22,201 DEBUG org.apache.hadoop.hbase.master.AssignmentManager: Forcing OFFLINE; was=BaseLog,C2,1309156850235.f3523f7dd3eeaae3bb3c66907227e8b0. state=OFFLINE, ts=1309285399926 2011-06-29 02:27:22,201 WARN org.apache.hadoop.hbase.zookeeper.ZKUtil: master:60000-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001 Unable to set watcher on znode (/hbase/unassigned/4510ea062f3aee7fea9c8c034e6085af) org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode = ConnectionLoss for /hbase/unassigned/4510ea062f3aee7fea9c8c034e6085af at org.apache.zookeeper.KeeperException.create(KeeperException.java:90) at org.apache.zookeeper.KeeperException.create(KeeperException.java:42) at org.apache.zookeeper.ZooKeeper.exists(ZooKeeper.java:809) at org.apache.hadoop.hbase.zookeeper.ZKUtil.checkExists(ZKUtil.java:260) at org.apache.hadoop.hbase.zookeeper.ZKAssign.createOrForceNodeOffline(ZKAssign.java:244) at org.apache.hadoop.hbase.master.AssignmentManager.setOfflineInZooKeeper(AssignmentManager.java:967) at org.apache.hadoop.hbase.master.AssignmentManager.assign(AssignmentManager.java:918) at org.apache.hadoop.hbase.master.AssignmentManager.assign(AssignmentManager.java:746) at org.apache.hadoop.hbase.master.AssignmentManager.assign(AssignmentManager.java:726) at org.apache.hadoop.hbase.master.handler.ServerShutdownHandler.process(ServerShutdownHandler.java:154) at org.apache.hadoop.hbase.executor.EventHandler.run(EventHandler.java:156) at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908) at java.lang.Thread.run(Thread.java:662) 2011-06-29 02:27:22,201 ERROR org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher: master:60000-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001-0x130d6fba3cc0001 Received unexpected KeeperException, re-throwing exception org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode = ConnectionLoss for /hbase/unassigned/4510ea062f3aee7fea9c8c034e6085af at org.apache.zookeeper.KeeperException.create(KeeperException.java:90) at org.apache.zookeeper.KeeperException.create(KeeperException.java:42) at org.apache.zookeeper.ZooKeeper.exists(ZooKeeper.java:809) at org.apache.hadoop.hbase.zookeeper.ZKUtil.checkExists(ZKUtil.java:260) at org.apache.hadoop.hbase.zookeeper.ZKAssign.createOrForceNodeOffline(ZKAssign.java:244) at org.apache.hadoop.hbase.master.AssignmentManager.setOfflineInZooKeeper(AssignmentManager.java:967) at org.apache.hadoop.hbase.master.AssignmentManager.assign(AssignmentManager.java:918) at org.apache.hadoop.hbase.master.AssignmentManager.assign(AssignmentManager.java:746) at org.apache.hadoop.hbase.master.AssignmentManager.assign(AssignmentManager.java:726) at org.apache.hadoop.hbase.master.handler.ServerShutdownHandler.process(ServerShutdownHandler.java:154) at org.apache.hadoop.hbase.executor.EventHandler.run(EventHandler.java:156) at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908) -- Best wishes Gan, Xiyun
