Andrey Stepachev created HBASE-13083:
----------------------------------------

             Summary: Master can be dead-locked while assigning META.
                 Key: HBASE-13083
                 URL: https://issues.apache.org/jira/browse/HBASE-13083
             Project: HBase
          Issue Type: Bug
          Components: master, Region Assignment
    Affects Versions: 1.1.0
            Reporter: Andrey Stepachev
            Assignee: Andrey Stepachev


We got situation when master is deadlocked.
It seems we have deadlock in master code. In SSH it calls 
RegionStates#serverOffline which in turn
aquires synchronized(this) effectively block all requests to RegionStates. 
In another thread it processes assignMeta, which tries to access region states 
and blocks.
Finally any assignment operations try to access meta for table states and 
region operations, but
cannot do that due of locked RegionStates class.

serverOffline() waiting for meta availability
{code}
Thread 17019: (state = BLOCKED)
 - sun.misc.Unsafe.park(boolean, long) @bci=0 (Compiled frame; information may 
be imprecise)
 - java.util.concurrent.locks.LockSupport.park(java.lang.Object) @bci=14, 
line=186 (Interpreted frame)
 - 
java.util.concurrent.SynchronousQueue$TransferStack.awaitFulfill(java.util.concurrent.SynchronousQueue$TransferStack$SNode,
 boolean, long) @bci=158, line=458 (Compiled frame)
/serverOffline
 - java.lang.Thread.sleep(long) @bci=0 (Interpreted frame)
 - 
org.apache.hadoop.hbase.zookeeper.MetaTableLocator.blockUntilAvailable(org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher,
 int, long) @bci=74, line=605 (Interpreted frame)
 - 
org.apache.hadoop.hbase.zookeeper.MetaTableLocator.blockUntilAvailable(org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher,
 long) @bci=4, line=580 (Interpreted frame)
 - 
org.apache.hadoop.hbase.zookeeper.MetaTableLocator.blockUntilAvailable(org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher,
 long, org.apache.hadoop.conf.Configuration) @bci=65, line=559 (Interpreted 
frame)
 - org.apache.hadoop.hbase.client.ZooKeeperRegistry.getMetaRegionLocation() 
@bci=69, line=58 (Interpreted frame)
 - 
org.apache.hadoop.hbase.client.ConnectionManager$HConnectionImplementation.locateMeta(org.apache.hadoop.hbase.TableName,
 boolean, int) @bci=83, line=1131 (Compiled frame)
 - 
org.apache.hadoop.hbase.client.ConnectionManager$HConnectionImplementation.locateRegion(org.apache.hadoop.hbase.TableName,
 byte[], boolean, boolean, int) @bci=74, line=1098 (Compiled frame)
 - 
org.apache.hadoop.hbase.client.AsyncProcess$AsyncRequestFutureImpl.findAllLocationsOrFail(org.apache.hadoop.hbase.client.Action,
 boolean) @bci=73, line=940 (Compiled frame)
 - 
org.apache.hadoop.hbase.client.AsyncProcess$AsyncRequestFutureImpl.groupAndSendMultiAction(java.util.List,
 int) @bci=48, line=857 (Compiled frame)
 - 
org.apache.hadoop.hbase.client.AsyncProcess$AsyncRequestFutureImpl.access$100(org.apache.hadoop.hbase.client.AsyncProcess$AsyncRequestFutureImpl,
 java.util.List, int) @bci=3, line=575 (Compiled frame)
 - 
org.apache.hadoop.hbase.client.AsyncProcess.submitAll(java.util.concurrent.ExecutorService,
 org.apache.hadoop.hbase.TableName, java.util.List, 
org.apache.hadoop.hbase.client.coprocessor.Batch$Callback, java.lang.Object[]) 
@bci=195, line=557 (Compiled frame)
 - 
org.apache.hadoop.hbase.client.ConnectionManager$HConnectionImplementation.processBatchCallback(java.util.List,
 org.apache.hadoop.hbase.TableName, java.util.concurrent.ExecutorService, 
java.lang.Object[], org.apache.hadoop.hbase.client.coprocessor.Batch$Callback) 
@bci=11, line=2136 (Compiled frame)
 - 
org.apache.hadoop.hbase.util.MultiHConnection.processBatchCallback(java.util.List,
 org.apache.hadoop.hbase.TableName, java.lang.Object[], 
org.apache.hadoop.hbase.client.coprocessor.Batch$Callback) @bci=24, line=125 
(Compiled frame)
 - org.apache.hadoop.hbase.master.RegionStateStore.updateRegionState(long, 
org.apache.hadoop.hbase.master.RegionState, 
org.apache.hadoop.hbase.master.RegionState) @bci=421, line=244 (Compiled frame)
 - 
org.apache.hadoop.hbase.master.RegionStates.updateRegionState(org.apache.hadoop.hbase.HRegionInfo,
 org.apache.hadoop.hbase.master.RegionState$State, 
org.apache.hadoop.hbase.ServerName, long) @bci=149, line=1109 (Compiled frame)
 - 
org.apache.hadoop.hbase.master.RegionStates.updateRegionState(org.apache.hadoop.hbase.HRegionInfo,
 org.apache.hadoop.hbase.master.RegionState$State, 
org.apache.hadoop.hbase.ServerName) @bci=7, line=425 (Compiled frame)
 - 
org.apache.hadoop.hbase.master.RegionStates.updateRegionState(org.apache.hadoop.hbase.HRegionInfo,
 org.apache.hadoop.hbase.master.RegionState$State) @bci=24, line=383 (Compiled 
frame)
 - 
org.apache.hadoop.hbase.master.RegionStates.regionOffline(org.apache.hadoop.hbase.HRegionInfo,
 org.apache.hadoop.hbase.master.RegionState$State) @bci=83, line=586 
(Interpreted frame)
 - 
org.apache.hadoop.hbase.master.RegionStates.regionOffline(org.apache.hadoop.hbase.HRegionInfo)
 @bci=3, line=566 (Interpreted frame)
 - 
org.apache.hadoop.hbase.master.RegionStates.serverOffline(org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher,
 org.apache.hadoop.hbase.ServerName) @bci=494, line=667 (Interpreted frame)
 - 
org.apache.hadoop.hbase.master.AssignmentManager.processServerShutdown(org.apache.hadoop.hbase.ServerName)
 @bci=101, line=3334 (Interpreted frame)
 - org.apache.hadoop.hbase.master.handler.ServerShutdownHandler.process() 
@bci=626, line=237 (Interpreted frame)
 - org.apache.hadoop.hbase.executor.EventHandler.run() @bci=33, line=128 
(Interpreted frame)
 - 
java.util.concurrent.ThreadPoolExecutor.runWorker(java.util.concurrent.ThreadPoolExecutor$Worker)
 @bci=95, line=1145 (Interpreted frame)
 - java.util.concurrent.ThreadPoolExecutor$Worker.run() @bci=5, line
{code}
Blocked meta looks like:
{code}
Thread 18357: (state = BLOCKED)
 - org.apache.hadoop.hbase.master.RegionStates.getRegionState(java.lang.String) 
@bci=0, line=1053 (Compiled frame)
 - 
org.apache.hadoop.hbase.master.RegionStates.getRegionState(org.apache.hadoop.hbase.HRegionInfo)
 @bci=5, line=1036 (Compiled frame)
 - 
org.apache.hadoop.hbase.master.AssignmentManager.forceRegionStateToOffline(org.apache.hadoop.hbase.HRegionInfo,
 boolean) @bci=5, line=1915 (Interpreted frame)
 - 
org.apache.hadoop.hbase.master.AssignmentManager.assign(org.apache.hadoop.hbase.HRegionInfo,
 boolean, boolean) @bci=29, line=1564 (Interpreted frame)
 - 
org.apache.hadoop.hbase.master.AssignmentManager.assign(org.apache.hadoop.hbase.HRegionInfo,
 boolean) @bci=4, line=1550 (Interpreted frame)
 - 
org.apache.hadoop.hbase.master.AssignmentManager.assignMeta(org.apache.hadoop.hbase.HRegionInfo)
 @bci=23, line=2636 (Interpreted frame)
 - 
org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler.verifyAndAssignMeta()
 @bci=64, line=159 (Interpreted frame)
 - 
org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler.verifyAndAssignMetaWithRetries()
 @bci=39, line=184 (Interpreted frame)
 - org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler.process() 
@bci=276, line=93 (Interpreted frame)
 - org.apache.hadoop.hbase.executor.EventHandler.run() @bci=33, line=128 
(Interpreted frame)
 - 
java.util.concurrent.ThreadPoolExecutor.runWorker(java.util.concurrent.ThreadPoolExecutor$Worker)
 @bci=95, line=1145 (Compiled frame)
 - java.util.concurrent.ThreadPoolExecutor$Worker.run() @bci=5, line=615 
(Interpreted frame)
 - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame)
{code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to