HBASE-12480 Regions in FAILED_OPEN/FAILED_CLOSE should be processed on master failover
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/3b4b1de3 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/3b4b1de3 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/3b4b1de3 Branch: refs/heads/0.98 Commit: 3b4b1de3ca387a0b720bf4c61d8f5a9ba08da78f Parents: 780f6f5 Author: Virag Kothari <[email protected]> Authored: Tue Jan 13 11:18:00 2015 -0800 Committer: Virag Kothari <[email protected]> Committed: Tue Jan 13 11:18:00 2015 -0800 ---------------------------------------------------------------------- .../hadoop/hbase/master/AssignmentManager.java | 22 +++++++--- .../hadoop/hbase/master/TestMasterFailover.java | 42 ++++++++++++++++++-- 2 files changed, 55 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/3b4b1de3/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index f0fe635..6d10327 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -569,8 +569,9 @@ public class AssignmentManager extends ZooKeeperListener { if (!regionsInTransition.isEmpty()) { Set<ServerName> onlineServers = serverManager.getOnlineServers().keySet(); for (RegionState regionState : regionsInTransition.values()) { + ServerName serverName = regionState.getServerName(); if (!regionState.getRegion().isMetaRegion() - && onlineServers.contains(regionState.getServerName())) { + && serverName != null && onlineServers.contains(serverName)) { LOG.debug("Found " + regionState + " in RITs"); failover = true; break; @@ -2995,14 +2996,21 @@ public class AssignmentManager extends ZooKeeperListener { // to the region if the master dies right after the RPC call is out. Map<String, RegionState> rits = regionStates.getRegionsInTransition(); for (RegionState regionState : rits.values()) { - if (!serverManager.isServerOnline(regionState.getServerName())) { - continue; // SSH will handle it + LOG.info("Processing " + regionState); + ServerName serverName = regionState.getServerName(); + // Server could be null in case of FAILED_OPEN when master cannot find a region plan. In that + // case, try assigning it here. + if (serverName != null + && !serverManager.getOnlineServers().containsKey(serverName)) { + LOG.info("Server " + serverName + " isn't online. SSH will handle this"); + continue; } + HRegionInfo regionInfo = regionState.getRegion(); State state = regionState.getState(); - LOG.info("Processing " + regionState); + switch (state) { case CLOSED: - invokeAssign(regionState.getRegion()); + invokeAssign(regionInfo); break; case PENDING_OPEN: retrySendRegionOpen(regionState); @@ -3010,6 +3018,10 @@ public class AssignmentManager extends ZooKeeperListener { case PENDING_CLOSE: retrySendRegionClose(regionState); break; + case FAILED_CLOSE: + case FAILED_OPEN: + invokeUnassign(regionInfo); + break; default: // No process for other states } http://git-wip-us.apache.org/repos/asf/hbase/blob/3b4b1de3/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java index 0a0e3d9..83ad29d 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java @@ -1053,8 +1053,8 @@ public class TestMasterFailover { RegionState newState = regionStates.getRegionState(hri); assertTrue(newState.isOpened()); } - - /** + + /** * Simple test of master failover. * <p> * Starts with three masters. Kills a backup master. Then kills the active @@ -1165,7 +1165,7 @@ public class TestMasterFailover { } /** - * Test region in pending_open/close when master failover + * Test region in pending_open/close and failed_open/close when master failover */ @Test (timeout=180000) public void testPendingOpenOrCloseWhenMasterFailover() throws Exception { @@ -1230,6 +1230,37 @@ public class TestMasterFailover { oldState = new RegionState(hriOffline, State.OFFLINE); newState = new RegionState(hriOffline, State.PENDING_OPEN, newState.getServerName()); stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + HRegionInfo failedClose = new HRegionInfo(offlineTable.getTableName(), null, null); + createRegion(failedClose, rootdir, conf, offlineTable); + MetaEditor.addRegionToMeta(master.getCatalogTracker(), failedClose); + + oldState = new RegionState(failedClose, State.PENDING_CLOSE); + newState = new RegionState(failedClose, State.FAILED_CLOSE, newState.getServerName()); + stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + + HRegionInfo failedOpen = new HRegionInfo(offlineTable.getTableName(), null, null); + createRegion(failedOpen, rootdir, conf, offlineTable); + MetaEditor.addRegionToMeta(master.getCatalogTracker(), failedOpen); + + // Simulate a region transitioning to failed open when the region server reports the + // transition as FAILED_OPEN + oldState = new RegionState(failedOpen, State.PENDING_OPEN); + newState = new RegionState(failedOpen, State.FAILED_OPEN, newState.getServerName()); + stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + HRegionInfo failedOpenNullServer = new HRegionInfo(offlineTable.getTableName(), null, null); + createRegion(failedOpenNullServer, rootdir, conf, offlineTable); + MetaEditor.addRegionToMeta(master.getCatalogTracker(), failedOpenNullServer); + + // Simulate a region transitioning to failed open when the master couldn't find a plan for + // the region + oldState = new RegionState(failedOpenNullServer, State.OFFLINE); + newState = new RegionState(failedOpenNullServer, State.FAILED_OPEN, null); + stateStore.updateRegionState(HConstants.NO_SEQNUM, newState, oldState); + + // Stop the master log("Aborting master"); @@ -1253,7 +1284,10 @@ public class TestMasterFailover { // Both pending_open (RPC sent/not yet) regions should be online assertTrue(regionStates.isRegionOnline(hriOffline)); assertTrue(regionStates.isRegionOnline(hriOnline)); - + assertTrue(regionStates.isRegionOnline(failedClose)); + assertTrue(regionStates.isRegionOnline(failedOpenNullServer)); + assertTrue(regionStates.isRegionOnline(failedOpen)); + log("Done with verification, shutting down cluster"); // Done, shutdown the cluster
