Repository: hbase Updated Branches: refs/heads/master 4cb76aa92 -> 3a82cf238
HBASE-11760 Tighten up region state transition Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/3a82cf23 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/3a82cf23 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/3a82cf23 Branch: refs/heads/master Commit: 3a82cf238ba287a6dc555e31c01c2dd43e880a1f Parents: 4cb76aa Author: Jimmy Xiang <[email protected]> Authored: Thu Aug 14 16:39:12 2014 -0700 Committer: Jimmy Xiang <[email protected]> Committed: Thu Sep 11 13:21:12 2014 -0700 ---------------------------------------------------------------------- .../apache/hadoop/hbase/master/RegionState.java | 56 +- .../RegionAlreadyInTransitionException.java | 39 - .../hadoop/hbase/master/AssignCallable.java | 6 +- .../hadoop/hbase/master/AssignmentManager.java | 958 +++++++++++-------- .../hadoop/hbase/master/RegionStates.java | 9 +- .../hadoop/hbase/master/ServerManager.java | 6 +- .../master/handler/ServerShutdownHandler.java | 2 +- .../hbase/regionserver/HRegionServer.java | 14 +- .../hbase/regionserver/RSRpcServices.java | 55 +- .../handler/CloseRegionHandler.java | 2 +- .../master/TestAssignmentManagerOnCluster.java | 26 +- .../regionserver/TestRegionServerNoMaster.java | 27 +- 12 files changed, 630 insertions(+), 570 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/3a82cf23/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java ---------------------------------------------------------------------- diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java index 0a9c123..e7af336 100644 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java +++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/master/RegionState.java @@ -36,10 +36,10 @@ public class RegionState { @InterfaceStability.Evolving public enum State { OFFLINE, // region is in an offline state - PENDING_OPEN, // sent rpc to server to open but has not begun + PENDING_OPEN, // same as OPENING, to be removed OPENING, // server has begun to open but not yet done OPEN, // server opened region and updated meta - PENDING_CLOSE, // sent rpc to server to close but has not begun + PENDING_CLOSE, // same as CLOSING, to be removed CLOSING, // server has begun to close but not yet done CLOSED, // server closed region and updated meta SPLITTING, // server started split of a region @@ -210,30 +210,28 @@ public class RegionState { return serverName; } + /** + * PENDING_CLOSE (to be removed) is the same as CLOSING + */ public boolean isClosing() { - return state == State.CLOSING; + return state == State.PENDING_CLOSE || state == State.CLOSING; } public boolean isClosed() { return state == State.CLOSED; } - public boolean isPendingClose() { - return state == State.PENDING_CLOSE; - } - + /** + * PENDING_OPEN (to be removed) is the same as OPENING + */ public boolean isOpening() { - return state == State.OPENING; + return state == State.PENDING_OPEN || state == State.OPENING; } public boolean isOpened() { return state == State.OPEN; } - public boolean isPendingOpen() { - return state == State.PENDING_OPEN; - } - public boolean isOffline() { return state == State.OFFLINE; } @@ -270,40 +268,6 @@ public class RegionState { return state == State.MERGING_NEW; } - public boolean isOpenOrMergingOnServer(final ServerName sn) { - return isOnServer(sn) && (isOpened() || isMerging()); - } - - public boolean isOpenOrMergingNewOnServer(final ServerName sn) { - return isOnServer(sn) && (isOpened() || isMergingNew()); - } - - public boolean isOpenOrSplittingOnServer(final ServerName sn) { - return isOnServer(sn) && (isOpened() || isSplitting()); - } - - public boolean isOpenOrSplittingNewOnServer(final ServerName sn) { - return isOnServer(sn) && (isOpened() || isSplittingNew()); - } - - public boolean isPendingOpenOrOpeningOnServer(final ServerName sn) { - return isOnServer(sn) && isPendingOpenOrOpening(); - } - - // Failed open is also kind of pending open - public boolean isPendingOpenOrOpening() { - return isPendingOpen() || isOpening() || isFailedOpen(); - } - - public boolean isPendingCloseOrClosingOnServer(final ServerName sn) { - return isOnServer(sn) && isPendingCloseOrClosing(); - } - - // Failed close is also kind of pending close - public boolean isPendingCloseOrClosing() { - return isPendingClose() || isClosing() || isFailedClose(); - } - public boolean isOnServer(final ServerName sn) { return serverName != null && serverName.equals(sn); } http://git-wip-us.apache.org/repos/asf/hbase/blob/3a82cf23/hbase-client/src/main/java/org/apache/hadoop/hbase/regionserver/RegionAlreadyInTransitionException.java ---------------------------------------------------------------------- diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/regionserver/RegionAlreadyInTransitionException.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/regionserver/RegionAlreadyInTransitionException.java deleted file mode 100644 index 6cad068..0000000 --- a/hbase-client/src/main/java/org/apache/hadoop/hbase/regionserver/RegionAlreadyInTransitionException.java +++ /dev/null @@ -1,39 +0,0 @@ -/** - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.regionserver; - -import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.classification.InterfaceStability; - -import java.io.IOException; - -/** - * This exception is thrown when a region server is asked to open or close - * a region but it's already processing it - */ -@SuppressWarnings("serial") [email protected] [email protected] -public class RegionAlreadyInTransitionException extends IOException { - - public RegionAlreadyInTransitionException(String s) { - super(s); - } - -} http://git-wip-us.apache.org/repos/asf/hbase/blob/3a82cf23/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignCallable.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignCallable.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignCallable.java index 88e5c2a..e2e729d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignCallable.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignCallable.java @@ -34,18 +34,16 @@ public class AssignCallable implements Callable<Object> { private AssignmentManager assignmentManager; private HRegionInfo hri; - private boolean newPlan; public AssignCallable( - AssignmentManager assignmentManager, HRegionInfo hri, boolean newPlan) { + AssignmentManager assignmentManager, HRegionInfo hri) { this.assignmentManager = assignmentManager; - this.newPlan = newPlan; this.hri = hri; } @Override public Object call() throws Exception { - assignmentManager.assign(hri, newPlan); + assignmentManager.assign(hri); return null; } } http://git-wip-us.apache.org/repos/asf/hbase/blob/3a82cf23/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java index 5ecbe98..5b9ff36 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java @@ -77,7 +77,7 @@ import org.apache.hadoop.hbase.master.handler.EnableTableHandler; import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionStateTransition; import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos; -import org.apache.hadoop.hbase.regionserver.RegionAlreadyInTransitionException; +import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.Table; import org.apache.hadoop.hbase.regionserver.RegionOpeningState; import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException; import org.apache.hadoop.hbase.regionserver.wal.HLog; @@ -101,10 +101,6 @@ import com.google.common.annotations.VisibleForTesting; public class AssignmentManager { private static final Log LOG = LogFactory.getLog(AssignmentManager.class); - static final String ALREADY_IN_TRANSITION_WAITTIME - = "hbase.assignment.already.intransition.waittime"; - static final int DEFAULT_ALREADY_IN_TRANSITION_WAITTIME = 60000; // 1 minute - protected final Server server; private ServerManager serverManager; @@ -136,12 +132,6 @@ public class AssignmentManager { private final int maximumAttempts; /** - * Map of two merging regions from the region to be created. - */ - private final Map<String, PairOfSameType<HRegionInfo>> mergingRegions - = new HashMap<String, PairOfSameType<HRegionInfo>>(); - - /** * The sleep time for which the assignment will wait before retrying in case of hbase:meta assignment * failure due to lack of availability of region plan */ @@ -751,22 +741,21 @@ public class AssignmentManager { try { // Send OPEN RPC. If it fails on a IOE or RemoteException, // regions will be assigned individually. + Configuration conf = server.getConfiguration(); long maxWaitTime = System.currentTimeMillis() + - this.server.getConfiguration(). - getLong("hbase.regionserver.rpc.startup.waittime", 60000); + conf.getLong("hbase.regionserver.rpc.startup.waittime", 60000); for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) { try { List<RegionOpeningState> regionOpeningStateList = serverManager .sendRegionOpen(destination, regionOpenInfos); - if (regionOpeningStateList == null) { - // Failed getting RPC connection to this server - return false; - } for (int k = 0, n = regionOpeningStateList.size(); k < n; k++) { RegionOpeningState openingState = regionOpeningStateList.get(k); if (openingState != RegionOpeningState.OPENED) { HRegionInfo region = regionOpenInfos.get(k).getFirst(); + LOG.info("Got opening state " + openingState + + ", will reassign later: " + region); // Failed opening this region, reassign it later + forceRegionStateToOffline(region, true); failedToOpenRegions.add(region); } } @@ -782,8 +771,10 @@ public class AssignmentManager { } else if (e instanceof ServerNotRunningYetException) { long now = System.currentTimeMillis(); if (now < maxWaitTime) { - LOG.debug("Server is not yet up; waiting up to " + - (maxWaitTime - now) + "ms", e); + if (LOG.isDebugEnabled()) { + LOG.debug("Server is not yet up; waiting up to " + + (maxWaitTime - now) + "ms", e); + } Thread.sleep(100); i--; // reset the try count continue; @@ -803,6 +794,17 @@ public class AssignmentManager { Thread.sleep(100); i--; continue; + } else if (e instanceof FailedServerException && i < maximumAttempts) { + // In case the server is in the failed server list, no point to + // retry too soon. Retry after the failed_server_expiry time + long sleepTime = 1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, + RpcClient.FAILED_SERVER_EXPIRY_DEFAULT); + if (LOG.isDebugEnabled()) { + LOG.debug(destination + " is on failed server list; waiting " + + sleepTime + "ms", e); + } + Thread.sleep(sleepTime); + continue; } throw e; } @@ -811,6 +813,10 @@ public class AssignmentManager { // Can be a socket timeout, EOF, NoRouteToHost, etc LOG.info("Unable to communicate with " + destination + " in order to assign regions, ", e); + for (RegionState state: states) { + HRegionInfo region = state.getRegion(); + forceRegionStateToOffline(region, true); + } return false; } } finally { @@ -844,9 +850,7 @@ public class AssignmentManager { * on an unexpected server scenario, for an example) */ private void unassign(final HRegionInfo region, - final RegionState state, final ServerName dest) { - ServerName server = state.getServerName(); - long maxWaitTime = -1; + final ServerName server, final ServerName dest) { for (int i = 1; i <= this.maximumAttempts; i++) { if (this.server.isStopped() || this.server.isAborted()) { LOG.debug("Server stopped/aborted; skipping unassign of " + region); @@ -873,7 +877,6 @@ public class AssignmentManager { if (t instanceof RemoteException) { t = ((RemoteException)t).unwrapRemoteException(); } - boolean logRetries = true; if (t instanceof NotServingRegionException || t instanceof RegionServerStoppedException || t instanceof ServerNotRunningYetException) { @@ -881,56 +884,34 @@ public class AssignmentManager { + ", it's not any more on " + server, t); regionStates.updateRegionState(region, State.OFFLINE); return; - } else if (t instanceof FailedServerException - || t instanceof RegionAlreadyInTransitionException) { - long sleepTime = 0; - Configuration conf = this.server.getConfiguration(); - if(t instanceof FailedServerException) { - sleepTime = 1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, - RpcClient.FAILED_SERVER_EXPIRY_DEFAULT); - } else { - if (maxWaitTime < 0) { - maxWaitTime = - EnvironmentEdgeManager.currentTime() - + conf.getLong(ALREADY_IN_TRANSITION_WAITTIME, - DEFAULT_ALREADY_IN_TRANSITION_WAITTIME); - } - long now = EnvironmentEdgeManager.currentTime(); - if (now < maxWaitTime) { - LOG.debug("Region is already in transition; " - + "waiting up to " + (maxWaitTime - now) + "ms", t); - sleepTime = 100; - i--; // reset the try count - logRetries = false; - } - } + } else if (t instanceof FailedServerException && i < maximumAttempts) { + // In case the server is in the failed server list, no point to + // retry too soon. Retry after the failed_server_expiry time try { - if (sleepTime > 0) { - Thread.sleep(sleepTime); + Configuration conf = this.server.getConfiguration(); + long sleepTime = 1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, + RpcClient.FAILED_SERVER_EXPIRY_DEFAULT); + if (LOG.isDebugEnabled()) { + LOG.debug(server + " is on failed server list; waiting " + + sleepTime + "ms", t); } + Thread.sleep(sleepTime); } catch (InterruptedException ie) { LOG.warn("Failed to unassign " + region.getRegionNameAsString() + " since interrupted", ie); + regionStates.updateRegionState(region, State.FAILED_CLOSE); Thread.currentThread().interrupt(); - if (state != null) { - regionStates.updateRegionState(region, State.FAILED_CLOSE); - } return; } } - if (logRetries) { - LOG.info("Server " + server + " returned " + t + " for " - + region.getRegionNameAsString() + ", try=" + i - + " of " + this.maximumAttempts, t); - // Presume retry or server will expire. - } + LOG.info("Server " + server + " returned " + t + " for " + + region.getRegionNameAsString() + ", try=" + i + + " of " + this.maximumAttempts, t); } } // Run out of attempts - if (state != null) { - regionStates.updateRegionState(region, State.FAILED_CLOSE); - } + regionStates.updateRegionState(region, State.FAILED_CLOSE); } /** @@ -961,13 +942,13 @@ public class AssignmentManager { } case FAILED_CLOSE: case FAILED_OPEN: - unassign(region, state, null); + regionStates.updateRegionState(region, State.PENDING_CLOSE); + unassign(region, state.getServerName(), null); state = regionStates.getRegionState(region); - if (state.isFailedClose()) { - // If we can't close the region, we can't re-assign - // it so as to avoid possible double assignment/data loss. - LOG.info("Skip assigning " + - region + ", we couldn't close it: " + state); + if (!state.isOffline() && !state.isClosed()) { + // If the region isn't offline, we can't re-assign + // it now. It will be assigned automatically after + // the regionserver reports it's closed. return null; } case OFFLINE: @@ -993,7 +974,6 @@ public class AssignmentManager { RegionPlan plan = null; long maxWaitTime = -1; HRegionInfo region = state.getRegion(); - RegionOpeningState regionOpenState; Throwable previousException = null; for (int i = 1; i <= maximumAttempts; i++) { if (server.isStopped() || server.isAborted()) { @@ -1023,25 +1003,25 @@ public class AssignmentManager { regionStates.updateRegionState(region, State.FAILED_OPEN); return; } - // In case of assignment from EnableTableHandler table state is ENABLING. Any how - // EnableTableHandler will set ENABLED after assigning all the table regions. If we - // try to set to ENABLED directly then client API may think table is enabled. - // When we have a case such as all the regions are added directly into hbase:meta and we call - // assignRegion then we need to make the table ENABLED. Hence in such case the table - // will not be in ENABLING or ENABLED state. - TableName tableName = region.getTable(); - if (!tableStateManager.isTableState(tableName, - ZooKeeperProtos.Table.State.ENABLED, ZooKeeperProtos.Table.State.ENABLING)) { - LOG.debug("Setting table " + tableName + " to ENABLED state."); - setEnabledTable(tableName); - } + // In case of assignment from EnableTableHandler table state is ENABLING. Any how + // EnableTableHandler will set ENABLED after assigning all the table regions. If we + // try to set to ENABLED directly then client API may think table is enabled. + // When we have a case such as all the regions are added directly into hbase:meta and we call + // assignRegion then we need to make the table ENABLED. Hence in such case the table + // will not be in ENABLING or ENABLED state. + TableName tableName = region.getTable(); + if (!tableStateManager.isTableState(tableName, + ZooKeeperProtos.Table.State.ENABLED, ZooKeeperProtos.Table.State.ENABLING)) { + LOG.debug("Setting table " + tableName + " to ENABLED state."); + setEnabledTable(tableName); + } LOG.info("Assigning " + region.getRegionNameAsString() + " to " + plan.getDestination().toString()); // Transition RegionState to PENDING_OPEN regionStates.updateRegionState(region, State.PENDING_OPEN, plan.getDestination()); - boolean needNewPlan; + boolean needNewPlan = false; final String assignMsg = "Failed assignment of " + region.getRegionNameAsString() + " to " + plan.getDestination(); try { @@ -1049,20 +1029,8 @@ public class AssignmentManager { if (this.shouldAssignRegionsWithFavoredNodes) { favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region); } - regionOpenState = serverManager.sendRegionOpen( - plan.getDestination(), region, favoredNodes); - - if (regionOpenState == RegionOpeningState.FAILED_OPENING) { - // Failed opening this region, looping again on a new server. - needNewPlan = true; - LOG.warn(assignMsg + ", regionserver says 'FAILED_OPENING', " + - " trying to assign elsewhere instead; " + - "try=" + i + " of " + this.maximumAttempts); - } else { - // we're done - return; - } - + serverManager.sendRegionOpen(plan.getDestination(), region, favoredNodes); + return; // we're done } catch (Throwable t) { if (t instanceof RemoteException) { t = ((RemoteException) t).unwrapRemoteException(); @@ -1070,44 +1038,34 @@ public class AssignmentManager { previousException = t; // Should we wait a little before retrying? If the server is starting it's yes. - // If the region is already in transition, it's yes as well: we want to be sure that - // the region will get opened but we don't want a double assignment. - boolean hold = (t instanceof RegionAlreadyInTransitionException || - t instanceof ServerNotRunningYetException); + boolean hold = (t instanceof ServerNotRunningYetException); // In case socket is timed out and the region server is still online, // the openRegion RPC could have been accepted by the server and // just the response didn't go through. So we will retry to - // open the region on the same server to avoid possible - // double assignment. + // open the region on the same server. boolean retry = !hold && (t instanceof java.net.SocketTimeoutException && this.serverManager.isServerOnline(plan.getDestination())); - if (hold) { LOG.warn(assignMsg + ", waiting a little before trying on the same region server " + "try=" + i + " of " + this.maximumAttempts, t); if (maxWaitTime < 0) { - if (t instanceof RegionAlreadyInTransitionException) { - maxWaitTime = EnvironmentEdgeManager.currentTime() - + this.server.getConfiguration().getLong(ALREADY_IN_TRANSITION_WAITTIME, - DEFAULT_ALREADY_IN_TRANSITION_WAITTIME); - } else { - maxWaitTime = EnvironmentEdgeManager.currentTime() - + this.server.getConfiguration().getLong( - "hbase.regionserver.rpc.startup.waittime", 60000); - } + maxWaitTime = EnvironmentEdgeManager.currentTime() + + this.server.getConfiguration().getLong( + "hbase.regionserver.rpc.startup.waittime", 60000); } try { - needNewPlan = false; long now = EnvironmentEdgeManager.currentTime(); if (now < maxWaitTime) { - LOG.debug("Server is not yet up or region is already in transition; " - + "waiting up to " + (maxWaitTime - now) + "ms", t); + if (LOG.isDebugEnabled()) { + LOG.debug("Server is not yet up; waiting up to " + + (maxWaitTime - now) + "ms", t); + } Thread.sleep(100); i--; // reset the try count - } else if (!(t instanceof RegionAlreadyInTransitionException)) { + } else { LOG.debug("Server is not up for a while; try a new one", t); needNewPlan = true; } @@ -1119,9 +1077,10 @@ public class AssignmentManager { return; } } else if (retry) { - needNewPlan = false; i--; // we want to retry as many times as needed as long as the RS is not dead. - LOG.warn(assignMsg + ", trying to assign to the same region server due ", t); + if (LOG.isDebugEnabled()) { + LOG.debug(assignMsg + ", trying to assign to the same region server due ", t); + } } else { needNewPlan = true; LOG.warn(assignMsg + ", trying to assign elsewhere instead;" + @@ -1200,29 +1159,17 @@ public class AssignmentManager { /** * @param region the region to assign - * @return Plan for passed <code>region</code> (If none currently, it creates one or - * if no servers to assign, it returns null). - */ - private RegionPlan getRegionPlan(final HRegionInfo region, - final boolean forceNewPlan) throws HBaseIOException { - return getRegionPlan(region, null, forceNewPlan); - } - - /** - * @param region the region to assign - * @param serverToExclude Server to exclude (we know its bad). Pass null if - * all servers are thought to be assignable. * @param forceNewPlan If true, then if an existing plan exists, a new plan * will be generated. * @return Plan for passed <code>region</code> (If none currently, it creates one or * if no servers to assign, it returns null). */ private RegionPlan getRegionPlan(final HRegionInfo region, - final ServerName serverToExclude, final boolean forceNewPlan) throws HBaseIOException { + final boolean forceNewPlan) throws HBaseIOException { // Pickup existing plan or make a new one final String encodedName = region.getEncodedName(); final List<ServerName> destServers = - serverManager.createDestinationServersList(serverToExclude); + serverManager.createDestinationServersList(); if (destServers.isEmpty()){ LOG.warn("Can't move " + encodedName + @@ -1268,15 +1215,19 @@ public class AssignmentManager { LOG.warn("Can't find a destination for " + encodedName); return null; } - LOG.debug("No previous transition plan found (or ignoring " + - "an existing plan) for " + region.getRegionNameAsString() + - "; generated random plan=" + randomPlan + "; " + destServers.size() + - " (online=" + serverManager.getOnlineServers().size() + - ") available servers, forceNewPlan=" + forceNewPlan); - return randomPlan; + if (LOG.isDebugEnabled()) { + LOG.debug("No previous transition plan found (or ignoring " + + "an existing plan) for " + region.getRegionNameAsString() + + "; generated random plan=" + randomPlan + "; " + destServers.size() + + " (online=" + serverManager.getOnlineServers().size() + + ") available servers, forceNewPlan=" + forceNewPlan); } - LOG.debug("Using pre-existing plan for " + - region.getRegionNameAsString() + "; plan=" + existingPlan); + return randomPlan; + } + if (LOG.isDebugEnabled()) { + LOG.debug("Using pre-existing plan for " + + region.getRegionNameAsString() + "; plan=" + existingPlan); + } return existingPlan; } @@ -1356,7 +1307,7 @@ public class AssignmentManager { return; } - unassign(region, state, dest); + unassign(region, state.getServerName(), dest); } finally { lock.unlock(); @@ -1481,7 +1432,7 @@ public class AssignmentManager { " region(s) to " + servers + " server(s)"); } for (Map.Entry<ServerName, List<HRegionInfo>> plan: bulkPlan.entrySet()) { - if (!assign(plan.getKey(), plan.getValue())) { + if (!assign(plan.getKey(), plan.getValue()) && !server.isStopped()) { for (HRegionInfo region: plan.getValue()) { if (!regionStates.isRegionOnline(region)) { invokeAssign(region); @@ -1727,6 +1678,9 @@ public class AssignmentManager { State state = regionState.getState(); LOG.info("Processing " + regionState); switch (state) { + case CLOSED: + invokeAssign(regionState.getRegion()); + break; case PENDING_OPEN: retrySendRegionOpen(regionState); break; @@ -1755,41 +1709,55 @@ public class AssignmentManager { if (!regionState.equals(regionStates.getRegionState(hri))) { return; // Region is not in the expected state any more } - while (serverManager.isServerOnline(serverName) - && !server.isStopped() && !server.isAborted()) { + for (int i = 1; i <= maximumAttempts; i++) { + if (!serverManager.isServerOnline(serverName) + || server.isStopped() || server.isAborted()) { + return; // No need any more + } try { List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST; if (shouldAssignRegionsWithFavoredNodes) { favoredNodes = ((FavoredNodeLoadBalancer)balancer).getFavoredNodes(hri); } - RegionOpeningState regionOpenState = serverManager.sendRegionOpen( - serverName, hri, favoredNodes); - - if (regionOpenState == RegionOpeningState.FAILED_OPENING) { - // Failed opening this region, this means the target server didn't get - // the original region open RPC, so re-assign it with a new plan - LOG.debug("Got failed_opening in retry sendRegionOpen for " - + regionState + ", re-assign it"); - invokeAssign(hri, true); - } - return; // Done. + serverManager.sendRegionOpen(serverName, hri, favoredNodes); + return; // we're done } catch (Throwable t) { if (t instanceof RemoteException) { t = ((RemoteException) t).unwrapRemoteException(); } - // In case SocketTimeoutException/FailedServerException, retry - if (t instanceof java.net.SocketTimeoutException - || t instanceof FailedServerException) { - Threads.sleep(100); - continue; + if (t instanceof FailedServerException && i < maximumAttempts) { + // In case the server is in the failed server list, no point to + // retry too soon. Retry after the failed_server_expiry time + try { + Configuration conf = this.server.getConfiguration(); + long sleepTime = 1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, + RpcClient.FAILED_SERVER_EXPIRY_DEFAULT); + if (LOG.isDebugEnabled()) { + LOG.debug(serverName + " is on failed server list; waiting " + + sleepTime + "ms", t); + } + Thread.sleep(sleepTime); + continue; + } catch (InterruptedException ie) { + LOG.warn("Failed to assign " + + hri.getRegionNameAsString() + " since interrupted", ie); + regionStates.updateRegionState(hri, State.FAILED_OPEN); + Thread.currentThread().interrupt(); + return; + } } - // For other exceptions, re-assign it - LOG.debug("Got exception in retry sendRegionOpen for " - + regionState + ", re-assign it", t); - invokeAssign(hri); - return; // Done. + if (serverManager.isServerOnline(serverName) + && t instanceof java.net.SocketTimeoutException) { + i--; // reset the try count + } else { + LOG.info("Got exception in retrying sendRegionOpen for " + + regionState + "; try=" + i + " of " + maximumAttempts, t); + } + Threads.sleep(100); } } + // Run out of attempts + regionStates.updateRegionState(hri, State.FAILED_OPEN); } finally { lock.unlock(); } @@ -1813,38 +1781,51 @@ public class AssignmentManager { if (!regionState.equals(regionStates.getRegionState(hri))) { return; // Region is not in the expected state any more } - while (serverManager.isServerOnline(serverName) - && !server.isStopped() && !server.isAborted()) { + for (int i = 1; i <= maximumAttempts; i++) { + if (!serverManager.isServerOnline(serverName) + || server.isStopped() || server.isAborted()) { + return; // No need any more + } try { - if (!serverManager.sendRegionClose(serverName, hri, null)) { - // This means the region is still on the target server - LOG.debug("Got false in retry sendRegionClose for " - + regionState + ", re-close it"); - invokeUnAssign(hri); - } + serverManager.sendRegionClose(serverName, hri, null); return; // Done. } catch (Throwable t) { if (t instanceof RemoteException) { t = ((RemoteException) t).unwrapRemoteException(); } - // In case SocketTimeoutException/FailedServerException, retry - if (t instanceof java.net.SocketTimeoutException - || t instanceof FailedServerException) { - Threads.sleep(100); - continue; + if (t instanceof FailedServerException && i < maximumAttempts) { + // In case the server is in the failed server list, no point to + // retry too soon. Retry after the failed_server_expiry time + try { + Configuration conf = this.server.getConfiguration(); + long sleepTime = 1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, + RpcClient.FAILED_SERVER_EXPIRY_DEFAULT); + if (LOG.isDebugEnabled()) { + LOG.debug(serverName + " is on failed server list; waiting " + + sleepTime + "ms", t); + } + Thread.sleep(sleepTime); + continue; + } catch (InterruptedException ie) { + LOG.warn("Failed to unassign " + + hri.getRegionNameAsString() + " since interrupted", ie); + regionStates.updateRegionState(hri, State.FAILED_CLOSE); + Thread.currentThread().interrupt(); + return; + } } - if (!(t instanceof NotServingRegionException - || t instanceof RegionAlreadyInTransitionException)) { - // NotServingRegionException/RegionAlreadyInTransitionException - // means the target server got the original region close request. - // For other exceptions, re-close it - LOG.debug("Got exception in retry sendRegionClose for " - + regionState + ", re-close it", t); - invokeUnAssign(hri); + if (serverManager.isServerOnline(serverName) + && t instanceof java.net.SocketTimeoutException) { + i--; // reset the try count + } else { + LOG.info("Got exception in retrying sendRegionClose for " + + regionState + "; try=" + i + " of " + maximumAttempts, t); } - return; // Done. + Threads.sleep(100); } } + // Run out of attempts + regionStates.updateRegionState(hri, State.FAILED_CLOSE); } finally { lock.unlock(); } @@ -1886,7 +1867,7 @@ public class AssignmentManager { /** * @param region Region whose plan we are to clear. */ - void clearRegionPlan(final HRegionInfo region) { + private void clearRegionPlan(final HRegionInfo region) { synchronized (this.regionPlans) { this.regionPlans.remove(region.getEncodedName()); } @@ -1933,11 +1914,7 @@ public class AssignmentManager { } void invokeAssign(HRegionInfo regionInfo) { - invokeAssign(regionInfo, true); - } - - void invokeAssign(HRegionInfo regionInfo, boolean newPlan) { - threadPoolExecutorService.submit(new AssignCallable(this, regionInfo, newPlan)); + threadPoolExecutorService.submit(new AssignCallable(this, regionInfo)); } void invokeUnAssign(HRegionInfo regionInfo) { @@ -1991,8 +1968,8 @@ public class AssignmentManager { } } } - List<HRegionInfo> regions = regionStates.serverOffline(sn); - for (Iterator<HRegionInfo> it = regions.iterator(); it.hasNext(); ) { + List<HRegionInfo> rits = regionStates.serverOffline(sn); + for (Iterator<HRegionInfo> it = rits.iterator(); it.hasNext(); ) { HRegionInfo hri = it.next(); String encodedName = hri.getEncodedName(); @@ -2003,8 +1980,8 @@ public class AssignmentManager { regionStates.getRegionTransitionState(encodedName); if (regionState == null || (regionState.getServerName() != null && !regionState.isOnServer(sn)) - || !(regionState.isFailedClose() || regionState.isOffline() - || regionState.isPendingOpenOrOpening())) { + || !RegionStates.isOneOfStates(regionState, State.PENDING_OPEN, + State.OPENING, State.FAILED_OPEN, State.FAILED_CLOSE, State.OFFLINE)) { LOG.info("Skip " + regionState + " since it is not opening/failed_close" + " on the dead server any more: " + sn); it.remove(); @@ -2022,7 +1999,7 @@ public class AssignmentManager { lock.unlock(); } } - return regions; + return rits; } /** @@ -2076,8 +2053,20 @@ public class AssignmentManager { } } - private void onRegionFailedOpen( - final HRegionInfo hri, final ServerName sn) { + private String onRegionFailedOpen(final RegionState current, + final HRegionInfo hri, final ServerName serverName) { + // The region must be opening on this server. + // If current state is failed_open on the same server, + // it could be a reportRegionTransition RPC retry. + if (current == null || !current.isOnServer(serverName) + || !(current.isOpening() || current.isFailedOpen())) { + return hri.getShortNameToLog() + " is not opening on " + serverName; + } + + if (current.isFailedOpen()) { + return null; + } + String encodedName = hri.getEncodedName(); AtomicInteger failedOpenCount = failedOpenTracker.get(encodedName); if (failedOpenCount == null) { @@ -2098,166 +2087,386 @@ public class AssignmentManager { if (regionState != null) { // When there are more than one region server a new RS is selected as the // destination and the same is updated in the region plan. (HBASE-5546) - if (getTableStateManager().isTableState(hri.getTable(), - ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING) || - replicasToClose.contains(hri)) { + if (getTableStateManager().isTableState(hri.getTable(), Table.State.DISABLED, + Table.State.DISABLING) || replicasToClose.contains(hri)) { offlineDisabledRegion(hri); - return; + return null; } - // ZK Node is in CLOSED state, assign it. - regionStates.updateRegionState(hri, RegionState.State.CLOSED); + regionStates.updateRegionState(hri, RegionState.State.CLOSED); // This below has to do w/ online enable/disable of a table removeClosedRegion(hri); try { - getRegionPlan(hri, sn, true); + getRegionPlan(hri, true); } catch (HBaseIOException e) { LOG.warn("Failed to get region plan", e); } - invokeAssign(hri, false); + invokeAssign(hri); } } + // Null means no error + return null; } - private void onRegionOpen( - final HRegionInfo hri, final ServerName sn, long openSeqNum) { - regionOnline(hri, sn, openSeqNum); + private String onRegionOpen(final RegionState current, final HRegionInfo hri, + final ServerName serverName, final RegionStateTransition transition) { + // The region must be opening on this server. + // If current state is already opened on the same server, + // it could be a reportRegionTransition RPC retry. + if (current == null || !current.isOnServer(serverName) + || !(current.isOpening() || current.isOpened())) { + return hri.getShortNameToLog() + " is not opening on " + serverName; + } + + if (current.isOpened()) { + return null; + } + + long openSeqNum = transition.hasOpenSeqNum() + ? transition.getOpenSeqNum() : HConstants.NO_SEQNUM; + if (openSeqNum < 0) { + return "Newly opened region has invalid open seq num " + openSeqNum; + } + regionOnline(hri, serverName, openSeqNum); // reset the count, if any failedOpenTracker.remove(hri.getEncodedName()); if (getTableStateManager().isTableState(hri.getTable(), - ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) { + Table.State.DISABLED, Table.State.DISABLING)) { invokeUnAssign(hri); } + return null; } - private void onRegionClosed(final HRegionInfo hri) { - if (getTableStateManager().isTableState(hri.getTable(), - ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING) || - replicasToClose.contains(hri)) { + private String onRegionClosed(final RegionState current, + final HRegionInfo hri, final ServerName serverName) { + // We didn't check if the region is already closed/offline on the server + // as we did for other transitions to handle reportRegionTransition RPC retry. + // There are two reasons. 1. Closed/offline states are transient. Region will be + // usually assigned right after closed. When a RPC retry comes in, the region may + // already have moved away from closed state. 2. On the region server side, we + // don't care much about the response for this transition. We only make sure + // master has got and processed this report, either successfully or not. + if (current == null || !current.isOnServer(serverName) || !current.isClosing()) { + return hri.getShortNameToLog() + " is not closing on " + serverName; + } + if (getTableStateManager().isTableState(hri.getTable(), Table.State.DISABLED, + Table.State.DISABLING) || replicasToClose.contains(hri)) { offlineDisabledRegion(hri); - return; + return null; } + regionStates.updateRegionState(hri, RegionState.State.CLOSED); sendRegionClosedNotification(hri); // This below has to do w/ online enable/disable of a table removeClosedRegion(hri); - invokeAssign(hri, false); + invokeAssign(hri); + return null; } - private String onRegionSplit(ServerName sn, TransitionCode code, - final HRegionInfo p, final HRegionInfo a, final HRegionInfo b) { - final RegionState rs_p = regionStates.getRegionState(p); + private String onRegionReadyToSplit(final RegionState current, final HRegionInfo hri, + final ServerName serverName, final RegionStateTransition transition) { + // The region must be opened on this server. + // If current state is already splitting on the same server, + // it could be a reportRegionTransition RPC retry. + if (current == null || !current.isOnServer(serverName) + || !(current.isOpened() || current.isSplitting())) { + return hri.getShortNameToLog() + " is not opening on " + serverName; + } + + if (current.isSplitting()) { + return null; + } + + final HRegionInfo a = HRegionInfo.convert(transition.getRegionInfo(1)); + final HRegionInfo b = HRegionInfo.convert(transition.getRegionInfo(2)); RegionState rs_a = regionStates.getRegionState(a); RegionState rs_b = regionStates.getRegionState(b); - if (!(rs_p.isOpenOrSplittingOnServer(sn) - && (rs_a == null || rs_a.isOpenOrSplittingNewOnServer(sn)) - && (rs_b == null || rs_b.isOpenOrSplittingNewOnServer(sn)))) { - return "Not in state good for split"; + if (rs_a != null || rs_b != null) { + return "Some daughter is already existing. " + + "a=" + rs_a + ", b=" + rs_b; + } + + // Server holding is not updated at this stage. + // It is done after PONR. + regionStates.updateRegionState(hri, State.SPLITTING); + regionStates.createRegionState( + a, State.SPLITTING_NEW, serverName, null); + regionStates.createRegionState( + b, State.SPLITTING_NEW, serverName, null); + return null; + } + + private String onRegionSplitPONR(final RegionState current, final HRegionInfo hri, + final ServerName serverName, final RegionStateTransition transition) { + // The region must be splitting on this server, and the daughters must be in + // splitting_new state. To check RPC retry, we use server holding info. + if (current == null || !current.isOnServer(serverName) || !current.isSplitting()) { + return hri.getShortNameToLog() + " is not splitting on " + serverName; } - regionStates.updateRegionState(a, State.SPLITTING_NEW, sn); - regionStates.updateRegionState(b, State.SPLITTING_NEW, sn); - regionStates.updateRegionState(p, State.SPLITTING); + final HRegionInfo a = HRegionInfo.convert(transition.getRegionInfo(1)); + final HRegionInfo b = HRegionInfo.convert(transition.getRegionInfo(2)); + RegionState rs_a = regionStates.getRegionState(a); + RegionState rs_b = regionStates.getRegionState(b); + if (rs_a == null || rs_b == null || !rs_a.isOnServer(serverName) + || !rs_b.isOnServer(serverName) || !rs_a.isSplittingNew() + || !rs_b.isSplittingNew()) { + return "Some daughter is not known to be splitting on " + serverName + + ", a=" + rs_a + ", b=" + rs_b; + } - if (code == TransitionCode.SPLIT) { - if (TEST_SKIP_SPLIT_HANDLING) { - return "Skipping split message, TEST_SKIP_SPLIT_HANDLING is set"; - } - regionOffline(p, State.SPLIT); - regionOnline(a, sn, 1); - regionOnline(b, sn, 1); - - // User could disable the table before master knows the new region. - if (getTableStateManager().isTableState(p.getTable(), - ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) { - invokeUnAssign(a); - invokeUnAssign(b); - } else { - Callable<Object> splitReplicasCallable = new Callable<Object>() { - @Override - public Object call() { - doSplittingOfReplicas(p, a, b); - return null; - } - }; - threadPoolExecutorService.submit(splitReplicasCallable); - } - } else if (code == TransitionCode.SPLIT_PONR) { - try { - regionStates.splitRegion(p, a, b, sn); - } catch (IOException ioe) { - LOG.info("Failed to record split region " + p.getShortNameToLog()); - return "Failed to record the splitting in meta"; - } - } else if (code == TransitionCode.SPLIT_REVERTED) { - regionOnline(p, sn); - regionOffline(a); - regionOffline(b); - - if (getTableStateManager().isTableState(p.getTable(), - ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) { - invokeUnAssign(p); - } + if (!regionStates.isRegionOnServer(hri, serverName)) { + return null; + } + + try { + regionStates.splitRegion(hri, a, b, serverName); + } catch (IOException ioe) { + LOG.info("Failed to record split region " + hri.getShortNameToLog()); + return "Failed to record the splitting in meta"; } return null; } - private String onRegionMerge(ServerName sn, TransitionCode code, - final HRegionInfo p, final HRegionInfo a, final HRegionInfo b) { - RegionState rs_p = regionStates.getRegionState(p); + private String onRegionSplit(final RegionState current, final HRegionInfo hri, + final ServerName serverName, final RegionStateTransition transition) { + // The region must be splitting on this server, and the daughters must be in + // splitting_new state. + // If current state is already split on the same server, + // it could be a reportRegionTransition RPC retry. + if (current == null || !current.isOnServer(serverName) + || !(current.isSplitting() || current.isSplit())) { + return hri.getShortNameToLog() + " is not splitting on " + serverName; + } + + if (current.isSplit()) { + return null; + } + + final HRegionInfo a = HRegionInfo.convert(transition.getRegionInfo(1)); + final HRegionInfo b = HRegionInfo.convert(transition.getRegionInfo(2)); RegionState rs_a = regionStates.getRegionState(a); RegionState rs_b = regionStates.getRegionState(b); - if (!(rs_a.isOpenOrMergingOnServer(sn) && rs_b.isOpenOrMergingOnServer(sn) - && (rs_p == null || rs_p.isOpenOrMergingNewOnServer(sn)))) { - return "Not in state good for merge"; - } - - regionStates.updateRegionState(a, State.MERGING); - regionStates.updateRegionState(b, State.MERGING); - regionStates.updateRegionState(p, State.MERGING_NEW, sn); - - String encodedName = p.getEncodedName(); - if (code == TransitionCode.READY_TO_MERGE) { - mergingRegions.put(encodedName, - new PairOfSameType<HRegionInfo>(a, b)); - } else if (code == TransitionCode.MERGED) { - mergingRegions.remove(encodedName); - regionOffline(a, State.MERGED); - regionOffline(b, State.MERGED); - regionOnline(p, sn, 1); - - // User could disable the table before master knows the new region. - if (getTableStateManager().isTableState(p.getTable(), - ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) { - invokeUnAssign(p); - } else { - Callable<Object> mergeReplicasCallable = new Callable<Object>() { - @Override - public Object call() { - doMergingOfReplicas(p, a, b); - return null; - } - }; - threadPoolExecutorService.submit(mergeReplicasCallable); + if (rs_a == null || rs_b == null || !rs_a.isOnServer(serverName) + || !rs_b.isOnServer(serverName) || !rs_a.isSplittingNew() + || !rs_b.isSplittingNew()) { + return "Some daughter is not known to be splitting on " + serverName + + ", a=" + rs_a + ", b=" + rs_b; + } + + if (TEST_SKIP_SPLIT_HANDLING) { + return "Skipping split message, TEST_SKIP_SPLIT_HANDLING is set"; + } + regionOffline(hri, State.SPLIT); + regionOnline(a, serverName, 1); + regionOnline(b, serverName, 1); + + // User could disable the table before master knows the new region. + if (getTableStateManager().isTableState(hri.getTable(), + Table.State.DISABLED, Table.State.DISABLING)) { + invokeUnAssign(a); + invokeUnAssign(b); + } else { + Callable<Object> splitReplicasCallable = new Callable<Object>() { + @Override + public Object call() { + doSplittingOfReplicas(hri, a, b); + return null; + } + }; + threadPoolExecutorService.submit(splitReplicasCallable); + } + return null; + } + + private String onRegionSplitReverted(final RegionState current, final HRegionInfo hri, + final ServerName serverName, final RegionStateTransition transition) { + // The region must be splitting on this server, and the daughters must be in + // splitting_new state. + // If the region is in open state, it could be an RPC retry. + if (current == null || !current.isOnServer(serverName) + || !(current.isSplitting() || current.isOpened())) { + return hri.getShortNameToLog() + " is not splitting on " + serverName; + } + + if (current.isOpened()) { + return null; + } + + final HRegionInfo a = HRegionInfo.convert(transition.getRegionInfo(1)); + final HRegionInfo b = HRegionInfo.convert(transition.getRegionInfo(2)); + RegionState rs_a = regionStates.getRegionState(a); + RegionState rs_b = regionStates.getRegionState(b); + if (rs_a == null || rs_b == null || !rs_a.isOnServer(serverName) + || !rs_b.isOnServer(serverName) || !rs_a.isSplittingNew() + || !rs_b.isSplittingNew()) { + return "Some daughter is not known to be splitting on " + serverName + + ", a=" + rs_a + ", b=" + rs_b; + } + + regionOnline(hri, serverName); + regionOffline(a); + regionOffline(b); + if (getTableStateManager().isTableState(hri.getTable(), + Table.State.DISABLED, Table.State.DISABLING)) { + invokeUnAssign(hri); + } + return null; + } + + private String onRegionReadyToMerge(final RegionState current, final HRegionInfo hri, + final ServerName serverName, final RegionStateTransition transition) { + // The region must be new, and the daughters must be open on this server. + // If the region is in merge_new state, it could be an RPC retry. + if (current != null && (!current.isOnServer(serverName) + || !current.isMergingNew())) { + return "Merging daughter region already exists, p=" + current; + } + + if (current != null) { + return null; + } + + final HRegionInfo a = HRegionInfo.convert(transition.getRegionInfo(1)); + final HRegionInfo b = HRegionInfo.convert(transition.getRegionInfo(2)); + Set<String> encodedNames = new HashSet<String>(2); + encodedNames.add(a.getEncodedName()); + encodedNames.add(b.getEncodedName()); + Map<String, Lock> locks = locker.acquireLocks(encodedNames); + try { + RegionState rs_a = regionStates.getRegionState(a); + RegionState rs_b = regionStates.getRegionState(b); + if (rs_a == null || rs_b == null || !rs_a.isOnServer(serverName) + || !rs_b.isOnServer(serverName) || !rs_a.isOpened() + || !rs_b.isOpened()) { + return "Some daughter is not in a state to merge on " + serverName + + ", a=" + rs_a + ", b=" + rs_b; } - } else if (code == TransitionCode.MERGE_PONR) { - try { - regionStates.mergeRegions(p, a, b, sn); - } catch (IOException ioe) { - LOG.info("Failed to record merged region " + p.getShortNameToLog()); - return "Failed to record the merging in meta"; + + regionStates.updateRegionState(a, State.MERGING); + regionStates.updateRegionState(b, State.MERGING); + regionStates.createRegionState( + hri, State.MERGING_NEW, serverName, null); + return null; + } finally { + for (Lock lock: locks.values()) { + lock.unlock(); } + } + } + + private String onRegionMergePONR(final RegionState current, final HRegionInfo hri, + final ServerName serverName, final RegionStateTransition transition) { + // The region must be in merging_new state, and the daughters must be + // merging. To check RPC retry, we use server holding info. + if (current == null || !current.isOnServer(serverName) || !current.isMergingNew()) { + return hri.getShortNameToLog() + " is not merging on " + serverName; + } + + final HRegionInfo a = HRegionInfo.convert(transition.getRegionInfo(1)); + final HRegionInfo b = HRegionInfo.convert(transition.getRegionInfo(2)); + RegionState rs_a = regionStates.getRegionState(a); + RegionState rs_b = regionStates.getRegionState(b); + if (rs_a == null || rs_b == null || !rs_a.isOnServer(serverName) + || !rs_b.isOnServer(serverName) || !rs_a.isMerging() + || !rs_b.isMerging()) { + return "Some daughter is not known to be merging on " + serverName + + ", a=" + rs_a + ", b=" + rs_b; + } + + if (regionStates.isRegionOnServer(hri, serverName)) { + return null; + } + + try { + regionStates.mergeRegions(hri, a, b, serverName); + } catch (IOException ioe) { + LOG.info("Failed to record merged region " + hri.getShortNameToLog()); + return "Failed to record the merging in meta"; + } + return null; + } + + private String onRegionMerged(final RegionState current, final HRegionInfo hri, + final ServerName serverName, final RegionStateTransition transition) { + // The region must be in merging_new state, and the daughters must be + // merging on this server. + // If current state is already opened on the same server, + // it could be a reportRegionTransition RPC retry. + if (current == null || !current.isOnServer(serverName) + || !(current.isMergingNew() || current.isOpened())) { + return hri.getShortNameToLog() + " is not merging on " + serverName; + } + + if (current.isOpened()) { + return null; + } + + final HRegionInfo a = HRegionInfo.convert(transition.getRegionInfo(1)); + final HRegionInfo b = HRegionInfo.convert(transition.getRegionInfo(2)); + RegionState rs_a = regionStates.getRegionState(a); + RegionState rs_b = regionStates.getRegionState(b); + if (rs_a == null || rs_b == null || !rs_a.isOnServer(serverName) + || !rs_b.isOnServer(serverName) || !rs_a.isMerging() + || !rs_b.isMerging()) { + return "Some daughter is not known to be merging on " + serverName + + ", a=" + rs_a + ", b=" + rs_b; + } + + regionOffline(a, State.MERGED); + regionOffline(b, State.MERGED); + regionOnline(hri, serverName, 1); + + // User could disable the table before master knows the new region. + if (getTableStateManager().isTableState(hri.getTable(), + Table.State.DISABLED, Table.State.DISABLING)) { + invokeUnAssign(hri); } else { - mergingRegions.remove(encodedName); - regionOnline(a, sn); - regionOnline(b, sn); - regionOffline(p); - - if (getTableStateManager().isTableState(p.getTable(), - ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) { - invokeUnAssign(a); - invokeUnAssign(b); - } + Callable<Object> mergeReplicasCallable = new Callable<Object>() { + @Override + public Object call() { + doMergingOfReplicas(hri, a, b); + return null; + } + }; + threadPoolExecutorService.submit(mergeReplicasCallable); + } + return null; + } + + private String onRegionMergeReverted(final RegionState current, final HRegionInfo hri, + final ServerName serverName, final RegionStateTransition transition) { + // The region must be in merging_new state, and the daughters must be + // merging on this server. + // If the region is in offline state, it could be an RPC retry. + if (current == null || !current.isOnServer(serverName) + || !(current.isMergingNew() || current.isOffline())) { + return hri.getShortNameToLog() + " is not merging on " + serverName; + } + + if (current.isOffline()) { + return null; + } + + final HRegionInfo a = HRegionInfo.convert(transition.getRegionInfo(1)); + final HRegionInfo b = HRegionInfo.convert(transition.getRegionInfo(2)); + RegionState rs_a = regionStates.getRegionState(a); + RegionState rs_b = regionStates.getRegionState(b); + if (rs_a == null || rs_b == null || !rs_a.isOnServer(serverName) + || !rs_b.isOnServer(serverName) || !rs_a.isMerging() + || !rs_b.isMerging()) { + return "Some daughter is not known to be merging on " + serverName + + ", a=" + rs_a + ", b=" + rs_b; + } + + regionOnline(a, serverName); + regionOnline(b, serverName); + regionOffline(hri); + + if (getTableStateManager().isTableState(hri.getTable(), + Table.State.DISABLED, Table.State.DISABLING)) { + invokeUnAssign(a); + invokeUnAssign(b); } return null; } @@ -2466,76 +2675,61 @@ public class AssignmentManager { final RegionStateTransition transition) { TransitionCode code = transition.getTransitionCode(); HRegionInfo hri = HRegionInfo.convert(transition.getRegionInfo(0)); - RegionState current = regionStates.getRegionState(hri); - if (LOG.isDebugEnabled()) { - LOG.debug("Got transition " + code + " for " - + (current != null ? current.toString() : hri.getShortNameToLog()) - + " from " + serverName); - } - String errorMsg = null; - switch (code) { - case OPENED: - if (current != null && current.isOpened() && current.isOnServer(serverName)) { - LOG.info("Region " + hri.getShortNameToLog() + " is already " + current.getState() + " on " - + serverName); - break; - } - case FAILED_OPEN: - if (current == null - || !current.isPendingOpenOrOpeningOnServer(serverName)) { - errorMsg = hri.getShortNameToLog() - + " is not pending open on " + serverName; - } else if (code == TransitionCode.FAILED_OPEN) { - onRegionFailedOpen(hri, serverName); - } else { - long openSeqNum = HConstants.NO_SEQNUM; - if (transition.hasOpenSeqNum()) { - openSeqNum = transition.getOpenSeqNum(); - } - if (openSeqNum < 0) { - errorMsg = "Newly opened region has invalid open seq num " + openSeqNum; - } else { - onRegionOpen(hri, serverName, openSeqNum); - } + Lock lock = locker.acquireLock(hri.getEncodedName()); + try { + RegionState current = regionStates.getRegionState(hri); + if (LOG.isDebugEnabled()) { + LOG.debug("Got transition " + code + " for " + + (current != null ? current.toString() : hri.getShortNameToLog()) + + " from " + serverName); } - break; + String errorMsg = null; + switch (code) { + case OPENED: + errorMsg = onRegionOpen(current, hri, serverName, transition); + break; + case FAILED_OPEN: + errorMsg = onRegionFailedOpen(current, hri, serverName); + break; + case CLOSED: + errorMsg = onRegionClosed(current, hri, serverName); + break; + case READY_TO_SPLIT: + errorMsg = onRegionReadyToSplit(current, hri, serverName, transition); + break; + case SPLIT_PONR: + errorMsg = onRegionSplitPONR(current, hri, serverName, transition); + break; + case SPLIT: + errorMsg = onRegionSplit(current, hri, serverName, transition); + break; + case SPLIT_REVERTED: + errorMsg = onRegionSplitReverted(current, hri, serverName, transition); + break; + case READY_TO_MERGE: + errorMsg = onRegionReadyToMerge(current, hri, serverName, transition); + break; + case MERGE_PONR: + errorMsg = onRegionMergePONR(current, hri, serverName, transition); + break; + case MERGED: + errorMsg = onRegionMerged(current, hri, serverName, transition); + break; + case MERGE_REVERTED: + errorMsg = onRegionMergeReverted(current, hri, serverName, transition); + break; - case CLOSED: - if (current == null - || !current.isPendingCloseOrClosingOnServer(serverName)) { - errorMsg = hri.getShortNameToLog() - + " is not pending close on " + serverName; - } else { - onRegionClosed(hri); + default: + errorMsg = "Unexpected transition code " + code; } - break; - - case READY_TO_SPLIT: - case SPLIT_PONR: - case SPLIT: - case SPLIT_REVERTED: - errorMsg = onRegionSplit(serverName, code, hri, - HRegionInfo.convert(transition.getRegionInfo(1)), - HRegionInfo.convert(transition.getRegionInfo(2))); - break; - - case READY_TO_MERGE: - case MERGE_PONR: - case MERGED: - case MERGE_REVERTED: - errorMsg = onRegionMerge(serverName, code, hri, - HRegionInfo.convert(transition.getRegionInfo(1)), - HRegionInfo.convert(transition.getRegionInfo(2))); - break; - - default: - errorMsg = "Unexpected transition code " + code; - } - if (errorMsg != null) { - LOG.error("Failed to transition region from " + current + " to " - + code + " by " + serverName + ": " + errorMsg); + if (errorMsg != null) { + LOG.error("Failed to transition region from " + current + " on " + + code + " by " + serverName + ": " + errorMsg); + } + return errorMsg; + } finally { + lock.unlock(); } - return errorMsg; } /** http://git-wip-us.apache.org/repos/asf/hbase/blob/3a82cf23/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java index 60296fa..6a0882c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/RegionStates.java @@ -561,7 +561,8 @@ public class RegionStates { // Offline state is also kind of pending open if the region is in // transition. The region could be in failed_close state too if we have // tried several times to open it while this region server is not reachable) - if (state.isPendingOpenOrOpening() || state.isFailedClose() || state.isOffline()) { + if (isOneOfStates(state, State.OPENING, State.PENDING_OPEN, + State.FAILED_OPEN, State.FAILED_CLOSE, State.OFFLINE)) { LOG.info("Found region in " + state + " to be reassigned by SSH for " + sn); rits.add(hri); } else { @@ -724,6 +725,12 @@ public class RegionStates { lastAssignments.put(encodedName, serverName); } + synchronized boolean isRegionOnServer( + final HRegionInfo hri, final ServerName serverName) { + Set<HRegionInfo> regions = serverHoldings.get(serverName); + return regions == null ? false : regions.contains(hri); + } + void splitRegion(HRegionInfo p, HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException { regionStateStore.splitRegion(p, a, b, sn); http://git-wip-us.apache.org/repos/asf/hbase/blob/3a82cf23/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 9390eba..60fb2bb 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -724,9 +724,8 @@ public class ServerManager { throws IOException { AdminService.BlockingInterface admin = getRsAdmin(server); if (admin == null) { - LOG.warn("Attempting to send OPEN RPC to server " + server.toString() + + throw new IOException("Attempting to send OPEN RPC to server " + server.toString() + " failed because no RPC connection found to this server"); - return RegionOpeningState.FAILED_OPENING; } OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server, region, favoredNodes, @@ -753,9 +752,8 @@ public class ServerManager { throws IOException { AdminService.BlockingInterface admin = getRsAdmin(server); if (admin == null) { - LOG.warn("Attempting to send OPEN RPC to server " + server.toString() + + throw new IOException("Attempting to send OPEN RPC to server " + server.toString() + " failed because no RPC connection found to this server"); - return null; } OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(regionOpenInfos, http://git-wip-us.apache.org/repos/asf/hbase/blob/3a82cf23/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java index 7898edc..cae6142 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java @@ -238,7 +238,7 @@ public class ServerShutdownHandler extends EventHandler { } toAssignRegions.add(hri); } else if (rit != null) { - if ((rit.isPendingCloseOrClosing() || rit.isOffline()) + if ((rit.isClosing() || rit.isFailedClose() || rit.isOffline()) && am.getTableStateManager().isTableState(hri.getTable(), ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING) || am.getReplicasToClose().contains(hri)) { http://git-wip-us.apache.org/repos/asf/hbase/blob/3a82cf23/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index c5cdacf..b60befd 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -2480,10 +2480,9 @@ public class HRegionServer extends HasThread implements * @param abort True if we are aborting * @return True if closed a region. * @throws NotServingRegionException if the region is not online - * @throws RegionAlreadyInTransitionException if the region is already closing */ protected boolean closeRegion(String encodedName, final boolean abort, final ServerName sn) - throws NotServingRegionException, RegionAlreadyInTransitionException { + throws NotServingRegionException { //Check for permissions to close. HRegion actualRegion = this.getFromOnlineRegions(encodedName); if ((actualRegion != null) && (actualRegion.getCoprocessorHost() != null)) { @@ -2518,15 +2517,8 @@ public class HRegionServer extends HasThread implements } } else if (Boolean.FALSE.equals(previous)) { LOG.info("Received CLOSE for the region: " + encodedName + - " ,which we are already trying to CLOSE, but not completed yet"); - // The master will retry till the region is closed. We need to do this since - // the region could fail to close somehow. If we mark the region closed in master - // while it is not, there could be data loss. - // If the region stuck in closing for a while, and master runs out of retries, - // master will move the region to failed_to_close. Later on, if the region - // is indeed closed, master can properly re-assign it. - throw new RegionAlreadyInTransitionException("The region " + encodedName + - " was already closing. New CLOSE request is ignored."); + ", which we are already trying to CLOSE, but not completed yet"); + return true; } if (actualRegion == null) { http://git-wip-us.apache.org/repos/asf/hbase/blob/3a82cf23/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java index 22e4d88..3e15a2e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/RSRpcServices.java @@ -51,7 +51,6 @@ import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.KeyValueUtil; -import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.NotServingRegionException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; @@ -1180,7 +1179,6 @@ public class RSRpcServices implements HBaseRPCErrorHandler, * @throws ServiceException */ @Override - @SuppressWarnings("deprecation") @QosPriority(priority=HConstants.HIGH_QOS) public OpenRegionResponse openRegion(final RpcController controller, final OpenRegionRequest request) throws ServiceException { @@ -1236,35 +1234,15 @@ public class RSRpcServices implements HBaseRPCErrorHandler, final HRegionInfo region = HRegionInfo.convert(regionOpenInfo.getRegion()); HTableDescriptor htd; try { - final HRegion onlineRegion = regionServer.getFromOnlineRegions(region.getEncodedName()); + String encodedName = region.getEncodedName(); + byte[] encodedNameBytes = region.getEncodedNameAsBytes(); + final HRegion onlineRegion = regionServer.getFromOnlineRegions(encodedName); if (onlineRegion != null) { - //Check if the region can actually be opened. - if (onlineRegion.getCoprocessorHost() != null) { - onlineRegion.getCoprocessorHost().preOpen(); - } - // See HBASE-5094. Cross check with hbase:meta if still this RS is owning - // the region. - Pair<HRegionInfo, ServerName> p = MetaTableAccessor.getRegion( - regionServer.getShortCircuitConnection(), region.getRegionName()); - if (regionServer.serverName.equals(p.getSecond())) { - Boolean closing = regionServer.regionsInTransitionInRS.get(region.getEncodedNameAsBytes()); - // Map regionsInTransitionInRSOnly has an entry for a region only if the region - // is in transition on this RS, so here closing can be null. If not null, it can - // be true or false. True means the region is opening on this RS; while false - // means the region is closing. Only return ALREADY_OPENED if not closing (i.e. - // not in transition any more, or still transition to open. - if (!Boolean.FALSE.equals(closing) - && regionServer.getFromOnlineRegions(region.getEncodedName()) != null) { - LOG.warn("Attempted open of " + region.getEncodedName() - + " but already online on this server"); - builder.addOpeningState(RegionOpeningState.ALREADY_OPENED); - continue; - } - } else { - LOG.warn("The region " + region.getEncodedName() + " is online on this server" - + " but hbase:meta does not have this server - continue opening."); - regionServer.removeFromOnlineRegions(onlineRegion, null); - } + // The region is already online. This should not happen any more. + String error = "Received OPEN for the region:" + + region.getRegionNameAsString() + ", which is already online"; + regionServer.abort(error); + throw new IOException(error); } LOG.info("Open " + region.getRegionNameAsString()); htd = htds.get(region.getTable()); @@ -1274,18 +1252,23 @@ public class RSRpcServices implements HBaseRPCErrorHandler, } final Boolean previous = regionServer.regionsInTransitionInRS.putIfAbsent( - region.getEncodedNameAsBytes(), Boolean.TRUE); + encodedNameBytes, Boolean.TRUE); if (Boolean.FALSE.equals(previous)) { - // There is a close in progress. This should not happen any more. - throw new RegionAlreadyInTransitionException("Received OPEN for the region:" - + region.getRegionNameAsString() + " , which we are already trying to CLOSE "); + if (regionServer.getFromOnlineRegions(encodedName) != null) { + // There is a close in progress. This should not happen any more. + String error = "Received OPEN for the region:" + + region.getRegionNameAsString() + ", which we are already trying to CLOSE"; + regionServer.abort(error); + throw new IOException(error); + } + regionServer.regionsInTransitionInRS.put(encodedNameBytes, Boolean.TRUE); } if (Boolean.TRUE.equals(previous)) { // An open is in progress. This is supported, but let's log this. LOG.info("Receiving OPEN for the region:" + - region.getRegionNameAsString() + " , which we are already trying to OPEN" + region.getRegionNameAsString() + ", which we are already trying to OPEN" + " - ignoring this new request for this region."); } @@ -1293,7 +1276,7 @@ public class RSRpcServices implements HBaseRPCErrorHandler, // want to keep returning the stale moved record while we are opening/if we close again. regionServer.removeFromMovedRegions(region.getEncodedName()); - if (previous == null) { + if (previous == null || !previous.booleanValue()) { // check if the region to be opened is marked in recovering state in ZK if (ZKSplitLog.isRegionMarkedRecoveringInZK(regionServer.getZooKeeper(), region.getEncodedName())) { http://git-wip-us.apache.org/repos/asf/hbase/blob/3a82cf23/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java index 77d8167..1c886d8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/CloseRegionHandler.java @@ -122,7 +122,7 @@ public class CloseRegionHandler extends EventHandler { LOG.debug("Closed " + region.getRegionNameAsString()); } finally { this.rsServices.getRegionsInTransitionInRS(). - remove(this.regionInfo.getEncodedNameAsBytes()); + remove(this.regionInfo.getEncodedNameAsBytes(), Boolean.FALSE); } } } http://git-wip-us.apache.org/repos/asf/hbase/blob/3a82cf23/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java index 4b5556e..c001468 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java @@ -191,12 +191,10 @@ public class TestAssignmentManagerOnCluster { // Region is assigned now. Let's assign it again. // Master should not abort, and region should be assigned. - RegionState oldState = regionStates.getRegionState(hri); TEST_UTIL.getHBaseAdmin().assign(hri.getRegionName()); master.getAssignmentManager().waitForAssignment(hri); RegionState newState = regionStates.getRegionState(hri); - assertTrue(newState.isOpened() - && newState.getStamp() != oldState.getStamp()); + assertTrue(newState.isOpened()); } finally { TEST_UTIL.deleteTable(Bytes.toBytes(table)); } @@ -231,7 +229,7 @@ public class TestAssignmentManagerOnCluster { // Use the first server as the destination server ServerName destServer = onlineServers.iterator().next(); - // Created faked dead server + // Created faked dead server that is still online in master deadServer = ServerName.valueOf(destServer.getHostname(), destServer.getPort(), destServer.getStartcode() - 100L); master.serverManager.recordNewServerWithLock(deadServer, ServerLoad.EMPTY_SERVERLOAD); @@ -415,14 +413,11 @@ public class TestAssignmentManagerOnCluster { } /** - * This test should not be flaky. If it is flaky, it means something - * wrong with AssignmentManager which should be reported and fixed - * - * This tests forcefully assign a region while it's closing and re-assigned. + * This tests assign a region while it's closing. */ @Test (timeout=60000) - public void testForceAssignWhileClosing() throws Exception { - String table = "testForceAssignWhileClosing"; + public void testAssignWhileClosing() throws Exception { + String table = "testAssignWhileClosing"; try { HTableDescriptor desc = new HTableDescriptor(TableName.valueOf(table)); desc.addFamily(new HColumnDescriptor(FAMILY)); @@ -664,14 +659,6 @@ public class TestAssignmentManagerOnCluster { MyRegionObserver.postCloseEnabled.set(true); am.unassign(hri); - // Now region should pending_close or closing - // Unassign it again so that we can trigger already - // in transition exception. This test is to make sure this scenario - // is handled properly. - am.server.getConfiguration().setLong( - AssignmentManager.ALREADY_IN_TRANSITION_WAITTIME, 1000); - am.getRegionStates().updateRegionState(hri, RegionState.State.FAILED_CLOSE); - am.unassign(hri); // Let region closing move ahead. The region should be closed // properly and re-assigned automatically @@ -727,7 +714,7 @@ public class TestAssignmentManagerOnCluster { am.unassign(hri); RegionState state = am.getRegionStates().getRegionState(hri); ServerName oldServerName = state.getServerName(); - assertTrue(state.isPendingOpenOrOpening() && oldServerName != null); + assertTrue(state.isOpening() && oldServerName != null); // Now the region is stuck in opening // Let's forcefully re-assign it to trigger closing/opening @@ -816,6 +803,7 @@ public class TestAssignmentManagerOnCluster { // You can't unassign a dead region before SSH either am.unassign(hri); + state = regionStates.getRegionState(hri); assertTrue(state.isFailedClose()); // Enable SSH so that log can be split http://git-wip-us.apache.org/repos/asf/hbase/blob/3a82cf23/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerNoMaster.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerNoMaster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerNoMaster.java index ed5dfbf..e494819 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerNoMaster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionServerNoMaster.java @@ -26,7 +26,6 @@ import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.MediumTests; -import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.NotServingRegionException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.HTable; @@ -247,29 +246,6 @@ public class TestRegionServerNoMaster { checkRegionIsOpened(HTU, getRS(), hri); } - @Test - public void testOpenClosingRegion() throws Exception { - Assert.assertTrue(getRS().getRegion(regionName).isAvailable()); - - try { - // we re-opened meta so some of its data is lost - ServerName sn = getRS().getServerName(); - MetaTableAccessor.updateRegionLocation(getRS().getShortCircuitConnection(), - hri, sn, getRS().getRegion(regionName).getOpenSeqNum()); - // fake region to be closing now, need to clear state afterwards - getRS().regionsInTransitionInRS.put(hri.getEncodedNameAsBytes(), Boolean.FALSE); - AdminProtos.OpenRegionRequest orr = - RequestConverter.buildOpenRegionRequest(sn, hri, null, null); - getRS().rpcServices.openRegion(null, orr); - Assert.fail("The closing region should not be opened"); - } catch (ServiceException se) { - Assert.assertTrue("The region should be already in transition", - se.getCause() instanceof RegionAlreadyInTransitionException); - } finally { - getRS().regionsInTransitionInRS.remove(hri.getEncodedNameAsBytes()); - } - } - @Test(timeout = 60000) public void testMultipleCloseFromMaster() throws Exception { for (int i = 0; i < 10; i++) { @@ -277,11 +253,10 @@ public class TestRegionServerNoMaster { RequestConverter.buildCloseRegionRequest(getRS().getServerName(), regionName, null); try { AdminProtos.CloseRegionResponse responseClose = getRS().rpcServices.closeRegion(null, crr); - Assert.assertEquals("The first request should succeeds", 0, i); Assert.assertTrue("request " + i + " failed", responseClose.getClosed() || responseClose.hasClosed()); } catch (ServiceException se) { - Assert.assertTrue("The next queries should throw an exception.", i > 0); + Assert.assertTrue("The next queries may throw an exception.", i > 0); } }
