Author: stack
Date: Tue Oct 26 16:52:31 2010
New Revision: 1027646
URL: http://svn.apache.org/viewvc?rev=1027646&view=rev
Log:
HBASE-3147 Regions stuck in transition after rolling restart, perpetual timeout
handling but nothing happens
Modified:
hbase/trunk/CHANGES.txt
hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/MetaReader.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/MetaNodeTracker.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java
hbase/trunk/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java
hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java
Modified: hbase/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=1027646&r1=1027645&r2=1027646&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Tue Oct 26 16:52:31 2010
@@ -613,6 +613,8 @@ Release 0.21.0 - Unreleased
HBASE-3136 Stale reads from ZK can break the atomic CAS operations we
have in ZKAssign
HBASE-2753 Remove sorted() methods from Result now that Gets are Scans
+ HBASE-3147 Regions stuck in transition after rolling restart, perpetual
+ timeout handling but nothing happens
IMPROVEMENTS
HBASE-1760 Cleanup TODOs in HTable
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java?rev=1027646&r1=1027645&r2=1027646&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java
Tue Oct 26 16:52:31 2010
@@ -29,19 +29,16 @@ import org.apache.commons.logging.LogFac
import org.apache.hadoop.hbase.Abortable;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HServerAddress;
-import org.apache.hadoop.hbase.HServerInfo;
import org.apache.hadoop.hbase.NotAllMetaRegionsOnlineException;
import org.apache.hadoop.hbase.NotServingRegionException;
import org.apache.hadoop.hbase.client.HConnection;
import org.apache.hadoop.hbase.client.RetriesExhaustedException;
import org.apache.hadoop.hbase.ipc.HRegionInterface;
import org.apache.hadoop.hbase.util.Bytes;
-import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.zookeeper.MetaNodeTracker;
import org.apache.hadoop.hbase.zookeeper.RootRegionTracker;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.hadoop.ipc.RemoteException;
-import org.apache.zookeeper.KeeperException;
/**
* Tracks the availability of the catalog tables <code>-ROOT-</code> and
@@ -63,6 +60,12 @@ public class CatalogTracker {
private final RootRegionTracker rootRegionTracker;
private final MetaNodeTracker metaNodeTracker;
private final AtomicBoolean metaAvailable = new AtomicBoolean(false);
+ /**
+ * Do not clear this address once set. Let it be cleared by
+ * {...@link #setMetaLocation(HServerAddress)} only. Its needed when we do
+ * server shutdown processing -- we need to know who had .META. last. If you
+ * want to know if the address is good, rely on {...@link #metaAvailable}
value.
+ */
private HServerAddress metaLocation;
private final int defaultTimeout;
private boolean stopped = false;
@@ -365,7 +368,6 @@ public class CatalogTracker {
private void resetMetaLocation() {
LOG.info("Current cached META location is not valid, resetting");
this.metaAvailable.set(false);
- this.metaLocation = null;
}
private void setMetaLocation(HServerAddress metaLocation) {
@@ -471,37 +473,6 @@ public class CatalogTracker {
return getMetaServerConnection(true) != null;
}
- /**
- * Check if <code>hsi</code> was carrying <code>-ROOT-</code> or
- * <code>.META.</code> and if so, clear out old locations.
- * @param hsi Server that has crashed/shutdown.
- * @throws InterruptedException
- * @throws KeeperException
- * @return Pair of booleans; if this server was carrying root, then first
- * boolean is set, if server was carrying meta, then second boolean set.
- */
- public Pair<Boolean, Boolean> processServerShutdown(final HServerInfo hsi)
- throws InterruptedException, KeeperException {
- Pair<Boolean, Boolean> result = new Pair<Boolean, Boolean>(false, false);
- HServerAddress rootHsa = getRootLocation();
- if (rootHsa == null) {
- LOG.info("-ROOT- is not assigned; continuing");
- } else if (hsi.getServerAddress().equals(rootHsa)) {
- result.setFirst(true);
- LOG.info(hsi.getServerName() + " carrying -ROOT-; unsetting");
- }
- HServerAddress metaHsa = getMetaLocation();
- if (metaHsa == null) {
- LOG.info(".META. is not assigned; continuing");
- } else if (hsi.getServerAddress().equals(metaHsa)) {
- LOG.info(hsi.getServerName() + " carrying .META.; unsetting " +
- ".META. location");
- result.setSecond(true);
- resetMetaLocation();
- }
- return result;
- }
-
MetaNodeTracker getMetaNodeTracker() {
return this.metaNodeTracker;
}
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/MetaReader.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/MetaReader.java?rev=1027646&r1=1027645&r2=1027646&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/MetaReader.java
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/MetaReader.java
Tue Oct 26 16:52:31 2010
@@ -246,9 +246,12 @@ public class MetaReader {
throw e;
}
} catch (RemoteException re) {
- if (re.unwrapRemoteException() instanceof NotServingRegionException) {
+ IOException ioe = re.unwrapRemoteException();
+ if (ioe instanceof NotServingRegionException) {
// Treat this NSRE as unavailable table. Catch and fall through to
// return null below
+ } else if (ioe.getMessage().contains("Server not running")) {
+ // Treat as unavailable table.
} else {
throw re;
}
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java?rev=1027646&r1=1027645&r2=1027646&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/EventHandler.java
Tue Oct 26 16:52:31 2010
@@ -127,7 +127,8 @@ public abstract class EventHandler imple
M_ZK_REGION_OFFLINE (50), // Master adds this region as offline in
ZK
// Master controlled events to be executed on the master
- M_SERVER_SHUTDOWN (70); // Master is processing shutdown of a RS
+ M_SERVER_SHUTDOWN (70), // Master is processing shutdown of a RS
+ M_META_SERVER_SHUTDOWN (72); // Master is processing shutdown of RS
hosting a meta region (-ROOT- or .META.).
/**
* Constructor
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java?rev=1027646&r1=1027645&r2=1027646&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/executor/ExecutorService.java
Tue Oct 26 16:52:31 2010
@@ -27,7 +27,6 @@ import java.util.concurrent.ConcurrentHa
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -77,6 +76,7 @@ public class ExecutorService {
MASTER_SERVER_OPERATIONS (3),
MASTER_TABLE_OPERATIONS (4),
MASTER_RS_SHUTDOWN (5),
+ MASTER_META_SERVER_OPERATIONS (6),
// RegionServer executor services
RS_OPEN_REGION (20),
@@ -115,6 +115,9 @@ public class ExecutorService {
case M_SERVER_SHUTDOWN:
return ExecutorType.MASTER_SERVER_OPERATIONS;
+ case M_META_SERVER_SHUTDOWN:
+ return ExecutorType.MASTER_META_SERVER_OPERATIONS;
+
case C_M_DELETE_TABLE:
case C_M_DISABLE_TABLE:
case C_M_ENABLE_TABLE:
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1027646&r1=1027645&r2=1027646&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
Tue Oct 26 16:52:31 2010
@@ -46,6 +46,7 @@ import org.apache.hadoop.hbase.Chore;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HServerInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
+import org.apache.hadoop.hbase.NotServingRegionException;
import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.Stoppable;
import org.apache.hadoop.hbase.catalog.CatalogTracker;
@@ -96,13 +97,13 @@ public class AssignmentManager extends Z
private TimeoutMonitor timeoutMonitor;
/** Regions currently in transition. */
- private final ConcurrentSkipListMap<String, RegionState> regionsInTransition
=
+ final ConcurrentSkipListMap<String, RegionState> regionsInTransition =
new ConcurrentSkipListMap<String, RegionState>();
/** Plans for region movement. Key is the encoded version of a region name*/
// TODO: When do plans get cleaned out? Ever? In server open and in server
// shutdown processing -- St.Ack
- protected final ConcurrentNavigableMap<String, RegionPlan> regionPlans =
+ final ConcurrentNavigableMap<String, RegionPlan> regionPlans =
new ConcurrentSkipListMap<String, RegionPlan>();
/** Set of tables that have been disabled. */
@@ -315,7 +316,7 @@ public class AssignmentManager extends Z
if (!serverManager.isServerOnline(data.getServerName()) &&
!this.master.getServerName().equals(data.getServerName())) {
LOG.warn("Attempted to handle region transition for server but " +
- "server is not online: " + data);
+ "server is not online: " + data.getRegionName());
return;
}
String encodedName = HRegionInfo.encodeRegionName(data.getRegionName());
@@ -597,9 +598,8 @@ public class AssignmentManager extends Z
ZKAssign.deleteOfflineNode(watcher, regionInfo.getEncodedName());
}
} catch (KeeperException.NoNodeException nne) {
- LOG.warn("Tried to delete closed node for " + regionInfo + " but it " +
- "does not exist");
- return;
+ LOG.debug("Tried to delete closed node for " + regionInfo + " but it " +
+ "does not exist so just offlining");
} catch (KeeperException e) {
this.master.abort("Error deleting CLOSED node in ZK", e);
}
@@ -976,15 +976,29 @@ public class AssignmentManager extends Z
}
// Send CLOSE RPC
try {
- serverManager.sendRegionClose(regions.get(region), state.getRegion());
+ if(!serverManager.sendRegionClose(regions.get(region),
+ state.getRegion())) {
+ throw new NotServingRegionException("Server failed to close region");
+ }
+ } catch (NotServingRegionException nsre) {
+ // Did not CLOSE, so set region offline and assign it
+ LOG.debug("Attempted to send CLOSE for region " +
+ region.getRegionNameAsString() + " but failed, setting region as " +
+ "OFFLINE and reassigning");
+ synchronized (regionsInTransition) {
+ forceRegionStateToOffline(region);
+ assign(region);
+ }
} catch (IOException e) {
// For now call abort if unexpected exception -- radical, but will get
fellas attention.
// St.Ack 20101012
+ // I don't think IOE can happen anymore, only NSRE IOE is used here
+ // should be able to remove this at least. jgray 20101024
this.master.abort("Remote unexpected exception", e);
} catch (Throwable t) {
// For now call abort if unexpected exception -- radical, but will get
fellas attention.
// St.Ack 20101012
- this.master.abort("Unexpected exception", t);
+ this.master.abort("Remote unexpected exception", t);
}
}
@@ -1435,14 +1449,40 @@ public class AssignmentManager extends Z
assign(regionState.getRegion());
break;
case PENDING_OPEN:
+ LOG.info("Region has been PENDING_OPEN for too " +
+ "long, reassigning region=" +
+ regionInfo.getRegionNameAsString());
+ // Should have a ZK node in OFFLINE state or no node at all
+ try {
+ if (ZKUtil.watchAndCheckExists(watcher,
+ ZKAssign.getNodeName(watcher,
+ regionInfo.getEncodedName())) &&
+ !ZKAssign.verifyRegionState(watcher, regionInfo,
+ EventType.M_ZK_REGION_OFFLINE)) {
+ LOG.info("Region exists and not in expected OFFLINE " +
+ "state so skipping timeout, region=" +
+ regionInfo.getRegionNameAsString());
+ break;
+ }
+ } catch (KeeperException ke) {
+ LOG.error("Unexpected ZK exception timing out " +
+ "PENDING_CLOSE region",
+ ke);
+ break;
+ }
+ AssignmentManager.this.setOffline(regionState.getRegion());
+ regionState.update(RegionState.State.OFFLINE);
+ assign(regionState.getRegion());
+ break;
case OPENING:
- LOG.info("Region has been PENDING_OPEN or OPENING for too " +
+ LOG.info("Region has been OPENING for too " +
"long, reassigning region=" +
regionInfo.getRegionNameAsString());
- // There could be two cases. No ZK node or ZK in CLOSING.
+ // Should have a ZK node in OPENING state
try {
- if (ZKUtil.checkExists(watcher, watcher.assignmentZNode)
- != -1 &&
+ if (ZKUtil.watchAndCheckExists(watcher,
+ ZKAssign.getNodeName(watcher,
+ regionInfo.getEncodedName())) &&
ZKAssign.transitionNode(watcher, regionInfo,
HMaster.MASTER, EventType.RS_ZK_REGION_OPENING,
EventType.M_ZK_REGION_OFFLINE, -1) == -1) {
@@ -1465,8 +1505,27 @@ public class AssignmentManager extends Z
"not happen; region=" + regionInfo.getRegionNameAsString());
break;
case PENDING_CLOSE:
+ LOG.info("Region has been PENDING_CLOSE for too " +
+ "long, running forced unassign again on region=" +
+ regionInfo.getRegionNameAsString());
+ try {
+ // If the server got the RPC, it will transition the node
+ // to CLOSING, so only do something here if no node exists
+ if (!ZKUtil.watchAndCheckExists(watcher,
+ ZKAssign.getNodeName(watcher,
+ regionInfo.getEncodedName()))) {
+ unassign(regionInfo, true);
+ }
+ } catch (NoNodeException e) {
+ LOG.debug("Node no longer existed so not forcing another "
+
+ "unassignment");
+ } catch (KeeperException e) {
+ LOG.warn("Unexpected ZK exception timing out a region " +
+ "close", e);
+ }
+ break;
case CLOSING:
- LOG.info("Region has been PENDING_CLOSE or CLOSING for too " +
+ LOG.info("Region has been CLOSING for too " +
"long, running forced unassign again on region=" +
regionInfo.getRegionNameAsString());
try {
@@ -1500,6 +1559,7 @@ public class AssignmentManager extends Z
Map.Entry<String, RegionPlan> e = i.next();
if (e.getValue().getDestination().equals(hsi)) {
// Use iterator's remove else we'll get CME
+ LOG.info("REMOVING PLAN " + e.getValue());
i.remove();
}
}
Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1027646&r1=1027645&r2=1027646&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Tue
Oct 26 16:52:31 2010
@@ -413,7 +413,8 @@ implements HMasterInterface, HMasterRegi
this.catalogTracker.waitForRoot();
assigned++;
}
- LOG.info("-ROOT- assigned=" + assigned + ", rit=" + rit);
+ LOG.info("-ROOT- assigned=" + assigned + ", rit=" + rit +
+ ", location=" + catalogTracker.getRootLocation());
// Work on meta region
rit = this.assignmentManager.
@@ -426,7 +427,8 @@ implements HMasterInterface, HMasterRegi
this.assignmentManager.waitForAssignment(HRegionInfo.FIRST_META_REGIONINFO);
assigned++;
}
- LOG.info(".META. assigned=" + assigned + ", rit=" + rit);
+ LOG.info(".META. assigned=" + assigned + ", rit=" + rit +
+ ", location=" + catalogTracker.getMetaLocation());
return assigned;
}
@@ -502,6 +504,8 @@ implements HMasterInterface, HMasterRegi
conf.getInt("hbase.master.executor.closeregion.threads", 5));
this.executorService.startExecutorService(ExecutorType.MASTER_SERVER_OPERATIONS,
conf.getInt("hbase.master.executor.serverops.threads", 3));
+
this.executorService.startExecutorService(ExecutorType.MASTER_META_SERVER_OPERATIONS,
+ conf.getInt("hbase.master.executor.serverops.threads", 2));
this.executorService.startExecutorService(ExecutorType.MASTER_TABLE_OPERATIONS,
conf.getInt("hbase.master.executor.tableops.threads", 3));
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1027646&r1=1027645&r2=1027646&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
Tue Oct 26 16:52:31 2010
@@ -41,15 +41,19 @@ import org.apache.hadoop.hbase.PleaseHol
import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.Stoppable;
import org.apache.hadoop.hbase.YouAreDeadException;
+import org.apache.hadoop.hbase.catalog.CatalogTracker;
import org.apache.hadoop.hbase.client.HConnection;
import org.apache.hadoop.hbase.client.HConnectionManager;
import org.apache.hadoop.hbase.client.RetriesExhaustedException;
import org.apache.hadoop.hbase.ipc.HRegionInterface;
+import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
import org.apache.hadoop.hbase.regionserver.Leases.LeaseStillHeldException;
+import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.util.StringUtils;
+import org.apache.zookeeper.KeeperException;
/**
* The ServerManager class manages info about region servers - HServerInfo,
@@ -490,10 +494,36 @@ public class ServerManager {
}
return;
}
- this.services.getExecutorService().submit(new
ServerShutdownHandler(this.master,
+ CatalogTracker ct = this.master.getCatalogTracker();
+ // Was this server carrying root?
+ boolean carryingRoot;
+ try {
+ HServerAddress address = ct.getRootLocation();
+ carryingRoot = address != null &&
+ hsi.getServerAddress().equals(address);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ LOG.info("Interrupted");
+ return;
+ }
+ // Was this server carrying meta? Can't ask CatalogTracker because it
+ // may have reset the meta location as null already (it may have already
+ // run into fact that meta is dead). I can ask assignment manager. It
+ // has an inmemory list of who has what. This list will be cleared as we
+ // process the dead server but should be find asking it now.
+ HServerAddress address = ct.getMetaLocation();
+ boolean carryingMeta =
+ address != null && hsi.getServerAddress().equals(address);
+ if (carryingRoot || carryingMeta) {
+ this.services.getExecutorService().submit(new
MetaServerShutdownHandler(this.master,
+ this.services, this.deadservers, info, carryingRoot, carryingMeta));
+ } else {
+ this.services.getExecutorService().submit(new
ServerShutdownHandler(this.master,
this.services, this.deadservers, info));
+ }
LOG.debug("Added=" + serverName +
- " to dead servers, submitted shutdown handler to be executed");
+ " to dead servers, submitted shutdown handler to be executed, root=" +
+ carryingRoot + ", meta=" + carryingMeta);
}
// RPC methods to region servers
@@ -546,16 +576,17 @@ public class ServerManager {
* @return true if server acknowledged close, false if not
* @throws IOException
*/
- public void sendRegionClose(HServerInfo server, HRegionInfo region)
+ public boolean sendRegionClose(HServerInfo server, HRegionInfo region)
throws IOException {
+ if (server == null) return false;
HRegionInterface hri = getServerConnection(server);
if(hri == null) {
LOG.warn("Attempting to send CLOSE RPC to server " +
server.getServerName() + " failed because no RPC connection found " +
"to this server");
- return;
+ return false;
}
- hri.closeRegion(region);
+ return hri.closeRegion(region);
}
/**
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java?rev=1027646&r1=1027645&r2=1027646&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
Tue Oct 26 16:52:31 2010
@@ -57,7 +57,12 @@ public class ServerShutdownHandler exten
public ServerShutdownHandler(final Server server, final MasterServices
services,
final DeadServer deadServers, final HServerInfo hsi) {
- super(server, EventType.M_SERVER_SHUTDOWN);
+ this(server, services, deadServers, hsi, EventType.M_SERVER_SHUTDOWN);
+ }
+
+ ServerShutdownHandler(final Server server, final MasterServices services,
+ final DeadServer deadServers, final HServerInfo hsi, EventType type) {
+ super(server, type);
this.hsi = hsi;
this.server = server;
this.services = services;
@@ -67,19 +72,22 @@ public class ServerShutdownHandler exten
}
}
+ /**
+ * @return True if the server we are processing was carrying
<code>-ROOT-</code>
+ */
+ boolean isCarryingRoot() {
+ return false;
+ }
+
+ /**
+ * @return True if the server we are processing was carrying
<code>.META.</code>
+ */
+ boolean isCarryingMeta() {
+ return false;
+ }
+
@Override
public void process() throws IOException {
- Pair<Boolean, Boolean> carryingCatalog = null;
- try {
- carryingCatalog =
- this.server.getCatalogTracker().processServerShutdown(this.hsi);
- } catch (InterruptedException e) {
- Thread.currentThread().interrupt();
- throw new IOException("Interrupted", e);
- } catch (KeeperException e) {
- this.server.abort("In server shutdown processing", e);
- throw new IOException("Aborting", e);
- }
final String serverName = this.hsi.getServerName();
LOG.info("Splitting logs for " + serverName);
@@ -92,7 +100,7 @@ public class ServerShutdownHandler exten
this.services.getAssignmentManager().processServerShutdown(this.hsi);
// Assign root and meta if we were carrying them.
- if (carryingCatalog.getFirst()) { // -ROOT-
+ if (isCarryingRoot()) { // -ROOT-
try {
this.services.getAssignmentManager().assignRoot();
} catch (KeeperException e) {
@@ -100,9 +108,9 @@ public class ServerShutdownHandler exten
throw new IOException("Aborting", e);
}
}
- if (carryingCatalog.getSecond()) { // .META.
- this.services.getAssignmentManager().assignMeta();
- }
+
+ // Carrying meta?
+ if (isCarryingMeta()) this.services.getAssignmentManager().assignMeta();
// Wait on meta to come online; we need it to progress.
try {
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/MetaNodeTracker.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/MetaNodeTracker.java?rev=1027646&r1=1027645&r2=1027646&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/MetaNodeTracker.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/MetaNodeTracker.java
Tue Oct 26 16:52:31 2010
@@ -57,6 +57,7 @@ public class MetaNodeTracker extends Zoo
@Override
public void nodeDeleted(String path) {
+ super.nodeDeleted(path);
if (!path.equals(node)) return;
LOG.info("Detected completed assignment of META, notifying catalog
tracker");
try {
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java?rev=1027646&r1=1027645&r2=1027646&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZKAssign.java
Tue Oct 26 16:52:31 2010
@@ -97,7 +97,7 @@ public class ZKAssign {
* @param regionName region name
* @return full path node name
*/
- private static String getNodeName(ZooKeeperWatcher zkw, String regionName) {
+ public static String getNodeName(ZooKeeperWatcher zkw, String regionName) {
return ZKUtil.joinZNode(zkw.assignmentZNode, regionName);
}
@@ -762,4 +762,44 @@ public class ZKAssign {
Thread.sleep(200);
}
}
+
+ /**
+ * Verifies that the specified region is in the specified state in ZooKeeper.
+ * <p>
+ * Returns true if region is in transition and in the specified state in
+ * ZooKeeper. Returns false if the region does not exist in ZK or is in
+ * a different state.
+ * <p>
+ * Method synchronizes() with ZK so will yield an up-to-date result but is
+ * a slow read.
+ * @param watcher
+ * @param region
+ * @param expectedState
+ * @return true if region exists and is in expected state
+ */
+ public static boolean verifyRegionState(ZooKeeperWatcher zkw,
+ HRegionInfo region, EventType expectedState)
+ throws KeeperException {
+ String encoded = region.getEncodedName();
+
+ String node = getNodeName(zkw, encoded);
+ zkw.sync(node);
+
+ // Read existing data of the node
+ byte [] existingBytes = null;
+ try {
+ existingBytes = ZKUtil.getDataAndWatch(zkw, node);
+ } catch (KeeperException.NoNodeException nne) {
+ return false;
+ } catch (KeeperException e) {
+ throw e;
+ }
+ if (existingBytes == null) return false;
+ RegionTransitionData existingData =
+ RegionTransitionData.fromBytes(existingBytes);
+ if (existingData.getEventType() == expectedState){
+ return true;
+ }
+ return false;
+ }
}
Modified:
hbase/trunk/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java?rev=1027646&r1=1027645&r2=1027646&view=diff
==============================================================================
---
hbase/trunk/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java
(original)
+++
hbase/trunk/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java
Tue Oct 26 16:52:31 2010
@@ -35,7 +35,6 @@ import org.apache.hadoop.hbase.Abortable
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
-import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.HServerAddress;
import org.apache.hadoop.hbase.HServerInfo;
import org.apache.hadoop.hbase.KeyValue;
@@ -105,6 +104,26 @@ public class TestCatalogTracker {
}
/**
+ * Test that we get notification if .META. moves.
+ * @throws IOException
+ * @throws InterruptedException
+ * @throws KeeperException
+ */
+ @Test public void testThatIfMETAMovesWeAreNotified()
+ throws IOException, InterruptedException, KeeperException {
+ HConnection connection = Mockito.mock(HConnection.class);
+ final CatalogTracker ct = constructAndStartCatalogTracker(connection);
+ try {
+ RootLocationEditor.setRootLocation(this.watcher,
+ new HServerAddress("example.com:1234"));
+ } finally {
+ // Clean out root location or later tests will be confused... they
presume
+ // start fresh in zk.
+ RootLocationEditor.deleteRootLocation(this.watcher);
+ }
+ }
+
+ /**
* Test interruptable while blocking wait on root and meta.
* @throws IOException
* @throws InterruptedException
Modified:
hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java?rev=1027646&r1=1027645&r2=1027646&view=diff
==============================================================================
---
hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java
(original)
+++
hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailover.java
Tue Oct 26 16:52:31 2010
@@ -41,6 +41,7 @@ import org.apache.hadoop.hbase.HTableDes
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.executor.RegionTransitionData;
import org.apache.hadoop.hbase.executor.EventHandler.EventType;
+import org.apache.hadoop.hbase.master.AssignmentManager.RegionState;
import org.apache.hadoop.hbase.master.LoadBalancer.RegionPlan;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.util.Bytes;
@@ -794,10 +795,35 @@ public class TestMasterFailover {
cluster.waitForActiveAndReadyMaster();
log("Master is ready");
+ // Let's add some weird states to master in-memory state
+
+ // PENDING_OPEN and enabled
+ region = enabledRegions.remove(0);
+ regionsThatShouldBeOnline.add(region);
+ master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
+ new RegionState(region, RegionState.State.PENDING_OPEN));
+ // PENDING_OPEN and disabled
+ region = disabledRegions.remove(0);
+ regionsThatShouldBeOffline.add(region);
+ master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
+ new RegionState(region, RegionState.State.PENDING_OPEN));
+ // PENDING_CLOSE and enabled
+ region = enabledRegions.remove(0);
+ regionsThatShouldBeOnline.add(region);
+ master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
+ new RegionState(region, RegionState.State.PENDING_CLOSE));
+ // PENDING_CLOSE and disabled
+ region = disabledRegions.remove(0);
+ regionsThatShouldBeOffline.add(region);
+ master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
+ new RegionState(region, RegionState.State.PENDING_CLOSE));
+
// Failover should be completed, now wait for no RIT
log("Waiting for no more RIT");
ZKAssign.blockUntilNoRIT(zkw);
- log("No more RIT in ZK, now doing final test verification");
+ log("No more RIT in ZK");
+ master.assignmentManager.waitUntilNoRegionsInTransition(120000);
+ log("No more RIT in RIT map, doing final test verification");
// Grab all the regions that are online across RSs
Set<HRegionInfo> onlineRegions = new TreeSet<HRegionInfo>();