Author: stack
Date: Sat Sep 18 00:51:05 2010
New Revision: 998380
URL: http://svn.apache.org/viewvc?rev=998380&view=rev
Log:
HBASE-3010 Can't start/stop/start... cluster using new master
Removed:
hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestMinimumServerCount.java
Modified:
hbase/trunk/CHANGES.txt
hbase/trunk/bin/stop-hbase.sh
hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HBaseServer.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestActiveMasterManager.java
Modified: hbase/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Sat Sep 18 00:51:05 2010
@@ -526,6 +526,7 @@ Release 0.21.0 - Unreleased
HBASE-3006 Reading compressed HFile blocks causes way too many DFS RPC
calls severly impacting performance
(Kannan Muthukkaruppan via Stack)
+ HBASE-3010 Can't start/stop/start... cluster using new master
IMPROVEMENTS
HBASE-1760 Cleanup TODOs in HTable
Modified: hbase/trunk/bin/stop-hbase.sh
URL:
http://svn.apache.org/viewvc/hbase/trunk/bin/stop-hbase.sh?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/bin/stop-hbase.sh (original)
+++ hbase/trunk/bin/stop-hbase.sh Sat Sep 18 00:51:05 2010
@@ -55,6 +55,8 @@ while kill -0 `cat $pid` > /dev/null 2>&
echo -n "."
sleep 1;
done
+# Add a CR after we're done w/ dots.
+echo
# distributed == false means that the HMaster will kill ZK when it exits
distMode=`$bin/hbase org.apache.hadoop.hbase.util.HBaseConfTool
hbase.cluster.distributed`
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java
Sat Sep 18 00:51:05 2010
@@ -114,9 +114,6 @@ public class CatalogTracker {
public void start() throws IOException, InterruptedException {
this.rootRegionTracker.start();
this.metaNodeTracker.start();
- // Determine meta assignment; may not work because root and meta not yet
- // deployed. Calling the below will set {...@link #metaLocation}.
- getMetaServerConnection(true);
}
/**
@@ -205,7 +202,7 @@ public class CatalogTracker {
*/
private HRegionInterface getRootServerConnection()
throws IOException, InterruptedException {
- HServerAddress address = rootRegionTracker.getRootRegionLocation();
+ HServerAddress address = this.rootRegionTracker.getRootRegionLocation();
if (address == null) {
return null;
}
Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HBaseServer.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HBaseServer.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HBaseServer.java
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HBaseServer.java Sat
Sep 18 00:51:05 2010
@@ -20,18 +20,6 @@
package org.apache.hadoop.hbase.ipc;
-import com.google.common.base.Function;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hbase.util.Bytes;
-import org.apache.hadoop.io.ObjectWritable;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.io.WritableUtils;
-import org.apache.hadoop.security.UserGroupInformation;
-import org.apache.hadoop.util.ReflectionUtils;
-import org.apache.hadoop.util.StringUtils;
-
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
@@ -64,6 +52,18 @@ import java.util.concurrent.ExecutorServ
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.ObjectWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.apache.hadoop.util.StringUtils;
+
+import com.google.common.base.Function;
+
/** An abstract IPC service. IPC calls take a single {...@link Writable} as a
* parameter, and return a {...@link Writable} as their value. A service runs
on
* a port and is defined by a parameter class and a value class.
@@ -151,7 +151,6 @@ public abstract class HBaseServer {
protected Configuration conf;
- @SuppressWarnings({"FieldCanBeLocal"})
private int maxQueueSize;
protected int socketSendBufferSize;
protected final boolean tcpNoDelay; // if T then disable Nagle's Algorithm
@@ -285,7 +284,6 @@ public abstract class HBaseServer {
this.readSelector = readSelector;
}
public void run() {
- LOG.info("Starting SocketReader");
synchronized(this) {
while (running) {
SelectionKey key = null;
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ActiveMasterManager.java
Sat Sep 18 00:51:05 2010
@@ -95,7 +95,7 @@ class ActiveMasterManager extends ZooKee
clusterHasActiveMaster.set(true);
} else {
// Node is no longer there, cluster does not have an active master
- LOG.debug("No master available. notifying waiting threads");
+ LOG.debug("No master available. Notifying waiting threads");
clusterHasActiveMaster.set(false);
// Notify any thread waiting to become the active master
clusterHasActiveMaster.notifyAll();
@@ -114,46 +114,56 @@ class ActiveMasterManager extends ZooKee
*
* This also makes sure that we are watching the master znode so will be
* notified if another master dies.
- * @return False if we did not start up this cluster, another
- * master did, or if a problem (zookeeper, stop flag has been set on this
- * Master)
+ * @return True if no issue becoming active master else false if another
+ * master was running or if some other problem (zookeeper, stop flag has been
+ * set on this Master)
*/
boolean blockUntilBecomingActiveMaster() {
- boolean thisMasterStartedCluster = true;
+ boolean cleanSetOfActiveMaster = true;
// Try to become the active master, watch if there is another master
try {
- if(ZKUtil.setAddressAndWatch(watcher, watcher.masterAddressZNode,
- address)) {
+ if (ZKUtil.setAddressAndWatch(this.watcher,
+ this.watcher.masterAddressZNode, this.address)) {
// We are the master, return
- clusterHasActiveMaster.set(true);
- return thisMasterStartedCluster;
+ this.clusterHasActiveMaster.set(true);
+ return cleanSetOfActiveMaster;
+ }
+
+ // There is another active master running elsewhere or this is a restart
+ // and the master ephemeral node has not expired yet.
+ this.clusterHasActiveMaster.set(true);
+ cleanSetOfActiveMaster = false;
+ HServerAddress currentMaster =
+ ZKUtil.getDataAsAddress(this.watcher, this.watcher.masterAddressZNode);
+ if (currentMaster != null && currentMaster.equals(this.address)) {
+ LOG.info("Current master has this master's address, " + currentMaster +
+ "; master was restarted? Waiting on znode to expire...");
+ // Hurry along the expiration of the znode.
+ ZKUtil.deleteNode(this.watcher, this.watcher.masterAddressZNode);
+ } else {
+ LOG.info("Another master is the active master, " + currentMaster +
+ "; waiting to become the next active master");
}
} catch (KeeperException ke) {
master.abort("Received an unexpected KeeperException, aborting", ke);
return false;
}
- // There is another active master, this is not a cluster startup
- // and we must wait until the active master dies
- LOG.info("Another master is already the active master, waiting to become "
+
- "the next active master");
- clusterHasActiveMaster.set(true);
- thisMasterStartedCluster = false;
- synchronized(clusterHasActiveMaster) {
- while(clusterHasActiveMaster.get() && !master.isStopped()) {
+ synchronized (this.clusterHasActiveMaster) {
+ while (this.clusterHasActiveMaster.get() && !this.master.isStopped()) {
try {
- clusterHasActiveMaster.wait();
+ this.clusterHasActiveMaster.wait();
} catch (InterruptedException e) {
// We expect to be interrupted when a master dies, will fall out if
so
LOG.debug("Interrupted waiting for master to die", e);
}
}
- if(master.isStopped()) {
- return thisMasterStartedCluster;
+ if (this.master.isStopped()) {
+ return cleanSetOfActiveMaster;
}
// Try to become active master again now that there is no active master
blockUntilBecomingActiveMaster();
}
- return thisMasterStartedCluster;
+ return cleanSetOfActiveMaster;
}
/**
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
Sat Sep 18 00:51:05 2010
@@ -157,8 +157,8 @@ public class AssignmentManager extends Z
void cleanoutUnassigned() throws IOException, KeeperException {
// Cleanup any existing ZK nodes and start watching
ZKAssign.deleteAllNodes(watcher);
- ZKUtil.listChildrenAndWatchForNewChildren(watcher,
- watcher.assignmentZNode);
+ ZKUtil.listChildrenAndWatchForNewChildren(this.watcher,
+ this.watcher.assignmentZNode);
}
/**
@@ -545,7 +545,7 @@ public class AssignmentManager extends Z
if (plan == null) {
LOG.debug("No previous transition plan for " +
state.getRegion().getRegionNameAsString() +
- " so generating a random one from " + serverManager.numServers() +
+ " so generating a random one from " +
serverManager.countOfRegionServers() +
" ( " + serverManager.getOnlineServers().size() + ") available
servers");
plan = new RegionPlan(state.getRegion(), null,
LoadBalancer.randomAssignment(serverManager.getOnlineServersList()));
Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Sat
Sep 18 00:51:05 2010
@@ -51,6 +51,7 @@ import org.apache.hadoop.hbase.UnknownRe
import org.apache.hadoop.hbase.catalog.CatalogTracker;
import org.apache.hadoop.hbase.catalog.MetaEditor;
import org.apache.hadoop.hbase.catalog.MetaReader;
+import org.apache.hadoop.hbase.catalog.RootLocationEditor;
import org.apache.hadoop.hbase.client.HConnection;
import org.apache.hadoop.hbase.client.HConnectionManager;
import org.apache.hadoop.hbase.client.MetaScanner;
@@ -145,12 +146,14 @@ implements HMasterInterface, HMasterRegi
// Cluster status zk tracker and local setter
private ClusterStatusTracker clusterStatusTracker;
- // True if this is the master that started the cluster.
- boolean clusterStarter;
-
- // This flag is for stopping this Master instance.
- private boolean stopped = false;
- // Set on abort -- usually failure of our zk session
+ // True if this a cluster startup where there are no already running servers
+ // as opposed to a master joining an already running cluster
+ boolean freshClusterStartup;
+
+ // This flag is for stopping this Master instance. Its set when we are
+ // stopping or aborting
+ private volatile boolean stopped = false;
+ // Set on abort -- usually failure of our zk session.
private volatile boolean abort = false;
// Instance of the hbase executor service.
@@ -178,17 +181,17 @@ implements HMasterInterface, HMasterRegi
this.conf = conf;
/*
* 1. Determine address and initialize RPC server (but do not start).
- * The RPC server ports can be ephemeral.
+ * The RPC server ports can be ephemeral. Create a ZKW instance.
*/
HServerAddress a = new HServerAddress(getMyAddress(this.conf));
int numHandlers = conf.getInt("hbase.regionserver.handler.count", 10);
this.rpcServer = HBaseRPC.getServer(this,
- new Class<?>[]{HMasterInterface.class, HMasterRegionInterface.class},
- a.getBindAddress(), a.getPort(),
- numHandlers,
- 0, // we dont use high priority handlers in master
- false, conf,
- 0); // this is a DNC w/o high priority handlers
+ new Class<?>[]{HMasterInterface.class, HMasterRegionInterface.class},
+ a.getBindAddress(), a.getPort(),
+ numHandlers,
+ 0, // we dont use high priority handlers in master
+ false, conf,
+ 0); // this is a DNC w/o high priority handlers
this.address = new HServerAddress(rpcServer.getListenerAddress());
// set the thread name now we have an address
@@ -201,24 +204,11 @@ implements HMasterInterface, HMasterRegi
"_" + System.currentTimeMillis());
}
- /*
- * 2. Determine if this is a fresh cluster startup or failed over master.
- * This is done by checking for the existence of any ephemeral
- * RegionServer nodes in ZooKeeper. These nodes are created by RSs on
- * their initialization but only after they find the primary master. As
- * long as this check is done before we write our address into ZK, this
- * will work. Note that multiple masters could find this to be true on
- * startup (none have become active master yet), which is why there is an
- * additional check if this master does not become primary on its first
attempt.
- */
this.zooKeeper =
new ZooKeeperWatcher(conf, MASTER + "-" + getMasterAddress(), this);
- this.clusterStarter = 0 ==
- ZKUtil.getNumberOfChildren(zooKeeper, zooKeeper.rsZNode);
-
/*
- * 3. Block on becoming the active master.
+ * 2. Block on becoming the active master.
* We race with other masters to write our address into ZooKeeper. If we
* succeed, we are the primary/active master and finish initialization.
*
@@ -228,32 +218,25 @@ implements HMasterInterface, HMasterRegi
*/
this.activeMasterManager = new ActiveMasterManager(zooKeeper, address,
this);
this.zooKeeper.registerListener(activeMasterManager);
+ stallIfBackupMaster(this.conf, this.activeMasterManager);
+ activeMasterManager.blockUntilBecomingActiveMaster();
+ /*
+ * 3. Determine if this is a fresh cluster startup or failed over master.
+ * This is done by checking for the existence of any ephemeral
+ * RegionServer nodes in ZooKeeper. These nodes are created by RSs on
+ * their initialization but initialization will not happen unless clusterup
+ * flag is set -- see ClusterStatusTracker below.
+ */
+ this.freshClusterStartup =
+ 0 == ZKUtil.getNumberOfChildren(zooKeeper, zooKeeper.rsZNode);
- // If we're a backup master, stall until a primary to writes his address
- if (conf.getBoolean(HConstants.MASTER_TYPE_BACKUP,
- HConstants.DEFAULT_MASTER_TYPE_BACKUP)) {
- // This will only be a minute or so while the cluster starts up,
- // so don't worry about setting watches on the parent znode
- while (!this.activeMasterManager.isActiveMaster()) {
- try {
- LOG.debug("Waiting for master address ZNode to be written " +
- "(Also watching cluster state node)");
- Thread.sleep(conf.getInt("zookeeper.session.timeout", 60 * 1000));
- } catch (InterruptedException e) {
- // interrupted = user wants to kill us. Don't continue
- throw new IOException("Interrupted waiting for master address");
- }
- }
- }
-
- // Wait here until we are the active master
- clusterStarter = activeMasterManager.blockUntilBecomingActiveMaster();
-
- /**
+ /*
* 4. We are active master now... go initialize components we need to run.
+ * Note, there may be dross in zk from previous runs; it'll get addressed
+ * when we enter {...@link #run()} below.
*/
- // TODO: Do this using Dependency Injection, using PicoContainer or Spring.
+ // TODO: Do this using Dependency Injection, using PicoContainer, Guice or
Spring.
this.fileSystemManager = new MasterFileSystem(this);
this.connection = HConnectionManager.getConnection(conf);
this.executorService = new ExecutorService(getServerName());
@@ -270,18 +253,40 @@ implements HMasterInterface, HMasterRegi
this.regionServerTracker = new RegionServerTracker(zooKeeper, this,
this.serverManager);
- regionServerTracker.start();
+ this.regionServerTracker.start();
- // Set the cluster as up.
+ // Set the cluster as up. If new RSs, they'll be waiting on this before
+ // going ahead with their startup.
this.clusterStatusTracker = new ClusterStatusTracker(getZooKeeper(), this);
this.clusterStatusTracker.setClusterUp();
this.clusterStatusTracker.start();
LOG.info("Server active/primary master; " + this.address +
- "; clusterStarter=" + this.clusterStarter + ", sessionid=0x" +
+ "; freshClusterStart=" + this.freshClusterStartup + ", sessionid=0x" +
Long.toHexString(this.zooKeeper.getZooKeeper().getSessionId()));
}
+ /*
+ * Stall startup if we are designated a backup master.
+ * @param c
+ * @param amm
+ * @throws InterruptedException
+ */
+ private static void stallIfBackupMaster(final Configuration c,
+ final ActiveMasterManager amm)
+ throws InterruptedException {
+ // If we're a backup master, stall until a primary to writes his address
+ if (!c.getBoolean(HConstants.MASTER_TYPE_BACKUP,
+ HConstants.DEFAULT_MASTER_TYPE_BACKUP)) return;
+ // This will only be a minute or so while the cluster starts up,
+ // so don't worry about setting watches on the parent znode
+ while (!amm.isActiveMaster()) {
+ LOG.debug("Waiting for master address ZNode to be written " +
+ "(Also watching cluster state node)");
+ Thread.sleep(c.getInt("zookeeper.session.timeout", 60 * 1000));
+ }
+ }
+
/**
* Main processing loop for the HMaster.
* 1. Handle both fresh cluster start as well as failed over initialization
of
@@ -295,22 +300,24 @@ implements HMasterInterface, HMasterRegi
try {
// start up all service threads.
startServiceThreads();
- // wait for minimum number of region servers to be up
- this.serverManager.waitForMinServers();
- // start assignment of user regions, startup or failure
- if (this.clusterStarter) {
- clusterStarterInitializations(this.fileSystemManager,
+ // Wait for minimum number of region servers to report in
+ this.serverManager.waitForRegionServers();
+
+ // Start assignment of user regions, startup or failure
+ if (!this.stopped) {
+ if (this.freshClusterStartup) {
+ clusterStarterInitializations(this.fileSystemManager,
this.serverManager, this.catalogTracker, this.assignmentManager);
- } else {
- // Process existing unassigned nodes in ZK, read all regions from META,
- // rebuild in-memory state.
- this.assignmentManager.processFailover();
+ } else {
+ // Process existing unassigned nodes in ZK, read all regions from
META,
+ // rebuild in-memory state.
+ this.assignmentManager.processFailover();
+ }
}
+
// Check if we should stop every second.
Sleeper sleeper = new Sleeper(1000, this);
- while (!this.stopped && !this.abort) {
- sleeper.sleep();
- }
+ while (!this.stopped) sleeper.sleep();
} catch (Throwable t) {
abort("Unhandled exception. Starting shutdown.", t);
}
@@ -341,22 +348,17 @@ implements HMasterInterface, HMasterRegi
/*
* Initializations we need to do if we are cluster starter.
- * @param starter
* @param mfs
+ * @param sm
+ * @param ct
+ * @param am
* @throws IOException
*/
private static void clusterStarterInitializations(final MasterFileSystem mfs,
final ServerManager sm, final CatalogTracker ct, final AssignmentManager
am)
throws IOException, InterruptedException, KeeperException {
- // This master is starting the cluster (its not a preexisting cluster
- // that this master is joining).
- // Initialize the filesystem, which does the following:
- // - Creates the root hbase directory in the FS if DNE
- // - If fresh start, create first ROOT and META regions (bootstrap)
- // - Checks the FS to make sure the root directory is readable
- // - Creates the archive directory for logs
+ // Check filesystem has required basics
mfs.initialize();
- // Do any log splitting necessary
// TODO: Should do this in background rather than block master startup
// TODO: Do we want to do this before/while/after RSs check in?
// It seems that this method looks at active RSs but happens
@@ -795,6 +797,7 @@ implements HMasterInterface, HMasterRegi
if (t != null) LOG.fatal(msg, t);
else LOG.fatal(msg);
this.abort = true;
+ stop("Aborting");
}
@Override
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMasterCommandLine.java
Sat Sep 18 00:51:05 2010
@@ -19,8 +19,8 @@
*/
package org.apache.hadoop.hbase.master;
-import java.io.IOException;
import java.io.File;
+import java.io.IOException;
import java.util.List;
import org.apache.commons.cli.CommandLine;
@@ -30,11 +30,10 @@ import org.apache.commons.cli.ParseExcep
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hbase.ZooKeeperConnectionException;
-import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.LocalHBaseCluster;
import org.apache.hadoop.hbase.MasterNotRunningException;
+import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.util.ServerCommandLine;
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
Sat Sep 18 00:51:05 2010
@@ -83,8 +83,11 @@ public class MasterFileSystem {
}
/**
+ * Create initial layout in filesystem.
* <ol>
- * <li>Check if the root region exists and is readable, if not create it</li>
+ * <li>Check if the root region exists and is readable, if not create it.
+ * Create hbase.version and the -ROOT- directory if not one.
+ * </li>
* <li>Create a log archive directory for RS to put archived logs</li>
* </ol>
*/
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
Sat Sep 18 00:51:05 2010
@@ -86,8 +86,6 @@ public class ServerManager {
private final ServerMonitor serverMonitorThread;
- private int minimumServerCount;
-
private final LogCleaner logCleaner;
// Reporting to track master metrics.
@@ -106,7 +104,7 @@ public class ServerManager {
@Override
protected void chore() {
- int numServers = numServers();
+ int numServers = countOfRegionServers();
int numDeadServers = deadservers.size();
double averageLoad = getAverageLoad();
String deadServersList = deadservers.toString();
@@ -127,7 +125,6 @@ public class ServerManager {
this.services = services;
Configuration c = master.getConfiguration();
int monitorInterval = c.getInt("hbase.master.monitor.interval", 60 * 1000);
- this.minimumServerCount = c.getInt("hbase.regions.server.count.min", 1);
this.metrics = new MasterMetrics(master.getServerName());
this.serverMonitorThread = new ServerMonitor(monitorInterval, master);
String n = Thread.currentThread().getName();
@@ -220,8 +217,8 @@ public class ServerManager {
info.setLoad(load);
// TODO: Why did we update the RS location ourself? Shouldn't RS do this?
// masterStatus.getZooKeeper().updateRSLocationGetWatch(info, watcher);
- onlineServers.put(serverName, info);
- if(hri == null) {
+ this.onlineServers.put(serverName, info);
+ if (hri == null) {
serverConnections.remove(serverName);
} else {
serverConnections.put(serverName, hri);
@@ -277,7 +274,7 @@ public class ServerManager {
}
HMsg [] reply = null;
- int numservers = numServers();
+ int numservers = countOfRegionServers();
if (this.clusterShutdown) {
if (numservers <= 2) {
// Shutdown needs to be staggered; the meta regions need to close last
@@ -362,14 +359,10 @@ public class ServerManager {
return averageLoad;
}
- /** @return the number of active servers */
- public int numServers() {
- int num = -1;
- // This synchronized seems gratuitous.
- synchronized (this.onlineServers) {
- num = this.onlineServers.size();
- }
- return num;
+ /** @return the count of active regionservers */
+ int countOfRegionServers() {
+ // Presumes onlineServers is a concurrent map
+ return this.onlineServers.size();
}
/**
@@ -476,17 +469,6 @@ public class ServerManager {
" to dead servers, submitted shutdown handler to be executed");
}
- public boolean canAssignUserRegions() {
- if (minimumServerCount == 0) {
- return true;
- }
- return (numServers() >= minimumServerCount);
- }
-
- public void setMinimumServerCount(int minimumServerCount) {
- this.minimumServerCount = minimumServerCount;
- }
-
// RPC methods to region servers
/**
@@ -546,18 +528,25 @@ public class ServerManager {
}
/**
- * Waits for the minimum number of servers to be running.
+ * Waits for the regionservers to report in.
+ * @throws InterruptedException
*/
- public void waitForMinServers() {
- while(numServers() < minimumServerCount) {
-// !masterStatus.getShutdownRequested().get()) {
- LOG.info("Waiting for enough servers to check in. Currently have " +
- numServers() + " but need at least " + minimumServerCount);
- try {
- Thread.sleep(1000);
- } catch (InterruptedException e) {
- LOG.warn("Got interrupted waiting for servers to check in, looping");
+ public void waitForRegionServers()
+ throws InterruptedException {
+ long interval = this.master.getConfiguration().
+ getLong("hbase.master.wait.on.regionservers.interval", 3000);
+ // So, number of regionservers > 0 and its been n since last check in,
break,
+ // else just stall here
+ for (int oldcount = countOfRegionServers(); !this.master.isStopped();) {
+ Thread.sleep(interval);
+ int count = countOfRegionServers();
+ if (count == oldcount && count > 0) break;
+ if (count == 0) {
+ LOG.info("Waiting on regionserver(s) to checkin");
+ } else {
+ LOG.info("Waiting on regionserver(s) count to settle; currently=" +
count);
}
+ oldcount = count;
}
}
@@ -571,8 +560,8 @@ public class ServerManager {
}
public void shutdownCluster() {
- LOG.info("Cluster shutdown requested");
this.clusterShutdown = true;
+ this.master.stop("Cluster shutdown requested");
}
public boolean isClusterShutdown() {
Modified:
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
---
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
(original)
+++
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
Sat Sep 18 00:51:05 2010
@@ -265,7 +265,7 @@ public class HRegionServer implements HR
* @throws InterruptedException
*/
public HRegionServer(Configuration conf) throws IOException,
InterruptedException {
- machineName = DNS.getDefaultHost(conf.get(
+ this.machineName = DNS.getDefaultHost(conf.get(
"hbase.regionserver.dns.interface", "default"), conf.get(
"hbase.regionserver.dns.nameserver", "default"));
String addressStr = machineName
@@ -434,18 +434,18 @@ public class HRegionServer implements HR
zooKeeper = new ZooKeeperWatcher(conf, REGIONSERVER + "-"
+ serverInfo.getServerName(), this);
+ this.clusterStatusTracker = new ClusterStatusTracker(this.zooKeeper, this);
+ this.clusterStatusTracker.start();
+ this.clusterStatusTracker.blockUntilAvailable();
+
// create the master address manager, register with zk, and start it
masterAddressManager = new MasterAddressTracker(zooKeeper, this);
masterAddressManager.start();
- // create the catalog tracker and start it
+ // Create the catalog tracker and start it;
this.catalogTracker = new CatalogTracker(this.zooKeeper, this.connection,
this, this.conf.getInt("hbase.regionserver.catalog.timeout",
Integer.MAX_VALUE));
catalogTracker.start();
-
- this.clusterStatusTracker = new ClusterStatusTracker(this.zooKeeper, this);
- this.clusterStatusTracker.start();
- this.clusterStatusTracker.blockUntilAvailable();
}
/**
Modified:
hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestActiveMasterManager.java
URL:
http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestActiveMasterManager.java?rev=998380&r1=998379&r2=998380&view=diff
==============================================================================
---
hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestActiveMasterManager.java
(original)
+++
hbase/trunk/src/test/java/org/apache/hadoop/hbase/master/TestActiveMasterManager.java
Sat Sep 18 00:51:05 2010
@@ -23,6 +23,7 @@ import static org.junit.Assert.assertFal
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
+import java.io.IOException;
import java.util.concurrent.Semaphore;
import org.apache.commons.logging.Log;
@@ -57,6 +58,39 @@ public class TestActiveMasterManager {
TEST_UTIL.shutdownMiniZKCluster();
}
+ @Test public void testRestartMaster() throws IOException, KeeperException {
+ ZooKeeperWatcher zk = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
+ "testActiveMasterManagerFromZK", null);
+ ZKUtil.createAndFailSilent(zk, zk.baseZNode);
+ try {
+ ZKUtil.deleteNode(zk, zk.masterAddressZNode);
+ } catch(KeeperException.NoNodeException nne) {}
+
+ // Create the master node with a dummy address
+ HServerAddress master = new HServerAddress("localhost", 1);
+ // Should not have a master yet
+ DummyMaster dummyMaster = new DummyMaster();
+ ActiveMasterManager activeMasterManager = new ActiveMasterManager(zk,
+ master, dummyMaster);
+ zk.registerListener(activeMasterManager);
+ assertFalse(activeMasterManager.clusterHasActiveMaster.get());
+
+ // First test becoming the active master uninterrupted
+ activeMasterManager.blockUntilBecomingActiveMaster();
+ assertTrue(activeMasterManager.clusterHasActiveMaster.get());
+ assertMaster(zk, master);
+
+ // Now pretend master restart
+ DummyMaster secondDummyMaster = new DummyMaster();
+ ActiveMasterManager secondActiveMasterManager = new ActiveMasterManager(zk,
+ master, secondDummyMaster);
+ zk.registerListener(secondActiveMasterManager);
+ assertFalse(secondActiveMasterManager.clusterHasActiveMaster.get());
+ activeMasterManager.blockUntilBecomingActiveMaster();
+ assertTrue(activeMasterManager.clusterHasActiveMaster.get());
+ assertMaster(zk, master);
+ }
+
/**
* Unit tests that uses ZooKeeper but does not use the master-side methods
* but rather acts directly on ZK.
@@ -64,22 +98,21 @@ public class TestActiveMasterManager {
*/
@Test
public void testActiveMasterManagerFromZK() throws Exception {
-
ZooKeeperWatcher zk = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
- "testActiveMasterManagerFromZK", null);
+ "testActiveMasterManagerFromZK", null);
ZKUtil.createAndFailSilent(zk, zk.baseZNode);
try {
ZKUtil.deleteNode(zk, zk.masterAddressZNode);
} catch(KeeperException.NoNodeException nne) {}
// Create the master node with a dummy address
- HServerAddress firstMasterAddress = new HServerAddress("firstMaster",
1234);
- HServerAddress secondMasterAddress = new HServerAddress("secondMaster",
1234);
+ HServerAddress firstMasterAddress = new HServerAddress("localhost", 1);
+ HServerAddress secondMasterAddress = new HServerAddress("localhost", 2);
// Should not have a master yet
DummyMaster ms1 = new DummyMaster();
ActiveMasterManager activeMasterManager = new ActiveMasterManager(zk,
- firstMasterAddress, ms1);
+ firstMasterAddress, ms1);
zk.registerListener(activeMasterManager);
assertFalse(activeMasterManager.clusterHasActiveMaster.get());
@@ -132,6 +165,9 @@ public class TestActiveMasterManager {
assertTrue(t.manager.clusterHasActiveMaster.get());
assertTrue(t.isActiveMaster);
+
+ LOG.info("Deleting master node");
+ ZKUtil.deleteNode(zk, zk.masterAddressZNode);
}
/**