Author: stack
Date: Wed Sep 29 22:37:50 2010
New Revision: 1002880

URL: http://svn.apache.org/viewvc?rev=1002880&view=rev
Log:
HBASE-3047 If new master crashes, restart is messy

Modified:
    hbase/trunk/CHANGES.txt
    
hbase/trunk/src/main/java/org/apache/hadoop/hbase/RemoteExceptionHandler.java
    
hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java
    
hbase/trunk/src/main/java/org/apache/hadoop/hbase/mapreduce/package-info.java
    
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
    
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
    
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
    
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
    hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/Leases.java
    
hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperNodeTracker.java
    hbase/trunk/src/main/resources/hbase-default.xml
    
hbase/trunk/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java

Modified: hbase/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=1002880&r1=1002879&r2=1002880&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Wed Sep 29 22:37:50 2010
@@ -552,6 +552,7 @@ Release 0.21.0 - Unreleased
    HBASE-2995  Incorrect dependency on Log class from Jetty
    HBASE-3038  WALReaderFSDataInputStream.getPos() fails if Filesize > MAX_INT
                (Nicolas Spiegelberg via Stack)
+   HBASE-3047  If new master crashes, restart is messy
 
   IMPROVEMENTS
    HBASE-1760  Cleanup TODOs in HTable

Modified: 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/RemoteExceptionHandler.java
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/RemoteExceptionHandler.java?rev=1002880&r1=1002879&r2=1002880&view=diff
==============================================================================
--- 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/RemoteExceptionHandler.java 
(original)
+++ 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/RemoteExceptionHandler.java 
Wed Sep 29 22:37:50 2010
@@ -79,6 +79,8 @@ public class RemoteExceptionHandler {
    * @throws IOException indicating a server error ocurred if the decoded
    *         exception is not an IOException. The decoded exception is set as
    *         the cause.
+   * @deprecated Use {...@link RemoteException#unwrapRemoteException()} 
instead.
+   * In fact we should look into deprecating this whole class - St.Ack 2010929
    */
   public static IOException decodeRemoteException(final RemoteException re)
   throws IOException {

Modified: 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java?rev=1002880&r1=1002879&r2=1002880&view=diff
==============================================================================
--- 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java 
(original)
+++ 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/catalog/CatalogTracker.java 
Wed Sep 29 22:37:50 2010
@@ -20,7 +20,6 @@
 package org.apache.hadoop.hbase.catalog;
 
 import java.io.IOException;
-import java.lang.reflect.UndeclaredThrowableException;
 import java.net.ConnectException;
 import java.util.concurrent.atomic.AtomicBoolean;
 
@@ -40,6 +39,7 @@ import org.apache.hadoop.hbase.util.Pair
 import org.apache.hadoop.hbase.zookeeper.MetaNodeTracker;
 import org.apache.hadoop.hbase.zookeeper.RootRegionTracker;
 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
+import org.apache.hadoop.ipc.RemoteException;
 import org.apache.zookeeper.KeeperException;
 
 /**
@@ -271,7 +271,9 @@ public class CatalogTracker {
   /**
    * Gets the current location for <code>.META.</code> if available and waits
    * for up to the specified timeout if not immediately available.  Throws an
-   * exception if timed out waiting.
+   * exception if timed out waiting.  This method differs from {...@link 
#waitForMeta()}
+   * in that it will go ahead and verify the location gotten from ZooKeeper by
+   * trying trying to use returned connection.
    * @param timeout maximum time to wait for meta availability, in milliseconds
    * @return location of meta
    * @throws InterruptedException if interrupted while waiting
@@ -282,15 +284,15 @@ public class CatalogTracker {
   public HServerAddress waitForMeta(long timeout)
   throws InterruptedException, IOException, NotAllMetaRegionsOnlineException {
     long stop = System.currentTimeMillis() + timeout;
-    synchronized(metaAvailable) {
-      if(getMetaServerConnection(true) != null) {
+    synchronized (metaAvailable) {
+      if (getMetaServerConnection(true) != null) {
         return metaLocation;
       }
       while(!metaAvailable.get() &&
           (timeout == 0 || System.currentTimeMillis() < stop)) {
         metaAvailable.wait(timeout);
       }
-      if(getMetaServerConnection(true) == null) {
+      if (getMetaServerConnection(true) == null) {
         throw new NotAllMetaRegionsOnlineException(
             "Timed out (" + timeout + "ms)");
       }
@@ -336,7 +338,6 @@ public class CatalogTracker {
   }
 
   private void setMetaLocation(HServerAddress metaLocation) {
-    LOG.info("Found new META location, " + metaLocation);
     metaAvailable.set(true);
     this.metaLocation = metaLocation;
     // no synchronization because these are private and already under lock
@@ -359,24 +360,70 @@ public class CatalogTracker {
   }
 
   private boolean verifyRegionLocation(HRegionInterface metaServer,
-      byte [] regionName) {
+      byte [] regionName)
+  throws IOException {
+    if (metaServer == null) {
+      LOG.info("Passed metaserver is null");
+      return false;
+    }
     Throwable t = null;
     try {
+      // Am expecting only two possible exceptions here; unable
+      // to connect to the regionserver or NotServingRegionException wrapped
+      // in the hadoop rpc RemoteException.
       return metaServer.getRegionInfo(regionName) != null;
-    } catch (NotServingRegionException e) {
+    } catch (ConnectException e) {
       t = e;
-    } catch (UndeclaredThrowableException e) {
-      // We can get a ConnectException wrapped by a UTE if client fails connect
-      // If not a ConnectException, rethrow.
-      if (!(e.getCause() instanceof ConnectException)) throw e;
-      t = e.getCause();
+    } catch (RemoteException e) {
+      IOException ioe = e.unwrapRemoteException();
+      if (ioe instanceof NotServingRegionException) {
+        t = ioe;
+      } else {
+        throw e;
+      }
     }
     LOG.info("Failed verification of " + Bytes.toString(regionName) +
-      ": " + t.getMessage());
+      ", assigning anew: " + t);
     return false;
   }
 
   /**
+   * Verify <code>-ROOT-</code> is deployed and accessible.
+   * @param timeout How long to wait on zk for root address (passed through to
+   * the internal call to {...@link #waitForRootServerConnection(long)}.
+   * @return True if the <code>-ROOT-</code> location is healthy.
+   * @throws IOException
+   * @throws InterruptedException 
+   */
+  public boolean verifyRootRegionLocation(final long timeout)
+  throws InterruptedException, IOException {
+    HRegionInterface connection = null;
+    try {
+      connection = waitForRootServerConnection(timeout);
+    } catch (NotAllMetaRegionsOnlineException e) {
+      // Pass
+    } catch (IOException e) {
+      // Unexpected exception
+      throw e;
+    }
+    return (connection == null)? false:
+      verifyRegionLocation(connection, 
HRegionInfo.ROOT_REGIONINFO.getRegionName());
+  }
+
+  /**
+   * Verify <code>.META.</code> is deployed and accessible.
+   * @param timeout How long to wait on zk for <code>.META.</code> address
+   * (passed through to the internal call to {...@link 
#waitForMetaServerConnection(long)}.
+   * @return True if the <code>.META.</code> location is healthy.
+   * @throws IOException Some unexpected IOE.
+   * @throws InterruptedException
+   */
+  public boolean verifyMetaRegionLocation(final long timeout)
+  throws InterruptedException, IOException {
+    return getMetaServerConnection(true) != null;
+  }
+
+  /**
    * Check if <code>hsi</code> was carrying <code>-ROOT-</code> or
    * <code>.META.</code> and if so, clear out old locations.
    * @param hsi Server that has crashed/shutdown.

Modified: 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java?rev=1002880&r1=1002879&r2=1002880&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java 
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/ipc/HRegionInterface.java 
Wed Sep 29 22:37:50 2010
@@ -20,6 +20,7 @@
 package org.apache.hadoop.hbase.ipc;
 
 import java.io.IOException;
+import java.net.ConnectException;
 import java.util.List;
 import java.util.NavigableSet;
 
@@ -38,6 +39,7 @@ import org.apache.hadoop.hbase.client.Pu
 import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hbase.regionserver.wal.HLog;
+import org.apache.hadoop.ipc.RemoteException;
 
 /**
  * Clients interact with HRegionServers using a handle to the HRegionInterface.
@@ -51,10 +53,12 @@ public interface HRegionInterface extend
    *
    * @param regionName name of the region
    * @return HRegionInfo object for region
-   * @throws NotServingRegionException e
+   * @throws NotServingRegionException
+   * @throws ConnectException
+   * @throws IOException This can manifest as an Hadoop ipc {...@link 
RemoteException}
    */
   public HRegionInfo getRegionInfo(final byte [] regionName)
-  throws NotServingRegionException;
+  throws NotServingRegionException, ConnectException, IOException;
 
   /**
    * Return all the data for the row that matches <i>row</i> exactly,

Modified: 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/mapreduce/package-info.java
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/mapreduce/package-info.java?rev=1002880&r1=1002879&r2=1002880&view=diff
==============================================================================
--- 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/mapreduce/package-info.java 
(original)
+++ 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/mapreduce/package-info.java 
Wed Sep 29 22:37:50 2010
@@ -1,5 +1,5 @@
 /*
- * Copyright 20010 The Apache Software Foundation
+ * Copyright 2010 The Apache Software Foundation
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file

Modified: 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1002880&r1=1002879&r2=1002880&view=diff
==============================================================================
--- 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java 
(original)
+++ 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java 
Wed Sep 29 22:37:50 2010
@@ -160,7 +160,7 @@ public class AssignmentManager extends Z
 
   /**
    * Handle failover.  Restore state from META and ZK.  Handle any regions in
-   * transition.
+   * transition.  Presumes <code>.META.</code> and <code>-ROOT-</code> 
deployed.
    * @throws KeeperException
    * @throws IOException
    */
@@ -170,6 +170,12 @@ public class AssignmentManager extends Z
     // synchronized.  The presumption is that in this case it is safe since 
this
     // method is being played by a single thread on startup.
 
+    // TODO: Check list of user regions and their assignments against 
regionservers.
+    // TODO: Regions that have a null location and are not in 
regionsInTransitions
+    // need to be handled.
+    // TODO: Regions that are on servers that are not in our online list need
+    // reassigning.
+
     // Scan META to build list of existing regions, servers, and assignment
     rebuildUserRegions();
     // Pickup any disabled tables
@@ -183,46 +189,90 @@ public class AssignmentManager extends Z
     }
     LOG.info("Failed-over master needs to process " + nodes.size() +
         " regions in transition");
-    for (String regionName: nodes) {
-      RegionTransitionData data = ZKAssign.getData(watcher, regionName);
-      HRegionInfo regionInfo =
-        MetaReader.getRegion(catalogTracker, data.getRegionName()).getFirst();
-      String encodedName = regionInfo.getEncodedName();
-      switch(data.getEventType()) {
-        case RS_ZK_REGION_CLOSING:
-          // Just insert region into RIT.
-          // If this never updates the timeout will trigger new assignment
-          regionsInTransition.put(encodedName,
-              new RegionState(regionInfo, RegionState.State.CLOSING,
-                  data.getStamp()));
-          break;
-
-        case RS_ZK_REGION_CLOSED:
-          // Region is closed, insert into RIT and handle it
-          regionsInTransition.put(encodedName,
-              new RegionState(regionInfo, RegionState.State.CLOSED,
-                  data.getStamp()));
-          new ClosedRegionHandler(master, this, data, regionInfo).process();
-          break;
-
-        case RS_ZK_REGION_OPENING:
-          // Just insert region into RIT
-          // If this never updates the timeout will trigger new assignment
-          regionsInTransition.put(encodedName,
-              new RegionState(regionInfo, RegionState.State.OPENING,
-                  data.getStamp()));
-          break;
+    for (String encodedRegionName: nodes) {
+      processRegionInTransition(encodedRegionName, null);
+    }
+  }
 
-        case RS_ZK_REGION_OPENED:
-          // Region is opened, insert into RIT and handle it
-          regionsInTransition.put(encodedName,
-              new RegionState(regionInfo, RegionState.State.OPENING,
-                  data.getStamp()));
-          new OpenedRegionHandler(master, this, data, regionInfo,
-              serverManager.getServerInfo(data.getServerName())).process();
-          break;
+  /**
+   * If region is up in zk in transition, then do fixup and block and wait 
until
+   * the region is assigned and out of transition.  Used on startup for
+   * catalog regions.
+   * @param hri Region to look for.
+   * @return True if we processed a region in transition else false if region
+   * was not up in zk in transition.
+   * @throws InterruptedException
+   * @throws KeeperException
+   * @throws IOException
+   */
+  boolean processRegionInTransitionAndBlockUntilAssigned(final HRegionInfo hri)
+  throws InterruptedException, KeeperException, IOException {
+    boolean intransistion = processRegionInTransition(hri.getEncodedName(), 
hri);
+    if (!intransistion) return intransistion;
+    synchronized(this.regionsInTransition) {
+      while (!this.master.isStopped() &&
+          this.regionsInTransition.containsKey(hri.getEncodedName())) {
+        this.regionsInTransition.wait();
       }
     }
+    return intransistion;
+  }
+
+  /**
+   * Process failover of <code>encodedName</code>.  Look in 
+   * @param encodedRegionName Region to process failover for.
+   * @param encodedRegionName RegionInfo.  If null we'll go get it from meta 
table.
+   * @return
+   * @throws KeeperException 
+   * @throws IOException 
+   */
+  boolean processRegionInTransition(final String encodedRegionName,
+      final HRegionInfo regionInfo)
+  throws KeeperException, IOException {
+    RegionTransitionData data = ZKAssign.getData(watcher, encodedRegionName);
+    if (data == null) return false;
+    HRegionInfo hri = (regionInfo != null)? regionInfo:
+      MetaReader.getRegion(catalogTracker, data.getRegionName()).getFirst();
+    processRegionsInTransition(data, hri);
+    return true;
+  }
+
+  void processRegionsInTransition(final RegionTransitionData data,
+      final HRegionInfo regionInfo)
+  throws KeeperException {
+    String encodedRegionName = regionInfo.getEncodedName();
+    LOG.info("Processing region " + regionInfo.getRegionNameAsString() +
+      " in state " + data.getEventType());
+    switch (data.getEventType()) {
+    case RS_ZK_REGION_CLOSING:
+      // Just insert region into RIT.
+      // If this never updates the timeout will trigger new assignment
+      regionsInTransition.put(encodedRegionName, new RegionState(
+          regionInfo, RegionState.State.CLOSING, data.getStamp()));
+      break;
+
+    case RS_ZK_REGION_CLOSED:
+      // Region is closed, insert into RIT and handle it
+      regionsInTransition.put(encodedRegionName, new RegionState(
+          regionInfo, RegionState.State.CLOSED, data.getStamp()));
+      new ClosedRegionHandler(master, this, data, regionInfo).process();
+      break;
+
+    case RS_ZK_REGION_OPENING:
+      // Just insert region into RIT
+      // If this never updates the timeout will trigger new assignment
+      regionsInTransition.put(encodedRegionName, new RegionState(
+          regionInfo, RegionState.State.OPENING, data.getStamp()));
+      break;
+
+    case RS_ZK_REGION_OPENED:
+      // Region is opened, insert into RIT and handle it
+      regionsInTransition.put(encodedRegionName, new RegionState(
+          regionInfo, RegionState.State.OPENING, data.getStamp()));
+      new OpenedRegionHandler(master, this, data, regionInfo,
+          serverManager.getServerInfo(data.getServerName())).process();
+      break;
+    }
   }
 
   /**
@@ -752,11 +802,11 @@ public class AssignmentManager extends Z
   private void rebuildUserRegions() throws IOException {
     Map<HRegionInfo,HServerAddress> allRegions =
       MetaReader.fullScan(catalogTracker);
-    for (Map.Entry<HRegionInfo,HServerAddress> region : allRegions.entrySet()) 
{
+    for (Map.Entry<HRegionInfo,HServerAddress> region: allRegions.entrySet()) {
       HServerAddress regionLocation = region.getValue();
       HRegionInfo regionInfo = region.getKey();
       if (regionLocation == null) {
-        regions.put(regionInfo, null);
+        this.regions.put(regionInfo, null);
         continue;
       }
       HServerInfo serverInfo = serverManager.getHServerInfo(regionLocation);

Modified: hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java?rev=1002880&r1=1002879&r2=1002880&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java 
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/HMaster.java Wed 
Sep 29 22:37:50 2010
@@ -145,10 +145,6 @@ implements HMasterInterface, HMasterRegi
   // Cluster status zk tracker and local setter
   private ClusterStatusTracker clusterStatusTracker;
 
-  // True if this a cluster startup where there are no already running servers
-  // as opposed to a master joining an already running cluster
-  boolean freshClusterStartup;
-
   // This flag is for stopping this Master instance.  Its set when we are
   // stopping or aborting
   private volatile boolean stopped = false;
@@ -170,8 +166,7 @@ implements HMasterInterface, HMasterRegi
    *
    * <ol>
    * <li>Initialize HMaster RPC and address
-   * <li>Connect to ZooKeeper and figure out if this is a fresh cluster start 
or
-   *     a failed over master
+   * <li>Connect to ZooKeeper.  Get count of regionservers still up.
    * <li>Block until becoming active master
    * <li>Initialize master components - server manager, region manager,
    *     region server queue, file system manager, etc
@@ -209,7 +204,12 @@ implements HMasterInterface, HMasterRegi
     this.zooKeeper = new ZooKeeperWatcher(conf, MASTER, this);
 
     /*
-     * 2. Block on becoming the active master.
+     * 2. Count of regoinservers that are up.
+     */
+    int count = ZKUtil.getNumberOfChildren(zooKeeper, zooKeeper.rsZNode);
+
+    /*
+     * 3. Block on becoming the active master.
      * We race with other masters to write our address into ZooKeeper.  If we
      * succeed, we are the primary/active master and finish initialization.
      *
@@ -223,16 +223,6 @@ implements HMasterInterface, HMasterRegi
     activeMasterManager.blockUntilBecomingActiveMaster();
 
     /*
-     * 3. Determine if this is a fresh cluster startup or failed over master.
-     * This is done by checking for the existence of any ephemeral
-     * RegionServer nodes in ZooKeeper.  These nodes are created by RSs on
-     * their initialization but initialization will not happen unless clusterup
-     * flag is set -- see ClusterStatusTracker below.
-     */
-    this.freshClusterStartup =
-      0 == ZKUtil.getNumberOfChildren(zooKeeper, zooKeeper.rsZNode);
-
-    /*
      * 4. We are active master now... go initialize components we need to run.
      * Note, there may be dross in zk from previous runs; it'll get addressed
      * when we enter {...@link #run()} below.
@@ -242,7 +232,7 @@ implements HMasterInterface, HMasterRegi
     this.connection = HConnectionManager.getConnection(conf);
     this.executorService = new ExecutorService(getServerName());
 
-    this.serverManager = new ServerManager(this, this, 
this.freshClusterStartup);
+    this.serverManager = new ServerManager(this, this);
 
     this.catalogTracker = new CatalogTracker(this.zooKeeper, this.connection,
       this, conf.getInt("hbase.master.catalog.timeout", Integer.MAX_VALUE));
@@ -259,16 +249,20 @@ implements HMasterInterface, HMasterRegi
     // Set the cluster as up.  If new RSs, they'll be waiting on this before
     // going ahead with their startup.
     this.clusterStatusTracker = new ClusterStatusTracker(getZooKeeper(), this);
-    this.clusterStatusTracker.setClusterUp();
+    boolean wasUp = this.clusterStatusTracker.isClusterUp();
+    if (!wasUp) this.clusterStatusTracker.setClusterUp();
     this.clusterStatusTracker.start();
 
     LOG.info("Server active/primary master; " + this.address +
-      "; freshClusterStart=" + this.freshClusterStartup + ", sessionid=0x" +
-      Long.toHexString(this.zooKeeper.getZooKeeper().getSessionId()));
+      ", sessionid=0x" +
+      Long.toHexString(this.zooKeeper.getZooKeeper().getSessionId()) +
+      ", ephemeral nodes still up in zk=" + count +
+      ", cluster-up flag was=" + wasUp);
   }
 
   /*
-   * Stall startup if we are designated a backup master.
+   * Stall startup if we are designated a backup master; i.e. we want someone
+   * else to become the master before proceeding.
    * @param c
    * @param amm
    * @throws InterruptedException
@@ -290,27 +284,42 @@ implements HMasterInterface, HMasterRegi
 
   /**
    * Main processing loop for the HMaster.
-   * 1. Handle both fresh cluster start as well as failed over initialization 
of
-   *    the HMaster.
-   * 2. Start the necessary services
-   * 3. Reassign the root region
-   * 4. The master is no longer closed - set "closed" to false
+   * <ol>
+   * <li>Handle both fresh cluster start as well as failed over initialization 
of
+   *    the HMaster</li>
+   * <li>Start the necessary services</li>
+   * <li>Reassign the root region</li>
+   * <li>The master is no longer closed - set "closed" to false</li>
+   * </ol>
    */
   @Override
   public void run() {
     try {
       // start up all service threads.
       startServiceThreads();
-      // Wait for minimum number of region servers to report in
-      this.serverManager.waitForRegionServers();
 
-      // Start assignment of user regions, startup or failure
-      if (this.freshClusterStartup) {
-        clusterStarterInitializations(this.fileSystemManager,
-          this.serverManager, this.catalogTracker, this.assignmentManager);
+      // Wait for region servers to report in.  Returns count of regions.
+      int regionCount = this.serverManager.waitForRegionServers();
+
+      // TODO: Should do this in background rather than block master startup
+      // TODO: Do we want to do this before/while/after RSs check in?
+      // It seems that this method looks at active RSs but happens concurrently
+      // with when we expect them to be checking in
+      this.fileSystemManager.
+        splitLogAfterStartup(this.serverManager.getOnlineServers());
+
+      // Make sure root and meta assigned before proceeding.
+      assignRootAndMeta();
+
+      // Is this fresh start with no regions assigned or are we a master 
joining
+      // an already-running cluster?  If regionsCount == 0, then for sure a
+      // fresh start.  TOOD: Be fancier.  If regionsCount == 2, perhaps the
+      // 2 are .META. and -ROOT- and we should fall into the fresh startup
+      // branch below.  For now, do processFailover.
+      if (regionCount == 0) {
+        this.assignmentManager.cleanoutUnassigned();
+        this.assignmentManager.assignAllUserRegions();
       } else {
-        // Process existing unassigned nodes in ZK, read all regions from META,
-        // rebuild in-memory state.
         this.assignmentManager.processFailover();
       }
 
@@ -343,36 +352,42 @@ implements HMasterInterface, HMasterRegi
     LOG.info("HMaster main thread exiting");
   }
 
-  /*
-   * Initializations we need to do if we are cluster starter.
-   * @param mfs
-   * @param sm
-   * @param ct
-   * @param am
+  /**
+   * Check <code>-ROOT-</code> and <code>.META.</code> are assigned.  If not,
+   * assign them.
+   * @throws InterruptedException
    * @throws IOException
+   * @throws KeeperException
+   * @return Count of regions we assigned.
    */
-  private static void clusterStarterInitializations(final MasterFileSystem mfs,
-    final ServerManager sm, final CatalogTracker ct, final AssignmentManager 
am)
-  throws IOException, InterruptedException, KeeperException {
-      // Check filesystem has required basics
-      mfs.initialize();
-      // TODO: Should do this in background rather than block master startup
-      // TODO: Do we want to do this before/while/after RSs check in?
-      //       It seems that this method looks at active RSs but happens
-      //       concurrently with when we expect them to be checking in
-      mfs.splitLogAfterStartup(sm.getOnlineServers());
-      // Clean out current state of unassigned
-      am.cleanoutUnassigned();
-      // assign the root region
-      am.assignRoot();
-      ct.waitForRoot();
-      // assign the meta region
-      am.assignMeta();
-      ct.waitForMeta();
-      // above check waits for general meta availability but this does not
+  int assignRootAndMeta()
+  throws InterruptedException, IOException, KeeperException {
+    int assigned = 0;
+    long timeout = this.conf.getLong("hbase.catalog.verification.timeout", 
1000);
+
+    // Work on ROOT region.  Is it in zk in transition?
+    boolean rit = this.assignmentManager.
+      
processRegionInTransitionAndBlockUntilAssigned(HRegionInfo.ROOT_REGIONINFO);
+    if (!catalogTracker.verifyRootRegionLocation(timeout)) {
+      this.assignmentManager.assignRoot();
+      this.catalogTracker.waitForRoot();
+      assigned++;
+    }
+    LOG.info("-ROOT- assigned=" + assigned + ", rit=" + rit);
+ 
+    // Work on meta region
+    rit = this.assignmentManager.
+      
processRegionInTransitionAndBlockUntilAssigned(HRegionInfo.FIRST_META_REGIONINFO);
+    if (!this.catalogTracker.verifyMetaRegionLocation(timeout)) {
+      this.assignmentManager.assignMeta();
+      this.catalogTracker.waitForMeta();
+      // Above check waits for general meta availability but this does not
       // guarantee that the transition has completed
-      am.waitForAssignment(HRegionInfo.FIRST_META_REGIONINFO);
-      am.assignAllUserRegions();
+      
this.assignmentManager.waitForAssignment(HRegionInfo.FIRST_META_REGIONINFO);
+      assigned++;
+    }
+    LOG.info(".META. assigned=" + assigned + ", rit=" + rit);
+    return assigned;
   }
 
   /*

Modified: 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java?rev=1002880&r1=1002879&r2=1002880&view=diff
==============================================================================
--- 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java 
(original)
+++ 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java 
Wed Sep 29 22:37:50 2010
@@ -81,6 +81,7 @@ public class MasterFileSystem {
     this.fs = FileSystem.get(conf);
     // set up the archived logs path
     this.oldLogDir = new Path(this.rootdir, HConstants.HREGION_OLDLOGDIR_NAME);
+    createInitialFileSystemLayout();
   }
 
   /**
@@ -91,8 +92,9 @@ public class MasterFileSystem {
    * </li>
    * <li>Create a log archive directory for RS to put archived logs</li>
    * </ol>
+   * Idempotent.
    */
-  public void initialize() throws IOException {
+  private void createInitialFileSystemLayout() throws IOException {
     // check if the root directory exists
     checkRootDir(this.rootdir, conf, this.fs);
 

Modified: 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java?rev=1002880&r1=1002879&r2=1002880&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java 
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java 
Wed Sep 29 22:37:50 2010
@@ -84,7 +84,6 @@ public class ServerManager {
 
   private final Server master;
   private final MasterServices services;
-  private final boolean freshClusterStartup;
 
   private final ServerMonitor serverMonitorThread;
 
@@ -123,11 +122,9 @@ public class ServerManager {
    * @param freshClusterStartup True if we are original master on a fresh
    * cluster startup else if false, we are joining an already running cluster.
    */
-  public ServerManager(final Server master, final MasterServices services,
-      final boolean freshClusterStartup) {
+  public ServerManager(final Server master, final MasterServices services) {
     this.master = master;
     this.services = services;
-    this.freshClusterStartup = freshClusterStartup;
     Configuration c = master.getConfiguration();
     int monitorInterval = c.getInt("hbase.master.monitor.interval", 60 * 1000);
     this.metrics = new MasterMetrics(master.getServerName());
@@ -170,8 +167,7 @@ public class ServerManager {
       throw new PleaseHoldException(message);
     }
     checkIsDead(info.getServerName(), "STARTUP");
-    LOG.info("Received start message from: " + info.getServerName());
-    recordNewServer(info);
+    recordNewServer(info, false, null);
   }
 
   private HServerInfo haveServerWithSameHostAndPortAlready(final String 
hostnamePort) {
@@ -201,26 +197,24 @@ public class ServerManager {
   }
 
   /**
-   * Adds the HSI to the RS list and creates an empty load
-   * @param info The region server informations
-   */
-  public void recordNewServer(HServerInfo info) {
-    recordNewServer(info, false, null);
-  }
-
-  /**
    * Adds the HSI to the RS list
    * @param info The region server informations
-   * @param useInfoLoad True if the load from the info should be used
-   *                    like under a master failover
+   * @param useInfoLoad True if the load from the info should be used; e.g.
+   * under a master failover
+   * @param hri Region interface.  Can be null.
    */
   void recordNewServer(HServerInfo info, boolean useInfoLoad,
       HRegionInterface hri) {
-    HServerLoad load = useInfoLoad ? info.getLoad() : new HServerLoad();
+    HServerLoad load = useInfoLoad? info.getLoad(): new HServerLoad();
     String serverName = info.getServerName();
+    LOG.info("Registering server=" + serverName + ", regionCount=" +
+      load.getLoad() + ", userLoad=" + useInfoLoad);
     info.setLoad(load);
     // TODO: Why did we update the RS location ourself?  Shouldn't RS do this?
     // masterStatus.getZooKeeper().updateRSLocationGetWatch(info, watcher);
+    // -- If I understand the question, the RS does not update the location
+    // because could be disagreement over locations because of DNS issues; only
+    // master does DNS now -- St.Ack 20100929.
     this.onlineServers.put(serverName, info);
     if (hri == null) {
       serverConnections.remove(serverName);
@@ -254,15 +248,16 @@ public class ServerManager {
     // If we don't know this server, tell it shutdown.
     HServerInfo storedInfo = this.onlineServers.get(info.getServerName());
     if (storedInfo == null) {
-      if (!this.freshClusterStartup) {
-        // If we are joining an existing cluster, then soon as we come up we'll
-        // be getting reports from already running regionservers.
-        LOG.info("Registering new server: " + info.getServerName());
+      if (this.deadservers.contains(storedInfo)) {
+        LOG.warn("Report from deadserver " + storedInfo);
+        return HMsg.STOP_REGIONSERVER_ARRAY;
+      } else {
+        // Just let the server in.  Presume master joining a running cluster.
         // recordNewServer is what happens at the end of reportServerStartup.
         // The only thing we are skipping is passing back to the regionserver
         // the HServerInfo to use.  Here we presume a master has already done
         // that so we'll press on with whatever it gave us for HSI.
-        recordNewServer(info);
+        recordNewServer(info, true, null);
         // If msgs, put off their processing but this is not enough because
         // its possible that the next time the server reports in, we'll still
         // not be up and serving.  For example, if a split, we'll need the
@@ -271,11 +266,6 @@ public class ServerManager {
         if (msgs.length > 0) throw new PleaseHoldException("FIX! Putting off " 
+
           "message processing because not yet rwady but possible we won't be " 
+
           "ready next on next report");
-      } else {
-        LOG.warn("Received report from unknown server, a server calling " +
-          " regionServerReport w/o having first called regionServerStartup; " +
-          "telling it " + HMsg.Type.STOP_REGIONSERVER + ": " + 
info.getServerName());
-        return HMsg.STOP_REGIONSERVER_ARRAY;
       }
     }
 
@@ -474,7 +464,10 @@ public class ServerManager {
           " but server shutdown is already in progress");
       return;
     }
-    // Remove the server from the known servers lists and update load info
+    // Remove the server from the known servers lists and update load info BUT
+    // add to deadservers first; do this so it'll show in dead servers list if
+    // not in online servers list.
+    this.deadservers.add(serverName);
     this.onlineServers.remove(serverName);
     this.serverConnections.remove(serverName);
     // If cluster is going down, yes, servers are going to be expiring; don't
@@ -488,7 +481,7 @@ public class ServerManager {
       return;
     }
     this.services.getExecutorService().submit(new 
ServerShutdownHandler(this.master,
-        this.services, deadservers, info));
+        this.services, this.deadservers, info));
     LOG.debug("Added=" + serverName +
       " to dead servers, submitted shutdown handler to be executed");
   }
@@ -554,9 +547,10 @@ public class ServerManager {
 
   /**
    * Waits for the regionservers to report in.
+   * @return Count of regions out on cluster
    * @throws InterruptedException 
    */
-  public void waitForRegionServers()
+  public int waitForRegionServers()
   throws InterruptedException {
     long interval = this.master.getConfiguration().
       getLong("hbase.master.wait.on.regionservers.interval", 3000);
@@ -574,8 +568,20 @@ public class ServerManager {
       }
       oldcount = count;
     }
+    // Count how many regions deployed out on cluster.  If fresh start, it'll
+    // be none but if not a fresh start, we'll have registered servers when
+    // they came in on the {...@link #regionServerReport(HServerInfo)} as 
opposed to
+    // {...@link #regionServerStartup(HServerInfo)} and it'll be carrying an
+    // actual server load.
+    int regionCount = 0;
+    for (Map.Entry<String, HServerInfo> e: this.onlineServers.entrySet()) {
+      HServerLoad load = e.getValue().getLoad();
+      if (load != null) regionCount += load.getLoad();
+    }
     LOG.info("Exiting wait on regionserver(s) to checkin; count=" + count +
-      ", stopped=" + this.master.isStopped());
+      ", stopped=" + this.master.isStopped() +
+      ", count of regions out on cluster=" + regionCount);
+    return regionCount;
   }
 
   public List<HServerInfo> getOnlineServersList() {

Modified: 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java?rev=1002880&r1=1002879&r2=1002880&view=diff
==============================================================================
--- 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
 (original)
+++ 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/handler/ServerShutdownHandler.java
 Wed Sep 29 22:37:50 2010
@@ -36,11 +36,16 @@ import org.apache.hadoop.hbase.client.Re
 import org.apache.hadoop.hbase.executor.EventHandler;
 import org.apache.hadoop.hbase.master.DeadServer;
 import org.apache.hadoop.hbase.master.MasterServices;
+import org.apache.hadoop.hbase.master.ServerManager;
 import org.apache.hadoop.hbase.util.Pair;
 import org.apache.hadoop.hbase.util.Writables;
 import org.apache.zookeeper.KeeperException;
 
-
+/**
+ * Process server shutdown.
+ * Server-to-handle must be already in the deadservers lists.  See
+ * {...@link ServerManager#expireServer(HServerInfo)}.
+ */
 public class ServerShutdownHandler extends EventHandler {
   private static final Log LOG = 
LogFactory.getLog(ServerShutdownHandler.class);
   private final HServerInfo hsi;
@@ -55,8 +60,9 @@ public class ServerShutdownHandler exten
     this.server = server;
     this.services = services;
     this.deadServers = deadServers;
-    // Add to dead servers.
-    this.deadServers.add(hsi.getServerName());
+    if (this.deadServers.contains(hsi.getServerName())) {
+      LOG.warn(hsi.getServerName() + " is NOT in deadservers; it should be!");
+    }
   }
 
   @Override

Modified: 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1002880&r1=1002879&r2=1002880&view=diff
==============================================================================
--- 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
 (original)
+++ 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
 Wed Sep 29 22:37:50 2010
@@ -48,7 +48,6 @@ import java.util.concurrent.atomic.Atomi
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 
-import com.google.common.base.Function;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -57,7 +56,6 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.Chore;
 import org.apache.hadoop.hbase.HBaseConfiguration;
 import org.apache.hadoop.hbase.HConstants;
-import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
 import org.apache.hadoop.hbase.HMsg;
 import org.apache.hadoop.hbase.HRegionInfo;
 import org.apache.hadoop.hbase.HServerAddress;
@@ -72,6 +70,7 @@ import org.apache.hadoop.hbase.Stoppable
 import org.apache.hadoop.hbase.UnknownRowLockException;
 import org.apache.hadoop.hbase.UnknownScannerException;
 import org.apache.hadoop.hbase.YouAreDeadException;
+import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
 import org.apache.hadoop.hbase.catalog.CatalogTracker;
 import org.apache.hadoop.hbase.catalog.MetaEditor;
 import org.apache.hadoop.hbase.catalog.RootLocationEditor;
@@ -120,9 +119,10 @@ import org.apache.hadoop.hbase.zookeeper
 import org.apache.hadoop.io.MapWritable;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.net.DNS;
-import org.apache.hadoop.util.StringUtils;
 import org.apache.zookeeper.KeeperException;
 
+import com.google.common.base.Function;
+
 /**
  * HRegionServer makes a set of HRegions available to clients. It checks in 
with
  * the HMaster. There are many HRegionServers in a single HBase deployment.
@@ -396,10 +396,6 @@ public class HRegionServer implements HR
     this.abortRequested = false;
     this.stopped = false;
 
-
-    //HRegionInterface,
-    //HBaseRPCErrorHandler, Runnable, Watcher, Stoppable, OnlineRegions
-
     // Server to handle client requests
     this.server = HBaseRPC.getServer(this,
         new Class<?>[]{HRegionInterface.class, HBaseRPCErrorHandler.class,
@@ -429,19 +425,32 @@ public class HRegionServer implements HR
     }
   }
 
+  /**
+   * Bring up connection to zk ensemble and then wait until a master for this
+   * cluster and then after that, wait until cluster 'up' flag has been set.
+   * This is the order in which master does things.
+   * Finally put up a catalog tracker.
+   * @throws IOException
+   * @throws InterruptedException
+   */
   private void initializeZooKeeper() throws IOException, InterruptedException {
-    // open connection to zookeeper and set primary watcher
+    // Open connection to zookeeper and set primary watcher
     zooKeeper = new ZooKeeperWatcher(conf, REGIONSERVER +
       serverInfo.getServerAddress().getPort(), this);
 
+    // Create the master address manager, register with zk, and start it.  Then
+    // block until a master is available.  No point in starting up if no master
+    // running.
+    this.masterAddressManager = new MasterAddressTracker(zooKeeper, this);
+    this.masterAddressManager.start();
+    this.masterAddressManager.blockUntilAvailable();
+
+    // Wait on cluster being up.  Master will set this flag up in zookeeper
+    // when ready.
     this.clusterStatusTracker = new ClusterStatusTracker(this.zooKeeper, this);
     this.clusterStatusTracker.start();
     this.clusterStatusTracker.blockUntilAvailable();
 
-    // create the master address manager, register with zk, and start it
-    masterAddressManager = new MasterAddressTracker(zooKeeper, this);
-    masterAddressManager.start();
-
     // Create the catalog tracker and start it; 
     this.catalogTracker = new CatalogTracker(this.zooKeeper, this.connection,
       this, this.conf.getInt("hbase.regionserver.catalog.timeout", 
Integer.MAX_VALUE));

Modified: 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/Leases.java
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/Leases.java?rev=1002880&r1=1002879&r2=1002880&view=diff
==============================================================================
--- hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/Leases.java 
(original)
+++ hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/Leases.java 
Wed Sep 29 22:37:50 2010
@@ -68,6 +68,7 @@ public class Leases extends Thread {
   public Leases(final int leasePeriod, final int leaseCheckFrequency) {
     this.leasePeriod = leasePeriod;
     this.leaseCheckFrequency = leaseCheckFrequency;
+    setDaemon(true);
   }
 
   /**

Modified: 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperNodeTracker.java
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperNodeTracker.java?rev=1002880&r1=1002879&r2=1002880&view=diff
==============================================================================
--- 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperNodeTracker.java
 (original)
+++ 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/zookeeper/ZooKeeperNodeTracker.java
 Wed Sep 29 22:37:50 2010
@@ -32,11 +32,6 @@ import org.apache.zookeeper.KeeperExcept
  * RegionServers.
  */
 public abstract class ZooKeeperNodeTracker extends ZooKeeperListener {
-  /**
-   * Pass this if you do not want a timeout.
-   */
-  public final static long NO_TIMEOUT = -1;
-
   /** Path of node being tracked */
   protected final String node;
 
@@ -94,7 +89,7 @@ public abstract class ZooKeeperNodeTrack
    */
   public synchronized byte [] blockUntilAvailable()
   throws InterruptedException {
-    return blockUntilAvailable(NO_TIMEOUT);
+    return blockUntilAvailable(0);
   }
 
   /**
@@ -102,18 +97,22 @@ public abstract class ZooKeeperNodeTrack
    * specified timeout has elapsed.
    *
    * @param timeout maximum time to wait for the node data to be available,
-   *                in milliseconds.  Pass {...@link #NO_TIMEOUT} for no 
timeout.
+   * n milliseconds.  Pass 0 for no timeout.
    * @return data of the node
    * @throws InterruptedException if the waiting thread is interrupted
    */
   public synchronized byte [] blockUntilAvailable(long timeout)
   throws InterruptedException {
-    if (timeout != NO_TIMEOUT && timeout < 0) throw new 
IllegalArgumentException();
+    if (timeout < 0) throw new IllegalArgumentException();
+    boolean notimeout = timeout == 0;
     long startTime = System.currentTimeMillis();
     long remaining = timeout;
-    while ((remaining == NO_TIMEOUT || remaining > 0) && this.data == null) {
-      if (remaining == NO_TIMEOUT) wait();
-      else wait(remaining);
+    while ((notimeout || remaining > 0) && this.data == null) {
+      if (notimeout) {
+        wait();
+        continue;
+      }
+      wait(remaining);
       remaining = timeout - (System.currentTimeMillis() - startTime);
     }
     return data;

Modified: hbase/trunk/src/main/resources/hbase-default.xml
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/main/resources/hbase-default.xml?rev=1002880&r1=1002879&r2=1002880&view=diff
==============================================================================
--- hbase/trunk/src/main/resources/hbase-default.xml (original)
+++ hbase/trunk/src/main/resources/hbase-default.xml Wed Sep 29 22:37:50 2010
@@ -157,7 +157,7 @@
   </property>
   <property>
     <name>hbase.regionserver.msginterval</name>
-    <value>5000</value>
+    <value>3000</value>
     <description>Interval between messages from the RegionServer to HMaster
     in milliseconds.
     </description>

Modified: 
hbase/trunk/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java?rev=1002880&r1=1002879&r2=1002880&view=diff
==============================================================================
--- 
hbase/trunk/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java
 (original)
+++ 
hbase/trunk/src/test/java/org/apache/hadoop/hbase/catalog/TestCatalogTracker.java
 Wed Sep 29 22:37:50 2010
@@ -20,6 +20,7 @@
 package org.apache.hadoop.hbase.catalog;
 
 import java.io.IOException;
+import java.net.ConnectException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -36,13 +37,13 @@ import org.apache.hadoop.hbase.HServerAd
 import org.apache.hadoop.hbase.HServerInfo;
 import org.apache.hadoop.hbase.KeyValue;
 import org.apache.hadoop.hbase.NotAllMetaRegionsOnlineException;
+import org.apache.hadoop.hbase.NotServingRegionException;
 import org.apache.hadoop.hbase.client.Get;
 import org.apache.hadoop.hbase.client.HConnection;
 import org.apache.hadoop.hbase.client.Result;
 import org.apache.hadoop.hbase.ipc.HRegionInterface;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.Threads;
-import org.apache.hadoop.hbase.util.Writables;
 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
 import org.apache.hadoop.util.Progressable;
@@ -52,6 +53,7 @@ import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
+import org.mockito.Matchers;
 import org.mockito.Mockito;
 
 /**
@@ -100,6 +102,61 @@ public class TestCatalogTracker {
     return ct;
   }
 
+  @Test public void testGetMetaServerConnectionFails()
+  throws IOException, InterruptedException, KeeperException {
+    HConnection connection = Mockito.mock(HConnection.class);
+    ConnectException connectException =
+      new ConnectException("Connection refused");
+    final HRegionInterface implementation =
+      Mockito.mock(HRegionInterface.class);
+    Mockito.when(implementation.get((byte [])Mockito.any(), 
(Get)Mockito.any())).
+      thenThrow(connectException);
+    
Mockito.when(connection.getHRegionConnection((HServerAddress)Matchers.anyObject(),
 Matchers.anyBoolean())).
+      thenReturn(implementation);
+    Assert.assertNotNull(connection.getHRegionConnection(new HServerAddress(), 
false));
+    final CatalogTracker ct = constructAndStartCatalogTracker(connection);
+    try {
+      RootLocationEditor.setRootLocation(this.watcher,
+        new HServerAddress("example.com:1234"));
+      Assert.assertFalse(ct.verifyMetaRegionLocation(100));
+    } finally {
+      // Clean out root location or later tests will be confused... they 
presume
+      // start fresh in zk.
+      RootLocationEditor.deleteRootLocation(this.watcher);
+    }
+  }
+
+  /**
+   * Test get of root region fails properly if nothing to connect to.
+   * @throws IOException
+   * @throws InterruptedException
+   * @throws KeeperException
+   */
+  @Test
+  public void testVerifyRootRegionLocationFails()
+  throws IOException, InterruptedException, KeeperException {
+    HConnection connection = Mockito.mock(HConnection.class);
+    ConnectException connectException =
+      new ConnectException("Connection refused");
+    final HRegionInterface implementation =
+      Mockito.mock(HRegionInterface.class);
+    Mockito.when(implementation.getRegionInfo((byte [])Mockito.any())).
+      thenThrow(connectException);
+    
Mockito.when(connection.getHRegionConnection((HServerAddress)Matchers.anyObject(),
 Matchers.anyBoolean())).
+      thenReturn(implementation);
+    Assert.assertNotNull(connection.getHRegionConnection(new HServerAddress(), 
false));
+    final CatalogTracker ct = constructAndStartCatalogTracker(connection);
+    try {
+      RootLocationEditor.setRootLocation(this.watcher,
+        new HServerAddress("example.com:1234"));
+      Assert.assertFalse(ct.verifyRootRegionLocation(100));
+    } finally {
+      // Clean out root location or later tests will be confused... they 
presume
+      // start fresh in zk.
+      RootLocationEditor.deleteRootLocation(this.watcher);
+    }
+  }
+
   @Test (expected = NotAllMetaRegionsOnlineException.class)
   public void testTimeoutWaitForRoot()
   throws IOException, InterruptedException {


Reply via email to