Author: stack
Date: Fri Apr 29 03:33:27 2011
New Revision: 1097676

URL: http://svn.apache.org/viewvc?rev=1097676&view=rev
Log:
HBASE-3829 TestMasterFailover failures in jenkins

Modified:
    hbase/trunk/CHANGES.txt
    
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java

Modified: hbase/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=1097676&r1=1097675&r2=1097676&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Fri Apr 29 03:33:27 2011
@@ -91,6 +91,7 @@ Release 0.91.0 - Unreleased
    HBASE-3210  HBASE-1921 for the new master
    HBASE-3827  hbase-1502, removing heartbeats, broke master joining a running
                cluster and was returning master hostname for rs to use
+   HBASE-3829  TestMasterFailover failures in jenkins
 
   IMPROVEMENTS
    HBASE-3290  Max Compaction Size (Nicolas Spiegelberg via Stack)  

Modified: 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1097676&r1=1097675&r2=1097676&view=diff
==============================================================================
--- 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
 (original)
+++ 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
 Fri Apr 29 03:33:27 2011
@@ -599,11 +599,17 @@ public class HRegionServer implements HR
     }
 
     try {
-      // Try and register with the Master; tell it we are here.
-      while (!this.stopped) {
-        if (tryReportForDuty()) break;
-        LOG.warn("No response on reportForDuty. Sleeping and then retrying.");
-        this.sleeper.sleep();
+      // Try and register with the Master; tell it we are here.  Break if
+      // server is stopped or the clusterup flag is down of hdfs went wacky.
+      while (keepLooping()) {
+        MapWritable w = reportForDuty();
+        if (w == null) {
+          LOG.warn("reportForDuty failed; sleeping and then retrying.");
+          this.sleeper.sleep();
+        } else {
+          handleReportForDutyResponse(w);
+          break;
+        }
       }
 
       // We registered with the Master.  Go into run mode.
@@ -617,8 +623,10 @@ public class HRegionServer implements HR
             stop("Exiting; cluster shutdown set and not carrying any regions");
           } else if (!this.stopping) {
             this.stopping = true;
+            LOG.info("Closing user regions");
             closeUserRegions(this.abortRequested);
           } else if (this.stopping && LOG.isDebugEnabled()) {
+            LOG.info("Only meta regions remain open");
             if (!onlyMetaRegionsRemaining) {
               onlyMetaRegionsRemaining = isOnlyMetaRegionsRemaining();
             }
@@ -730,22 +738,19 @@ public class HRegionServer implements HR
     HServerLoad hsl = buildServerLoad();
     // Why we do this?
     this.requestCount.set(0);
-    while (!this.stopped) {
-      try {
-        
this.hbaseMaster.regionServerReport(this.serverNameFromMasterPOV.getBytes(), 
hsl);
-        break;
-      } catch (IOException ioe) {
-        if (ioe instanceof RemoteException) {
-          ioe = ((RemoteException)ioe).unwrapRemoteException();
-        }
-        if (ioe instanceof YouAreDeadException) {
-          // This will be caught and handled as a fatal error in run()
-          throw ioe;
-        }
-        // Couldn't connect to the master, get location from zk and reconnect
-        // Method blocks until new master is found or we are stopped
-        getMaster();
-      }
+    try {
+      
this.hbaseMaster.regionServerReport(this.serverNameFromMasterPOV.getBytes(), 
hsl);
+    } catch (IOException ioe) {
+      if (ioe instanceof RemoteException) {
+        ioe = ((RemoteException)ioe).unwrapRemoteException();
+      }
+      if (ioe instanceof YouAreDeadException) {
+        // This will be caught and handled as a fatal error in run()
+        throw ioe;
+      }
+      // Couldn't connect to the master, get location from zk and reconnect
+      // Method blocks until new master is found or we are stopped
+      getMaster();
     }
   }
 
@@ -1431,7 +1436,7 @@ public class HRegionServer implements HR
     Threads.shutdown(this.cacheFlusher);
     Threads.shutdown(this.compactSplitThread);
     Threads.shutdown(this.hlogRoller);
-    this.service.shutdown();
+    if (this.service != null) this.service.shutdown();
     if (this.replicationHandler != null) {
       this.replicationHandler.join();
     }
@@ -1448,16 +1453,14 @@ public class HRegionServer implements HR
   private ServerName getMaster() {
     ServerName masterServerName = null;
     while ((masterServerName = this.masterAddressManager.getMasterAddress()) 
== null) {
-      if (stopped) {
-        return null;
-      }
-      LOG.debug("No master found, will retry");
+      if (!keepLooping()) return null;
+      LOG.debug("No master found; retry");
       sleeper.sleep();
     }
     InetSocketAddress isa =
       new InetSocketAddress(masterServerName.getHostname(), 
masterServerName.getPort());
     HMasterRegionInterface master = null;
-    while (!stopped && master == null) {
+    while (keepLooping() && master == null) {
       LOG.info("Attempting connect to Master server at " +
         this.masterAddressManager.getMasterAddress());
       try {
@@ -1484,16 +1487,11 @@ public class HRegionServer implements HR
   }
 
   /**
-   * @return True if successfully invoked {@link #reportForDuty()}
-   * @throws IOException
+   * @return True if we should break loop because cluster is going down or
+   * this server has been stopped or hdfs has gone bad.
    */
-  private boolean tryReportForDuty() throws IOException {
-    MapWritable w = reportForDuty();
-    if (w != null) {
-      handleReportForDutyResponse(w);
-      return true;
-    }
-    return false;
+  private boolean keepLooping() {
+    return !this.stopped && isClusterUp();
   }
 
   /*
@@ -1504,36 +1502,27 @@ public class HRegionServer implements HR
    * @throws IOException
    */
   private MapWritable reportForDuty() throws IOException {
-    ServerName masterServerName = null;
-    while (!stopped && (masterServerName = getMaster()) == null) {
-      LOG.warn("Unable to get master for initialization -- sleeping");
-      sleeper.sleep();
-    }
     MapWritable result = null;
-    long lastMsg = 0;
-    while (!stopped) {
-      try {
-        this.requestCount.set(0);
-        LOG.info("Telling master at " + masterServerName + " that we are up " +
-          "with port=" + this.isa.getPort() + ", startcode=" + this.startcode);
-        lastMsg = EnvironmentEdgeManager.currentTimeMillis();
-        int port = this.isa.getPort();
-        result = this.hbaseMaster.regionServerStartup(port, this.startcode, 
lastMsg);
-        break;
-      } catch (RemoteException e) {
-        IOException ioe = e.unwrapRemoteException();
-        if (ioe instanceof ClockOutOfSyncException) {
-          LOG.fatal("Master rejected startup because clock is out of sync",
-              ioe);
-          // Re-throw IOE will cause RS to abort
-          throw ioe;
-        } else {
-          LOG.warn("remote error telling master we are up", e);
-        }
-      } catch (IOException e) {
-        LOG.warn("error telling master we are up", e);
+    ServerName masterServerName = getMaster();
+    if (masterServerName == null) return result;
+    try {
+      this.requestCount.set(0);
+      LOG.info("Telling master at " + masterServerName + " that we are up " +
+        "with port=" + this.isa.getPort() + ", startcode=" + this.startcode);
+      long now = EnvironmentEdgeManager.currentTimeMillis();
+      int port = this.isa.getPort();
+      result = this.hbaseMaster.regionServerStartup(port, this.startcode, now);
+    } catch (RemoteException e) {
+      IOException ioe = e.unwrapRemoteException();
+      if (ioe instanceof ClockOutOfSyncException) {
+        LOG.fatal("Master rejected startup because clock is out of sync", ioe);
+        // Re-throw IOE will cause RS to abort
+        throw ioe;
+      } else {
+        LOG.warn("remote error telling master we are up", e);
       }
-      sleeper.sleep(lastMsg);
+    } catch (IOException e) {
+      LOG.warn("error telling master we are up", e);
     }
     return result;
   }


Reply via email to