Author: stack
Date: Fri Oct  1 23:02:18 2010
New Revision: 1003699

URL: http://svn.apache.org/viewvc?rev=1003699&view=rev
Log:
HBASE-3068 IllegalStateException when new server comes online, is given 200 
regions to open and 200th region gets timed out of regions in transition

Modified:
    hbase/trunk/CHANGES.txt
    
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java

Modified: hbase/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/hbase/trunk/CHANGES.txt?rev=1003699&r1=1003698&r2=1003699&view=diff
==============================================================================
--- hbase/trunk/CHANGES.txt (original)
+++ hbase/trunk/CHANGES.txt Fri Oct  1 23:02:18 2010
@@ -559,6 +559,9 @@ Release 0.21.0 - Unreleased
    HBASE-3057  Race condition when closing regions that causes flakiness in
                TestRestartCluster
    HBASE-3058  Fix REST tests on trunk
+   HBASE-3068  IllegalStateException when new server comes online, is given
+               200 regions to open and 200th region gets timed out of regions
+               in transition
 
   IMPROVEMENTS
    HBASE-1760  Cleanup TODOs in HTable

Modified: 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
URL: 
http://svn.apache.org/viewvc/hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java?rev=1003699&r1=1003698&r2=1003699&view=diff
==============================================================================
--- 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java 
(original)
+++ 
hbase/trunk/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java 
Fri Oct  1 23:02:18 2010
@@ -34,6 +34,8 @@ import java.util.NavigableMap;
 import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeMap;
+import java.util.concurrent.ConcurrentNavigableMap;
+import java.util.concurrent.ConcurrentSkipListMap;
 import java.util.concurrent.Executors;
 
 import org.apache.commons.logging.Log;
@@ -92,10 +94,10 @@ public class AssignmentManager extends Z
     new TreeMap<String, RegionState>();
 
   /** Plans for region movement. Key is the encoded version of a region name*/
-  // TODO: When do plans get cleaned out?  Ever?
-  // Its cleaned on server shutdown processing -- St.Ack
-  private final Map<String, RegionPlan> regionPlans =
-    new TreeMap<String, RegionPlan>();
+  // TODO: When do plans get cleaned out?  Ever? In server open and in server
+  // shutdown processing -- St.Ack
+  private final ConcurrentNavigableMap<String, RegionPlan> regionPlans =
+    new ConcurrentSkipListMap<String, RegionPlan>();
 
   /** Set of tables that have been disabled. */
   private final Set<String> disabledTables =
@@ -494,6 +496,40 @@ public class AssignmentManager extends Z
       this.regions.put(regionInfo, serverInfo);
       addToServers(serverInfo, regionInfo);
     }
+    // Remove plan if one.
+    this.regionPlans.remove(regionInfo.getEncodedName());
+    // Update timers for all regions in transition going against this server.
+    updateTimers(serverInfo);
+  }
+
+  /**
+   * Touch timers for all regions in transition that have the passed
+   * <code>hsi</code> in common.
+   * Call this method whenever a server checks in.  Doing so helps the case 
where
+   * a new regionserver has joined the cluster and its been given 1k regions to
+   * open.  If this method is tickled every time the region reports in a
+   * successful open then the 1k-th region won't be timed out just because its
+   * sitting behind the open of 999 other regions.  This method is NOT used
+   * as part of bulk assign -- there we have a different mechanism for 
extending
+   * the regions in transition timer (we turn it off temporarily -- because
+   * there is no regionplan involved when bulk assigning.
+   * @param hsi
+   */
+  private void updateTimers(final HServerInfo hsi) {
+    // This loop could be expensive
+    for (Map.Entry<String, RegionPlan> e: this.regionPlans.entrySet()) {
+      if (e.getValue().getDestination().equals(hsi)) {
+        RegionState rs = null;
+        synchronized (this.regionsInTransition) {
+          rs = this.regionsInTransition.get(e.getKey());
+        }
+        if (rs != null) {
+          synchronized (rs) {
+            rs.update(rs.getState());
+          }
+        }
+      }
+    }
   }
 
   /**
@@ -510,15 +546,7 @@ public class AssignmentManager extends Z
         this.regionsInTransition.notifyAll();
       }
     }
-    synchronized(this.regions) {
-      HServerInfo serverInfo = this.regions.remove(regionInfo);
-      if (serverInfo != null) {
-        List<HRegionInfo> serverRegions = this.servers.get(serverInfo);
-        serverRegions.remove(regionInfo);
-      } else {
-        LOG.warn("Asked offline a region that was not online: " + regionInfo);
-      }
-    }
+    setOffline(regionInfo);
   }
 
   /**
@@ -529,10 +557,14 @@ public class AssignmentManager extends Z
    * @param regionInfo
    */
   public void setOffline(HRegionInfo regionInfo) {
-    synchronized(regions) {
-      HServerInfo serverInfo = regions.remove(regionInfo);
-      List<HRegionInfo> serverRegions = servers.get(serverInfo);
-      serverRegions.remove(regionInfo);
+    synchronized (this.regions) {
+      HServerInfo serverInfo = this.regions.remove(regionInfo);
+      if (serverInfo != null) {
+        List<HRegionInfo> serverRegions = this.servers.get(serverInfo);
+        serverRegions.remove(regionInfo);
+      } else {
+        LOG.warn("Asked offline a region that was not online: " + regionInfo);
+      }
     }
   }
 
@@ -602,6 +634,10 @@ public class AssignmentManager extends Z
     LOG.debug("Bulk assigning done for " + destination.getServerName());
   }
 
+  /**
+   * @param region
+   * @return
+   */
   private RegionState addToRegionsInTransition(final HRegionInfo region) {
     synchronized (regionsInTransition) {
       return forceRegionStateToOffline(region);
@@ -620,6 +656,9 @@ public class AssignmentManager extends Z
     if (state == null) {
       state = new RegionState(region, RegionState.State.OFFLINE);
       this.regionsInTransition.put(encodedName, state);
+    } else {
+      LOG.debug("Forcing OFFLINE; was=" + state);
+      state.update(RegionState.State.OFFLINE);
     }
     return state;
   }
@@ -644,9 +683,7 @@ public class AssignmentManager extends Z
         plan.getDestination(), t);
       // Clean out plan we failed execute and one that doesn't look like it'll
       // succeed anyways; we need a new plan!
-      synchronized(regionPlans) {
-        this.regionPlans.remove(state.getRegion().getEncodedName());
-      }
+      this.regionPlans.remove(state.getRegion().getEncodedName());
     }
   }
 
@@ -662,9 +699,8 @@ public class AssignmentManager extends Z
       this.master.abort("Unexpected state trying to OFFLINE; " + state,
         new IllegalStateException());
       return false;
-    } else {
-      state.update(RegionState.State.OFFLINE);
     }
+    state.update(RegionState.State.OFFLINE);
     try {
       if(!ZKAssign.createOrForceNodeOffline(master.getZooKeeper(),
           state.getRegion(), master.getServerName())) {
@@ -679,23 +715,28 @@ public class AssignmentManager extends Z
     return true;
   }
 
+  /**
+   * @param state
+   * @return Plan for passed <code>state</code> (If none currently, it creates 
one)
+   */
   RegionPlan getRegionPlan(final RegionState state) {
     // Pickup existing plan or make a new one
     String encodedName = state.getRegion().getEncodedName();
-    RegionPlan plan;
-    synchronized (regionPlans) {
-      plan = regionPlans.get(encodedName);
-      if (plan == null) {
-        LOG.debug("No previous transition plan for " +
-          state.getRegion().getRegionNameAsString() +
-          " so generating a random one; " + 
serverManager.countOfRegionServers() +
-          " (online=" + serverManager.getOnlineServers().size() + ") available 
servers");
-        plan = new RegionPlan(state.getRegion(), null,
-          LoadBalancer.randomAssignment(serverManager.getOnlineServersList()));
-        regionPlans.put(encodedName, plan);
-      } else {
-        LOG.debug("Using preexisting plan=" + plan);
-      }
+    RegionPlan newPlan = new RegionPlan(state.getRegion(), null,
+      LoadBalancer.randomAssignment(serverManager.getOnlineServersList()));
+    RegionPlan existingPlan = regionPlans.putIfAbsent(encodedName, newPlan);
+    RegionPlan plan = null;
+    if (existingPlan == null) {
+      LOG.debug("No previous transition plan for " +
+        state.getRegion().getRegionNameAsString() +
+        " so generated a random one; " + newPlan + "; " +
+        serverManager.countOfRegionServers() +
+        " (online=" + serverManager.getOnlineServers().size() +
+        ") available servers");
+      plan = newPlan;
+    } else {
+      LOG.debug("Using preexisting plan=" + existingPlan);
+      plan = existingPlan;
     }
     return plan;
   }
@@ -1096,9 +1137,16 @@ public class AssignmentManager extends Z
                 break;
               case PENDING_OPEN:
               case OPENING:
-                LOG.info("Region has been PENDING_OPEN or OPENING for too " +
+                LOG.info("Region has been PENDING_OPEN  or OPENING for too " +
                   "long, reassigning region=" +
-                   regionInfo.getRegionNameAsString());
+                  regionInfo.getRegionNameAsString());
+                // TODO: Possible RACE in here if RS is right now sending us an
+                // OPENED to handle.  Otherwise, after our call to assign, 
which
+                // forces zk state to OFFLINE, any actions by RS should cause
+                // it abort its open w/ accompanying LOG.warns coming out of 
the
+                // handleRegion method below.
+                AssignmentManager.this.setOffline(regionState.getRegion());
+                regionState.update(RegionState.State.OFFLINE);
                 assign(regionState.getRegion());
                 break;
               case OPEN:
@@ -1125,14 +1173,12 @@ public class AssignmentManager extends Z
    */
   public void processServerShutdown(final HServerInfo hsi) {
     // Clean out any exisiting assignment plans for this server
-    synchronized (this.regionPlans) {
-      for (Iterator <Map.Entry<String, RegionPlan>> i =
-          this.regionPlans.entrySet().iterator(); i.hasNext();) {
-        Map.Entry<String, RegionPlan> e = i.next();
-        if (e.getValue().getDestination().equals(hsi)) {
-          // Use iterator's remove else we'll get CME
-          i.remove();
-        }
+    for (Iterator <Map.Entry<String, RegionPlan>> i =
+        this.regionPlans.entrySet().iterator(); i.hasNext();) {
+      Map.Entry<String, RegionPlan> e = i.next();
+      if (e.getValue().getDestination().equals(hsi)) {
+        // Use iterator's remove else we'll get CME
+        i.remove();
       }
     }
     // Remove assignment info related to the downed server.  Remove the downed
@@ -1234,9 +1280,7 @@ public class AssignmentManager extends Z
    * @param plan Plan to execute.
    */
   void balance(final RegionPlan plan) {
-    synchronized (this.regionPlans) {
-      this.regionPlans.put(plan.getRegionName(), plan);
-    }
+    this.regionPlans.put(plan.getRegionName(), plan);
     unassign(plan.getRegionInfo());
   }
 


Reply via email to