Author: jimk
Date: Sun Jan 13 15:39:01 2008
New Revision: 611681

URL: http://svn.apache.org/viewvc?rev=611681&view=rev
Log:
HADOOP-2500 Unreadable region kills region servers
HADOOP-2587 Splits blocked by compactions cause region to be offline for 
duration of compaction. 
Fix bug in TestCompaction in which two mini dfs clusters were being started for 
the same test.

Modified:
    lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt
    
lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HMaster.java
    
lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegion.java
    
lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java
    
lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestTimestamp.java

Modified: lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt?rev=611681&r1=611680&r2=611681&view=diff
==============================================================================
--- lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt (original)
+++ lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt Sun Jan 13 15:39:01 2008
@@ -118,6 +118,9 @@
    HADOOP-2530 Missing type in new hbase custom RPC serializer
    HADOOP-2490 Failure in nightly #346 (Added debugging of hudson failures).
    HADOOP-2558 fixes for build up on hudson (part 1, part 2, part 3, part 4)
+   HADOOP-2500 Unreadable region kills region servers
+   HADOOP-2587 Splits blocked by compactions cause region to be offline for
+               duration of compaction. 
    
   IMPROVEMENTS
    HADOOP-2401 Add convenience put method that takes writable

Modified: 
lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HMaster.java
URL: 
http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HMaster.java?rev=611681&r1=611680&r2=611681&view=diff
==============================================================================
--- 
lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HMaster.java
 (original)
+++ 
lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HMaster.java
 Sun Jan 13 15:39:01 2008
@@ -464,10 +464,10 @@
           )
         ) {
 
-        // The current assignment is no good
+        // The current assignment is invalid
         if (LOG.isDebugEnabled()) {
           LOG.debug("Current assignment of " + info.getRegionName() +
-            " is no good: storedInfo: " + storedInfo + ", startCode: " +
+            " is not valid: storedInfo: " + storedInfo + ", startCode: " +
             startCode + ", storedInfo.startCode: " +
             ((storedInfo != null)? storedInfo.getStartCode(): -1) +
             ", unassignedRegions: " + unassignedRegions.containsKey(info) +
@@ -963,7 +963,9 @@
    */
   void unassignRootRegion() {
     this.rootRegionLocation.set(null);
-    this.unassignedRegions.put(HRegionInfo.rootRegionInfo, ZERO_L);
+    if (!this.shutdownRequested) {
+      this.unassignedRegions.put(HRegionInfo.rootRegionInfo, ZERO_L);
+    }
   }
 
   /**
@@ -1622,10 +1624,15 @@
 
           // Root region
 
+          if (region.isOffline()) {
+            // Can't proceed without root region. Shutdown.
+            LOG.fatal("root region is marked offline");
+            shutdown();
+          }
           unassignRootRegion();
 
         } else {
-          boolean reassignRegion = true;
+          boolean reassignRegion = !region.isOffline();
           boolean deleteRegion = false;
 
           if (killedRegions.remove(region.getRegionName())) {

Modified: 
lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegion.java
URL: 
http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegion.java?rev=611681&r1=611680&r2=611681&view=diff
==============================================================================
--- 
lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegion.java
 (original)
+++ 
lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegion.java
 Sun Jan 13 15:39:01 2008
@@ -353,7 +353,7 @@
    * @throws IOException
    */
   public List<HStoreFile> close() throws IOException {
-    return close(false);
+    return close(false, null);
   }
   
   /**
@@ -364,13 +364,15 @@
    * time-sensitive thread.
    * 
    * @param abort true if server is aborting (only during testing)
+   * @param listener call back to alert caller on close status
    * @return Vector of all the storage files that the HRegion's component 
    * HStores make use of.  It's a list of HStoreFile objects.  Can be null if
    * we are not to close at this time or we are already closed.
    * 
    * @throws IOException
    */
-  List<HStoreFile> close(boolean abort) throws IOException {
+  List<HStoreFile> close(boolean abort,
+      final RegionUnavailableListener listener) throws IOException {
     if (isClosed()) {
       LOG.info("region " + this.regionInfo.getRegionName() + " already 
closed");
       return null;
@@ -410,6 +412,13 @@
         // outstanding updates.
         waitOnRowLocks();
 
+        if (listener != null) {
+          // If there is a listener, let them know that we have now
+          // acquired all the necessary locks and are starting to
+          // do the close
+          listener.closing(getRegionName());
+        }
+        
         // Don't flush the cache if we are aborting
         if (!abort) {
           internalFlushcache(snapshotMemcaches());
@@ -420,6 +429,13 @@
           result.addAll(store.close());
         }
         this.closed.set(true);
+        
+        if (listener != null) {
+          // If there is a listener, tell them that the region is now 
+          // closed.
+          listener.closed(getRegionName());
+        }
+        
         LOG.info("closed " + this.regionInfo.getRegionName());
         return result;
       } finally {
@@ -553,17 +569,10 @@
         throw new IOException("Cannot split; target file collision at " + 
dirB);
       }
 
-      // Notify the caller that we are about to close the region. This moves
-      // us to the 'retiring' queue. Means no more updates coming in -- just
-      // whatever is outstanding.
-      if (listener != null) {
-        listener.closing(getRegionName());
-      }
-
       // Now close the HRegion.  Close returns all store files or null if not
       // supposed to close (? What to do in this case? Implement abort of 
close?)
       // Close also does wait on outstanding rows and calls a flush 
just-in-case.
-      List<HStoreFile> hstoreFilesToSplit = close();
+      List<HStoreFile> hstoreFilesToSplit = close(false, listener);
       if (hstoreFilesToSplit == null) {
         LOG.warn("Close came back null (Implement abort of close?)");
         throw new RuntimeException("close returned empty vector of 
HStoreFiles");

Modified: 
lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java
URL: 
http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java?rev=611681&r1=611680&r2=611681&view=diff
==============================================================================
--- 
lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java
 (original)
+++ 
lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java
 Sun Jan 13 15:39:01 2008
@@ -1091,13 +1091,13 @@
   }
 
   /** Add to the outbound message buffer */
-  private void reportOpen(HRegion region) {
-    outboundMsgs.add(new HMsg(HMsg.MSG_REPORT_OPEN, region.getRegionInfo()));
+  private void reportOpen(HRegionInfo region) {
+    outboundMsgs.add(new HMsg(HMsg.MSG_REPORT_OPEN, region));
   }
 
   /** Add to the outbound message buffer */
-  private void reportClose(HRegion region) {
-    outboundMsgs.add(new HMsg(HMsg.MSG_REPORT_CLOSE, region.getRegionInfo()));
+  private void reportClose(HRegionInfo region) {
+    outboundMsgs.add(new HMsg(HMsg.MSG_REPORT_CLOSE, region));
   }
   
   /**
@@ -1222,7 +1222,14 @@
         
       } catch (IOException e) {
         LOG.error("error opening region " + regionInfo.getRegionName(), e);
-        reportClose(region);
+        
+        // Mark the region offline.
+        // TODO: add an extra field in HRegionInfo to indicate that there is
+        // an error. We can't do that now because that would be an incompatible
+        // change that would require a migration
+        
+        regionInfo.setOffline(true);
+        reportClose(regionInfo);
         return;
       }
       this.lock.writeLock().lock();
@@ -1232,7 +1239,7 @@
       } finally {
         this.lock.writeLock().unlock();
       }
-      reportOpen(region); 
+      reportOpen(regionInfo); 
     }
   }
 
@@ -1249,7 +1256,7 @@
     if(region != null) {
       region.close();
       if(reportWhenCompleted) {
-        reportClose(region);
+        reportClose(hri);
       }
     }
   }
@@ -1269,7 +1276,7 @@
         LOG.debug("closing region " + region.getRegionName());
       }
       try {
-        region.close(abortRequested);
+        region.close(abortRequested, null);
       } catch (IOException e) {
         LOG.error("error closing region " + region.getRegionName(),
           RemoteExceptionHandler.checkIOException(e));
@@ -1303,7 +1310,7 @@
         LOG.debug("closing region " + region.getRegionName());
       }
       try {
-        region.close(false);
+        region.close();
       } catch (IOException e) {
         LOG.error("error closing region " + region.getRegionName(),
           RemoteExceptionHandler.checkIOException(e));

Modified: 
lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestTimestamp.java
URL: 
http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestTimestamp.java?rev=611681&r1=611680&r2=611681&view=diff
==============================================================================
--- 
lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestTimestamp.java
 (original)
+++ 
lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestTimestamp.java
 Sun Jan 13 15:39:01 2008
@@ -105,7 +105,8 @@
    * @throws IOException
    */
   public void testTimestamps() throws IOException {
-    final MiniHBaseCluster cluster = new MiniHBaseCluster(this.conf, 1);
+    final MiniHBaseCluster cluster =
+      new MiniHBaseCluster(this.conf, 1, this.cluster, true);
     try {
       HTable t = createTable();
       Incommon incommon = new HTableIncommon(t);


Reply via email to