Author: jimk Date: Sun Jan 13 15:39:01 2008 New Revision: 611681 URL: http://svn.apache.org/viewvc?rev=611681&view=rev Log: HADOOP-2500 Unreadable region kills region servers HADOOP-2587 Splits blocked by compactions cause region to be offline for duration of compaction. Fix bug in TestCompaction in which two mini dfs clusters were being started for the same test.
Modified: lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HMaster.java lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegion.java lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestTimestamp.java Modified: lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt?rev=611681&r1=611680&r2=611681&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt (original) +++ lucene/hadoop/trunk/src/contrib/hbase/CHANGES.txt Sun Jan 13 15:39:01 2008 @@ -118,6 +118,9 @@ HADOOP-2530 Missing type in new hbase custom RPC serializer HADOOP-2490 Failure in nightly #346 (Added debugging of hudson failures). HADOOP-2558 fixes for build up on hudson (part 1, part 2, part 3, part 4) + HADOOP-2500 Unreadable region kills region servers + HADOOP-2587 Splits blocked by compactions cause region to be offline for + duration of compaction. IMPROVEMENTS HADOOP-2401 Add convenience put method that takes writable Modified: lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HMaster.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HMaster.java?rev=611681&r1=611680&r2=611681&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HMaster.java (original) +++ lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HMaster.java Sun Jan 13 15:39:01 2008 @@ -464,10 +464,10 @@ ) ) { - // The current assignment is no good + // The current assignment is invalid if (LOG.isDebugEnabled()) { LOG.debug("Current assignment of " + info.getRegionName() + - " is no good: storedInfo: " + storedInfo + ", startCode: " + + " is not valid: storedInfo: " + storedInfo + ", startCode: " + startCode + ", storedInfo.startCode: " + ((storedInfo != null)? storedInfo.getStartCode(): -1) + ", unassignedRegions: " + unassignedRegions.containsKey(info) + @@ -963,7 +963,9 @@ */ void unassignRootRegion() { this.rootRegionLocation.set(null); - this.unassignedRegions.put(HRegionInfo.rootRegionInfo, ZERO_L); + if (!this.shutdownRequested) { + this.unassignedRegions.put(HRegionInfo.rootRegionInfo, ZERO_L); + } } /** @@ -1622,10 +1624,15 @@ // Root region + if (region.isOffline()) { + // Can't proceed without root region. Shutdown. + LOG.fatal("root region is marked offline"); + shutdown(); + } unassignRootRegion(); } else { - boolean reassignRegion = true; + boolean reassignRegion = !region.isOffline(); boolean deleteRegion = false; if (killedRegions.remove(region.getRegionName())) { Modified: lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegion.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegion.java?rev=611681&r1=611680&r2=611681&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegion.java (original) +++ lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegion.java Sun Jan 13 15:39:01 2008 @@ -353,7 +353,7 @@ * @throws IOException */ public List<HStoreFile> close() throws IOException { - return close(false); + return close(false, null); } /** @@ -364,13 +364,15 @@ * time-sensitive thread. * * @param abort true if server is aborting (only during testing) + * @param listener call back to alert caller on close status * @return Vector of all the storage files that the HRegion's component * HStores make use of. It's a list of HStoreFile objects. Can be null if * we are not to close at this time or we are already closed. * * @throws IOException */ - List<HStoreFile> close(boolean abort) throws IOException { + List<HStoreFile> close(boolean abort, + final RegionUnavailableListener listener) throws IOException { if (isClosed()) { LOG.info("region " + this.regionInfo.getRegionName() + " already closed"); return null; @@ -410,6 +412,13 @@ // outstanding updates. waitOnRowLocks(); + if (listener != null) { + // If there is a listener, let them know that we have now + // acquired all the necessary locks and are starting to + // do the close + listener.closing(getRegionName()); + } + // Don't flush the cache if we are aborting if (!abort) { internalFlushcache(snapshotMemcaches()); @@ -420,6 +429,13 @@ result.addAll(store.close()); } this.closed.set(true); + + if (listener != null) { + // If there is a listener, tell them that the region is now + // closed. + listener.closed(getRegionName()); + } + LOG.info("closed " + this.regionInfo.getRegionName()); return result; } finally { @@ -553,17 +569,10 @@ throw new IOException("Cannot split; target file collision at " + dirB); } - // Notify the caller that we are about to close the region. This moves - // us to the 'retiring' queue. Means no more updates coming in -- just - // whatever is outstanding. - if (listener != null) { - listener.closing(getRegionName()); - } - // Now close the HRegion. Close returns all store files or null if not // supposed to close (? What to do in this case? Implement abort of close?) // Close also does wait on outstanding rows and calls a flush just-in-case. - List<HStoreFile> hstoreFilesToSplit = close(); + List<HStoreFile> hstoreFilesToSplit = close(false, listener); if (hstoreFilesToSplit == null) { LOG.warn("Close came back null (Implement abort of close?)"); throw new RuntimeException("close returned empty vector of HStoreFiles"); Modified: lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java?rev=611681&r1=611680&r2=611681&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java (original) +++ lucene/hadoop/trunk/src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java Sun Jan 13 15:39:01 2008 @@ -1091,13 +1091,13 @@ } /** Add to the outbound message buffer */ - private void reportOpen(HRegion region) { - outboundMsgs.add(new HMsg(HMsg.MSG_REPORT_OPEN, region.getRegionInfo())); + private void reportOpen(HRegionInfo region) { + outboundMsgs.add(new HMsg(HMsg.MSG_REPORT_OPEN, region)); } /** Add to the outbound message buffer */ - private void reportClose(HRegion region) { - outboundMsgs.add(new HMsg(HMsg.MSG_REPORT_CLOSE, region.getRegionInfo())); + private void reportClose(HRegionInfo region) { + outboundMsgs.add(new HMsg(HMsg.MSG_REPORT_CLOSE, region)); } /** @@ -1222,7 +1222,14 @@ } catch (IOException e) { LOG.error("error opening region " + regionInfo.getRegionName(), e); - reportClose(region); + + // Mark the region offline. + // TODO: add an extra field in HRegionInfo to indicate that there is + // an error. We can't do that now because that would be an incompatible + // change that would require a migration + + regionInfo.setOffline(true); + reportClose(regionInfo); return; } this.lock.writeLock().lock(); @@ -1232,7 +1239,7 @@ } finally { this.lock.writeLock().unlock(); } - reportOpen(region); + reportOpen(regionInfo); } } @@ -1249,7 +1256,7 @@ if(region != null) { region.close(); if(reportWhenCompleted) { - reportClose(region); + reportClose(hri); } } } @@ -1269,7 +1276,7 @@ LOG.debug("closing region " + region.getRegionName()); } try { - region.close(abortRequested); + region.close(abortRequested, null); } catch (IOException e) { LOG.error("error closing region " + region.getRegionName(), RemoteExceptionHandler.checkIOException(e)); @@ -1303,7 +1310,7 @@ LOG.debug("closing region " + region.getRegionName()); } try { - region.close(false); + region.close(); } catch (IOException e) { LOG.error("error closing region " + region.getRegionName(), RemoteExceptionHandler.checkIOException(e)); Modified: lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestTimestamp.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestTimestamp.java?rev=611681&r1=611680&r2=611681&view=diff ============================================================================== --- lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestTimestamp.java (original) +++ lucene/hadoop/trunk/src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestTimestamp.java Sun Jan 13 15:39:01 2008 @@ -105,7 +105,8 @@ * @throws IOException */ public void testTimestamps() throws IOException { - final MiniHBaseCluster cluster = new MiniHBaseCluster(this.conf, 1); + final MiniHBaseCluster cluster = + new MiniHBaseCluster(this.conf, 1, this.cluster, true); try { HTable t = createTable(); Incommon incommon = new HTableIncommon(t);