Author: dhruba Date: Thu Nov 29 06:01:46 2007 New Revision: 599447 URL: http://svn.apache.org/viewvc?rev=599447&view=rev Log: HADOOP-2256. Fix a buf in the namenode that could cause it to encounter an infinite loop while deleting excess replicas that were created by block rebalancing. (Hairong Kuang via dhruba)
Modified: lucene/hadoop/trunk/CHANGES.txt lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java lucene/hadoop/trunk/src/test/org/apache/hadoop/dfs/TestBlockReplacement.java Modified: lucene/hadoop/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/CHANGES.txt?rev=599447&r1=599446&r2=599447&view=diff ============================================================================== --- lucene/hadoop/trunk/CHANGES.txt (original) +++ lucene/hadoop/trunk/CHANGES.txt Thu Nov 29 06:01:46 2007 @@ -165,6 +165,10 @@ fails to allocate any datanodes for newly allocated block. (Dhruba Borthakur via dhruba) + HADOOP-2256. Fix a buf in the namenode that could cause it to encounter + an infinite loop while deleting excess replicas that were created by + block rebalancing. (Hairong Kuang via dhruba) + Branch 0.15 (unreleased) BUG FIXES Modified: lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java?rev=599447&r1=599446&r2=599447&view=diff ============================================================================== --- lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java (original) +++ lucene/hadoop/trunk/src/java/org/apache/hadoop/dfs/FSNamesystem.java Thu Nov 29 06:01:46 2007 @@ -2431,13 +2431,14 @@ // pick one node to delete that favors the delete hint // otherwise pick one with least space from priSet if it is not empty // otherwise one node with least space from remains + boolean firstOne = true; while (nonExcess.size() - replication > 0) { DatanodeInfo cur = null; long minSpace = Long.MAX_VALUE; // check if we can del delNodeHint - if( delNodeHint !=null && (priSet.contains(delNodeHint) || - (addedNode != null && !priSet.contains(addedNode))) ) { + if (firstOne && delNodeHint !=null && nonExcess.contains(delNodeHint) && + (priSet.contains(delNodeHint) || (addedNode != null && !priSet.contains(addedNode))) ) { cur = delNodeHint; } else { // regular excessive replica removal Iterator<DatanodeDescriptor> iter = @@ -2453,6 +2454,7 @@ } } + firstOne = false; // adjust rackmap, priSet, and remains String rack = cur.getNetworkLocation(); ArrayList<DatanodeDescriptor> datanodes = rackMap.get(rack); Modified: lucene/hadoop/trunk/src/test/org/apache/hadoop/dfs/TestBlockReplacement.java URL: http://svn.apache.org/viewvc/lucene/hadoop/trunk/src/test/org/apache/hadoop/dfs/TestBlockReplacement.java?rev=599447&r1=599446&r2=599447&view=diff ============================================================================== --- lucene/hadoop/trunk/src/test/org/apache/hadoop/dfs/TestBlockReplacement.java (original) +++ lucene/hadoop/trunk/src/test/org/apache/hadoop/dfs/TestBlockReplacement.java Thu Nov 29 06:01:46 2007 @@ -26,6 +26,8 @@ import java.util.List; import java.util.Random; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.dfs.FSConstants.DatanodeReportType; import org.apache.hadoop.fs.FileSystem; @@ -37,6 +39,9 @@ * This class tests if block replacement request to data nodes work correctly. */ public class TestBlockReplacement extends TestCase { + private static final Log LOG = LogFactory.getLog( + "org.apache.hadoop.dfs.TestBlockReplacement"); + MiniDFSCluster cluster; public void testThrottler() throws IOException { Configuration conf = new Configuration(); @@ -136,15 +141,22 @@ // start to replace the block // case 1: proxySource does not contain the block + LOG.info("Testcase 1: Proxy " + newNode.getName() + + "does not contain the block " + b.getBlockName() ); assertFalse(replaceBlock(b, source, newNode, proxies.get(0))); // case 2: destination contains the block + LOG.info("Testcase 2: Destination " + proxies.get(1).getName() + + "contains the block " + b.getBlockName() ); assertFalse(replaceBlock(b, source, proxies.get(0), proxies.get(1))); // case 3: correct case + LOG.info("Testcase 3: Proxy=" + source.getName() + "source=" + + proxies.get(0).getName() + " destination=" + newNode.getName() ); assertTrue(replaceBlock(b, source, proxies.get(0), newNode)); // block locations should contain two proxies and newNode checkBlocks(source, fileName.toString(), DEFAULT_BLOCK_SIZE, REPLICATION_FACTOR); // case 4: proxies.get(1) is not a valid del hint + LOG.info("Testcase 3: invalid del hint " + proxies.get(0).getName() ); assertTrue(replaceBlock(b, proxies.get(1), proxies.get(0), source)); /* block locations should contain two proxies and source; * newNode was deleted. @@ -161,22 +173,26 @@ long fileLen, short replFactor) throws IOException { Boolean notDone; do { + try { + Thread.sleep(100); + } catch(InterruptedException e) { + } List<LocatedBlock> blocks = cluster.getNameNode(). getBlockLocations(fileName, 0, fileLen).getLocatedBlocks(); assertEquals(1, blocks.size()); DatanodeInfo[] nodes = blocks.get(0).getLocations(); notDone = (nodes.length != replFactor); - if(!notDone) { - for(DatanodeInfo node:nodes) { - if(node.equals(excludedNode) ) { - notDone=true; - try { - Thread.sleep(100); - } catch(InterruptedException e) { + if (notDone) { + LOG.info("Expected replication factor is " + replFactor + + " but the real replication factor is " + nodes.length ); + } else { + for( DatanodeInfo node : nodes) { + if (node.equals(excludedNode) ) { + notDone=true; + LOG.info("Unexpected block location " + excludedNode.getName() ); + break; } - break; } - } } } while(notDone); }