Author: hairong
Date: Thu Jan 8 10:37:55 2009
New Revision: 732788
URL: http://svn.apache.org/viewvc?rev=732788&view=rev
Log:
Merge -r 732776:732777 from trunk to move the change of HADOOP-4910 into branch
0.18.
Added:
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestOverReplicatedBlocks.java
Modified:
hadoop/core/branches/branch-0.18/CHANGES.txt
hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/FSNamesystem.java
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/MiniDFSCluster.java
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestDatanodeBlockScanner.java
Modified: hadoop/core/branches/branch-0.18/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/CHANGES.txt?rev=732788&r1=732787&r2=732788&view=diff
==============================================================================
--- hadoop/core/branches/branch-0.18/CHANGES.txt (original)
+++ hadoop/core/branches/branch-0.18/CHANGES.txt Thu Jan 8 10:37:55 2009
@@ -133,6 +133,9 @@
HADOOP-4971. A long (unexpected) delay at datanodes could make subsequent
block reports from many datanode at the same time. (Raghu Angadi)
+ HADOOP-4910. NameNode should exclude replicas when choosing excessive
+ replicas to delete to avoid data lose. (hairong)
+
Release 0.18.2 - 2008-11-03
BUG FIXES
Modified:
hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/FSNamesystem.java
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/FSNamesystem.java?rev=732788&r1=732787&r2=732788&view=diff
==============================================================================
---
hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/FSNamesystem.java
(original)
+++
hadoop/core/branches/branch-0.18/src/hdfs/org/apache/hadoop/dfs/FSNamesystem.java
Thu Jan 8 10:37:55 2009
@@ -2963,13 +2963,17 @@
delNodeHint = null;
}
Collection<DatanodeDescriptor> nonExcess = new
ArrayList<DatanodeDescriptor>();
+ Collection<DatanodeDescriptor> corruptNodes =
corruptReplicas.getNodes(block);
for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(block);
it.hasNext();) {
DatanodeDescriptor cur = it.next();
Collection<Block> excessBlocks =
excessReplicateMap.get(cur.getStorageID());
if (excessBlocks == null || !excessBlocks.contains(block)) {
if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) {
- nonExcess.add(cur);
+ // exclude corrupt replicas
+ if (corruptNodes == null || !corruptNodes.contains(cur)) {
+ nonExcess.add(cur);
+ }
}
}
}
Modified:
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/MiniDFSCluster.java
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/MiniDFSCluster.java?rev=732788&r1=732787&r2=732788&view=diff
==============================================================================
---
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/MiniDFSCluster.java
(original)
+++
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/MiniDFSCluster.java
Thu Jan 8 10:37:55 2009
@@ -624,7 +624,7 @@
/*
* Restart a particular datanode
*/
- synchronized boolean restartDataNode(int i) throws IOException {
+ public synchronized boolean restartDataNode(int i) throws IOException {
DataNodeProperties dnprop = stopDataNode(i);
if (dnprop == null) {
return false;
Modified:
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestDatanodeBlockScanner.java
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestDatanodeBlockScanner.java?rev=732788&r1=732787&r2=732788&view=diff
==============================================================================
---
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestDatanodeBlockScanner.java
(original)
+++
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestDatanodeBlockScanner.java
Thu Jan 8 10:37:55 2009
@@ -141,7 +141,7 @@
cluster.shutdown();
}
- boolean corruptReplica(String blockName, int replica) throws IOException {
+ public static boolean corruptReplica(String blockName, int replica) throws
IOException {
Random random = new Random();
File baseDir = new File(System.getProperty("test.build.data"), "dfs/data");
boolean corrupted = false;
@@ -420,7 +420,7 @@
}
}
- private void truncateReplica(String blockName, int dnIndex) throws
IOException {
+ private static void truncateReplica(String blockName, int dnIndex) throws
IOException {
File baseDir = new File(System.getProperty("test.build.data"), "dfs/data");
for (int i=dnIndex*2; i<dnIndex*2+2; i++) {
File blockFile = new File(baseDir, "data" + (i+1)+ "/current/" +
@@ -434,7 +434,7 @@
}
}
- private void waitForBlockDeleted(String blockName, int dnIndex)
+ private static void waitForBlockDeleted(String blockName, int dnIndex)
throws IOException, InterruptedException {
File baseDir = new File(System.getProperty("test.build.data"), "dfs/data");
File blockFile1 = new File(baseDir, "data" + (2*dnIndex+1)+ "/current/" +
Added:
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestOverReplicatedBlocks.java
URL:
http://svn.apache.org/viewvc/hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestOverReplicatedBlocks.java?rev=732788&view=auto
==============================================================================
---
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestOverReplicatedBlocks.java
(added)
+++
hadoop/core/branches/branch-0.18/src/test/org/apache/hadoop/dfs/TestOverReplicatedBlocks.java
Thu Jan 8 10:37:55 2009
@@ -0,0 +1,68 @@
+package org.apache.hadoop.dfs;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.dfs.DFSTestUtil;
+import org.apache.hadoop.dfs.MiniDFSCluster;
+import org.apache.hadoop.dfs.TestDatanodeBlockScanner;
+import org.apache.hadoop.dfs.Block;
+import org.apache.hadoop.dfs.DatanodeID;
+
+import junit.framework.TestCase;
+
+public class TestOverReplicatedBlocks extends TestCase {
+ /** Test processOverReplicatedBlock can handle corrupt replicas fine.
+ * It make sure that it won't treat corrupt replicas as valid ones
+ * thus prevents NN deleting valid replicas but keeping
+ * corrupt ones.
+ */
+ public void testProcesOverReplicateBlock() throws IOException {
+ Configuration conf = new Configuration();
+ conf.setLong("dfs.blockreport.intervalMsec", 1000L);
+ conf.set("dfs.replication.pending.timeout.sec", Integer.toString(2));
+ MiniDFSCluster cluster = new MiniDFSCluster(conf, 3, true, null);
+ FileSystem fs = cluster.getFileSystem();
+
+ try {
+ final Path fileName = new Path("/foo1");
+ DFSTestUtil.createFile(fs, fileName, 2, (short)3, 0L);
+ DFSTestUtil.waitReplication(fs, fileName, (short)3);
+
+ // corrupt the block on datanode 0
+ Block block = DFSTestUtil.getFirstBlock(fs, fileName);
+ TestDatanodeBlockScanner.corruptReplica(block.getBlockName(), 0);
+ File scanLog = new File(System.getProperty("test.build.data"),
+ "dfs/data/data1/current/dncp_block_verification.log.curr");
+ assertTrue(scanLog.delete());
+ // restart the datanode so the corrupt replica will be detected
+ cluster.restartDataNode(0);
+ DFSTestUtil.waitReplication(fs, fileName, (short)2);
+
+ final DatanodeID corruptDataNode =
+ cluster.getDataNodes().get(2).dnRegistration;
+ final FSNamesystem namesystem = FSNamesystem.getFSNamesystem();
+ synchronized (namesystem.heartbeats) {
+ // set live datanode's remaining space to be 0
+ // so they will be chosen to be deleted when over-replication occurs
+ for (DatanodeDescriptor datanode : namesystem.heartbeats) {
+ if (!corruptDataNode.equals(datanode)) {
+ datanode.updateHeartbeat(100L, 100L, 0L, 0);
+ }
+ }
+
+ // decrease the replication factor to 1;
+ namesystem.setReplication(fileName.toString(), (short)1);
+
+ // corrupt one won't be chosen to be excess one
+ // without 4910 the number of live replicas would be 0: block gets lost
+ assertEquals(1, namesystem.countNodes(block).liveReplicas());
+ }
+ } finally {
+ cluster.shutdown();
+ }
+ }
+}