Author: szetszwo
Date: Mon Jun 3 07:12:06 2013
New Revision: 1488865
URL: http://svn.apache.org/r1488865
Log:
HDFS-4261. Fix bugs in Balaner causing infinite loop and
TestBalancerWithNodeGroup timeing out. Contributed by Junping Du
Modified:
hadoop/common/branches/branch-1/CHANGES.txt
hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/balancer/Balancer.java
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithNodeGroup.java
Modified: hadoop/common/branches/branch-1/CHANGES.txt
URL:
http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/CHANGES.txt?rev=1488865&r1=1488864&r2=1488865&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/CHANGES.txt (original)
+++ hadoop/common/branches/branch-1/CHANGES.txt Mon Jun 3 07:12:06 2013
@@ -58,6 +58,9 @@ Release 1.3.0 - unreleased
HADOOP-8981. TestMetricsSystemImpl fails on Windows. (Xuan Gong, backported
by Chris Nauroth via suresh)
+ HDFS-4261. Fix bugs in Balaner causing infinite loop and
+ TestBalancerWithNodeGroup timeing out. (Junping Du via szetszwo)
+
Release 1.2.1 - Unreleased
INCOMPATIBLE CHANGES
Modified:
hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/balancer/Balancer.java
URL:
http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/balancer/Balancer.java?rev=1488865&r1=1488864&r2=1488865&view=diff
==============================================================================
---
hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/balancer/Balancer.java
(original)
+++
hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/balancer/Balancer.java
Mon Jun 3 07:12:06 2013
@@ -193,6 +193,8 @@ public class Balancer implements Tool {
*/
public static final int MAX_NUM_CONCURRENT_MOVES = 5;
+ public static final int MAX_NO_PENDING_BLOCK_ITERATIONS = 5;
+
private Configuration conf;
private double threshold = 10D;
@@ -746,6 +748,7 @@ public class Balancer implements Tool {
long startTime = Util.now();
this.blocksToReceive = 2*scheduledSize;
boolean isTimeUp = false;
+ int noPendingBlockIteration = 0;
while(!isTimeUp && scheduledSize > 0 &&
(!srcBlockList.isEmpty() || blocksToReceive > 0)) {
PendingBlockMove pendingBlock = chooseNextBlockToMove();
@@ -769,7 +772,15 @@ public class Balancer implements Tool {
LOG.warn(StringUtils.stringifyException(e));
return;
}
- }
+ } else {
+ // source node cannot find a pendingBlockToMove, iteration +1
+ noPendingBlockIteration++;
+ // in case no blocks can be moved for source node's task,
+ // jump out of while-loop after 5 iterations.
+ if (noPendingBlockIteration >= MAX_NO_PENDING_BLOCK_ITERATIONS) {
+ scheduledSize = 0;
+ }
+ }
// check if time is up or not
if (Util.now()-startTime > MAX_ITERATION_TIME) {
@@ -1496,7 +1507,11 @@ public class Balancer implements Tool {
Formatter formatter = new Formatter(System.out);
System.out.println("Time Stamp Iteration# Bytes Already
Moved Bytes Left To Move Bytes Being Moved");
int iterations = 0;
+
while (true) {
+ // clean all lists at the beginning of balancer iteration.
+ resetData();
+
/* get all live datanodes of a cluster and their disk usage
* decide the number of bytes need to be moved
*/
@@ -1547,9 +1562,6 @@ public class Balancer implements Tool {
return NO_MOVE_PROGRESS;
}
}
-
- // clean all lists
- resetData();
try {
Thread.sleep(2*conf.getLong("dfs.heartbeat.interval", 3));
Modified:
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithNodeGroup.java
URL:
http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithNodeGroup.java?rev=1488865&r1=1488864&r2=1488865&view=diff
==============================================================================
---
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithNodeGroup.java
(original)
+++
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/balancer/TestBalancerWithNodeGroup.java
Mon Jun 3 07:12:06 2013
@@ -216,7 +216,7 @@ public class TestBalancerWithNodeGroup {
* to n0 or n1 as balancer policy with node group. Thus, we expect the
balancer
* to end in 5 iterations without move block process.
*/
- @Test
+ @Test(timeout=60000)
public void testBalancerEndInNoMoveProgress() throws Exception {
Configuration conf = createConf();
long[] capacities = new long[]{CAPACITY, CAPACITY, CAPACITY, CAPACITY};
@@ -255,7 +255,7 @@ public class TestBalancerWithNodeGroup {
* Create a cluster with even distribution, and a new empty node is added to
* the cluster, then test rack locality for balancer policy.
*/
- @Test
+ @Test(timeout=60000)
public void testBalancerWithRackLocality() throws Exception {
Configuration conf = createConf();
long[] capacities = new long[]{CAPACITY, CAPACITY};
@@ -294,7 +294,7 @@ public class TestBalancerWithNodeGroup {
totalCapacity += newCapacity;
// run balancer and validate results
- runBalancer(conf, totalUsedSpace, totalCapacity);
+ runBalancerCanFinish(conf, totalUsedSpace, totalCapacity);
DatanodeInfo[] datanodeReport =
client.getDatanodeReport(DatanodeReportType.ALL);
@@ -321,7 +321,7 @@ public class TestBalancerWithNodeGroup {
/** Create a cluster with even distribution, and a new empty node is added to
* the cluster, then test rack locality for balancer policy.
**/
- @Test
+ @Test(timeout=60000)
public void testBalancerWithNodeGroup() throws Exception {
Configuration conf = createConf();
long[] capacities = new long[]{CAPACITY, CAPACITY};