[02/50] [abbrv] hadoop git commit: HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy.

2017-03-16 Thread inigoiri
HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by 
Lukas Majercak and Manoj Govindassamy.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/385d2cb7
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/385d2cb7
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/385d2cb7

Branch: refs/heads/HDFS-10467
Commit: 385d2cb777a0272ac20c62336c944fad295d5d12
Parents: 570827a
Author: Masatake Iwasaki 
Authored: Thu Mar 9 13:30:33 2017 +0900
Committer: Masatake Iwasaki 
Committed: Thu Mar 9 21:13:50 2017 +0900

--
 .../server/blockmanagement/BlockManager.java| 10 +++-
 .../apache/hadoop/hdfs/TestDecommission.java| 48 ++
 .../hadoop/hdfs/TestMaintenanceState.java   | 51 
 3 files changed, 108 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
index 9ec28f9..5dc40fa 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@@ -891,7 +891,15 @@ public class BlockManager implements BlockStatsMXBean {
   lastBlock.getUnderConstructionFeature()
   .updateStorageScheduledSize((BlockInfoStriped) lastBlock);
 }
-if (hasMinStorage(lastBlock)) {
+
+// Count replicas on decommissioning nodes, as these will not be
+// decommissioned unless recovery/completing last block has finished
+NumberReplicas numReplicas = countNodes(lastBlock);
+int numUsableReplicas = numReplicas.liveReplicas() +
+numReplicas.decommissioning() +
+numReplicas.liveEnteringMaintenanceReplicas();
+
+if (hasMinStorage(lastBlock, numUsableReplicas)) {
   if (committed) {
 addExpectedReplicasToPending(lastBlock);
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
index 94e8946..dc0edcc 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
@@ -33,6 +33,7 @@ import java.util.concurrent.ExecutionException;
 import com.google.common.base.Supplier;
 import com.google.common.collect.Lists;
 import org.apache.hadoop.fs.BlockLocation;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -646,6 +647,53 @@ public class TestDecommission extends AdminStatesBaseTest {
 
 fdos.close();
   }
+
+  @Test(timeout = 36)
+  public void testDecommissionWithOpenFileAndBlockRecovery()
+  throws IOException, InterruptedException {
+startCluster(1, 6);
+getCluster().waitActive();
+
+Path file = new Path("/testRecoveryDecommission");
+
+// Create a file and never close the output stream to trigger recovery
+DistributedFileSystem dfs = getCluster().getFileSystem();
+FSDataOutputStream out = dfs.create(file, true,
+getConf().getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 
4096),
+(short) 3, blockSize);
+
+// Write data to the file
+long writtenBytes = 0;
+while (writtenBytes < fileSize) {
+  out.writeLong(writtenBytes);
+  writtenBytes += 8;
+}
+out.hsync();
+
+DatanodeInfo[] lastBlockLocations = NameNodeAdapter.getBlockLocations(
+  getCluster().getNameNode(), "/testRecoveryDecommission", 0, fileSize)
+  .getLastLocatedBlock().getLocations();
+
+// Decommission all nodes of the last block
+ArrayList toDecom = new ArrayList<>();
+for (DatanodeInfo dnDecom : lastBlockLocations) {
+  toDecom.add(dnDecom.getXferAddr());
+}
+initExcludeHosts(toDecom);
+refreshNodes(0);
+
+// Make sure hard lease expires to trigger replica recovery
+getCluster().setLeasePeriod(300L, 300L);
+

hadoop git commit: HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy.

2017-03-13 Thread weichiu
Repository: hadoop
Updated Branches:
  refs/heads/branch-2.7 ef99e5ed8 -> 830a60237


HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by 
Lukas Majercak and Manoj Govindassamy.

(cherry picked from commit 385d2cb777a0272ac20c62336c944fad295d5d12)

 Conflicts:

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java

(cherry picked from commit 60be2e5d8a1a6a8921c68f8b0f428b55152d05db)

 Conflicts:

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/830a6023
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/830a6023
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/830a6023

Branch: refs/heads/branch-2.7
Commit: 830a602375ee4055c84b998734290ded78b68d70
Parents: ef99e5e
Author: Wei-Chiu Chuang 
Authored: Mon Mar 13 13:45:12 2017 -0700
Committer: Wei-Chiu Chuang 
Committed: Mon Mar 13 13:45:12 2017 -0700

--
 hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt |  3 ++
 .../server/blockmanagement/BlockManager.java|  7 ++-
 .../apache/hadoop/hdfs/TestDecommission.java| 48 
 3 files changed, 57 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/830a6023/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
--
diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt 
b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
index 3234fc2..fb3186f 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt
@@ -229,6 +229,9 @@ Release 2.7.4 - UNRELEASED
 
 HDFS-11379. DFSInputStream may infinite loop requesting block locations. 
Contributed by Daryn Sharp.
 
+HDFS-11499. Decommissioning stuck because of failing recovery.
+Contributed by Lukas Majercak and Manoj Govindassamy.
+
 Release 2.7.3 - 2016-08-25
 
   INCOMPATIBLE CHANGES

http://git-wip-us.apache.org/repos/asf/hadoop/blob/830a6023/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
index b4b5b5f..cc6c881 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@@ -640,7 +640,12 @@ public class BlockManager {
 
 final boolean b = commitBlock(
 (BlockInfoContiguousUnderConstruction) lastBlock, commitBlock);
-if(countNodes(lastBlock).liveReplicas() >= minReplication)
+
+// Count replicas on decommissioning nodes, as these will not be
+// decommissioned unless recovery/completing last block has finished
+NumberReplicas numReplicas = countNodes(lastBlock);
+if(numReplicas.liveReplicas() + numReplicas.decommissioning() >=
+minReplication)
   completeBlock(bc, bc.numBlocks()-1, iip, false);
 return b;
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/830a6023/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
index 5e892d7..7d8cc59 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
@@ -869,6 +869,54 @@ public class TestDecommission {
 
 fdos.close();
   }
+
+  @Test(timeout = 36)
+  public void testDecommissionWithOpenFileAndBlockRecovery()
+  throws IOException, InterruptedException {
+startCluster(1, 6, conf);
+cluster.waitActive();
+
+Path file = new Path("/testRecoveryDecommission");
+
+// Create a file and never close the output stream to trigger recovery
+DistributedFileSystem dfs = cluster.getFileSystem();
+FSNamesystem ns = cluster.getNamesystem(0);
+FSDataOutputStream out = dfs.create(file, true,
+conf.getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 4096),
+(short) 3, blockSize);
+
+// Write data to the file
+long 

hadoop git commit: HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy.

2017-03-13 Thread weichiu
Repository: hadoop
Updated Branches:
  refs/heads/branch-2.8 72fc7e052 -> 851ba7d9d


HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by 
Lukas Majercak and Manoj Govindassamy.

(cherry picked from commit 385d2cb777a0272ac20c62336c944fad295d5d12)

 Conflicts:

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java

(cherry picked from commit 60be2e5d8a1a6a8921c68f8b0f428b55152d05db)

 Conflicts:

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/851ba7d9
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/851ba7d9
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/851ba7d9

Branch: refs/heads/branch-2.8
Commit: 851ba7d9d1a7a0b8a0bd86d3ad14bffc781a0316
Parents: 72fc7e0
Author: Wei-Chiu Chuang 
Authored: Mon Mar 13 13:41:13 2017 -0700
Committer: Wei-Chiu Chuang 
Committed: Mon Mar 13 13:43:00 2017 -0700

--
 .../server/blockmanagement/BlockManager.java|  7 ++-
 .../apache/hadoop/hdfs/TestDecommission.java| 48 
 2 files changed, 54 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/851ba7d9/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
index a929c43..858a54f 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@@ -736,7 +736,12 @@ public class BlockManager implements BlockStatsMXBean {
   return false; // already completed (e.g. by syncBlock)
 
 final boolean b = commitBlock(lastBlock, commitBlock);
-if (countNodes(lastBlock).liveReplicas() >= minReplication) {
+
+// Count replicas on decommissioning nodes, as these will not be
+// decommissioned unless recovery/completing last block has finished
+NumberReplicas numReplicas = countNodes(lastBlock);
+if (numReplicas.liveReplicas() + numReplicas.decommissioning() >=
+minReplication) {
   if (b) {
 addExpectedReplicasToPending(lastBlock, bc);
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/851ba7d9/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
index 1d5ebbf..78f6221 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
@@ -873,6 +873,54 @@ public class TestDecommission {
 
 fdos.close();
   }
+
+  @Test(timeout = 36)
+  public void testDecommissionWithOpenFileAndBlockRecovery()
+  throws IOException, InterruptedException {
+startCluster(1, 6, conf);
+cluster.waitActive();
+
+Path file = new Path("/testRecoveryDecommission");
+
+// Create a file and never close the output stream to trigger recovery
+DistributedFileSystem dfs = cluster.getFileSystem();
+FSNamesystem ns = cluster.getNamesystem(0);
+FSDataOutputStream out = dfs.create(file, true,
+conf.getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 4096),
+(short) 3, blockSize);
+
+// Write data to the file
+long writtenBytes = 0;
+while (writtenBytes < fileSize) {
+  out.writeLong(writtenBytes);
+  writtenBytes += 8;
+}
+out.hsync();
+
+DatanodeInfo[] lastBlockLocations = NameNodeAdapter.getBlockLocations(
+  cluster.getNameNode(), "/testRecoveryDecommission", 0, fileSize)
+  .getLastLocatedBlock().getLocations();
+
+// Decommission all nodes of the last block
+ArrayList toDecom = new ArrayList<>();
+for (DatanodeInfo dnDecom : lastBlockLocations) {
+  toDecom.add(dnDecom.getXferAddr());
+}
+writeConfigFile(excludeFile, toDecom);
+refreshNodes(ns, conf);
+
+// Make sure hard lease expires to trigger replica recovery
+cluster.setLeasePeriod(300L, 300L);
+Thread.sleep(2 * 

[30/50] [abbrv] hadoop git commit: HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy.

2017-03-13 Thread stevel
HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by 
Lukas Majercak and Manoj Govindassamy.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/385d2cb7
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/385d2cb7
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/385d2cb7

Branch: refs/heads/HADOOP-13345
Commit: 385d2cb777a0272ac20c62336c944fad295d5d12
Parents: 570827a
Author: Masatake Iwasaki 
Authored: Thu Mar 9 13:30:33 2017 +0900
Committer: Masatake Iwasaki 
Committed: Thu Mar 9 21:13:50 2017 +0900

--
 .../server/blockmanagement/BlockManager.java| 10 +++-
 .../apache/hadoop/hdfs/TestDecommission.java| 48 ++
 .../hadoop/hdfs/TestMaintenanceState.java   | 51 
 3 files changed, 108 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
index 9ec28f9..5dc40fa 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@@ -891,7 +891,15 @@ public class BlockManager implements BlockStatsMXBean {
   lastBlock.getUnderConstructionFeature()
   .updateStorageScheduledSize((BlockInfoStriped) lastBlock);
 }
-if (hasMinStorage(lastBlock)) {
+
+// Count replicas on decommissioning nodes, as these will not be
+// decommissioned unless recovery/completing last block has finished
+NumberReplicas numReplicas = countNodes(lastBlock);
+int numUsableReplicas = numReplicas.liveReplicas() +
+numReplicas.decommissioning() +
+numReplicas.liveEnteringMaintenanceReplicas();
+
+if (hasMinStorage(lastBlock, numUsableReplicas)) {
   if (committed) {
 addExpectedReplicasToPending(lastBlock);
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
index 94e8946..dc0edcc 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
@@ -33,6 +33,7 @@ import java.util.concurrent.ExecutionException;
 import com.google.common.base.Supplier;
 import com.google.common.collect.Lists;
 import org.apache.hadoop.fs.BlockLocation;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -646,6 +647,53 @@ public class TestDecommission extends AdminStatesBaseTest {
 
 fdos.close();
   }
+
+  @Test(timeout = 36)
+  public void testDecommissionWithOpenFileAndBlockRecovery()
+  throws IOException, InterruptedException {
+startCluster(1, 6);
+getCluster().waitActive();
+
+Path file = new Path("/testRecoveryDecommission");
+
+// Create a file and never close the output stream to trigger recovery
+DistributedFileSystem dfs = getCluster().getFileSystem();
+FSDataOutputStream out = dfs.create(file, true,
+getConf().getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 
4096),
+(short) 3, blockSize);
+
+// Write data to the file
+long writtenBytes = 0;
+while (writtenBytes < fileSize) {
+  out.writeLong(writtenBytes);
+  writtenBytes += 8;
+}
+out.hsync();
+
+DatanodeInfo[] lastBlockLocations = NameNodeAdapter.getBlockLocations(
+  getCluster().getNameNode(), "/testRecoveryDecommission", 0, fileSize)
+  .getLastLocatedBlock().getLocations();
+
+// Decommission all nodes of the last block
+ArrayList toDecom = new ArrayList<>();
+for (DatanodeInfo dnDecom : lastBlockLocations) {
+  toDecom.add(dnDecom.getXferAddr());
+}
+initExcludeHosts(toDecom);
+refreshNodes(0);
+
+// Make sure hard lease expires to trigger replica recovery
+getCluster().setLeasePeriod(300L, 300L);
+

[28/33] hadoop git commit: HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy.

2017-03-09 Thread jhung
HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by 
Lukas Majercak and Manoj Govindassamy.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/385d2cb7
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/385d2cb7
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/385d2cb7

Branch: refs/heads/YARN-5734
Commit: 385d2cb777a0272ac20c62336c944fad295d5d12
Parents: 570827a
Author: Masatake Iwasaki 
Authored: Thu Mar 9 13:30:33 2017 +0900
Committer: Masatake Iwasaki 
Committed: Thu Mar 9 21:13:50 2017 +0900

--
 .../server/blockmanagement/BlockManager.java| 10 +++-
 .../apache/hadoop/hdfs/TestDecommission.java| 48 ++
 .../hadoop/hdfs/TestMaintenanceState.java   | 51 
 3 files changed, 108 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
index 9ec28f9..5dc40fa 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@@ -891,7 +891,15 @@ public class BlockManager implements BlockStatsMXBean {
   lastBlock.getUnderConstructionFeature()
   .updateStorageScheduledSize((BlockInfoStriped) lastBlock);
 }
-if (hasMinStorage(lastBlock)) {
+
+// Count replicas on decommissioning nodes, as these will not be
+// decommissioned unless recovery/completing last block has finished
+NumberReplicas numReplicas = countNodes(lastBlock);
+int numUsableReplicas = numReplicas.liveReplicas() +
+numReplicas.decommissioning() +
+numReplicas.liveEnteringMaintenanceReplicas();
+
+if (hasMinStorage(lastBlock, numUsableReplicas)) {
   if (committed) {
 addExpectedReplicasToPending(lastBlock);
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
index 94e8946..dc0edcc 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
@@ -33,6 +33,7 @@ import java.util.concurrent.ExecutionException;
 import com.google.common.base.Supplier;
 import com.google.common.collect.Lists;
 import org.apache.hadoop.fs.BlockLocation;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -646,6 +647,53 @@ public class TestDecommission extends AdminStatesBaseTest {
 
 fdos.close();
   }
+
+  @Test(timeout = 36)
+  public void testDecommissionWithOpenFileAndBlockRecovery()
+  throws IOException, InterruptedException {
+startCluster(1, 6);
+getCluster().waitActive();
+
+Path file = new Path("/testRecoveryDecommission");
+
+// Create a file and never close the output stream to trigger recovery
+DistributedFileSystem dfs = getCluster().getFileSystem();
+FSDataOutputStream out = dfs.create(file, true,
+getConf().getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 
4096),
+(short) 3, blockSize);
+
+// Write data to the file
+long writtenBytes = 0;
+while (writtenBytes < fileSize) {
+  out.writeLong(writtenBytes);
+  writtenBytes += 8;
+}
+out.hsync();
+
+DatanodeInfo[] lastBlockLocations = NameNodeAdapter.getBlockLocations(
+  getCluster().getNameNode(), "/testRecoveryDecommission", 0, fileSize)
+  .getLastLocatedBlock().getLocations();
+
+// Decommission all nodes of the last block
+ArrayList toDecom = new ArrayList<>();
+for (DatanodeInfo dnDecom : lastBlockLocations) {
+  toDecom.add(dnDecom.getXferAddr());
+}
+initExcludeHosts(toDecom);
+refreshNodes(0);
+
+// Make sure hard lease expires to trigger replica recovery
+getCluster().setLeasePeriod(300L, 300L);
+

hadoop git commit: HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy.

2017-03-09 Thread iwasakims
Repository: hadoop
Updated Branches:
  refs/heads/branch-2 4a1187238 -> 60be2e5d8


HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by 
Lukas Majercak and Manoj Govindassamy.

(cherry picked from commit 385d2cb777a0272ac20c62336c944fad295d5d12)

 Conflicts:

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/60be2e5d
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/60be2e5d
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/60be2e5d

Branch: refs/heads/branch-2
Commit: 60be2e5d8a1a6a8921c68f8b0f428b55152d05db
Parents: 4a11872
Author: Masatake Iwasaki 
Authored: Thu Mar 9 13:30:33 2017 +0900
Committer: Masatake Iwasaki 
Committed: Thu Mar 9 23:37:04 2017 +0900

--
 .../server/blockmanagement/BlockManager.java| 10 +++-
 .../apache/hadoop/hdfs/TestDecommission.java| 48 ++
 .../hadoop/hdfs/TestMaintenanceState.java   | 51 
 3 files changed, 108 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/60be2e5d/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
index dad82d2..5d5706d 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@@ -801,7 +801,15 @@ public class BlockManager implements BlockStatsMXBean {
   return false; // already completed (e.g. by syncBlock)
 
 final boolean committed = commitBlock(lastBlock, commitBlock);
-if (countNodes(lastBlock).liveReplicas() >= minReplication) {
+
+// Count replicas on decommissioning nodes, as these will not be
+// decommissioned unless recovery/completing last block has finished
+NumberReplicas numReplicas = countNodes(lastBlock);
+int numUsableReplicas = numReplicas.liveReplicas() +
+numReplicas.decommissioning() +
+numReplicas.liveEnteringMaintenanceReplicas();
+
+if (numUsableReplicas >= minReplication) {
   if (committed) {
 addExpectedReplicasToPending(lastBlock);
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/60be2e5d/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
index 5551782..b34f047 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
@@ -33,6 +33,7 @@ import java.util.concurrent.ExecutionException;
 import com.google.common.base.Supplier;
 import com.google.common.collect.Lists;
 import org.apache.hadoop.fs.BlockLocation;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -647,6 +648,53 @@ public class TestDecommission extends AdminStatesBaseTest {
 
 fdos.close();
   }
+
+  @Test(timeout = 36)
+  public void testDecommissionWithOpenFileAndBlockRecovery()
+  throws IOException, InterruptedException {
+startCluster(1, 6);
+getCluster().waitActive();
+
+Path file = new Path("/testRecoveryDecommission");
+
+// Create a file and never close the output stream to trigger recovery
+DistributedFileSystem dfs = getCluster().getFileSystem();
+FSDataOutputStream out = dfs.create(file, true,
+getConf().getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 
4096),
+(short) 3, blockSize);
+
+// Write data to the file
+long writtenBytes = 0;
+while (writtenBytes < fileSize) {
+  out.writeLong(writtenBytes);
+  writtenBytes += 8;
+}
+out.hsync();
+
+DatanodeInfo[] lastBlockLocations = NameNodeAdapter.getBlockLocations(
+  getCluster().getNameNode(), "/testRecoveryDecommission", 0, fileSize)
+  .getLastLocatedBlock().getLocations();
+
+// Decommission all nodes of the last block
+

hadoop git commit: HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy.

2017-03-09 Thread iwasakims
Repository: hadoop
Updated Branches:
  refs/heads/trunk 570827a81 -> 385d2cb77


HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by 
Lukas Majercak and Manoj Govindassamy.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/385d2cb7
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/385d2cb7
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/385d2cb7

Branch: refs/heads/trunk
Commit: 385d2cb777a0272ac20c62336c944fad295d5d12
Parents: 570827a
Author: Masatake Iwasaki 
Authored: Thu Mar 9 13:30:33 2017 +0900
Committer: Masatake Iwasaki 
Committed: Thu Mar 9 21:13:50 2017 +0900

--
 .../server/blockmanagement/BlockManager.java| 10 +++-
 .../apache/hadoop/hdfs/TestDecommission.java| 48 ++
 .../hadoop/hdfs/TestMaintenanceState.java   | 51 
 3 files changed, 108 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
index 9ec28f9..5dc40fa 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java
@@ -891,7 +891,15 @@ public class BlockManager implements BlockStatsMXBean {
   lastBlock.getUnderConstructionFeature()
   .updateStorageScheduledSize((BlockInfoStriped) lastBlock);
 }
-if (hasMinStorage(lastBlock)) {
+
+// Count replicas on decommissioning nodes, as these will not be
+// decommissioned unless recovery/completing last block has finished
+NumberReplicas numReplicas = countNodes(lastBlock);
+int numUsableReplicas = numReplicas.liveReplicas() +
+numReplicas.decommissioning() +
+numReplicas.liveEnteringMaintenanceReplicas();
+
+if (hasMinStorage(lastBlock, numUsableReplicas)) {
   if (committed) {
 addExpectedReplicasToPending(lastBlock);
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
index 94e8946..dc0edcc 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java
@@ -33,6 +33,7 @@ import java.util.concurrent.ExecutionException;
 import com.google.common.base.Supplier;
 import com.google.common.collect.Lists;
 import org.apache.hadoop.fs.BlockLocation;
+import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -646,6 +647,53 @@ public class TestDecommission extends AdminStatesBaseTest {
 
 fdos.close();
   }
+
+  @Test(timeout = 36)
+  public void testDecommissionWithOpenFileAndBlockRecovery()
+  throws IOException, InterruptedException {
+startCluster(1, 6);
+getCluster().waitActive();
+
+Path file = new Path("/testRecoveryDecommission");
+
+// Create a file and never close the output stream to trigger recovery
+DistributedFileSystem dfs = getCluster().getFileSystem();
+FSDataOutputStream out = dfs.create(file, true,
+getConf().getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 
4096),
+(short) 3, blockSize);
+
+// Write data to the file
+long writtenBytes = 0;
+while (writtenBytes < fileSize) {
+  out.writeLong(writtenBytes);
+  writtenBytes += 8;
+}
+out.hsync();
+
+DatanodeInfo[] lastBlockLocations = NameNodeAdapter.getBlockLocations(
+  getCluster().getNameNode(), "/testRecoveryDecommission", 0, fileSize)
+  .getLastLocatedBlock().getLocations();
+
+// Decommission all nodes of the last block
+ArrayList toDecom = new ArrayList<>();
+for (DatanodeInfo dnDecom : lastBlockLocations) {
+  toDecom.add(dnDecom.getXferAddr());
+}
+initExcludeHosts(toDecom);
+refreshNodes(0);
+
+// Make sure hard lease expires to