[02/50] [abbrv] hadoop git commit: HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy.
HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy. Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/385d2cb7 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/385d2cb7 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/385d2cb7 Branch: refs/heads/HDFS-10467 Commit: 385d2cb777a0272ac20c62336c944fad295d5d12 Parents: 570827a Author: Masatake IwasakiAuthored: Thu Mar 9 13:30:33 2017 +0900 Committer: Masatake Iwasaki Committed: Thu Mar 9 21:13:50 2017 +0900 -- .../server/blockmanagement/BlockManager.java| 10 +++- .../apache/hadoop/hdfs/TestDecommission.java| 48 ++ .../hadoop/hdfs/TestMaintenanceState.java | 51 3 files changed, 108 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 9ec28f9..5dc40fa 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -891,7 +891,15 @@ public class BlockManager implements BlockStatsMXBean { lastBlock.getUnderConstructionFeature() .updateStorageScheduledSize((BlockInfoStriped) lastBlock); } -if (hasMinStorage(lastBlock)) { + +// Count replicas on decommissioning nodes, as these will not be +// decommissioned unless recovery/completing last block has finished +NumberReplicas numReplicas = countNodes(lastBlock); +int numUsableReplicas = numReplicas.liveReplicas() + +numReplicas.decommissioning() + +numReplicas.liveEnteringMaintenanceReplicas(); + +if (hasMinStorage(lastBlock, numUsableReplicas)) { if (committed) { addExpectedReplicasToPending(lastBlock); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java index 94e8946..dc0edcc 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java @@ -33,6 +33,7 @@ import java.util.concurrent.ExecutionException; import com.google.common.base.Supplier; import com.google.common.collect.Lists; import org.apache.hadoop.fs.BlockLocation; +import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -646,6 +647,53 @@ public class TestDecommission extends AdminStatesBaseTest { fdos.close(); } + + @Test(timeout = 36) + public void testDecommissionWithOpenFileAndBlockRecovery() + throws IOException, InterruptedException { +startCluster(1, 6); +getCluster().waitActive(); + +Path file = new Path("/testRecoveryDecommission"); + +// Create a file and never close the output stream to trigger recovery +DistributedFileSystem dfs = getCluster().getFileSystem(); +FSDataOutputStream out = dfs.create(file, true, +getConf().getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 4096), +(short) 3, blockSize); + +// Write data to the file +long writtenBytes = 0; +while (writtenBytes < fileSize) { + out.writeLong(writtenBytes); + writtenBytes += 8; +} +out.hsync(); + +DatanodeInfo[] lastBlockLocations = NameNodeAdapter.getBlockLocations( + getCluster().getNameNode(), "/testRecoveryDecommission", 0, fileSize) + .getLastLocatedBlock().getLocations(); + +// Decommission all nodes of the last block +ArrayList toDecom = new ArrayList<>(); +for (DatanodeInfo dnDecom : lastBlockLocations) { + toDecom.add(dnDecom.getXferAddr()); +} +initExcludeHosts(toDecom); +refreshNodes(0); + +// Make sure hard lease expires to trigger replica recovery +getCluster().setLeasePeriod(300L, 300L); +
hadoop git commit: HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy.
Repository: hadoop Updated Branches: refs/heads/branch-2.7 ef99e5ed8 -> 830a60237 HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy. (cherry picked from commit 385d2cb777a0272ac20c62336c944fad295d5d12) Conflicts: hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java (cherry picked from commit 60be2e5d8a1a6a8921c68f8b0f428b55152d05db) Conflicts: hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/830a6023 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/830a6023 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/830a6023 Branch: refs/heads/branch-2.7 Commit: 830a602375ee4055c84b998734290ded78b68d70 Parents: ef99e5e Author: Wei-Chiu ChuangAuthored: Mon Mar 13 13:45:12 2017 -0700 Committer: Wei-Chiu Chuang Committed: Mon Mar 13 13:45:12 2017 -0700 -- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 ++ .../server/blockmanagement/BlockManager.java| 7 ++- .../apache/hadoop/hdfs/TestDecommission.java| 48 3 files changed, 57 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/hadoop/blob/830a6023/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 3234fc2..fb3186f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -229,6 +229,9 @@ Release 2.7.4 - UNRELEASED HDFS-11379. DFSInputStream may infinite loop requesting block locations. Contributed by Daryn Sharp. +HDFS-11499. Decommissioning stuck because of failing recovery. +Contributed by Lukas Majercak and Manoj Govindassamy. + Release 2.7.3 - 2016-08-25 INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/830a6023/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index b4b5b5f..cc6c881 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -640,7 +640,12 @@ public class BlockManager { final boolean b = commitBlock( (BlockInfoContiguousUnderConstruction) lastBlock, commitBlock); -if(countNodes(lastBlock).liveReplicas() >= minReplication) + +// Count replicas on decommissioning nodes, as these will not be +// decommissioned unless recovery/completing last block has finished +NumberReplicas numReplicas = countNodes(lastBlock); +if(numReplicas.liveReplicas() + numReplicas.decommissioning() >= +minReplication) completeBlock(bc, bc.numBlocks()-1, iip, false); return b; } http://git-wip-us.apache.org/repos/asf/hadoop/blob/830a6023/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java index 5e892d7..7d8cc59 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java @@ -869,6 +869,54 @@ public class TestDecommission { fdos.close(); } + + @Test(timeout = 36) + public void testDecommissionWithOpenFileAndBlockRecovery() + throws IOException, InterruptedException { +startCluster(1, 6, conf); +cluster.waitActive(); + +Path file = new Path("/testRecoveryDecommission"); + +// Create a file and never close the output stream to trigger recovery +DistributedFileSystem dfs = cluster.getFileSystem(); +FSNamesystem ns = cluster.getNamesystem(0); +FSDataOutputStream out = dfs.create(file, true, +conf.getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 4096), +(short) 3, blockSize); + +// Write data to the file +long
hadoop git commit: HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy.
Repository: hadoop Updated Branches: refs/heads/branch-2.8 72fc7e052 -> 851ba7d9d HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy. (cherry picked from commit 385d2cb777a0272ac20c62336c944fad295d5d12) Conflicts: hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java (cherry picked from commit 60be2e5d8a1a6a8921c68f8b0f428b55152d05db) Conflicts: hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/851ba7d9 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/851ba7d9 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/851ba7d9 Branch: refs/heads/branch-2.8 Commit: 851ba7d9d1a7a0b8a0bd86d3ad14bffc781a0316 Parents: 72fc7e0 Author: Wei-Chiu ChuangAuthored: Mon Mar 13 13:41:13 2017 -0700 Committer: Wei-Chiu Chuang Committed: Mon Mar 13 13:43:00 2017 -0700 -- .../server/blockmanagement/BlockManager.java| 7 ++- .../apache/hadoop/hdfs/TestDecommission.java| 48 2 files changed, 54 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/hadoop/blob/851ba7d9/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index a929c43..858a54f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -736,7 +736,12 @@ public class BlockManager implements BlockStatsMXBean { return false; // already completed (e.g. by syncBlock) final boolean b = commitBlock(lastBlock, commitBlock); -if (countNodes(lastBlock).liveReplicas() >= minReplication) { + +// Count replicas on decommissioning nodes, as these will not be +// decommissioned unless recovery/completing last block has finished +NumberReplicas numReplicas = countNodes(lastBlock); +if (numReplicas.liveReplicas() + numReplicas.decommissioning() >= +minReplication) { if (b) { addExpectedReplicasToPending(lastBlock, bc); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/851ba7d9/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java index 1d5ebbf..78f6221 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java @@ -873,6 +873,54 @@ public class TestDecommission { fdos.close(); } + + @Test(timeout = 36) + public void testDecommissionWithOpenFileAndBlockRecovery() + throws IOException, InterruptedException { +startCluster(1, 6, conf); +cluster.waitActive(); + +Path file = new Path("/testRecoveryDecommission"); + +// Create a file and never close the output stream to trigger recovery +DistributedFileSystem dfs = cluster.getFileSystem(); +FSNamesystem ns = cluster.getNamesystem(0); +FSDataOutputStream out = dfs.create(file, true, +conf.getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 4096), +(short) 3, blockSize); + +// Write data to the file +long writtenBytes = 0; +while (writtenBytes < fileSize) { + out.writeLong(writtenBytes); + writtenBytes += 8; +} +out.hsync(); + +DatanodeInfo[] lastBlockLocations = NameNodeAdapter.getBlockLocations( + cluster.getNameNode(), "/testRecoveryDecommission", 0, fileSize) + .getLastLocatedBlock().getLocations(); + +// Decommission all nodes of the last block +ArrayList toDecom = new ArrayList<>(); +for (DatanodeInfo dnDecom : lastBlockLocations) { + toDecom.add(dnDecom.getXferAddr()); +} +writeConfigFile(excludeFile, toDecom); +refreshNodes(ns, conf); + +// Make sure hard lease expires to trigger replica recovery +cluster.setLeasePeriod(300L, 300L); +Thread.sleep(2 *
[30/50] [abbrv] hadoop git commit: HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy.
HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy. Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/385d2cb7 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/385d2cb7 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/385d2cb7 Branch: refs/heads/HADOOP-13345 Commit: 385d2cb777a0272ac20c62336c944fad295d5d12 Parents: 570827a Author: Masatake IwasakiAuthored: Thu Mar 9 13:30:33 2017 +0900 Committer: Masatake Iwasaki Committed: Thu Mar 9 21:13:50 2017 +0900 -- .../server/blockmanagement/BlockManager.java| 10 +++- .../apache/hadoop/hdfs/TestDecommission.java| 48 ++ .../hadoop/hdfs/TestMaintenanceState.java | 51 3 files changed, 108 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 9ec28f9..5dc40fa 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -891,7 +891,15 @@ public class BlockManager implements BlockStatsMXBean { lastBlock.getUnderConstructionFeature() .updateStorageScheduledSize((BlockInfoStriped) lastBlock); } -if (hasMinStorage(lastBlock)) { + +// Count replicas on decommissioning nodes, as these will not be +// decommissioned unless recovery/completing last block has finished +NumberReplicas numReplicas = countNodes(lastBlock); +int numUsableReplicas = numReplicas.liveReplicas() + +numReplicas.decommissioning() + +numReplicas.liveEnteringMaintenanceReplicas(); + +if (hasMinStorage(lastBlock, numUsableReplicas)) { if (committed) { addExpectedReplicasToPending(lastBlock); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java index 94e8946..dc0edcc 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java @@ -33,6 +33,7 @@ import java.util.concurrent.ExecutionException; import com.google.common.base.Supplier; import com.google.common.collect.Lists; import org.apache.hadoop.fs.BlockLocation; +import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -646,6 +647,53 @@ public class TestDecommission extends AdminStatesBaseTest { fdos.close(); } + + @Test(timeout = 36) + public void testDecommissionWithOpenFileAndBlockRecovery() + throws IOException, InterruptedException { +startCluster(1, 6); +getCluster().waitActive(); + +Path file = new Path("/testRecoveryDecommission"); + +// Create a file and never close the output stream to trigger recovery +DistributedFileSystem dfs = getCluster().getFileSystem(); +FSDataOutputStream out = dfs.create(file, true, +getConf().getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 4096), +(short) 3, blockSize); + +// Write data to the file +long writtenBytes = 0; +while (writtenBytes < fileSize) { + out.writeLong(writtenBytes); + writtenBytes += 8; +} +out.hsync(); + +DatanodeInfo[] lastBlockLocations = NameNodeAdapter.getBlockLocations( + getCluster().getNameNode(), "/testRecoveryDecommission", 0, fileSize) + .getLastLocatedBlock().getLocations(); + +// Decommission all nodes of the last block +ArrayList toDecom = new ArrayList<>(); +for (DatanodeInfo dnDecom : lastBlockLocations) { + toDecom.add(dnDecom.getXferAddr()); +} +initExcludeHosts(toDecom); +refreshNodes(0); + +// Make sure hard lease expires to trigger replica recovery +getCluster().setLeasePeriod(300L, 300L); +
[28/33] hadoop git commit: HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy.
HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy. Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/385d2cb7 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/385d2cb7 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/385d2cb7 Branch: refs/heads/YARN-5734 Commit: 385d2cb777a0272ac20c62336c944fad295d5d12 Parents: 570827a Author: Masatake IwasakiAuthored: Thu Mar 9 13:30:33 2017 +0900 Committer: Masatake Iwasaki Committed: Thu Mar 9 21:13:50 2017 +0900 -- .../server/blockmanagement/BlockManager.java| 10 +++- .../apache/hadoop/hdfs/TestDecommission.java| 48 ++ .../hadoop/hdfs/TestMaintenanceState.java | 51 3 files changed, 108 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 9ec28f9..5dc40fa 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -891,7 +891,15 @@ public class BlockManager implements BlockStatsMXBean { lastBlock.getUnderConstructionFeature() .updateStorageScheduledSize((BlockInfoStriped) lastBlock); } -if (hasMinStorage(lastBlock)) { + +// Count replicas on decommissioning nodes, as these will not be +// decommissioned unless recovery/completing last block has finished +NumberReplicas numReplicas = countNodes(lastBlock); +int numUsableReplicas = numReplicas.liveReplicas() + +numReplicas.decommissioning() + +numReplicas.liveEnteringMaintenanceReplicas(); + +if (hasMinStorage(lastBlock, numUsableReplicas)) { if (committed) { addExpectedReplicasToPending(lastBlock); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java index 94e8946..dc0edcc 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java @@ -33,6 +33,7 @@ import java.util.concurrent.ExecutionException; import com.google.common.base.Supplier; import com.google.common.collect.Lists; import org.apache.hadoop.fs.BlockLocation; +import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -646,6 +647,53 @@ public class TestDecommission extends AdminStatesBaseTest { fdos.close(); } + + @Test(timeout = 36) + public void testDecommissionWithOpenFileAndBlockRecovery() + throws IOException, InterruptedException { +startCluster(1, 6); +getCluster().waitActive(); + +Path file = new Path("/testRecoveryDecommission"); + +// Create a file and never close the output stream to trigger recovery +DistributedFileSystem dfs = getCluster().getFileSystem(); +FSDataOutputStream out = dfs.create(file, true, +getConf().getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 4096), +(short) 3, blockSize); + +// Write data to the file +long writtenBytes = 0; +while (writtenBytes < fileSize) { + out.writeLong(writtenBytes); + writtenBytes += 8; +} +out.hsync(); + +DatanodeInfo[] lastBlockLocations = NameNodeAdapter.getBlockLocations( + getCluster().getNameNode(), "/testRecoveryDecommission", 0, fileSize) + .getLastLocatedBlock().getLocations(); + +// Decommission all nodes of the last block +ArrayList toDecom = new ArrayList<>(); +for (DatanodeInfo dnDecom : lastBlockLocations) { + toDecom.add(dnDecom.getXferAddr()); +} +initExcludeHosts(toDecom); +refreshNodes(0); + +// Make sure hard lease expires to trigger replica recovery +getCluster().setLeasePeriod(300L, 300L); +
hadoop git commit: HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy.
Repository: hadoop Updated Branches: refs/heads/branch-2 4a1187238 -> 60be2e5d8 HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy. (cherry picked from commit 385d2cb777a0272ac20c62336c944fad295d5d12) Conflicts: hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/60be2e5d Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/60be2e5d Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/60be2e5d Branch: refs/heads/branch-2 Commit: 60be2e5d8a1a6a8921c68f8b0f428b55152d05db Parents: 4a11872 Author: Masatake IwasakiAuthored: Thu Mar 9 13:30:33 2017 +0900 Committer: Masatake Iwasaki Committed: Thu Mar 9 23:37:04 2017 +0900 -- .../server/blockmanagement/BlockManager.java| 10 +++- .../apache/hadoop/hdfs/TestDecommission.java| 48 ++ .../hadoop/hdfs/TestMaintenanceState.java | 51 3 files changed, 108 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/hadoop/blob/60be2e5d/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index dad82d2..5d5706d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -801,7 +801,15 @@ public class BlockManager implements BlockStatsMXBean { return false; // already completed (e.g. by syncBlock) final boolean committed = commitBlock(lastBlock, commitBlock); -if (countNodes(lastBlock).liveReplicas() >= minReplication) { + +// Count replicas on decommissioning nodes, as these will not be +// decommissioned unless recovery/completing last block has finished +NumberReplicas numReplicas = countNodes(lastBlock); +int numUsableReplicas = numReplicas.liveReplicas() + +numReplicas.decommissioning() + +numReplicas.liveEnteringMaintenanceReplicas(); + +if (numUsableReplicas >= minReplication) { if (committed) { addExpectedReplicasToPending(lastBlock); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/60be2e5d/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java index 5551782..b34f047 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java @@ -33,6 +33,7 @@ import java.util.concurrent.ExecutionException; import com.google.common.base.Supplier; import com.google.common.collect.Lists; import org.apache.hadoop.fs.BlockLocation; +import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -647,6 +648,53 @@ public class TestDecommission extends AdminStatesBaseTest { fdos.close(); } + + @Test(timeout = 36) + public void testDecommissionWithOpenFileAndBlockRecovery() + throws IOException, InterruptedException { +startCluster(1, 6); +getCluster().waitActive(); + +Path file = new Path("/testRecoveryDecommission"); + +// Create a file and never close the output stream to trigger recovery +DistributedFileSystem dfs = getCluster().getFileSystem(); +FSDataOutputStream out = dfs.create(file, true, +getConf().getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 4096), +(short) 3, blockSize); + +// Write data to the file +long writtenBytes = 0; +while (writtenBytes < fileSize) { + out.writeLong(writtenBytes); + writtenBytes += 8; +} +out.hsync(); + +DatanodeInfo[] lastBlockLocations = NameNodeAdapter.getBlockLocations( + getCluster().getNameNode(), "/testRecoveryDecommission", 0, fileSize) + .getLastLocatedBlock().getLocations(); + +// Decommission all nodes of the last block +
hadoop git commit: HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy.
Repository: hadoop Updated Branches: refs/heads/trunk 570827a81 -> 385d2cb77 HDFS-11499. Decommissioning stuck because of failing recovery. Contributed by Lukas Majercak and Manoj Govindassamy. Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/385d2cb7 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/385d2cb7 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/385d2cb7 Branch: refs/heads/trunk Commit: 385d2cb777a0272ac20c62336c944fad295d5d12 Parents: 570827a Author: Masatake IwasakiAuthored: Thu Mar 9 13:30:33 2017 +0900 Committer: Masatake Iwasaki Committed: Thu Mar 9 21:13:50 2017 +0900 -- .../server/blockmanagement/BlockManager.java| 10 +++- .../apache/hadoop/hdfs/TestDecommission.java| 48 ++ .../hadoop/hdfs/TestMaintenanceState.java | 51 3 files changed, 108 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java index 9ec28f9..5dc40fa 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java @@ -891,7 +891,15 @@ public class BlockManager implements BlockStatsMXBean { lastBlock.getUnderConstructionFeature() .updateStorageScheduledSize((BlockInfoStriped) lastBlock); } -if (hasMinStorage(lastBlock)) { + +// Count replicas on decommissioning nodes, as these will not be +// decommissioned unless recovery/completing last block has finished +NumberReplicas numReplicas = countNodes(lastBlock); +int numUsableReplicas = numReplicas.liveReplicas() + +numReplicas.decommissioning() + +numReplicas.liveEnteringMaintenanceReplicas(); + +if (hasMinStorage(lastBlock, numUsableReplicas)) { if (committed) { addExpectedReplicasToPending(lastBlock); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/385d2cb7/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java -- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java index 94e8946..dc0edcc 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDecommission.java @@ -33,6 +33,7 @@ import java.util.concurrent.ExecutionException; import com.google.common.base.Supplier; import com.google.common.collect.Lists; import org.apache.hadoop.fs.BlockLocation; +import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -646,6 +647,53 @@ public class TestDecommission extends AdminStatesBaseTest { fdos.close(); } + + @Test(timeout = 36) + public void testDecommissionWithOpenFileAndBlockRecovery() + throws IOException, InterruptedException { +startCluster(1, 6); +getCluster().waitActive(); + +Path file = new Path("/testRecoveryDecommission"); + +// Create a file and never close the output stream to trigger recovery +DistributedFileSystem dfs = getCluster().getFileSystem(); +FSDataOutputStream out = dfs.create(file, true, +getConf().getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 4096), +(short) 3, blockSize); + +// Write data to the file +long writtenBytes = 0; +while (writtenBytes < fileSize) { + out.writeLong(writtenBytes); + writtenBytes += 8; +} +out.hsync(); + +DatanodeInfo[] lastBlockLocations = NameNodeAdapter.getBlockLocations( + getCluster().getNameNode(), "/testRecoveryDecommission", 0, fileSize) + .getLastLocatedBlock().getLocations(); + +// Decommission all nodes of the last block +ArrayList toDecom = new ArrayList<>(); +for (DatanodeInfo dnDecom : lastBlockLocations) { + toDecom.add(dnDecom.getXferAddr()); +} +initExcludeHosts(toDecom); +refreshNodes(0); + +// Make sure hard lease expires to