Repository: hadoop Updated Branches: refs/heads/branch-2.8 23a658c4e -> 4722cd9f3
HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss. Contributed by Kihwal Lee. (cherry picked from commit 4b43f2aa566322317a7f3163027bf5fd0a247207) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/4722cd9f Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/4722cd9f Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/4722cd9f Branch: refs/heads/branch-2.8 Commit: 4722cd9f35a8ff3efb106fe297d48b73c849f776 Parents: 23a658c Author: Kihwal Lee <[email protected]> Authored: Mon Feb 26 11:15:06 2018 -0600 Committer: Kihwal Lee <[email protected]> Committed: Mon Feb 26 11:16:44 2018 -0600 ---------------------------------------------------------------------- .../server/datanode/BlockRecoveryWorker.java | 6 +-- .../apache/hadoop/hdfs/TestLeaseRecovery.java | 44 ++++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/4722cd9f/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java index 86fead2..b19e51d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java @@ -294,10 +294,8 @@ public class BlockRecoveryWorker { } } - // If any of the data-nodes failed, the recovery fails, because - // we never know the actual state of the replica on failed data-nodes. - // The recovery should be started over. - if (!failedList.isEmpty()) { + // Abort if all failed. + if (successList.isEmpty()) { StringBuilder b = new StringBuilder(); for(DatanodeID id : failedList) { b.append("\n " + id); http://git-wip-us.apache.org/repos/asf/hadoop/blob/4722cd9f/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java index d62194c..c82b47c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java @@ -228,6 +228,50 @@ public class TestLeaseRecovery { } /** + * Block/lease recovery should be retried with failed nodes from the second + * stage removed to avoid perpetual recovery failures. + */ + @Test + public void testBlockRecoveryRetryAfterFailedRecovery() throws Exception { + Configuration conf = new Configuration(); + cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build(); + Path file = new Path("/testBlockRecoveryRetryAfterFailedRecovery"); + DistributedFileSystem dfs = cluster.getFileSystem(); + + // Create a file. + FSDataOutputStream out = dfs.create(file); + final int FILE_SIZE = 128 * 1024; + int count = 0; + while (count < FILE_SIZE) { + out.writeBytes("DE K9SUL"); + count += 8; + } + out.hsync(); + + // Abort the original stream. + ((DFSOutputStream) out.getWrappedStream()).abort(); + + LocatedBlocks locations = cluster.getNameNodeRpc().getBlockLocations( + file.toString(), 0, count); + ExtendedBlock block = locations.get(0).getBlock(); + + // Finalize one replica to simulate a partial close failure. + cluster.getDataNodes().get(0).getFSDataset().finalizeBlock(block, false); + // Delete the meta file to simulate a rename/move failure. + cluster.deleteMeta(0, block); + + // Try to recover the lease. + DistributedFileSystem newDfs = (DistributedFileSystem) FileSystem + .newInstance(cluster.getConfiguration(0)); + count = 0; + while (count++ < 15 && !newDfs.recoverLease(file)) { + Thread.sleep(1000); + } + // The lease should have been recovered. + assertTrue("File should be closed", newDfs.recoverLease(file)); + } + + /** * Recover the lease on a file and append file from another client. */ @Test --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
