Repository: hadoop Updated Branches: refs/heads/branch-3.1 cb260a2d3 -> 33f82323b
HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss. Contributed by Kihwal Lee. (cherry picked from commit 451265a83d8798624ae2a144bc58fa41db826704) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/33f82323 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/33f82323 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/33f82323 Branch: refs/heads/branch-3.1 Commit: 33f82323b0db22f1dc884babbbb59bbc367311c0 Parents: cb260a2 Author: Kihwal Lee <kih...@apache.org> Authored: Mon Feb 26 10:29:28 2018 -0600 Committer: Kihwal Lee <kih...@apache.org> Committed: Mon Feb 26 10:29:28 2018 -0600 ---------------------------------------------------------------------- .../server/datanode/BlockRecoveryWorker.java | 6 +-- .../apache/hadoop/hdfs/TestLeaseRecovery.java | 44 ++++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/33f82323/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java index 2ecd986..94835e2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java @@ -307,10 +307,8 @@ public class BlockRecoveryWorker { } } - // If any of the data-nodes failed, the recovery fails, because - // we never know the actual state of the replica on failed data-nodes. - // The recovery should be started over. - if (!failedList.isEmpty()) { + // Abort if all failed. + if (successList.isEmpty()) { throw new IOException("Cannot recover " + block + ", the following datanodes failed: " + failedList); } http://git-wip-us.apache.org/repos/asf/hadoop/blob/33f82323/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java ---------------------------------------------------------------------- diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java index d62194c..c82b47c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java @@ -228,6 +228,50 @@ public class TestLeaseRecovery { } /** + * Block/lease recovery should be retried with failed nodes from the second + * stage removed to avoid perpetual recovery failures. + */ + @Test + public void testBlockRecoveryRetryAfterFailedRecovery() throws Exception { + Configuration conf = new Configuration(); + cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build(); + Path file = new Path("/testBlockRecoveryRetryAfterFailedRecovery"); + DistributedFileSystem dfs = cluster.getFileSystem(); + + // Create a file. + FSDataOutputStream out = dfs.create(file); + final int FILE_SIZE = 128 * 1024; + int count = 0; + while (count < FILE_SIZE) { + out.writeBytes("DE K9SUL"); + count += 8; + } + out.hsync(); + + // Abort the original stream. + ((DFSOutputStream) out.getWrappedStream()).abort(); + + LocatedBlocks locations = cluster.getNameNodeRpc().getBlockLocations( + file.toString(), 0, count); + ExtendedBlock block = locations.get(0).getBlock(); + + // Finalize one replica to simulate a partial close failure. + cluster.getDataNodes().get(0).getFSDataset().finalizeBlock(block, false); + // Delete the meta file to simulate a rename/move failure. + cluster.deleteMeta(0, block); + + // Try to recover the lease. + DistributedFileSystem newDfs = (DistributedFileSystem) FileSystem + .newInstance(cluster.getConfiguration(0)); + count = 0; + while (count++ < 15 && !newDfs.recoverLease(file)) { + Thread.sleep(1000); + } + // The lease should have been recovered. + assertTrue("File should be closed", newDfs.recoverLease(file)); + } + + /** * Recover the lease on a file and append file from another client. */ @Test --------------------------------------------------------------------- To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org For additional commands, e-mail: common-commits-h...@hadoop.apache.org