[21/50] [abbrv] hadoop git commit: HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss. Contributed by Kihwal Lee.

2018-03-01 Thread rohithsharmaks
HDFS-12070. Failed block recovery leaves files open indefinitely and at risk 
for data loss. Contributed by Kihwal Lee.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/362272bc
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/362272bc
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/362272bc

Branch: refs/heads/YARN-7055
Commit: 362272bc3afc87e3fe15f557366c0bcd0a87a238
Parents: dc86ff4
Author: Kihwal Lee 
Authored: Mon Feb 26 10:28:04 2018 -0600
Committer: Rohith Sharma K S 
Committed: Fri Mar 2 11:08:28 2018 +0530

--
 .../server/datanode/BlockRecoveryWorker.java|  6 +--
 .../apache/hadoop/hdfs/TestLeaseRecovery.java   | 44 
 2 files changed, 46 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/362272bc/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
index 2ecd986..94835e2 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
@@ -307,10 +307,8 @@ public class BlockRecoveryWorker {
 }
   }
 
-  // If any of the data-nodes failed, the recovery fails, because
-  // we never know the actual state of the replica on failed data-nodes.
-  // The recovery should be started over.
-  if (!failedList.isEmpty()) {
+  // Abort if all failed.
+  if (successList.isEmpty()) {
 throw new IOException("Cannot recover " + block
 + ", the following datanodes failed: " + failedList);
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/362272bc/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
index d62194c..c82b47c 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
@@ -228,6 +228,50 @@ public class TestLeaseRecovery {
   }
 
   /**
+   * Block/lease recovery should be retried with failed nodes from the second
+   * stage removed to avoid perpetual recovery failures.
+   */
+  @Test
+  public void testBlockRecoveryRetryAfterFailedRecovery() throws Exception {
+Configuration conf = new Configuration();
+cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
+Path file = new Path("/testBlockRecoveryRetryAfterFailedRecovery");
+DistributedFileSystem dfs = cluster.getFileSystem();
+
+// Create a file.
+FSDataOutputStream out = dfs.create(file);
+final int FILE_SIZE = 128 * 1024;
+int count = 0;
+while (count < FILE_SIZE) {
+  out.writeBytes("DE K9SUL");
+  count += 8;
+}
+out.hsync();
+
+// Abort the original stream.
+((DFSOutputStream) out.getWrappedStream()).abort();
+
+LocatedBlocks locations = cluster.getNameNodeRpc().getBlockLocations(
+file.toString(), 0, count);
+ExtendedBlock block = locations.get(0).getBlock();
+
+// Finalize one replica to simulate a partial close failure.
+cluster.getDataNodes().get(0).getFSDataset().finalizeBlock(block, false);
+// Delete the meta file to simulate a rename/move failure.
+cluster.deleteMeta(0, block);
+
+// Try to recover the lease.
+DistributedFileSystem newDfs = (DistributedFileSystem) FileSystem
+.newInstance(cluster.getConfiguration(0));
+count = 0;
+while (count++ < 15 && !newDfs.recoverLease(file)) {
+  Thread.sleep(1000);
+}
+// The lease should have been recovered.
+assertTrue("File should be closed", newDfs.recoverLease(file));
+  }
+
+  /**
* Recover the lease on a file and append file from another client.
*/
   @Test


-
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-commits-h...@hadoop.apache.org



[58/59] [abbrv] hadoop git commit: HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss. Contributed by Kihwal Lee.

2018-02-26 Thread xyao
HDFS-12070. Failed block recovery leaves files open indefinitely and at risk 
for data loss. Contributed by Kihwal Lee.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/451265a8
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/451265a8
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/451265a8

Branch: refs/heads/HDFS-7240
Commit: 451265a83d8798624ae2a144bc58fa41db826704
Parents: 2fa7963
Author: Kihwal Lee 
Authored: Mon Feb 26 10:28:04 2018 -0600
Committer: Kihwal Lee 
Committed: Mon Feb 26 10:28:04 2018 -0600

--
 .../server/datanode/BlockRecoveryWorker.java|  6 +--
 .../apache/hadoop/hdfs/TestLeaseRecovery.java   | 44 
 2 files changed, 46 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/451265a8/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
index 2ecd986..94835e2 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
@@ -307,10 +307,8 @@ public class BlockRecoveryWorker {
 }
   }
 
-  // If any of the data-nodes failed, the recovery fails, because
-  // we never know the actual state of the replica on failed data-nodes.
-  // The recovery should be started over.
-  if (!failedList.isEmpty()) {
+  // Abort if all failed.
+  if (successList.isEmpty()) {
 throw new IOException("Cannot recover " + block
 + ", the following datanodes failed: " + failedList);
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/451265a8/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
index d62194c..c82b47c 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
@@ -228,6 +228,50 @@ public class TestLeaseRecovery {
   }
 
   /**
+   * Block/lease recovery should be retried with failed nodes from the second
+   * stage removed to avoid perpetual recovery failures.
+   */
+  @Test
+  public void testBlockRecoveryRetryAfterFailedRecovery() throws Exception {
+Configuration conf = new Configuration();
+cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
+Path file = new Path("/testBlockRecoveryRetryAfterFailedRecovery");
+DistributedFileSystem dfs = cluster.getFileSystem();
+
+// Create a file.
+FSDataOutputStream out = dfs.create(file);
+final int FILE_SIZE = 128 * 1024;
+int count = 0;
+while (count < FILE_SIZE) {
+  out.writeBytes("DE K9SUL");
+  count += 8;
+}
+out.hsync();
+
+// Abort the original stream.
+((DFSOutputStream) out.getWrappedStream()).abort();
+
+LocatedBlocks locations = cluster.getNameNodeRpc().getBlockLocations(
+file.toString(), 0, count);
+ExtendedBlock block = locations.get(0).getBlock();
+
+// Finalize one replica to simulate a partial close failure.
+cluster.getDataNodes().get(0).getFSDataset().finalizeBlock(block, false);
+// Delete the meta file to simulate a rename/move failure.
+cluster.deleteMeta(0, block);
+
+// Try to recover the lease.
+DistributedFileSystem newDfs = (DistributedFileSystem) FileSystem
+.newInstance(cluster.getConfiguration(0));
+count = 0;
+while (count++ < 15 && !newDfs.recoverLease(file)) {
+  Thread.sleep(1000);
+}
+// The lease should have been recovered.
+assertTrue("File should be closed", newDfs.recoverLease(file));
+  }
+
+  /**
* Recover the lease on a file and append file from another client.
*/
   @Test


-
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-commits-h...@hadoop.apache.org



hadoop git commit: HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss. Contributed by Kihwal Lee.

2018-02-26 Thread kihwal
Repository: hadoop
Updated Branches:
  refs/heads/branch-2.8 23a658c4e -> 4722cd9f3


HDFS-12070. Failed block recovery leaves files open indefinitely and at risk 
for data loss. Contributed by Kihwal Lee.

(cherry picked from commit 4b43f2aa566322317a7f3163027bf5fd0a247207)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/4722cd9f
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/4722cd9f
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/4722cd9f

Branch: refs/heads/branch-2.8
Commit: 4722cd9f35a8ff3efb106fe297d48b73c849f776
Parents: 23a658c
Author: Kihwal Lee 
Authored: Mon Feb 26 11:15:06 2018 -0600
Committer: Kihwal Lee 
Committed: Mon Feb 26 11:16:44 2018 -0600

--
 .../server/datanode/BlockRecoveryWorker.java|  6 +--
 .../apache/hadoop/hdfs/TestLeaseRecovery.java   | 44 
 2 files changed, 46 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/4722cd9f/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
index 86fead2..b19e51d 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
@@ -294,10 +294,8 @@ public class BlockRecoveryWorker {
 }
   }
 
-  // If any of the data-nodes failed, the recovery fails, because
-  // we never know the actual state of the replica on failed data-nodes.
-  // The recovery should be started over.
-  if (!failedList.isEmpty()) {
+  // Abort if all failed.
+  if (successList.isEmpty()) {
 StringBuilder b = new StringBuilder();
 for(DatanodeID id : failedList) {
   b.append("\n  " + id);

http://git-wip-us.apache.org/repos/asf/hadoop/blob/4722cd9f/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
index d62194c..c82b47c 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
@@ -228,6 +228,50 @@ public class TestLeaseRecovery {
   }
 
   /**
+   * Block/lease recovery should be retried with failed nodes from the second
+   * stage removed to avoid perpetual recovery failures.
+   */
+  @Test
+  public void testBlockRecoveryRetryAfterFailedRecovery() throws Exception {
+Configuration conf = new Configuration();
+cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
+Path file = new Path("/testBlockRecoveryRetryAfterFailedRecovery");
+DistributedFileSystem dfs = cluster.getFileSystem();
+
+// Create a file.
+FSDataOutputStream out = dfs.create(file);
+final int FILE_SIZE = 128 * 1024;
+int count = 0;
+while (count < FILE_SIZE) {
+  out.writeBytes("DE K9SUL");
+  count += 8;
+}
+out.hsync();
+
+// Abort the original stream.
+((DFSOutputStream) out.getWrappedStream()).abort();
+
+LocatedBlocks locations = cluster.getNameNodeRpc().getBlockLocations(
+file.toString(), 0, count);
+ExtendedBlock block = locations.get(0).getBlock();
+
+// Finalize one replica to simulate a partial close failure.
+cluster.getDataNodes().get(0).getFSDataset().finalizeBlock(block, false);
+// Delete the meta file to simulate a rename/move failure.
+cluster.deleteMeta(0, block);
+
+// Try to recover the lease.
+DistributedFileSystem newDfs = (DistributedFileSystem) FileSystem
+.newInstance(cluster.getConfiguration(0));
+count = 0;
+while (count++ < 15 && !newDfs.recoverLease(file)) {
+  Thread.sleep(1000);
+}
+// The lease should have been recovered.
+assertTrue("File should be closed", newDfs.recoverLease(file));
+  }
+
+  /**
* Recover the lease on a file and append file from another client.
*/
   @Test


-
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: 

hadoop git commit: HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss. Contributed by Kihwal Lee.

2018-02-26 Thread kihwal
Repository: hadoop
Updated Branches:
  refs/heads/branch-2.9 627a32375 -> a6343ff80


HDFS-12070. Failed block recovery leaves files open indefinitely and at risk 
for data loss. Contributed by Kihwal Lee.

(cherry picked from commit 4b43f2aa566322317a7f3163027bf5fd0a247207)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/a6343ff8
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/a6343ff8
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/a6343ff8

Branch: refs/heads/branch-2.9
Commit: a6343ff808dcdabfa11b0f713a445cdb30474fa7
Parents: 627a323
Author: Kihwal Lee 
Authored: Mon Feb 26 10:59:09 2018 -0600
Committer: Kihwal Lee 
Committed: Mon Feb 26 10:59:47 2018 -0600

--
 .../server/datanode/BlockRecoveryWorker.java|  6 +--
 .../apache/hadoop/hdfs/TestLeaseRecovery.java   | 44 
 2 files changed, 46 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/a6343ff8/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
index aa36247..8d218ae 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
@@ -293,10 +293,8 @@ public class BlockRecoveryWorker {
 }
   }
 
-  // If any of the data-nodes failed, the recovery fails, because
-  // we never know the actual state of the replica on failed data-nodes.
-  // The recovery should be started over.
-  if (!failedList.isEmpty()) {
+  // Abort if all failed.
+  if (successList.isEmpty()) {
 StringBuilder b = new StringBuilder();
 for(DatanodeID id : failedList) {
   b.append("\n  " + id);

http://git-wip-us.apache.org/repos/asf/hadoop/blob/a6343ff8/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
index d62194c..c82b47c 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
@@ -228,6 +228,50 @@ public class TestLeaseRecovery {
   }
 
   /**
+   * Block/lease recovery should be retried with failed nodes from the second
+   * stage removed to avoid perpetual recovery failures.
+   */
+  @Test
+  public void testBlockRecoveryRetryAfterFailedRecovery() throws Exception {
+Configuration conf = new Configuration();
+cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
+Path file = new Path("/testBlockRecoveryRetryAfterFailedRecovery");
+DistributedFileSystem dfs = cluster.getFileSystem();
+
+// Create a file.
+FSDataOutputStream out = dfs.create(file);
+final int FILE_SIZE = 128 * 1024;
+int count = 0;
+while (count < FILE_SIZE) {
+  out.writeBytes("DE K9SUL");
+  count += 8;
+}
+out.hsync();
+
+// Abort the original stream.
+((DFSOutputStream) out.getWrappedStream()).abort();
+
+LocatedBlocks locations = cluster.getNameNodeRpc().getBlockLocations(
+file.toString(), 0, count);
+ExtendedBlock block = locations.get(0).getBlock();
+
+// Finalize one replica to simulate a partial close failure.
+cluster.getDataNodes().get(0).getFSDataset().finalizeBlock(block, false);
+// Delete the meta file to simulate a rename/move failure.
+cluster.deleteMeta(0, block);
+
+// Try to recover the lease.
+DistributedFileSystem newDfs = (DistributedFileSystem) FileSystem
+.newInstance(cluster.getConfiguration(0));
+count = 0;
+while (count++ < 15 && !newDfs.recoverLease(file)) {
+  Thread.sleep(1000);
+}
+// The lease should have been recovered.
+assertTrue("File should be closed", newDfs.recoverLease(file));
+  }
+
+  /**
* Recover the lease on a file and append file from another client.
*/
   @Test


-
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: 

hadoop git commit: HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss. Contributed by Kihwal Lee.

2018-02-26 Thread kihwal
Repository: hadoop
Updated Branches:
  refs/heads/branch-2 79af42f09 -> 4b43f2aa5


HDFS-12070. Failed block recovery leaves files open indefinitely and at risk 
for data loss. Contributed by Kihwal Lee.

(cherry picked from commit 451265a83d8798624ae2a144bc58fa41db826704)

Conflicts:

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/4b43f2aa
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/4b43f2aa
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/4b43f2aa

Branch: refs/heads/branch-2
Commit: 4b43f2aa566322317a7f3163027bf5fd0a247207
Parents: 79af42f
Author: Kihwal Lee 
Authored: Mon Feb 26 10:58:07 2018 -0600
Committer: Kihwal Lee 
Committed: Mon Feb 26 10:58:07 2018 -0600

--
 .../server/datanode/BlockRecoveryWorker.java|  6 +--
 .../apache/hadoop/hdfs/TestLeaseRecovery.java   | 44 
 2 files changed, 46 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/4b43f2aa/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
index aa36247..8d218ae 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
@@ -293,10 +293,8 @@ public class BlockRecoveryWorker {
 }
   }
 
-  // If any of the data-nodes failed, the recovery fails, because
-  // we never know the actual state of the replica on failed data-nodes.
-  // The recovery should be started over.
-  if (!failedList.isEmpty()) {
+  // Abort if all failed.
+  if (successList.isEmpty()) {
 StringBuilder b = new StringBuilder();
 for(DatanodeID id : failedList) {
   b.append("\n  " + id);

http://git-wip-us.apache.org/repos/asf/hadoop/blob/4b43f2aa/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
index d62194c..c82b47c 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
@@ -228,6 +228,50 @@ public class TestLeaseRecovery {
   }
 
   /**
+   * Block/lease recovery should be retried with failed nodes from the second
+   * stage removed to avoid perpetual recovery failures.
+   */
+  @Test
+  public void testBlockRecoveryRetryAfterFailedRecovery() throws Exception {
+Configuration conf = new Configuration();
+cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
+Path file = new Path("/testBlockRecoveryRetryAfterFailedRecovery");
+DistributedFileSystem dfs = cluster.getFileSystem();
+
+// Create a file.
+FSDataOutputStream out = dfs.create(file);
+final int FILE_SIZE = 128 * 1024;
+int count = 0;
+while (count < FILE_SIZE) {
+  out.writeBytes("DE K9SUL");
+  count += 8;
+}
+out.hsync();
+
+// Abort the original stream.
+((DFSOutputStream) out.getWrappedStream()).abort();
+
+LocatedBlocks locations = cluster.getNameNodeRpc().getBlockLocations(
+file.toString(), 0, count);
+ExtendedBlock block = locations.get(0).getBlock();
+
+// Finalize one replica to simulate a partial close failure.
+cluster.getDataNodes().get(0).getFSDataset().finalizeBlock(block, false);
+// Delete the meta file to simulate a rename/move failure.
+cluster.deleteMeta(0, block);
+
+// Try to recover the lease.
+DistributedFileSystem newDfs = (DistributedFileSystem) FileSystem
+.newInstance(cluster.getConfiguration(0));
+count = 0;
+while (count++ < 15 && !newDfs.recoverLease(file)) {
+  Thread.sleep(1000);
+}
+// The lease should have been recovered.
+assertTrue("File should be closed", newDfs.recoverLease(file));
+  }
+
+  /**
* Recover the lease on a file and append file from another client.
*/
   @Test



hadoop git commit: HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss. Contributed by Kihwal Lee.

2018-02-26 Thread kihwal
Repository: hadoop
Updated Branches:
  refs/heads/branch-3.0 21d4b5fd2 -> 1087b9af8


HDFS-12070. Failed block recovery leaves files open indefinitely and at risk 
for data loss. Contributed by Kihwal Lee.

(cherry picked from commit 451265a83d8798624ae2a144bc58fa41db826704)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/1087b9af
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/1087b9af
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/1087b9af

Branch: refs/heads/branch-3.0
Commit: 1087b9af8c34742bdcf90f2e5b809bddb9f79315
Parents: 21d4b5f
Author: Kihwal Lee 
Authored: Mon Feb 26 10:30:50 2018 -0600
Committer: Kihwal Lee 
Committed: Mon Feb 26 10:30:50 2018 -0600

--
 .../server/datanode/BlockRecoveryWorker.java|  6 +--
 .../apache/hadoop/hdfs/TestLeaseRecovery.java   | 44 
 2 files changed, 46 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/1087b9af/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
index 2ecd986..94835e2 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
@@ -307,10 +307,8 @@ public class BlockRecoveryWorker {
 }
   }
 
-  // If any of the data-nodes failed, the recovery fails, because
-  // we never know the actual state of the replica on failed data-nodes.
-  // The recovery should be started over.
-  if (!failedList.isEmpty()) {
+  // Abort if all failed.
+  if (successList.isEmpty()) {
 throw new IOException("Cannot recover " + block
 + ", the following datanodes failed: " + failedList);
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/1087b9af/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
index d62194c..c82b47c 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
@@ -228,6 +228,50 @@ public class TestLeaseRecovery {
   }
 
   /**
+   * Block/lease recovery should be retried with failed nodes from the second
+   * stage removed to avoid perpetual recovery failures.
+   */
+  @Test
+  public void testBlockRecoveryRetryAfterFailedRecovery() throws Exception {
+Configuration conf = new Configuration();
+cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
+Path file = new Path("/testBlockRecoveryRetryAfterFailedRecovery");
+DistributedFileSystem dfs = cluster.getFileSystem();
+
+// Create a file.
+FSDataOutputStream out = dfs.create(file);
+final int FILE_SIZE = 128 * 1024;
+int count = 0;
+while (count < FILE_SIZE) {
+  out.writeBytes("DE K9SUL");
+  count += 8;
+}
+out.hsync();
+
+// Abort the original stream.
+((DFSOutputStream) out.getWrappedStream()).abort();
+
+LocatedBlocks locations = cluster.getNameNodeRpc().getBlockLocations(
+file.toString(), 0, count);
+ExtendedBlock block = locations.get(0).getBlock();
+
+// Finalize one replica to simulate a partial close failure.
+cluster.getDataNodes().get(0).getFSDataset().finalizeBlock(block, false);
+// Delete the meta file to simulate a rename/move failure.
+cluster.deleteMeta(0, block);
+
+// Try to recover the lease.
+DistributedFileSystem newDfs = (DistributedFileSystem) FileSystem
+.newInstance(cluster.getConfiguration(0));
+count = 0;
+while (count++ < 15 && !newDfs.recoverLease(file)) {
+  Thread.sleep(1000);
+}
+// The lease should have been recovered.
+assertTrue("File should be closed", newDfs.recoverLease(file));
+  }
+
+  /**
* Recover the lease on a file and append file from another client.
*/
   @Test


-
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: 

hadoop git commit: HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss. Contributed by Kihwal Lee.

2018-02-26 Thread kihwal
Repository: hadoop
Updated Branches:
  refs/heads/branch-3.1 cb260a2d3 -> 33f82323b


HDFS-12070. Failed block recovery leaves files open indefinitely and at risk 
for data loss. Contributed by Kihwal Lee.

(cherry picked from commit 451265a83d8798624ae2a144bc58fa41db826704)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/33f82323
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/33f82323
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/33f82323

Branch: refs/heads/branch-3.1
Commit: 33f82323b0db22f1dc884ba59bbc367311c0
Parents: cb260a2
Author: Kihwal Lee 
Authored: Mon Feb 26 10:29:28 2018 -0600
Committer: Kihwal Lee 
Committed: Mon Feb 26 10:29:28 2018 -0600

--
 .../server/datanode/BlockRecoveryWorker.java|  6 +--
 .../apache/hadoop/hdfs/TestLeaseRecovery.java   | 44 
 2 files changed, 46 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/33f82323/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
index 2ecd986..94835e2 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
@@ -307,10 +307,8 @@ public class BlockRecoveryWorker {
 }
   }
 
-  // If any of the data-nodes failed, the recovery fails, because
-  // we never know the actual state of the replica on failed data-nodes.
-  // The recovery should be started over.
-  if (!failedList.isEmpty()) {
+  // Abort if all failed.
+  if (successList.isEmpty()) {
 throw new IOException("Cannot recover " + block
 + ", the following datanodes failed: " + failedList);
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/33f82323/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
index d62194c..c82b47c 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
@@ -228,6 +228,50 @@ public class TestLeaseRecovery {
   }
 
   /**
+   * Block/lease recovery should be retried with failed nodes from the second
+   * stage removed to avoid perpetual recovery failures.
+   */
+  @Test
+  public void testBlockRecoveryRetryAfterFailedRecovery() throws Exception {
+Configuration conf = new Configuration();
+cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
+Path file = new Path("/testBlockRecoveryRetryAfterFailedRecovery");
+DistributedFileSystem dfs = cluster.getFileSystem();
+
+// Create a file.
+FSDataOutputStream out = dfs.create(file);
+final int FILE_SIZE = 128 * 1024;
+int count = 0;
+while (count < FILE_SIZE) {
+  out.writeBytes("DE K9SUL");
+  count += 8;
+}
+out.hsync();
+
+// Abort the original stream.
+((DFSOutputStream) out.getWrappedStream()).abort();
+
+LocatedBlocks locations = cluster.getNameNodeRpc().getBlockLocations(
+file.toString(), 0, count);
+ExtendedBlock block = locations.get(0).getBlock();
+
+// Finalize one replica to simulate a partial close failure.
+cluster.getDataNodes().get(0).getFSDataset().finalizeBlock(block, false);
+// Delete the meta file to simulate a rename/move failure.
+cluster.deleteMeta(0, block);
+
+// Try to recover the lease.
+DistributedFileSystem newDfs = (DistributedFileSystem) FileSystem
+.newInstance(cluster.getConfiguration(0));
+count = 0;
+while (count++ < 15 && !newDfs.recoverLease(file)) {
+  Thread.sleep(1000);
+}
+// The lease should have been recovered.
+assertTrue("File should be closed", newDfs.recoverLease(file));
+  }
+
+  /**
* Recover the lease on a file and append file from another client.
*/
   @Test


-
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: 

hadoop git commit: HDFS-12070. Failed block recovery leaves files open indefinitely and at risk for data loss. Contributed by Kihwal Lee.

2018-02-26 Thread kihwal
Repository: hadoop
Updated Branches:
  refs/heads/trunk 2fa7963c3 -> 451265a83


HDFS-12070. Failed block recovery leaves files open indefinitely and at risk 
for data loss. Contributed by Kihwal Lee.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/451265a8
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/451265a8
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/451265a8

Branch: refs/heads/trunk
Commit: 451265a83d8798624ae2a144bc58fa41db826704
Parents: 2fa7963
Author: Kihwal Lee 
Authored: Mon Feb 26 10:28:04 2018 -0600
Committer: Kihwal Lee 
Committed: Mon Feb 26 10:28:04 2018 -0600

--
 .../server/datanode/BlockRecoveryWorker.java|  6 +--
 .../apache/hadoop/hdfs/TestLeaseRecovery.java   | 44 
 2 files changed, 46 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/hadoop/blob/451265a8/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
index 2ecd986..94835e2 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockRecoveryWorker.java
@@ -307,10 +307,8 @@ public class BlockRecoveryWorker {
 }
   }
 
-  // If any of the data-nodes failed, the recovery fails, because
-  // we never know the actual state of the replica on failed data-nodes.
-  // The recovery should be started over.
-  if (!failedList.isEmpty()) {
+  // Abort if all failed.
+  if (successList.isEmpty()) {
 throw new IOException("Cannot recover " + block
 + ", the following datanodes failed: " + failedList);
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/451265a8/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
--
diff --git 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
index d62194c..c82b47c 100644
--- 
a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
+++ 
b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestLeaseRecovery.java
@@ -228,6 +228,50 @@ public class TestLeaseRecovery {
   }
 
   /**
+   * Block/lease recovery should be retried with failed nodes from the second
+   * stage removed to avoid perpetual recovery failures.
+   */
+  @Test
+  public void testBlockRecoveryRetryAfterFailedRecovery() throws Exception {
+Configuration conf = new Configuration();
+cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
+Path file = new Path("/testBlockRecoveryRetryAfterFailedRecovery");
+DistributedFileSystem dfs = cluster.getFileSystem();
+
+// Create a file.
+FSDataOutputStream out = dfs.create(file);
+final int FILE_SIZE = 128 * 1024;
+int count = 0;
+while (count < FILE_SIZE) {
+  out.writeBytes("DE K9SUL");
+  count += 8;
+}
+out.hsync();
+
+// Abort the original stream.
+((DFSOutputStream) out.getWrappedStream()).abort();
+
+LocatedBlocks locations = cluster.getNameNodeRpc().getBlockLocations(
+file.toString(), 0, count);
+ExtendedBlock block = locations.get(0).getBlock();
+
+// Finalize one replica to simulate a partial close failure.
+cluster.getDataNodes().get(0).getFSDataset().finalizeBlock(block, false);
+// Delete the meta file to simulate a rename/move failure.
+cluster.deleteMeta(0, block);
+
+// Try to recover the lease.
+DistributedFileSystem newDfs = (DistributedFileSystem) FileSystem
+.newInstance(cluster.getConfiguration(0));
+count = 0;
+while (count++ < 15 && !newDfs.recoverLease(file)) {
+  Thread.sleep(1000);
+}
+// The lease should have been recovered.
+assertTrue("File should be closed", newDfs.recoverLease(file));
+  }
+
+  /**
* Recover the lease on a file and append file from another client.
*/
   @Test


-
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-commits-h...@hadoop.apache.org